Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/X86/X86ISelLowering.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This file defines the interfaces that X86 uses to lower LLVM code into a
11
// selection DAG.
12
//
13
//===----------------------------------------------------------------------===//
14
15
#include "X86ISelLowering.h"
16
#include "Utils/X86ShuffleDecode.h"
17
#include "X86CallingConv.h"
18
#include "X86FrameLowering.h"
19
#include "X86InstrBuilder.h"
20
#include "X86IntrinsicsInfo.h"
21
#include "X86MachineFunctionInfo.h"
22
#include "X86ShuffleDecodeConstantPool.h"
23
#include "X86TargetMachine.h"
24
#include "X86TargetObjectFile.h"
25
#include "llvm/ADT/SmallBitVector.h"
26
#include "llvm/ADT/SmallSet.h"
27
#include "llvm/ADT/Statistic.h"
28
#include "llvm/ADT/StringExtras.h"
29
#include "llvm/ADT/StringSwitch.h"
30
#include "llvm/Analysis/EHPersonalities.h"
31
#include "llvm/CodeGen/IntrinsicLowering.h"
32
#include "llvm/CodeGen/MachineFrameInfo.h"
33
#include "llvm/CodeGen/MachineFunction.h"
34
#include "llvm/CodeGen/MachineInstrBuilder.h"
35
#include "llvm/CodeGen/MachineJumpTableInfo.h"
36
#include "llvm/CodeGen/MachineModuleInfo.h"
37
#include "llvm/CodeGen/MachineRegisterInfo.h"
38
#include "llvm/CodeGen/WinEHFuncInfo.h"
39
#include "llvm/IR/CallSite.h"
40
#include "llvm/IR/CallingConv.h"
41
#include "llvm/IR/Constants.h"
42
#include "llvm/IR/DerivedTypes.h"
43
#include "llvm/IR/DiagnosticInfo.h"
44
#include "llvm/IR/Function.h"
45
#include "llvm/IR/GlobalAlias.h"
46
#include "llvm/IR/GlobalVariable.h"
47
#include "llvm/IR/Instructions.h"
48
#include "llvm/IR/Intrinsics.h"
49
#include "llvm/MC/MCAsmInfo.h"
50
#include "llvm/MC/MCContext.h"
51
#include "llvm/MC/MCExpr.h"
52
#include "llvm/MC/MCSymbol.h"
53
#include "llvm/Support/CommandLine.h"
54
#include "llvm/Support/Debug.h"
55
#include "llvm/Support/ErrorHandling.h"
56
#include "llvm/Support/KnownBits.h"
57
#include "llvm/Support/MathExtras.h"
58
#include "llvm/Target/TargetLowering.h"
59
#include "llvm/Target/TargetOptions.h"
60
#include <algorithm>
61
#include <bitset>
62
#include <cctype>
63
#include <numeric>
64
using namespace llvm;
65
66
#define DEBUG_TYPE "x86-isel"
67
68
STATISTIC(NumTailCalls, "Number of tail calls");
69
70
static cl::opt<bool> ExperimentalVectorWideningLegalization(
71
    "x86-experimental-vector-widening-legalization", cl::init(false),
72
    cl::desc("Enable an experimental vector type legalization through widening "
73
             "rather than promotion."),
74
    cl::Hidden);
75
76
static cl::opt<int> ExperimentalPrefLoopAlignment(
77
    "x86-experimental-pref-loop-alignment", cl::init(4),
78
    cl::desc("Sets the preferable loop alignment for experiments "
79
             "(the last x86-experimental-pref-loop-alignment bits"
80
             " of the loop header PC will be 0)."),
81
    cl::Hidden);
82
83
static cl::opt<bool> MulConstantOptimization(
84
    "mul-constant-optimization", cl::init(true),
85
    cl::desc("Replace 'mul x, Const' with more effective instructions like "
86
             "SHIFT, LEA, etc."),
87
    cl::Hidden);
88
89
/// Call this when the user attempts to do something unsupported, like
90
/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91
/// report_fatal_error, so calling code should attempt to recover without
92
/// crashing.
93
static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
94
10
                             const char *Msg) {
95
10
  MachineFunction &MF = DAG.getMachineFunction();
96
10
  DAG.getContext()->diagnose(
97
10
      DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
98
10
}
99
100
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101
                                     const X86Subtarget &STI)
102
9.65k
    : TargetLowering(TM), Subtarget(STI) {
103
9.63k
  bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104
9.65k
  X86ScalarSSEf64 = Subtarget.hasSSE2();
105
9.65k
  X86ScalarSSEf32 = Subtarget.hasSSE1();
106
9.65k
  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
107
9.65k
108
9.65k
  // Set up the TargetLowering object.
109
9.65k
110
9.65k
  // X86 is weird. It always uses i8 for shift amounts and setcc results.
111
9.65k
  setBooleanContents(ZeroOrOneBooleanContent);
112
9.65k
  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113
9.65k
  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
114
9.65k
115
9.65k
  // For 64-bit, since we have so many registers, use the ILP scheduler.
116
9.65k
  // For 32-bit, use the register pressure specific scheduling.
117
9.65k
  // For Atom, always use ILP scheduling.
118
9.65k
  if (Subtarget.isAtom())
119
38
    setSchedulingPreference(Sched::ILP);
120
9.61k
  else 
if (9.61k
Subtarget.is64Bit()9.61k
)
121
7.09k
    setSchedulingPreference(Sched::ILP);
122
9.61k
  else
123
2.52k
    setSchedulingPreference(Sched::RegPressure);
124
9.65k
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125
9.65k
  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
126
9.65k
127
9.65k
  // Bypass expensive divides and use cheaper ones.
128
9.65k
  if (
TM.getOptLevel() >= CodeGenOpt::Default9.65k
) {
129
8.05k
    if (Subtarget.hasSlowDivide32())
130
36
      addBypassSlowDiv(32, 8);
131
8.05k
    if (
Subtarget.hasSlowDivide64() && 8.05k
Subtarget.is64Bit()750
)
132
678
      addBypassSlowDiv(64, 32);
133
8.05k
  }
134
9.65k
135
9.65k
  if (Subtarget.isTargetKnownWindowsMSVC() ||
136
9.65k
      
Subtarget.isTargetWindowsItanium()9.19k
) {
137
472
    // Setup Windows compiler runtime calls.
138
472
    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139
472
    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140
472
    setLibcallName(RTLIB::SREM_I64, "_allrem");
141
472
    setLibcallName(RTLIB::UREM_I64, "_aullrem");
142
472
    setLibcallName(RTLIB::MUL_I64, "_allmul");
143
472
    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144
472
    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145
472
    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146
472
    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147
472
    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
148
472
  }
149
9.65k
150
9.65k
  if (
Subtarget.isTargetDarwin()9.65k
) {
151
4.09k
    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152
4.09k
    setUseUnderscoreSetJmp(false);
153
4.09k
    setUseUnderscoreLongJmp(false);
154
9.65k
  } else 
if (5.55k
Subtarget.isTargetWindowsGNU()5.55k
) {
155
69
    // MS runtime is weird: it exports _setjmp, but longjmp!
156
69
    setUseUnderscoreSetJmp(true);
157
69
    setUseUnderscoreLongJmp(false);
158
5.55k
  } else {
159
5.49k
    setUseUnderscoreSetJmp(true);
160
5.49k
    setUseUnderscoreLongJmp(true);
161
5.49k
  }
162
9.65k
163
9.65k
  // Set up the register classes.
164
9.65k
  addRegisterClass(MVT::i8, &X86::GR8RegClass);
165
9.65k
  addRegisterClass(MVT::i16, &X86::GR16RegClass);
166
9.65k
  addRegisterClass(MVT::i32, &X86::GR32RegClass);
167
9.65k
  if (Subtarget.is64Bit())
168
7.11k
    addRegisterClass(MVT::i64, &X86::GR64RegClass);
169
9.65k
170
9.65k
  for (MVT VT : MVT::integer_valuetypes())
171
57.9k
    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
172
9.65k
173
9.65k
  // We don't accept any truncstore of integer registers.
174
9.65k
  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175
9.65k
  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176
9.65k
  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177
9.65k
  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178
9.65k
  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179
9.65k
  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
180
9.65k
181
9.65k
  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
182
9.65k
183
9.65k
  // SETOEQ and SETUNE require checking two conditions.
184
9.65k
  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185
9.65k
  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186
9.65k
  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187
9.65k
  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188
9.65k
  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189
9.65k
  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
190
9.65k
191
9.65k
  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
192
9.65k
  // operation.
193
9.65k
  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
194
9.65k
  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
195
9.65k
  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
196
9.65k
197
9.65k
  if (
Subtarget.is64Bit()9.65k
) {
198
7.11k
    if (
!Subtarget.useSoftFloat() && 7.11k
Subtarget.hasAVX512()7.10k
)
199
7.11k
      // f32/f64 are legal, f80 is custom.
200
653
      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
201
7.11k
    else
202
6.46k
      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
203
7.11k
    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
204
9.65k
  } else 
if (2.53k
!Subtarget.useSoftFloat()2.53k
) {
205
2.53k
    // We have an algorithm for SSE2->double, and we turn this into a
206
2.53k
    // 64-bit FILD followed by conditional FADD for other targets.
207
2.53k
    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
208
2.53k
    // We have an algorithm for SSE2, and we turn this into a 64-bit
209
2.53k
    // FILD or VCVTUSI2SS/SD for other targets.
210
2.53k
    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
211
2.53k
  }
212
9.65k
213
9.65k
  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
214
9.65k
  // this operation.
215
9.65k
  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
216
9.65k
  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
217
9.65k
218
9.65k
  if (
!Subtarget.useSoftFloat()9.65k
) {
219
9.63k
    // SSE has no i16 to fp conversion, only i32.
220
9.63k
    if (
X86ScalarSSEf329.63k
) {
221
8.29k
      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
222
8.29k
      // f32 and f64 cases are Legal, f80 case is not
223
8.29k
      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
224
9.63k
    } else {
225
1.34k
      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
226
1.34k
      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
227
1.34k
    }
228
9.65k
  } else {
229
13
    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
230
13
    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
231
13
  }
232
9.65k
233
9.65k
  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
234
9.65k
  // this operation.
235
9.65k
  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
236
9.65k
  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
237
9.65k
238
9.65k
  if (
!Subtarget.useSoftFloat()9.65k
) {
239
9.63k
    // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
240
9.63k
    // are Legal, f80 is custom lowered.
241
9.63k
    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
242
9.63k
    setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
243
9.63k
244
9.63k
    if (
X86ScalarSSEf329.63k
) {
245
8.29k
      setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
246
8.29k
      // f32 and f64 cases are Legal, f80 case is not
247
8.29k
      setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
248
9.63k
    } else {
249
1.34k
      setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
250
1.34k
      setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
251
1.34k
    }
252
9.65k
  } else {
253
13
    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
254
13
    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
255
13
    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
256
13
  }
257
9.65k
258
9.65k
  // Handle FP_TO_UINT by promoting the destination to a larger signed
259
9.65k
  // conversion.
260
9.65k
  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
261
9.65k
  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
262
9.65k
  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
263
9.65k
264
9.65k
  if (
Subtarget.is64Bit()9.65k
) {
265
7.11k
    if (
!Subtarget.useSoftFloat() && 7.11k
Subtarget.hasAVX512()7.10k
) {
266
653
      // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
267
653
      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
268
653
      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
269
7.11k
    } else {
270
6.46k
      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
271
6.46k
      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
272
6.46k
    }
273
9.65k
  } else 
if (2.53k
!Subtarget.useSoftFloat()2.53k
) {
274
2.53k
    // Since AVX is a superset of SSE3, only check for SSE here.
275
2.53k
    if (
Subtarget.hasSSE1() && 2.53k
!Subtarget.hasSSE3()1.20k
)
276
2.53k
      // Expand FP_TO_UINT into a select.
277
2.53k
      // FIXME: We would like to use a Custom expander here eventually to do
278
2.53k
      // the optimal thing for SSE vs. the default expansion in the legalizer.
279
246
      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
280
2.53k
    else
281
2.53k
      // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
282
2.53k
      // With SSE3 we can use fisttpll to convert to a signed i64; without
283
2.53k
      // SSE, we're stuck with a fistpll.
284
2.28k
      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
285
2.53k
286
2.53k
    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
287
2.53k
  }
288
9.65k
289
9.65k
  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
290
9.65k
  if (
!X86ScalarSSEf649.65k
) {
291
1.39k
    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
292
1.39k
    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
293
1.39k
    if (
Subtarget.is64Bit()1.39k
) {
294
22
      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
295
22
      // Without SSE, i64->f64 goes through memory.
296
22
      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
297
22
    }
298
9.65k
  } else 
if (8.25k
!Subtarget.is64Bit()8.25k
)
299
1.16k
    setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
300
9.65k
301
9.65k
  // Scalar integer divide and remainder are lowered to use operations that
302
9.65k
  // produce two results, to match the available instructions. This exposes
303
9.65k
  // the two-result form to trivial CSE, which is able to combine x/y and x%y
304
9.65k
  // into a single instruction.
305
9.65k
  //
306
9.65k
  // Scalar integer multiply-high is also lowered to use two-result
307
9.65k
  // operations, to match the available instructions. However, plain multiply
308
9.65k
  // (low) operations are left as Legal, as there are single-result
309
9.65k
  // instructions for this in x86. Using the two-result multiply instructions
310
9.65k
  // when both high and low results are needed must be arranged by dagcombine.
311
38.5k
  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
312
38.5k
    setOperationAction(ISD::MULHS, VT, Expand);
313
38.5k
    setOperationAction(ISD::MULHU, VT, Expand);
314
38.5k
    setOperationAction(ISD::SDIV, VT, Expand);
315
38.5k
    setOperationAction(ISD::UDIV, VT, Expand);
316
38.5k
    setOperationAction(ISD::SREM, VT, Expand);
317
38.5k
    setOperationAction(ISD::UREM, VT, Expand);
318
38.5k
  }
319
9.65k
320
9.65k
  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
321
9.65k
  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
322
9.65k
  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
323
77.1k
                   MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
324
77.1k
    setOperationAction(ISD::BR_CC,     VT, Expand);
325
77.1k
    setOperationAction(ISD::SELECT_CC, VT, Expand);
326
77.1k
  }
327
9.65k
  if (Subtarget.is64Bit())
328
7.11k
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
329
9.65k
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
330
9.65k
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
331
9.65k
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
332
9.65k
  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
333
9.65k
334
9.65k
  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
335
9.65k
  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
336
9.65k
  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
337
9.65k
  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
338
9.65k
339
9.65k
  // Promote the i8 variants and force them on up to i32 which has a shorter
340
9.65k
  // encoding.
341
9.65k
  setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
342
9.65k
  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
343
9.65k
  if (
!Subtarget.hasBMI()9.65k
) {
344
8.83k
    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
345
8.83k
    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
346
8.83k
    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
347
8.83k
    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
348
8.83k
    if (
Subtarget.is64Bit()8.83k
) {
349
6.35k
      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
350
6.35k
      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
351
6.35k
    }
352
8.83k
  }
353
9.65k
354
9.65k
  if (
Subtarget.hasLZCNT()9.65k
) {
355
832
    // When promoting the i8 variants, force them to i32 for a shorter
356
832
    // encoding.
357
832
    setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
358
832
    setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
359
9.65k
  } else {
360
8.81k
    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
361
8.81k
    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
362
8.81k
    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
363
8.81k
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
364
8.81k
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
365
8.81k
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
366
8.81k
    if (
Subtarget.is64Bit()8.81k
) {
367
6.34k
      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
368
6.34k
      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
369
6.34k
    }
370
8.81k
  }
371
9.65k
372
9.65k
  // Special handling for half-precision floating point conversions.
373
9.65k
  // If we don't have F16C support, then lower half float conversions
374
9.65k
  // into library calls.
375
9.65k
  if (Subtarget.useSoftFloat() ||
376
9.65k
      
(!Subtarget.hasF16C() && 9.63k
!Subtarget.hasAVX512()8.79k
)) {
377
8.28k
    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
378
8.28k
    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
379
8.28k
  }
380
9.65k
381
9.65k
  // There's never any support for operations beyond MVT::f32.
382
9.65k
  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
383
9.65k
  setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
384
9.65k
  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
385
9.65k
  setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
386
9.65k
387
9.65k
  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
388
9.65k
  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
389
9.65k
  setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
390
9.65k
  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
391
9.65k
  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
392
9.65k
  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
393
9.65k
394
9.65k
  if (
Subtarget.hasPOPCNT()9.65k
) {
395
1.33k
    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
396
9.65k
  } else {
397
8.31k
    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
398
8.31k
    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
399
8.31k
    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
400
8.31k
    if (Subtarget.is64Bit())
401
5.93k
      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
402
8.31k
  }
403
9.65k
404
9.65k
  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
405
9.65k
406
9.65k
  if (!Subtarget.hasMOVBE())
407
8.81k
    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
408
9.65k
409
9.65k
  // These should be promoted to a larger select which is supported.
410
9.65k
  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
411
9.65k
  // X86 wants to expand cmov itself.
412
38.5k
  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
413
38.5k
    setOperationAction(ISD::SELECT, VT, Custom);
414
38.5k
    setOperationAction(ISD::SETCC, VT, Custom);
415
38.5k
  }
416
38.5k
  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
417
38.5k
    if (
VT == MVT::i64 && 38.5k
!Subtarget.is64Bit()9.64k
)
418
2.53k
      continue;
419
36.0k
    setOperationAction(ISD::SELECT, VT, Custom);
420
36.0k
    setOperationAction(ISD::SETCC,  VT, Custom);
421
36.0k
  }
422
9.65k
423
9.65k
  // Custom action for SELECT MMX and expand action for SELECT_CC MMX
424
9.65k
  setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
425
9.65k
  setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
426
9.65k
427
9.65k
  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
428
9.65k
  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
429
9.65k
  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
430
9.65k
  // support continuation, user-level threading, and etc.. As a result, no
431
9.65k
  // other SjLj exception interfaces are implemented and please don't build
432
9.65k
  // your own exception handling based on them.
433
9.65k
  // LLVM/Clang supports zero-cost DWARF exception handling.
434
9.65k
  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
435
9.65k
  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
436
9.65k
  setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
437
9.65k
  if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
438
2
    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
439
9.65k
440
9.65k
  // Darwin ABI issue.
441
19.2k
  for (auto VT : { MVT::i32, MVT::i64 }) {
442
19.2k
    if (
VT == MVT::i64 && 19.2k
!Subtarget.is64Bit()9.65k
)
443
2.53k
      continue;
444
16.7k
    setOperationAction(ISD::ConstantPool    , VT, Custom);
445
16.7k
    setOperationAction(ISD::JumpTable       , VT, Custom);
446
16.7k
    setOperationAction(ISD::GlobalAddress   , VT, Custom);
447
16.7k
    setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
448
16.7k
    setOperationAction(ISD::ExternalSymbol  , VT, Custom);
449
16.7k
    setOperationAction(ISD::BlockAddress    , VT, Custom);
450
16.7k
  }
451
9.65k
452
9.65k
  // 64-bit shl, sra, srl (iff 32-bit x86)
453
19.3k
  for (auto VT : { MVT::i32, MVT::i64 }) {
454
19.3k
    if (
VT == MVT::i64 && 19.3k
!Subtarget.is64Bit()9.64k
)
455
2.53k
      continue;
456
16.7k
    setOperationAction(ISD::SHL_PARTS, VT, Custom);
457
16.7k
    setOperationAction(ISD::SRA_PARTS, VT, Custom);
458
16.7k
    setOperationAction(ISD::SRL_PARTS, VT, Custom);
459
16.7k
  }
460
9.65k
461
9.65k
  if (Subtarget.hasSSE1())
462
8.30k
    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
463
9.65k
464
9.65k
  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
465
9.65k
466
9.65k
  // Expand certain atomics
467
38.5k
  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
468
38.5k
    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
469
38.5k
    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
470
38.5k
    setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
471
38.5k
    setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
472
38.5k
    setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
473
38.5k
    setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
474
38.5k
    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
475
38.5k
  }
476
9.65k
477
9.65k
  if (
Subtarget.hasCmpxchg16b()9.65k
) {
478
2.84k
    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
479
2.84k
  }
480
9.65k
481
9.65k
  // FIXME - use subtarget debug flags
482
9.65k
  if (
!Subtarget.isTargetDarwin() && 9.65k
!Subtarget.isTargetELF()5.55k
&&
483
9.65k
      
!Subtarget.isTargetCygMing()570
&&
!Subtarget.isTargetWin64()493
&&
484
9.65k
      
TM.Options.ExceptionModel != ExceptionHandling::SjLj211
) {
485
211
    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
486
211
  }
487
9.65k
488
9.65k
  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
489
9.65k
  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
490
9.65k
491
9.65k
  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
492
9.65k
  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
493
9.65k
494
9.65k
  setOperationAction(ISD::TRAP, MVT::Other, Legal);
495
9.65k
  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
496
9.65k
497
9.65k
  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
498
9.65k
  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
499
9.65k
  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
500
9.65k
  bool Is64Bit = Subtarget.is64Bit();
501
9.65k
  setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? 
Custom7.11k
:
Expand2.53k
);
502
9.65k
  setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? 
Custom7.11k
:
Expand2.53k
);
503
9.65k
504
9.65k
  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
505
9.65k
  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
506
9.65k
507
9.65k
  setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
508
9.65k
509
9.65k
  // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
510
9.65k
  setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
511
9.65k
  setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
512
9.65k
513
9.65k
  if (
!Subtarget.useSoftFloat() && 9.65k
X86ScalarSSEf649.63k
) {
514
8.24k
    // f32 and f64 use SSE.
515
8.24k
    // Set up the FP register classes.
516
726
    addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
517
7.52k
                                                     : &X86::FR32RegClass);
518
726
    addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
519
7.52k
                                                     : &X86::FR64RegClass);
520
8.24k
521
16.4k
    for (auto VT : { MVT::f32, MVT::f64 }) {
522
16.4k
      // Use ANDPD to simulate FABS.
523
16.4k
      setOperationAction(ISD::FABS, VT, Custom);
524
16.4k
525
16.4k
      // Use XORP to simulate FNEG.
526
16.4k
      setOperationAction(ISD::FNEG, VT, Custom);
527
16.4k
528
16.4k
      // Use ANDPD and ORPD to simulate FCOPYSIGN.
529
16.4k
      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
530
16.4k
531
16.4k
      // We don't support sin/cos/fmod
532
16.4k
      setOperationAction(ISD::FSIN   , VT, Expand);
533
16.4k
      setOperationAction(ISD::FCOS   , VT, Expand);
534
16.4k
      setOperationAction(ISD::FSINCOS, VT, Expand);
535
16.4k
    }
536
8.24k
537
8.24k
    // Lower this to MOVMSK plus an AND.
538
8.24k
    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
539
8.24k
    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
540
8.24k
541
8.24k
    // Expand FP immediates into loads from the stack, except for the special
542
8.24k
    // cases we handle.
543
8.24k
    addLegalFPImmediate(APFloat(+0.0)); // xorpd
544
8.24k
    addLegalFPImmediate(APFloat(+0.0f)); // xorps
545
9.65k
  } else 
if (1.40k
UseX87 && 1.40k
X86ScalarSSEf321.38k
) {
546
43
    // Use SSE for f32, x87 for f64.
547
43
    // Set up the FP register classes.
548
0
    addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
549
43
                                                     : &X86::FR32RegClass);
550
43
    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
551
43
552
43
    // Use ANDPS to simulate FABS.
553
43
    setOperationAction(ISD::FABS , MVT::f32, Custom);
554
43
555
43
    // Use XORP to simulate FNEG.
556
43
    setOperationAction(ISD::FNEG , MVT::f32, Custom);
557
43
558
43
    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
559
43
560
43
    // Use ANDPS and ORPS to simulate FCOPYSIGN.
561
43
    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
562
43
    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
563
43
564
43
    // We don't support sin/cos/fmod
565
43
    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
566
43
    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
567
43
    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
568
43
569
43
    // Special cases we handle for FP constants.
570
43
    addLegalFPImmediate(APFloat(+0.0f)); // xorps
571
43
    addLegalFPImmediate(APFloat(+0.0)); // FLD0
572
43
    addLegalFPImmediate(APFloat(+1.0)); // FLD1
573
43
    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
574
43
    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
575
43
576
43
    // Always expand sin/cos functions even though x87 has an instruction.
577
43
    setOperationAction(ISD::FSIN   , MVT::f64, Expand);
578
43
    setOperationAction(ISD::FCOS   , MVT::f64, Expand);
579
43
    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
580
1.40k
  } else 
if (1.36k
UseX871.36k
) {
581
1.33k
    // f32 and f64 in x87.
582
1.33k
    // Set up the FP register classes.
583
1.33k
    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
584
1.33k
    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
585
1.33k
586
2.67k
    for (auto VT : { MVT::f32, MVT::f64 }) {
587
2.67k
      setOperationAction(ISD::UNDEF,     VT, Expand);
588
2.67k
      setOperationAction(ISD::FCOPYSIGN, VT, Expand);
589
2.67k
590
2.67k
      // Always expand sin/cos functions even though x87 has an instruction.
591
2.67k
      setOperationAction(ISD::FSIN   , VT, Expand);
592
2.67k
      setOperationAction(ISD::FCOS   , VT, Expand);
593
2.67k
      setOperationAction(ISD::FSINCOS, VT, Expand);
594
2.67k
    }
595
1.40k
    addLegalFPImmediate(APFloat(+0.0)); // FLD0
596
1.40k
    addLegalFPImmediate(APFloat(+1.0)); // FLD1
597
1.40k
    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
598
1.40k
    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
599
1.40k
    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
600
1.40k
    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
601
1.40k
    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
602
1.40k
    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
603
1.40k
  }
604
9.65k
605
9.65k
  // We don't support FMA.
606
9.65k
  setOperationAction(ISD::FMA, MVT::f64, Expand);
607
9.65k
  setOperationAction(ISD::FMA, MVT::f32, Expand);
608
9.65k
609
9.65k
  // Long double always uses X87, except f128 in MMX.
610
9.65k
  if (
UseX879.65k
) {
611
9.63k
    if (
Subtarget.is64Bit() && 9.63k
Subtarget.hasMMX()7.10k
) {
612
2.63k
      addRegisterClass(MVT::f128, &X86::FR128RegClass);
613
2.63k
      ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
614
2.63k
      setOperationAction(ISD::FABS , MVT::f128, Custom);
615
2.63k
      setOperationAction(ISD::FNEG , MVT::f128, Custom);
616
2.63k
      setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
617
2.63k
    }
618
9.63k
619
9.63k
    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
620
9.63k
    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
621
9.63k
    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
622
9.63k
    {
623
9.63k
      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
624
9.63k
      addLegalFPImmediate(TmpFlt);  // FLD0
625
9.63k
      TmpFlt.changeSign();
626
9.63k
      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
627
9.63k
628
9.63k
      bool ignored;
629
9.63k
      APFloat TmpFlt2(+1.0);
630
9.63k
      TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
631
9.63k
                      &ignored);
632
9.63k
      addLegalFPImmediate(TmpFlt2);  // FLD1
633
9.63k
      TmpFlt2.changeSign();
634
9.63k
      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
635
9.63k
    }
636
9.63k
637
9.63k
    // Always expand sin/cos functions even though x87 has an instruction.
638
9.63k
    setOperationAction(ISD::FSIN   , MVT::f80, Expand);
639
9.63k
    setOperationAction(ISD::FCOS   , MVT::f80, Expand);
640
9.63k
    setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
641
9.63k
642
9.63k
    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
643
9.63k
    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
644
9.63k
    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
645
9.63k
    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
646
9.63k
    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
647
9.63k
    setOperationAction(ISD::FMA, MVT::f80, Expand);
648
9.63k
  }
649
9.65k
650
9.65k
  // Always use a library call for pow.
651
9.65k
  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
652
9.65k
  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
653
9.65k
  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
654
9.65k
655
9.65k
  setOperationAction(ISD::FLOG, MVT::f80, Expand);
656
9.65k
  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
657
9.65k
  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
658
9.65k
  setOperationAction(ISD::FEXP, MVT::f80, Expand);
659
9.65k
  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
660
9.65k
  setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
661
9.65k
  setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
662
9.65k
663
9.65k
  // Some FP actions are always expanded for vector types.
664
9.65k
  for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
665
57.8k
                   MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
666
57.8k
    setOperationAction(ISD::FSIN,      VT, Expand);
667
57.8k
    setOperationAction(ISD::FSINCOS,   VT, Expand);
668
57.8k
    setOperationAction(ISD::FCOS,      VT, Expand);
669
57.8k
    setOperationAction(ISD::FREM,      VT, Expand);
670
57.8k
    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
671
57.8k
    setOperationAction(ISD::FPOW,      VT, Expand);
672
57.8k
    setOperationAction(ISD::FLOG,      VT, Expand);
673
57.8k
    setOperationAction(ISD::FLOG2,     VT, Expand);
674
57.8k
    setOperationAction(ISD::FLOG10,    VT, Expand);
675
57.8k
    setOperationAction(ISD::FEXP,      VT, Expand);
676
57.8k
    setOperationAction(ISD::FEXP2,     VT, Expand);
677
57.8k
  }
678
9.65k
679
9.65k
  // First set operation action for all vector types to either promote
680
9.65k
  // (for widening) or expand (for scalarization). Then we will selectively
681
9.65k
  // turn on ones that can be effectively codegen'd.
682
907k
  for (MVT VT : MVT::vector_valuetypes()) {
683
907k
    setOperationAction(ISD::SDIV, VT, Expand);
684
907k
    setOperationAction(ISD::UDIV, VT, Expand);
685
907k
    setOperationAction(ISD::SREM, VT, Expand);
686
907k
    setOperationAction(ISD::UREM, VT, Expand);
687
907k
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
688
907k
    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
689
907k
    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
690
907k
    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
691
907k
    setOperationAction(ISD::FMA,  VT, Expand);
692
907k
    setOperationAction(ISD::FFLOOR, VT, Expand);
693
907k
    setOperationAction(ISD::FCEIL, VT, Expand);
694
907k
    setOperationAction(ISD::FTRUNC, VT, Expand);
695
907k
    setOperationAction(ISD::FRINT, VT, Expand);
696
907k
    setOperationAction(ISD::FNEARBYINT, VT, Expand);
697
907k
    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
698
907k
    setOperationAction(ISD::MULHS, VT, Expand);
699
907k
    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
700
907k
    setOperationAction(ISD::MULHU, VT, Expand);
701
907k
    setOperationAction(ISD::SDIVREM, VT, Expand);
702
907k
    setOperationAction(ISD::UDIVREM, VT, Expand);
703
907k
    setOperationAction(ISD::CTPOP, VT, Expand);
704
907k
    setOperationAction(ISD::CTTZ, VT, Expand);
705
907k
    setOperationAction(ISD::CTLZ, VT, Expand);
706
907k
    setOperationAction(ISD::ROTL, VT, Expand);
707
907k
    setOperationAction(ISD::ROTR, VT, Expand);
708
907k
    setOperationAction(ISD::BSWAP, VT, Expand);
709
907k
    setOperationAction(ISD::SETCC, VT, Expand);
710
907k
    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
711
907k
    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
712
907k
    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
713
907k
    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
714
907k
    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
715
907k
    setOperationAction(ISD::TRUNCATE, VT, Expand);
716
907k
    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
717
907k
    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
718
907k
    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
719
907k
    setOperationAction(ISD::SELECT_CC, VT, Expand);
720
85.2M
    for (MVT InnerVT : MVT::vector_valuetypes()) {
721
85.2M
      setTruncStoreAction(InnerVT, VT, Expand);
722
85.2M
723
85.2M
      setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
724
85.2M
      setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
725
85.2M
726
85.2M
      // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
727
85.2M
      // types, we have to deal with them whether we ask for Expansion or not.
728
85.2M
      // Setting Expand causes its own optimisation problems though, so leave
729
85.2M
      // them legal.
730
85.2M
      if (VT.getVectorElementType() == MVT::i1)
731
13.6M
        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
732
85.2M
733
85.2M
      // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
734
85.2M
      // split/scalarized right now.
735
85.2M
      if (VT.getVectorElementType() == MVT::f16)
736
5.44M
        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
737
85.2M
    }
738
907k
  }
739
9.65k
740
9.65k
  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
741
9.65k
  // with -msoft-float, disable use of MMX as well.
742
9.65k
  if (
!Subtarget.useSoftFloat() && 9.65k
Subtarget.hasMMX()9.63k
) {
743
3.43k
    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
744
3.43k
    // No operations on x86mmx supported, everything uses intrinsics.
745
3.43k
  }
746
9.65k
747
9.65k
  if (
!Subtarget.useSoftFloat() && 9.65k
Subtarget.hasSSE1()9.63k
) {
748
317
    addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
749
7.97k
                                                    : &X86::VR128RegClass);
750
8.29k
751
8.29k
    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
752
8.29k
    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
753
8.29k
    setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
754
8.29k
    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
755
8.29k
    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
756
8.29k
    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
757
8.29k
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
758
8.29k
    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
759
8.29k
    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
760
8.29k
  }
761
9.65k
762
9.65k
  if (
!Subtarget.useSoftFloat() && 9.65k
Subtarget.hasSSE2()9.63k
) {
763
317
    addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
764
7.93k
                                                    : &X86::VR128RegClass);
765
8.25k
766
8.25k
    // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
767
8.25k
    // registers cannot be used even for integer operations.
768
317
    addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
769
7.93k
                                                    : &X86::VR128RegClass);
770
317
    addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
771
7.93k
                                                    : &X86::VR128RegClass);
772
317
    addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
773
7.93k
                                                    : &X86::VR128RegClass);
774
317
    addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
775
7.93k
                                                    : &X86::VR128RegClass);
776
8.25k
777
8.25k
    setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
778
8.25k
    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
779
8.25k
    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
780
8.25k
    setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
781
8.25k
    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
782
8.25k
    setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
783
8.25k
    setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
784
8.25k
    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
785
8.25k
    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
786
8.25k
    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
787
8.25k
    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
788
8.25k
    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
789
8.25k
    setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
790
8.25k
791
8.25k
    setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
792
8.25k
    setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
793
8.25k
    setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
794
8.25k
    setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
795
8.25k
796
8.25k
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
797
8.25k
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
798
8.25k
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
799
8.25k
800
32.9k
    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
801
32.9k
      setOperationAction(ISD::SETCC,              VT, Custom);
802
32.9k
      setOperationAction(ISD::CTPOP,              VT, Custom);
803
32.9k
      setOperationAction(ISD::CTTZ,               VT, Custom);
804
32.9k
    }
805
8.25k
806
24.7k
    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
807
24.7k
      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
808
24.7k
      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
809
24.7k
      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
810
24.7k
      setOperationAction(ISD::VSELECT,            VT, Custom);
811
24.7k
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
812
24.7k
    }
813
8.25k
814
8.25k
    // We support custom legalizing of sext and anyext loads for specific
815
8.25k
    // memory vector types which we can load as a scalar (or sequence of
816
8.25k
    // scalars) and extend in-register to a legal 128-bit vector type. For sext
817
8.25k
    // loads these must work with a single scalar load.
818
577k
    for (MVT VT : MVT::integer_vector_valuetypes()) {
819
577k
      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
820
577k
      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
821
577k
      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
822
577k
      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
823
577k
      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
824
577k
      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
825
577k
      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
826
577k
      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
827
577k
      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
828
577k
    }
829
8.25k
830
16.5k
    for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
831
16.5k
      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
832
16.5k
      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
833
16.5k
      setOperationAction(ISD::VSELECT,            VT, Custom);
834
16.5k
835
16.5k
      if (
VT == MVT::v2i64 && 16.5k
!Subtarget.is64Bit()8.25k
)
836
1.16k
        continue;
837
15.3k
838
15.3k
      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
839
15.3k
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
840
15.3k
    }
841
8.25k
842
8.25k
    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
843
24.7k
    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
844
24.7k
      setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
845
24.7k
      setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
846
24.7k
      setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
847
24.7k
      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
848
24.7k
      setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
849
24.7k
    }
850
8.25k
851
8.25k
    // Custom lower v2i64 and v2f64 selects.
852
8.25k
    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
853
8.25k
    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
854
8.25k
855
8.25k
    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
856
8.25k
    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
857
8.25k
858
8.25k
    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
859
8.25k
    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
860
8.25k
861
8.25k
    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
862
8.25k
    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
863
8.25k
    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
864
8.25k
865
8.25k
    // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
866
8.25k
    setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
867
8.25k
868
8.25k
    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
869
8.25k
    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
870
8.25k
871
8.25k
    for (MVT VT : MVT::fp_vector_valuetypes())
872
197k
      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
873
8.25k
874
8.25k
    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
875
8.25k
    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
876
8.25k
    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
877
8.25k
878
8.25k
    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
879
8.25k
    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
880
8.25k
    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
881
8.25k
882
8.25k
    // In the customized shift lowering, the legal v4i32/v2i64 cases
883
8.25k
    // in AVX2 will be recognized.
884
32.9k
    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
885
32.9k
      setOperationAction(ISD::SRL,              VT, Custom);
886
32.9k
      setOperationAction(ISD::SHL,              VT, Custom);
887
32.9k
      setOperationAction(ISD::SRA,              VT, Custom);
888
32.9k
    }
889
8.25k
  }
890
9.65k
891
9.65k
  if (
!Subtarget.useSoftFloat() && 9.65k
Subtarget.hasSSSE3()9.63k
) {
892
3.96k
    setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
893
3.96k
    setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
894
3.96k
    setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
895
3.96k
    setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
896
3.96k
    setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
897
3.96k
    setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
898
3.96k
    setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
899
3.96k
    setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
900
3.96k
  }
901
9.65k
902
9.65k
  if (
!Subtarget.useSoftFloat() && 9.65k
Subtarget.hasSSE41()9.63k
) {
903
11.3k
    for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
904
11.3k
      setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
905
11.3k
      setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
906
11.3k
      setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
907
11.3k
      setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
908
11.3k
      setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
909
11.3k
    }
910
2.83k
911
2.83k
    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
912
2.83k
    setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
913
2.83k
    setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
914
2.83k
    setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
915
2.83k
    setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
916
2.83k
    setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
917
2.83k
    setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
918
2.83k
    setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
919
2.83k
920
2.83k
    // FIXME: Do we need to handle scalar-to-vector here?
921
2.83k
    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
922
2.83k
923
2.83k
    // We directly match byte blends in the backend as they match the VSELECT
924
2.83k
    // condition form.
925
2.83k
    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
926
2.83k
927
2.83k
    // SSE41 brings specific instructions for doing vector sign extend even in
928
2.83k
    // cases where we don't have SRA.
929
8.51k
    for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
930
8.51k
      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
931
8.51k
      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
932
8.51k
    }
933
2.83k
934
198k
    for (MVT VT : MVT::integer_vector_valuetypes()) {
935
198k
      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
936
198k
      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
937
198k
      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
938
198k
    }
939
2.83k
940
2.83k
    // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
941
5.67k
    for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
942
5.67k
      setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
943
5.67k
      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
944
5.67k
      setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8,  Legal);
945
5.67k
      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
946
5.67k
      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
947
5.67k
      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
948
5.67k
      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
949
5.67k
    }
950
2.83k
951
2.83k
    // i8 vectors are custom because the source register and source
952
2.83k
    // source memory operand types are not the same width.
953
2.83k
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
954
2.83k
  }
955
9.65k
956
9.65k
  if (
!Subtarget.useSoftFloat() && 9.65k
Subtarget.hasXOP()9.62k
) {
957
110
    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
958
110
                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
959
880
      setOperationAction(ISD::ROTL, VT, Custom);
960
110
961
110
    // XOP can efficiently perform BITREVERSE with VPPERM.
962
110
    for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
963
440
      setOperationAction(ISD::BITREVERSE, VT, Custom);
964
110
965
110
    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
966
110
                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
967
880
      setOperationAction(ISD::BITREVERSE, VT, Custom);
968
110
  }
969
9.65k
970
9.65k
  if (
!Subtarget.useSoftFloat() && 9.65k
Subtarget.hasFp256()9.63k
) {
971
2.28k
    bool HasInt256 = Subtarget.hasInt256();
972
2.28k
973
317
    addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
974
1.97k
                                                     : &X86::VR256RegClass);
975
317
    addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
976
1.97k
                                                     : &X86::VR256RegClass);
977
317
    addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
978
1.97k
                                                     : &X86::VR256RegClass);
979
317
    addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
980
1.97k
                                                     : &X86::VR256RegClass);
981
317
    addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
982
1.97k
                                                     : &X86::VR256RegClass);
983
317
    addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
984
1.97k
                                                     : &X86::VR256RegClass);
985
2.28k
986
4.57k
    for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
987
4.57k
      setOperationAction(ISD::FFLOOR,     VT, Legal);
988
4.57k
      setOperationAction(ISD::FCEIL,      VT, Legal);
989
4.57k
      setOperationAction(ISD::FTRUNC,     VT, Legal);
990
4.57k
      setOperationAction(ISD::FRINT,      VT, Legal);
991
4.57k
      setOperationAction(ISD::FNEARBYINT, VT, Legal);
992
4.57k
      setOperationAction(ISD::FNEG,       VT, Custom);
993
4.57k
      setOperationAction(ISD::FABS,       VT, Custom);
994
4.57k
      setOperationAction(ISD::FCOPYSIGN,  VT, Custom);
995
4.57k
    }
996
2.28k
997
2.28k
    // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
998
2.28k
    // even though v8i16 is a legal type.
999
2.28k
    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1000
2.28k
    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1001
2.28k
    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1002
2.28k
1003
2.28k
    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1004
2.28k
    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1005
2.28k
    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1006
2.28k
1007
2.28k
    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1008
2.28k
    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1009
2.28k
1010
2.28k
    for (MVT VT : MVT::fp_vector_valuetypes())
1011
54.9k
      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1012
2.28k
1013
2.28k
    // In the customized shift lowering, the legal v8i32/v4i64 cases
1014
2.28k
    // in AVX2 will be recognized.
1015
9.15k
    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1016
9.15k
      setOperationAction(ISD::SRL, VT, Custom);
1017
9.15k
      setOperationAction(ISD::SHL, VT, Custom);
1018
9.15k
      setOperationAction(ISD::SRA, VT, Custom);
1019
9.15k
    }
1020
2.28k
1021
2.28k
    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1022
2.28k
    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1023
2.28k
    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1024
2.28k
1025
6.86k
    for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1026
6.86k
      setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
1027
6.86k
      setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
1028
6.86k
      setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
1029
6.86k
    }
1030
2.28k
1031
2.28k
    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1032
2.28k
    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1033
2.28k
    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1034
2.28k
    setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1035
2.28k
1036
9.15k
    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1037
9.15k
      setOperationAction(ISD::SETCC,           VT, Custom);
1038
9.15k
      setOperationAction(ISD::CTPOP,           VT, Custom);
1039
9.15k
      setOperationAction(ISD::CTTZ,            VT, Custom);
1040
9.15k
      setOperationAction(ISD::CTLZ,            VT, Custom);
1041
9.15k
    }
1042
2.28k
1043
2.28k
    if (
Subtarget.hasAnyFMA()2.28k
) {
1044
1.40k
      for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1045
1.40k
                       MVT::v2f64, MVT::v4f64 })
1046
8.40k
        setOperationAction(ISD::FMA, VT, Legal);
1047
1.40k
    }
1048
2.28k
1049
9.15k
    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1050
9.15k
      setOperationAction(ISD::ADD, VT, HasInt256 ? 
Legal6.18k
:
Custom2.96k
);
1051
9.15k
      setOperationAction(ISD::SUB, VT, HasInt256 ? 
Legal6.18k
:
Custom2.96k
);
1052
9.15k
    }
1053
2.28k
1054
2.28k
    setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1055
2.28k
    setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? 
Legal1.54k
:
Custom742
);
1056
2.28k
    setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? 
Legal1.54k
:
Custom742
);
1057
2.28k
    setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1058
2.28k
1059
2.28k
    setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
1060
2.28k
    setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
1061
2.28k
1062
2.28k
    setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? 
Legal1.54k
:
Custom742
);
1063
2.28k
    setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? 
Legal1.54k
:
Custom742
);
1064
2.28k
    setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1065
2.28k
    setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1066
2.28k
1067
6.86k
    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1068
6.86k
      setOperationAction(ISD::ABS,  VT, HasInt256 ? 
Legal4.64k
:
Custom2.22k
);
1069
6.86k
      setOperationAction(ISD::SMAX, VT, HasInt256 ? 
Legal4.64k
:
Custom2.22k
);
1070
6.86k
      setOperationAction(ISD::UMAX, VT, HasInt256 ? 
Legal4.64k
:
Custom2.22k
);
1071
6.86k
      setOperationAction(ISD::SMIN, VT, HasInt256 ? 
Legal4.64k
:
Custom2.22k
);
1072
6.86k
      setOperationAction(ISD::UMIN, VT, HasInt256 ? 
Legal4.64k
:
Custom2.22k
);
1073
6.86k
    }
1074
2.28k
1075
2.28k
    if (
HasInt2562.28k
) {
1076
1.54k
      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
1077
1.54k
      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
1078
1.54k
      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1079
1.54k
1080
1.54k
      // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1081
1.54k
      // when we have a 256bit-wide blend with immediate.
1082
1.54k
      setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1083
1.54k
1084
1.54k
      // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1085
3.09k
      for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1086
3.09k
        setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1087
3.09k
        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
1088
3.09k
        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
1089
3.09k
        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
1090
3.09k
        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
1091
3.09k
        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
1092
3.09k
      }
1093
1.54k
    }
1094
2.28k
1095
2.28k
    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1096
18.3k
                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1097
18.3k
      setOperationAction(ISD::MLOAD,  VT, Legal);
1098
18.3k
      setOperationAction(ISD::MSTORE, VT, Legal);
1099
18.3k
    }
1100
2.28k
1101
2.28k
    // Extract subvector is special because the value type
1102
2.28k
    // (result) is 128-bit but the source is 256-bit wide.
1103
2.28k
    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1104
13.7k
                     MVT::v4f32, MVT::v2f64 }) {
1105
13.7k
      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1106
13.7k
    }
1107
2.28k
1108
2.28k
    // Custom lower several nodes for 256-bit types.
1109
2.28k
    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1110
13.7k
                    MVT::v8f32, MVT::v4f64 }) {
1111
13.7k
      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1112
13.7k
      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1113
13.7k
      setOperationAction(ISD::VSELECT,            VT, Custom);
1114
13.7k
      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1115
13.7k
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1116
13.7k
      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1117
13.7k
      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1118
13.7k
      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1119
13.7k
    }
1120
2.28k
1121
2.28k
    if (HasInt256)
1122
1.54k
      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1123
2.28k
1124
2.28k
    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1125
6.86k
    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1126
6.86k
      setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
1127
6.86k
      setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
1128
6.86k
      setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
1129
6.86k
      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
1130
6.86k
      setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1131
6.86k
    }
1132
2.28k
  }
1133
9.65k
1134
9.65k
  if (
!Subtarget.useSoftFloat() && 9.65k
Subtarget.hasAVX512()9.63k
) {
1135
726
    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1136
726
    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1137
726
    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1138
726
    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1139
726
1140
726
    addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
1141
726
    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1142
726
    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1143
726
1144
726
    for (MVT VT : MVT::fp_vector_valuetypes())
1145
17.4k
      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1146
726
1147
2.17k
    for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1148
2.17k
      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1149
2.17k
      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1150
2.17k
      setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8,  Legal);
1151
2.17k
      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1152
2.17k
      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1153
2.17k
      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1154
2.17k
    }
1155
726
1156
726
    for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1157
726
                   MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1158
8.71k
                   MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1159
8.71k
      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1160
8.71k
      setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1161
8.71k
      setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1162
8.71k
      setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
1163
8.71k
      setTruncStoreAction(VT, MaskVT, Custom);
1164
8.71k
    }
1165
726
1166
1.45k
    for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1167
1.45k
      setOperationAction(ISD::FNEG,  VT, Custom);
1168
1.45k
      setOperationAction(ISD::FABS,  VT, Custom);
1169
1.45k
      setOperationAction(ISD::FMA,   VT, Legal);
1170
1.45k
      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1171
1.45k
    }
1172
726
1173
726
    setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1174
726
    setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1175
726
    setOperationAction(ISD::FP_TO_UINT,         MVT::v16i8, Legal);
1176
726
    setOperationAction(ISD::FP_TO_UINT,         MVT::v16i16, Legal);
1177
726
    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1178
726
    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1179
726
    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
1180
726
    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1181
726
    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1182
726
    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1183
726
    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1184
726
    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1185
726
    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1186
726
    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1187
726
    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1188
726
    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
1189
726
    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
1190
726
    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1, Custom);
1191
726
    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i1, Custom);
1192
726
    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,  Custom);
1193
726
    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i1,  Custom);
1194
726
    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i1,  Custom);
1195
726
    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i1,  Custom);
1196
726
    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i1,  Custom);
1197
726
    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i1,  Custom);
1198
726
    setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1199
726
    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1200
726
1201
726
    setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1202
726
    setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1203
726
    setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1204
726
    setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1205
726
    setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1206
726
    if (
Subtarget.hasVLX()726
){
1207
317
      setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
1208
317
      setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1209
317
      setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1210
317
      setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
1211
317
      setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1212
317
1213
317
      setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
1214
317
      setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1215
317
      setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1216
317
      setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
1217
317
      setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1218
726
    } else {
1219
409
      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1220
3.27k
           MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1221
3.27k
        setOperationAction(ISD::MLOAD,  VT, Custom);
1222
3.27k
        setOperationAction(ISD::MSTORE, VT, Custom);
1223
3.27k
      }
1224
409
    }
1225
726
1226
726
    if (
Subtarget.hasDQI()726
) {
1227
630
      for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1228
630
        setOperationAction(ISD::SINT_TO_FP,     VT, Legal);
1229
630
        setOperationAction(ISD::UINT_TO_FP,     VT, Legal);
1230
630
        setOperationAction(ISD::FP_TO_SINT,     VT, Legal);
1231
630
        setOperationAction(ISD::FP_TO_UINT,     VT, Legal);
1232
630
      }
1233
210
      if (
Subtarget.hasVLX()210
) {
1234
165
        // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1235
165
        setOperationAction(ISD::SINT_TO_FP,    MVT::v2f32, Custom);
1236
165
        setOperationAction(ISD::FP_TO_SINT,    MVT::v2f32, Custom);
1237
165
        setOperationAction(ISD::FP_TO_UINT,    MVT::v2f32, Custom);
1238
165
      }
1239
210
    }
1240
726
    if (
Subtarget.hasVLX()726
) {
1241
317
      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
1242
317
      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
1243
317
      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
1244
317
      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
1245
317
      setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
1246
317
      setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
1247
317
      setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
1248
317
      setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
1249
317
      setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
1250
317
      setOperationAction(ISD::SIGN_EXTEND,      MVT::v4i32, Custom);
1251
317
      setOperationAction(ISD::SIGN_EXTEND,      MVT::v2i64, Custom);
1252
317
1253
317
      // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1254
317
      setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
1255
317
      setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1256
317
      setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1257
317
      setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1258
317
      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8,  Legal);
1259
317
      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1260
317
      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1261
317
      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1262
317
      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1263
317
      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1264
317
    }
1265
726
1266
726
    setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1267
726
    setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1268
726
    setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1269
726
    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1270
726
    setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
1271
726
    setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
1272
726
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1273
726
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1274
726
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1275
726
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1276
726
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1277
726
1278
1.45k
    for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1279
1.45k
      setOperationAction(ISD::FFLOOR,           VT, Legal);
1280
1.45k
      setOperationAction(ISD::FCEIL,            VT, Legal);
1281
1.45k
      setOperationAction(ISD::FTRUNC,           VT, Legal);
1282
1.45k
      setOperationAction(ISD::FRINT,            VT, Legal);
1283
1.45k
      setOperationAction(ISD::FNEARBYINT,       VT, Legal);
1284
1.45k
    }
1285
726
1286
726
    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64,  Custom);
1287
726
    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1288
726
1289
726
    // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1290
726
    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1291
726
    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1292
726
1293
726
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1294
726
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1295
726
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1296
726
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1297
726
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
1298
726
1299
726
    setOperationAction(ISD::MUL,                MVT::v8i64, Custom);
1300
726
    setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1301
726
1302
726
    setOperationAction(ISD::UMUL_LOHI,          MVT::v16i32,  Custom);
1303
726
    setOperationAction(ISD::SMUL_LOHI,          MVT::v16i32,  Custom);
1304
726
1305
726
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1306
726
    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
1307
726
    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
1308
726
    setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1309
726
    setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1310
726
    setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1311
726
1312
726
1313
726
    // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1314
726
    setOperationAction(ISD::ABS,                MVT::v4i64, Legal);
1315
726
    setOperationAction(ISD::ABS,                MVT::v2i64, Legal);
1316
726
1317
1.45k
    for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1318
1.45k
      setOperationAction(ISD::ADD,              VT, Custom);
1319
1.45k
      setOperationAction(ISD::SUB,              VT, Custom);
1320
1.45k
      setOperationAction(ISD::MUL,              VT, Custom);
1321
1.45k
      setOperationAction(ISD::SETCC,            VT, Custom);
1322
1.45k
      setOperationAction(ISD::SELECT,           VT, Custom);
1323
1.45k
      setOperationAction(ISD::TRUNCATE,         VT, Custom);
1324
1.45k
1325
1.45k
      setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
1326
1.45k
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1327
1.45k
      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1328
1.45k
      setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
1329
1.45k
      setOperationAction(ISD::VSELECT,          VT,  Expand);
1330
1.45k
    }
1331
726
1332
1.45k
    for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1333
1.45k
      setOperationAction(ISD::SMAX,             VT, Legal);
1334
1.45k
      setOperationAction(ISD::UMAX,             VT, Legal);
1335
1.45k
      setOperationAction(ISD::SMIN,             VT, Legal);
1336
1.45k
      setOperationAction(ISD::UMIN,             VT, Legal);
1337
1.45k
      setOperationAction(ISD::ABS,              VT, Legal);
1338
1.45k
      setOperationAction(ISD::SRL,              VT, Custom);
1339
1.45k
      setOperationAction(ISD::SHL,              VT, Custom);
1340
1.45k
      setOperationAction(ISD::SRA,              VT, Custom);
1341
1.45k
      setOperationAction(ISD::CTPOP,            VT, Custom);
1342
1.45k
      setOperationAction(ISD::CTTZ,             VT, Custom);
1343
1.45k
    }
1344
726
1345
726
    // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1346
726
    for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
1347
4.35k
                    MVT::v8i64}) {
1348
4.35k
      setOperationAction(ISD::ROTL,             VT, Custom);
1349
4.35k
      setOperationAction(ISD::ROTR,             VT, Custom);
1350
4.35k
    }
1351
726
1352
726
    // Need to promote to 64-bit even though we have 32-bit masked instructions
1353
726
    // because the IR optimizers rearrange bitcasts around logic ops leaving
1354
726
    // too many variations to handle if we don't promote them.
1355
726
    setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1356
726
    setOperationPromotedToType(ISD::OR,  MVT::v16i32, MVT::v8i64);
1357
726
    setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1358
726
1359
726
    if (
Subtarget.hasCDI()726
) {
1360
209
      // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1361
209
      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1362
1.25k
                      MVT::v4i64, MVT::v8i64}) {
1363
1.25k
        setOperationAction(ISD::CTLZ,            VT, Legal);
1364
1.25k
        setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1365
1.25k
      }
1366
209
    } // Subtarget.hasCDI()
1367
726
1368
726
    if (
Subtarget.hasDQI()726
) {
1369
210
      // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1370
210
      setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
1371
210
      setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
1372
210
      setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
1373
210
    }
1374
726
1375
726
    if (
Subtarget.hasVPOPCNTDQ()726
) {
1376
9
      // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1377
9
      // version of popcntd/q.
1378
9
      for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1379
9
                      MVT::v4i32, MVT::v2i64})
1380
54
        setOperationAction(ISD::CTPOP, VT, Legal);
1381
9
    }
1382
726
1383
726
    // Custom lower several nodes.
1384
726
    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1385
5.80k
                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1386
5.80k
      setOperationAction(ISD::MGATHER,  VT, Custom);
1387
5.80k
      setOperationAction(ISD::MSCATTER, VT, Custom);
1388
5.80k
    }
1389
726
1390
726
    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v1i1, Legal);
1391
726
1392
726
    // Extract subvector is special because the value type
1393
726
    // (result) is 256-bit but the source is 512-bit wide.
1394
726
    // 128-bit was made Legal under AVX1.
1395
726
    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1396
726
                     MVT::v8f32, MVT::v4f64 })
1397
4.35k
      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1398
726
    for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1399
726
                     MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1400
4.35k
      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1401
726
1402
2.90k
    for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1403
2.90k
      setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1404
2.90k
      setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1405
2.90k
      setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1406
2.90k
      setOperationAction(ISD::VSELECT,             VT, Custom);
1407
2.90k
      setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1408
2.90k
      setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1409
2.90k
      setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
1410
2.90k
      setOperationAction(ISD::MLOAD,               VT, Legal);
1411
2.90k
      setOperationAction(ISD::MSTORE,              VT, Legal);
1412
2.90k
      setOperationAction(ISD::MGATHER,             VT, Legal);
1413
2.90k
      setOperationAction(ISD::MSCATTER,            VT, Custom);
1414
2.90k
    }
1415
2.17k
    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1416
2.17k
      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
1417
2.17k
      setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1418
2.17k
    }
1419
726
  }// has  AVX-512
1420
9.65k
1421
9.65k
  if (
!Subtarget.useSoftFloat() && 9.65k
Subtarget.hasBWI()9.62k
) {
1422
304
    addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1423
304
    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1424
304
1425
304
    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1426
304
    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1427
304
1428
304
    setOperationAction(ISD::ADD,                MVT::v32i1, Custom);
1429
304
    setOperationAction(ISD::ADD,                MVT::v64i1, Custom);
1430
304
    setOperationAction(ISD::SUB,                MVT::v32i1, Custom);
1431
304
    setOperationAction(ISD::SUB,                MVT::v64i1, Custom);
1432
304
    setOperationAction(ISD::MUL,                MVT::v32i1, Custom);
1433
304
    setOperationAction(ISD::MUL,                MVT::v64i1, Custom);
1434
304
1435
304
    setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1436
304
    setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1437
304
    setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1438
304
    setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
1439
304
    setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
1440
304
    setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
1441
304
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
1442
304
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
1443
304
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
1444
304
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
1445
304
    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
1446
304
    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
1447
304
    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Legal);
1448
304
    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Legal);
1449
304
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1450
304
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1451
304
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1,  Custom);
1452
304
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1453
304
    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
1454
304
    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
1455
304
    setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
1456
304
    setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
1457
304
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1458
304
    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1459
304
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
1460
304
    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
1461
304
    setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
1462
304
    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
1463
304
    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
1464
304
    setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1465
304
    setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1466
304
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
1467
304
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
1468
304
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
1469
304
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
1470
304
    setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
1471
304
    setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
1472
304
    setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
1473
304
    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
1474
304
    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
1475
304
    setOperationAction(ISD::BUILD_VECTOR,       MVT::v32i1, Custom);
1476
304
    setOperationAction(ISD::BUILD_VECTOR,       MVT::v64i1, Custom);
1477
304
    setOperationAction(ISD::VSELECT,            MVT::v32i1, Expand);
1478
304
    setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
1479
304
    setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
1480
304
1481
304
    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1482
304
1483
304
    setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1484
304
    if (
Subtarget.hasVLX()304
) {
1485
196
      setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
1486
196
      setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
1487
196
    }
1488
304
1489
304
    LegalizeAction Action = Subtarget.hasVLX() ? 
Legal196
:
Custom108
;
1490
1.21k
    for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1491
1.21k
      setOperationAction(ISD::MLOAD,               VT, Action);
1492
1.21k
      setOperationAction(ISD::MSTORE,              VT, Action);
1493
1.21k
    }
1494
304
1495
304
    if (
Subtarget.hasCDI()304
) {
1496
108
      setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
1497
108
      setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
1498
108
    }
1499
304
1500
608
    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1501
608
      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1502
608
      setOperationAction(ISD::VSELECT,      VT, Custom);
1503
608
      setOperationAction(ISD::ABS,          VT, Legal);
1504
608
      setOperationAction(ISD::SRL,          VT, Custom);
1505
608
      setOperationAction(ISD::SHL,          VT, Custom);
1506
608
      setOperationAction(ISD::SRA,          VT, Custom);
1507
608
      setOperationAction(ISD::MLOAD,        VT, Legal);
1508
608
      setOperationAction(ISD::MSTORE,       VT, Legal);
1509
608
      setOperationAction(ISD::CTPOP,        VT, Custom);
1510
608
      setOperationAction(ISD::CTTZ,         VT, Custom);
1511
608
      setOperationAction(ISD::SMAX,         VT, Legal);
1512
608
      setOperationAction(ISD::UMAX,         VT, Legal);
1513
608
      setOperationAction(ISD::SMIN,         VT, Legal);
1514
608
      setOperationAction(ISD::UMIN,         VT, Legal);
1515
608
1516
608
      setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
1517
608
      setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
1518
608
      setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
1519
608
    }
1520
304
1521
912
    for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1522
912
      setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1523
912
      if (
Subtarget.hasVLX()912
) {
1524
588
        // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1525
588
        setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1526
588
        setLoadExtAction(ExtType, MVT::v8i16,  MVT::v8i8,  Legal);
1527
588
      }
1528
912
    }
1529
304
  }
1530
9.65k
1531
9.65k
  if (
!Subtarget.useSoftFloat() && 9.65k
Subtarget.hasVLX()9.63k
) {
1532
317
    addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1533
317
    addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1534
317
1535
634
    for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1536
634
      setOperationAction(ISD::ADD,                VT, Custom);
1537
634
      setOperationAction(ISD::SUB,                VT, Custom);
1538
634
      setOperationAction(ISD::MUL,                VT, Custom);
1539
634
      setOperationAction(ISD::VSELECT,            VT, Expand);
1540
634
1541
634
      setOperationAction(ISD::TRUNCATE,           VT, Custom);
1542
634
      setOperationAction(ISD::SETCC,              VT, Custom);
1543
634
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1544
634
      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1545
634
      setOperationAction(ISD::SELECT,             VT, Custom);
1546
634
      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1547
634
      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1548
634
    }
1549
317
1550
317
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
1551
317
    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
1552
317
    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
1553
317
    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
1554
317
1555
634
    for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1556
634
      setOperationAction(ISD::SMAX, VT, Legal);
1557
634
      setOperationAction(ISD::UMAX, VT, Legal);
1558
634
      setOperationAction(ISD::SMIN, VT, Legal);
1559
634
      setOperationAction(ISD::UMIN, VT, Legal);
1560
634
    }
1561
317
  }
1562
9.65k
1563
9.65k
  // We want to custom lower some of our intrinsics.
1564
9.65k
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1565
9.65k
  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1566
9.65k
  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1567
9.65k
  if (
!Subtarget.is64Bit()9.65k
) {
1568
2.53k
    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1569
2.53k
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1570
2.53k
  }
1571
9.65k
1572
9.65k
  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1573
9.65k
  // handle type legalization for these operations here.
1574
9.65k
  //
1575
9.65k
  // FIXME: We really should do custom legalization for addition and
1576
9.65k
  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1577
9.65k
  // than generic legalization for 64-bit multiplication-with-overflow, though.
1578
38.5k
  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1579
38.5k
    if (
VT == MVT::i64 && 38.5k
!Subtarget.is64Bit()9.64k
)
1580
2.53k
      continue;
1581
36.0k
    // Add/Sub/Mul with overflow operations are custom lowered.
1582
36.0k
    setOperationAction(ISD::SADDO, VT, Custom);
1583
36.0k
    setOperationAction(ISD::UADDO, VT, Custom);
1584
36.0k
    setOperationAction(ISD::SSUBO, VT, Custom);
1585
36.0k
    setOperationAction(ISD::USUBO, VT, Custom);
1586
36.0k
    setOperationAction(ISD::SMULO, VT, Custom);
1587
36.0k
    setOperationAction(ISD::UMULO, VT, Custom);
1588
36.0k
1589
36.0k
    // Support carry in as value rather than glue.
1590
36.0k
    setOperationAction(ISD::ADDCARRY, VT, Custom);
1591
36.0k
    setOperationAction(ISD::SUBCARRY, VT, Custom);
1592
36.0k
    setOperationAction(ISD::SETCCCARRY, VT, Custom);
1593
36.0k
  }
1594
9.65k
1595
9.65k
  if (
!Subtarget.is64Bit()9.65k
) {
1596
2.53k
    // These libcalls are not available in 32-bit.
1597
2.53k
    setLibcallName(RTLIB::SHL_I128, nullptr);
1598
2.53k
    setLibcallName(RTLIB::SRL_I128, nullptr);
1599
2.53k
    setLibcallName(RTLIB::SRA_I128, nullptr);
1600
2.53k
  }
1601
9.65k
1602
9.65k
  // Combine sin / cos into one node or libcall if possible.
1603
9.65k
  if (
Subtarget.hasSinCos()9.65k
) {
1604
1.32k
    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1605
1.32k
    setLibcallName(RTLIB::SINCOS_F64, "sincos");
1606
1.32k
    if (
Subtarget.isTargetDarwin()1.32k
) {
1607
1.32k
      // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1608
1.32k
      // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1609
1.32k
      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1610
1.32k
      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1611
1.32k
    }
1612
1.32k
  }
1613
9.65k
1614
9.65k
  if (
Subtarget.isTargetWin64()9.65k
) {
1615
315
    setOperationAction(ISD::SDIV, MVT::i128, Custom);
1616
315
    setOperationAction(ISD::UDIV, MVT::i128, Custom);
1617
315
    setOperationAction(ISD::SREM, MVT::i128, Custom);
1618
315
    setOperationAction(ISD::UREM, MVT::i128, Custom);
1619
315
    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1620
315
    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1621
315
  }
1622
9.65k
1623
9.65k
  // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1624
9.65k
  // is. We should promote the value to 64-bits to solve this.
1625
9.65k
  // This is what the CRT headers do - `fmodf` is an inline header
1626
9.65k
  // function casting to f64 and calling `fmod`.
1627
9.65k
  if (
Subtarget.is32Bit() && 9.65k
(Subtarget.isTargetKnownWindowsMSVC() ||
1628
2.33k
                              Subtarget.isTargetWindowsItanium()))
1629
205
    for (ISD::NodeType Op :
1630
205
         {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1631
205
          ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1632
1.84k
      
if (1.84k
isOperationExpand(Op, MVT::f32)1.84k
)
1633
1.81k
        setOperationAction(Op, MVT::f32, Promote);
1634
9.65k
1635
9.65k
  // We have target-specific dag combine patterns for the following nodes:
1636
9.65k
  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1637
9.65k
  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1638
9.65k
  setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1639
9.65k
  setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1640
9.65k
  setTargetDAGCombine(ISD::BITCAST);
1641
9.65k
  setTargetDAGCombine(ISD::VSELECT);
1642
9.65k
  setTargetDAGCombine(ISD::SELECT);
1643
9.65k
  setTargetDAGCombine(ISD::SHL);
1644
9.65k
  setTargetDAGCombine(ISD::SRA);
1645
9.65k
  setTargetDAGCombine(ISD::SRL);
1646
9.65k
  setTargetDAGCombine(ISD::OR);
1647
9.65k
  setTargetDAGCombine(ISD::AND);
1648
9.65k
  setTargetDAGCombine(ISD::ADD);
1649
9.65k
  setTargetDAGCombine(ISD::FADD);
1650
9.65k
  setTargetDAGCombine(ISD::FSUB);
1651
9.65k
  setTargetDAGCombine(ISD::FNEG);
1652
9.65k
  setTargetDAGCombine(ISD::FMA);
1653
9.65k
  setTargetDAGCombine(ISD::FMINNUM);
1654
9.65k
  setTargetDAGCombine(ISD::FMAXNUM);
1655
9.65k
  setTargetDAGCombine(ISD::SUB);
1656
9.65k
  setTargetDAGCombine(ISD::LOAD);
1657
9.65k
  setTargetDAGCombine(ISD::MLOAD);
1658
9.65k
  setTargetDAGCombine(ISD::STORE);
1659
9.65k
  setTargetDAGCombine(ISD::MSTORE);
1660
9.65k
  setTargetDAGCombine(ISD::TRUNCATE);
1661
9.65k
  setTargetDAGCombine(ISD::ZERO_EXTEND);
1662
9.65k
  setTargetDAGCombine(ISD::ANY_EXTEND);
1663
9.65k
  setTargetDAGCombine(ISD::SIGN_EXTEND);
1664
9.65k
  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1665
9.65k
  setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1666
9.65k
  setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1667
9.65k
  setTargetDAGCombine(ISD::SINT_TO_FP);
1668
9.65k
  setTargetDAGCombine(ISD::UINT_TO_FP);
1669
9.65k
  setTargetDAGCombine(ISD::SETCC);
1670
9.65k
  setTargetDAGCombine(ISD::MUL);
1671
9.65k
  setTargetDAGCombine(ISD::XOR);
1672
9.65k
  setTargetDAGCombine(ISD::MSCATTER);
1673
9.65k
  setTargetDAGCombine(ISD::MGATHER);
1674
9.65k
1675
9.65k
  computeRegisterProperties(Subtarget.getRegisterInfo());
1676
9.65k
1677
9.65k
  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1678
9.65k
  MaxStoresPerMemsetOptSize = 8;
1679
9.65k
  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1680
9.65k
  MaxStoresPerMemcpyOptSize = 4;
1681
9.65k
  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1682
9.65k
  MaxStoresPerMemmoveOptSize = 4;
1683
9.65k
1684
9.65k
  // TODO: These control memcmp expansion in CGP and could be raised higher, but
1685
9.65k
  // that needs to benchmarked and balanced with the potential use of vector
1686
9.65k
  // load/store types (PR33329, PR33914).
1687
9.65k
  MaxLoadsPerMemcmp = 2;
1688
9.65k
  MaxLoadsPerMemcmpOptSize = 2;
1689
9.65k
1690
9.65k
  // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1691
9.65k
  setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1692
9.65k
1693
9.65k
  // An out-of-order CPU can speculatively execute past a predictable branch,
1694
9.65k
  // but a conditional move could be stalled by an expensive earlier operation.
1695
9.65k
  PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1696
9.65k
  EnableExtLdPromotion = true;
1697
9.65k
  setPrefFunctionAlignment(4); // 2^4 bytes.
1698
9.65k
1699
9.65k
  verifyIntrinsicTables();
1700
9.65k
}
1701
1702
// This has so far only been implemented for 64-bit MachO.
1703
1.05k
bool X86TargetLowering::useLoadStackGuardNode() const {
1704
586
  return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1705
1.05k
}
1706
1707
TargetLoweringBase::LegalizeTypeAction
1708
837k
X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1709
837k
  if (ExperimentalVectorWideningLegalization &&
1710
598
      VT.getVectorNumElements() != 1 &&
1711
493
      VT.getVectorElementType().getSimpleVT() != MVT::i1)
1712
402
    return TypeWidenVector;
1713
836k
1714
836k
  return TargetLoweringBase::getPreferredVectorAction(VT);
1715
836k
}
1716
1717
EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1718
                                          LLVMContext& Context,
1719
105k
                                          EVT VT) const {
1720
105k
  if (!VT.isVector())
1721
95.7k
    return MVT::i8;
1722
9.39k
1723
9.39k
  
if (9.39k
VT.isSimple()9.39k
) {
1724
9.37k
    MVT VVT = VT.getSimpleVT();
1725
9.37k
    const unsigned NumElts = VVT.getVectorNumElements();
1726
9.37k
    MVT EltVT = VVT.getVectorElementType();
1727
9.37k
    if (
VVT.is512BitVector()9.37k
) {
1728
498
      if (Subtarget.hasAVX512())
1729
145
        
if (145
EltVT == MVT::i32 || 145
EltVT == MVT::i64118
||
1730
145
            
EltVT == MVT::f32100
||
EltVT == MVT::f6476
)
1731
93
          switch(NumElts) {
1732
42
          case  8: return MVT::v8i1;
1733
51
          case 16: return MVT::v16i1;
1734
405
        }
1735
405
      
if (405
Subtarget.hasBWI()405
)
1736
21
        
if (21
EltVT == MVT::i8 || 21
EltVT == MVT::i1610
)
1737
21
          switch(NumElts) {
1738
10
          case 32: return MVT::v32i1;
1739
11
          case 64: return MVT::v64i1;
1740
9.26k
        }
1741
498
    }
1742
9.26k
1743
9.26k
    
if (9.26k
Subtarget.hasBWI() && 9.26k
Subtarget.hasVLX()432
)
1744
223
      return MVT::getVectorVT(MVT::i1, NumElts);
1745
9.04k
1746
9.04k
    
if (9.04k
!isTypeLegal(VT) && 9.04k
getTypeAction(Context, VT) == TypePromoteInteger1.41k
) {
1747
237
      EVT LegalVT = getTypeToTransformTo(Context, VT);
1748
237
      EltVT = LegalVT.getVectorElementType().getSimpleVT();
1749
237
    }
1750
9.04k
1751
9.04k
    if (
Subtarget.hasVLX() && 9.04k
EltVT.getSizeInBits() >= 3234
)
1752
14
      switch(NumElts) {
1753
0
      case 2: return MVT::v2i1;
1754
10
      case 4: return MVT::v4i1;
1755
4
      case 8: return MVT::v8i1;
1756
9.04k
      }
1757
9.37k
  }
1758
9.04k
1759
9.04k
  return VT.changeVectorElementTypeToInteger();
1760
9.04k
}
1761
1762
/// Helper for getByValTypeAlignment to determine
1763
/// the desired ByVal argument alignment.
1764
14
static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1765
14
  if (MaxAlign == 16)
1766
0
    return;
1767
14
  
if (VectorType *14
VTy14
= dyn_cast<VectorType>(Ty)) {
1768
1
    if (VTy->getBitWidth() == 128)
1769
1
      MaxAlign = 16;
1770
14
  } else 
if (ArrayType *13
ATy13
= dyn_cast<ArrayType>(Ty)) {
1771
0
    unsigned EltAlign = 0;
1772
0
    getMaxByValAlign(ATy->getElementType(), EltAlign);
1773
0
    if (EltAlign > MaxAlign)
1774
0
      MaxAlign = EltAlign;
1775
13
  } else 
if (StructType *13
STy13
= dyn_cast<StructType>(Ty)) {
1776
7
    for (auto *EltTy : STy->elements()) {
1777
7
      unsigned EltAlign = 0;
1778
7
      getMaxByValAlign(EltTy, EltAlign);
1779
7
      if (EltAlign > MaxAlign)
1780
1
        MaxAlign = EltAlign;
1781
7
      if (MaxAlign == 16)
1782
1
        break;
1783
14
    }
1784
13
  }
1785
14
}
1786
1787
/// Return the desired alignment for ByVal aggregate
1788
/// function arguments in the caller parameter area. For X86, aggregates
1789
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1790
/// are at 4-byte boundaries.
1791
unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1792
145
                                                  const DataLayout &DL) const {
1793
145
  if (
Subtarget.is64Bit()145
) {
1794
35
    // Max of 8 and alignment of type.
1795
35
    unsigned TyAlign = DL.getABITypeAlignment(Ty);
1796
35
    if (TyAlign > 8)
1797
0
      return TyAlign;
1798
35
    return 8;
1799
35
  }
1800
110
1801
110
  unsigned Align = 4;
1802
110
  if (Subtarget.hasSSE1())
1803
7
    getMaxByValAlign(Ty, Align);
1804
145
  return Align;
1805
145
}
1806
1807
/// Returns the target specific optimal type for load
1808
/// and store operations as a result of memset, memcpy, and memmove
1809
/// lowering. If DstAlign is zero that means it's safe to destination
1810
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1811
/// means there isn't a need to check it against alignment requirement,
1812
/// probably because the source does not need to be loaded. If 'IsMemset' is
1813
/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1814
/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1815
/// source is constant so it does not need to be loaded.
1816
/// It returns EVT::Other if the type should be determined using generic
1817
/// target-independent logic.
1818
EVT
1819
X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1820
                                       unsigned DstAlign, unsigned SrcAlign,
1821
                                       bool IsMemset, bool ZeroMemset,
1822
                                       bool MemcpyStrSrc,
1823
733
                                       MachineFunction &MF) const {
1824
733
  const Function *F = MF.getFunction();
1825
733
  if (
!F->hasFnAttribute(Attribute::NoImplicitFloat)733
) {
1826
733
    if (Size >= 16 &&
1827
511
        (!Subtarget.isUnalignedMem16Slow() ||
1828
359
         
((DstAlign == 0 || 359
DstAlign >= 16263
) &&
1829
733
          
(SrcAlign == 0 || 105
SrcAlign >= 1685
)))) {
1830
193
      // FIXME: Check if unaligned 32-byte accesses are slow.
1831
193
      if (
Size >= 32 && 193
Subtarget.hasAVX()107
) {
1832
64
        // Although this isn't a well-supported type for AVX1, we'll let
1833
64
        // legalization and shuffle lowering produce the optimal codegen. If we
1834
64
        // choose an optimal type with a vector element larger than a byte,
1835
64
        // getMemsetStores() may create an intermediate splat (using an integer
1836
64
        // multiply) before we splat as a vector.
1837
64
        return MVT::v32i8;
1838
64
      }
1839
129
      
if (129
Subtarget.hasSSE2()129
)
1840
116
        return MVT::v16i8;
1841
13
      // TODO: Can SSE1 handle a byte vector?
1842
13
      
if (13
Subtarget.hasSSE1()13
)
1843
5
        return MVT::v4f32;
1844
540
    } else 
if (540
(!IsMemset || 540
ZeroMemset74
) &&
!MemcpyStrSrc514
&&
Size >= 8472
&&
1845
540
               
!Subtarget.is64Bit()358
&&
Subtarget.hasSSE2()179
) {
1846
115
      // Do not use f64 to lower memcpy if source is string constant. It's
1847
115
      // better to use i32 to avoid the loads.
1848
115
      // Also, do not use f64 to lower memset unless this is a memset of zeros.
1849
115
      // The gymnastics of splatting a byte value into an XMM register and then
1850
115
      // only using 8-byte stores (because this is a CPU with slow unaligned
1851
115
      // 16-byte accesses) makes that a loser.
1852
115
      return MVT::f64;
1853
115
    }
1854
433
  }
1855
433
  // This is a compromise. If we reach here, unaligned accesses may be slow on
1856
433
  // this target. However, creating smaller, aligned accesses could be even
1857
433
  // slower and would certainly be a lot more code.
1858
433
  
if (433
Subtarget.is64Bit() && 433
Size >= 8294
)
1859
210
    return MVT::i64;
1860
223
  return MVT::i32;
1861
223
}
1862
1863
147
bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1864
147
  if (VT == MVT::f32)
1865
0
    return X86ScalarSSEf32;
1866
147
  else 
if (147
VT == MVT::f64147
)
1867
6
    return X86ScalarSSEf64;
1868
141
  return true;
1869
141
}
1870
1871
bool
1872
X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1873
                                                  unsigned,
1874
                                                  unsigned,
1875
40.8k
                                                  bool *Fast) const {
1876
40.8k
  if (
Fast40.8k
) {
1877
20.0k
    switch (VT.getSizeInBits()) {
1878
2.77k
    default:
1879
2.77k
      // 8-byte and under are always assumed to be fast.
1880
2.77k
      *Fast = true;
1881
2.77k
      break;
1882
10.6k
    case 128:
1883
10.6k
      *Fast = !Subtarget.isUnalignedMem16Slow();
1884
10.6k
      break;
1885
6.67k
    case 256:
1886
6.67k
      *Fast = !Subtarget.isUnalignedMem32Slow();
1887
6.67k
      break;
1888
40.8k
    // TODO: What about AVX-512 (512-bit) accesses?
1889
40.8k
    }
1890
40.8k
  }
1891
40.8k
  // Misaligned accesses of any size are always allowed.
1892
40.8k
  return true;
1893
40.8k
}
1894
1895
/// Return the entry encoding for a jump table in the
1896
/// current function.  The returned value is a member of the
1897
/// MachineJumpTableInfo::JTEntryKind enum.
1898
559
unsigned X86TargetLowering::getJumpTableEncoding() const {
1899
559
  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1900
559
  // symbol.
1901
559
  if (
isPositionIndependent() && 559
Subtarget.isPICStyleGOT()459
)
1902
2
    return MachineJumpTableInfo::EK_Custom32;
1903
557
1904
557
  // Otherwise, use the normal jump table encoding heuristics.
1905
557
  return TargetLowering::getJumpTableEncoding();
1906
557
}
1907
1908
67.8k
bool X86TargetLowering::useSoftFloat() const {
1909
67.8k
  return Subtarget.useSoftFloat();
1910
67.8k
}
1911
1912
void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1913
2.11k
                                              ArgListTy &Args) const {
1914
2.11k
1915
2.11k
  // Only relabel X86-32 for C / Stdcall CCs.
1916
2.11k
  if (Subtarget.is64Bit())
1917
1.28k
    return;
1918
823
  
if (823
CC != CallingConv::C && 823
CC != CallingConv::X86_StdCall8
)
1919
0
    return;
1920
823
  unsigned ParamRegs = 0;
1921
823
  if (auto *M = MF->getFunction()->getParent())
1922
823
    ParamRegs = M->getNumberRegisterParameters();
1923
823
1924
823
  // Mark the first N int arguments as having reg
1925
1.60k
  for (unsigned Idx = 0; 
Idx < Args.size()1.60k
;
Idx++785
) {
1926
1.18k
    Type *T = Args[Idx].Ty;
1927
1.18k
    if (
T->isPointerTy() || 1.18k
T->isIntegerTy()1.10k
)
1928
792
      
if (792
MF->getDataLayout().getTypeAllocSize(T) <= 8792
) {
1929
408
        unsigned numRegs = 1;
1930
408
        if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1931
90
          numRegs = 2;
1932
408
        if (ParamRegs < numRegs)
1933
396
          return;
1934
12
        ParamRegs -= numRegs;
1935
12
        Args[Idx].IsInReg = true;
1936
12
      }
1937
1.18k
  }
1938
2.11k
}
1939
1940
const MCExpr *
1941
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1942
                                             const MachineBasicBlock *MBB,
1943
48
                                             unsigned uid,MCContext &Ctx) const{
1944
48
  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1945
48
  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1946
48
  // entries.
1947
48
  return MCSymbolRefExpr::create(MBB->getSymbol(),
1948
48
                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1949
48
}
1950
1951
/// Returns relocation base for the given PIC jumptable.
1952
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1953
458
                                                    SelectionDAG &DAG) const {
1954
458
  if (!Subtarget.is64Bit())
1955
458
    // This doesn't have SDLoc associated with it, but is not really the
1956
458
    // same as a Register.
1957
164
    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1958
164
                       getPointerTy(DAG.getDataLayout()));
1959
294
  return Table;
1960
294
}
1961
1962
/// This returns the relocation base for the given PIC jumptable,
1963
/// the same as getPICJumpTableRelocBase, but as an MCExpr.
1964
const MCExpr *X86TargetLowering::
1965
getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1966
495
                             MCContext &Ctx) const {
1967
495
  // X86-64 uses RIP relative addressing based on the jump table label.
1968
495
  if (Subtarget.isPICStyleRIPRel())
1969
326
    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1970
169
1971
169
  // Otherwise, the reference is relative to the PIC base.
1972
169
  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1973
169
}
1974
1975
std::pair<const TargetRegisterClass *, uint8_t>
1976
X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1977
1.08M
                                           MVT VT) const {
1978
1.08M
  const TargetRegisterClass *RRC = nullptr;
1979
1.08M
  uint8_t Cost = 1;
1980
1.08M
  switch (VT.SimpleTy) {
1981
839k
  default:
1982
839k
    return TargetLowering::findRepresentativeClass(TRI, VT);
1983
38.6k
  
case MVT::i8: 38.6k
case MVT::i16: 38.6k
case MVT::i32: 38.6k
case MVT::i64:
1984
38.6k
    RRC = Subtarget.is64Bit() ? 
&X86::GR64RegClass28.4k
:
&X86::GR32RegClass10.1k
;
1985
38.6k
    break;
1986
9.65k
  case MVT::x86mmx:
1987
9.65k
    RRC = &X86::VR64RegClass;
1988
9.65k
    break;
1989
193k
  
case MVT::f32: 193k
case MVT::f64:
1990
193k
  
case MVT::v16i8: 193k
case MVT::v8i16: 193k
case MVT::v4i32: 193k
case MVT::v2i64:
1991
193k
  
case MVT::v4f32: 193k
case MVT::v2f64:
1992
193k
  
case MVT::v32i8: 193k
case MVT::v16i16: 193k
case MVT::v8i32: 193k
case MVT::v4i64:
1993
193k
  
case MVT::v8f32: 193k
case MVT::v4f64:
1994
193k
  
case MVT::v64i8: 193k
case MVT::v32i16: 193k
case MVT::v16i32: 193k
case MVT::v8i64:
1995
193k
  
case MVT::v16f32: 193k
case MVT::v8f64:
1996
193k
    RRC = &X86::VR128XRegClass;
1997
193k
    break;
1998
241k
  }
1999
241k
  return std::make_pair(RRC, Cost);
2000
241k
}
2001
2002
405
unsigned X86TargetLowering::getAddressSpace() const {
2003
405
  if (Subtarget.is64Bit())
2004
270
    
return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 270
256120
:
257150
;
2005
135
  return 256;
2006
135
}
2007
2008
1.11k
static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2009
733
  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2010
731
         
(TargetTriple.isAndroid() && 731
!TargetTriple.isAndroidVersionLT(17)17
);
2011
1.11k
}
2012
2013
static Constant* SegmentOffset(IRBuilder<> &IRB,
2014
405
                               unsigned Offset, unsigned AddressSpace) {
2015
405
  return ConstantExpr::getIntToPtr(
2016
405
      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2017
405
      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2018
405
}
2019
2020
848
Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2021
848
  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2022
848
  // tcbhead_t; use it instead of the usual global variable (see
2023
848
  // sysdeps/{i386,x86_64}/nptl/tls.h)
2024
848
  if (
hasStackGuardSlotTLS(Subtarget.getTargetTriple())848
) {
2025
395
    if (
Subtarget.isTargetFuchsia()395
) {
2026
2
      // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2027
2
      return SegmentOffset(IRB, 0x10, getAddressSpace());
2028
0
    } else {
2029
393
      // %fs:0x28, unless we're using a Kernel code model, in which case
2030
393
      // it's %gs:0x28.  gs:0x14 on i386.
2031
393
      unsigned Offset = (Subtarget.is64Bit()) ? 
0x28262
:
0x14131
;
2032
393
      return SegmentOffset(IRB, Offset, getAddressSpace());
2033
393
    }
2034
453
  }
2035
453
2036
453
  return TargetLowering::getIRStackGuard(IRB);
2037
453
}
2038
2039
329
void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2040
329
  // MSVC CRT provides functionalities for stack protection.
2041
329
  if (
Subtarget.getTargetTriple().isOSMSVCRT()329
) {
2042
63
    // MSVC CRT has a global variable holding security cookie.
2043
63
    M.getOrInsertGlobal("__security_cookie",
2044
63
                        Type::getInt8PtrTy(M.getContext()));
2045
63
2046
63
    // MSVC CRT has a function to validate security cookie.
2047
63
    auto *SecurityCheckCookie = cast<Function>(
2048
63
        M.getOrInsertFunction("__security_check_cookie",
2049
63
                              Type::getVoidTy(M.getContext()),
2050
63
                              Type::getInt8PtrTy(M.getContext())));
2051
63
    SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2052
63
    SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2053
63
    return;
2054
63
  }
2055
266
  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2056
266
  
if (266
hasStackGuardSlotTLS(Subtarget.getTargetTriple())266
)
2057
0
    return;
2058
266
  TargetLowering::insertSSPDeclarations(M);
2059
266
}
2060
2061
690
Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2062
690
  // MSVC CRT has a global variable holding security cookie.
2063
690
  if (Subtarget.getTargetTriple().isOSMSVCRT())
2064
63
    return M.getGlobalVariable("__security_cookie");
2065
627
  return TargetLowering::getSDagStackGuard(M);
2066
627
}
2067
2068
846
Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2069
846
  // MSVC CRT has a function to validate security cookie.
2070
846
  if (Subtarget.getTargetTriple().isOSMSVCRT())
2071
133
    return M.getFunction("__security_check_cookie");
2072
713
  return TargetLowering::getSSPStackGuardCheck(M);
2073
713
}
2074
2075
149
Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2076
149
  if (Subtarget.getTargetTriple().isOSContiki())
2077
4
    return getDefaultSafeStackPointerLocation(IRB, false);
2078
145
2079
145
  // Android provides a fixed TLS slot for the SafeStack pointer. See the
2080
145
  // definition of TLS_SLOT_SAFESTACK in
2081
145
  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2082
145
  
if (145
Subtarget.isTargetAndroid()145
) {
2083
7
    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2084
7
    // %gs:0x24 on i386
2085
7
    unsigned Offset = (Subtarget.is64Bit()) ? 
0x483
:
0x244
;
2086
7
    return SegmentOffset(IRB, Offset, getAddressSpace());
2087
7
  }
2088
138
2089
138
  // Fuchsia is similar.
2090
138
  
if (138
Subtarget.isTargetFuchsia()138
) {
2091
3
    // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2092
3
    return SegmentOffset(IRB, 0x18, getAddressSpace());
2093
3
  }
2094
135
2095
135
  return TargetLowering::getSafeStackPointerLocation(IRB);
2096
135
}
2097
2098
bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2099
136
                                            unsigned DestAS) const {
2100
136
  assert(SrcAS != DestAS && "Expected different address spaces!");
2101
136
2102
136
  return SrcAS < 256 && DestAS < 256;
2103
136
}
2104
2105
//===----------------------------------------------------------------------===//
2106
//               Return Value Calling Convention Implementation
2107
//===----------------------------------------------------------------------===//
2108
2109
#include "X86GenCallingConv.inc"
2110
2111
bool X86TargetLowering::CanLowerReturn(
2112
    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2113
103k
    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2114
103k
  SmallVector<CCValAssign, 16> RVLocs;
2115
103k
  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2116
103k
  return CCInfo.CheckReturn(Outs, RetCC_X86);
2117
103k
}
2118
2119
170
const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2120
170
  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2121
170
  return ScratchRegs;
2122
170
}
2123
2124
/// Lowers masks values (v*i1) to the local register values
2125
/// \returns DAG node after lowering to register type
2126
static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2127
246
                               const SDLoc &Dl, SelectionDAG &DAG) {
2128
246
  EVT ValVT = ValArg.getValueType();
2129
246
2130
246
  if (
(ValVT == MVT::v8i1 && 246
(ValLoc == MVT::i8 || 71
ValLoc == MVT::i3268
)) ||
2131
246
      
(ValVT == MVT::v16i1 && 234
(ValLoc == MVT::i16 || 78
ValLoc == MVT::i3275
))) {
2132
24
    // Two stage lowering might be required
2133
24
    // bitcast:   v8i1 -> i8 / v16i1 -> i16
2134
24
    // anyextend: i8   -> i32 / i16   -> i32
2135
24
    EVT TempValLoc = ValVT == MVT::v8i1 ? 
MVT::i812
:
MVT::i1612
;
2136
24
    SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2137
24
    if (ValLoc == MVT::i32)
2138
18
      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2139
24
    return ValToCopy;
2140
222
  } else 
if (222
(ValVT == MVT::v32i1 && 222
ValLoc == MVT::i3232
) ||
2141
222
             
(ValVT == MVT::v64i1 && 210
ValLoc == MVT::i6452
)) {
2142
54
    // One stage lowering is required
2143
54
    // bitcast:   v32i1 -> i32 / v64i1 -> i64
2144
54
    return DAG.getBitcast(ValLoc, ValArg);
2145
54
  } else
2146
168
    return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2147
0
}
2148
2149
/// Breaks v64i1 value into two registers and adds the new node to the DAG
2150
static void Passv64i1ArgInRegs(
2151
    const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2152
    SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2153
3
    CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2154
3
  assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2155
3
         "Expected AVX512BW or AVX512BMI target!");
2156
3
  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2157
3
  assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2158
3
  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2159
3
         "The value should reside in two registers");
2160
3
2161
3
  // Before splitting the value we cast it to i64
2162
3
  Arg = DAG.getBitcast(MVT::i64, Arg);
2163
3
2164
3
  // Splitting the value into two i32 types
2165
3
  SDValue Lo, Hi;
2166
3
  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2167
3
                   DAG.getConstant(0, Dl, MVT::i32));
2168
3
  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2169
3
                   DAG.getConstant(1, Dl, MVT::i32));
2170
3
2171
3
  // Attach the two i32 types into corresponding registers
2172
3
  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2173
3
  RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2174
3
}
2175
2176
SDValue
2177
X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2178
                               bool isVarArg,
2179
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
2180
                               const SmallVectorImpl<SDValue> &OutVals,
2181
67.2k
                               const SDLoc &dl, SelectionDAG &DAG) const {
2182
67.2k
  MachineFunction &MF = DAG.getMachineFunction();
2183
67.2k
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2184
67.2k
2185
67.2k
  // In some cases we need to disable registers from the default CSR list.
2186
67.2k
  // For example, when they are used for argument passing.
2187
67.2k
  bool ShouldDisableCalleeSavedRegister =
2188
67.2k
      CallConv == CallingConv::X86_RegCall ||
2189
67.1k
      MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2190
67.2k
2191
67.2k
  if (
CallConv == CallingConv::X86_INTR && 67.2k
!Outs.empty()25
)
2192
0
    report_fatal_error("X86 interrupts may not return any value");
2193
67.2k
2194
67.2k
  SmallVector<CCValAssign, 16> RVLocs;
2195
67.2k
  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2196
67.2k
  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2197
67.2k
2198
67.2k
  SDValue Flag;
2199
67.2k
  SmallVector<SDValue, 6> RetOps;
2200
67.2k
  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2201
67.2k
  // Operand #1 = Bytes To Pop
2202
67.2k
  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2203
67.2k
                   MVT::i32));
2204
67.2k
2205
67.2k
  // Copy the result values into the output registers.
2206
126k
  for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2207
67.2k
       
++I, ++OutsIndex59.6k
) {
2208
59.6k
    CCValAssign &VA = RVLocs[I];
2209
59.6k
    assert(VA.isRegLoc() && "Can only return in registers!");
2210
59.6k
2211
59.6k
    // Add the register to the CalleeSaveDisableRegs list.
2212
59.6k
    if (ShouldDisableCalleeSavedRegister)
2213
187
      MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2214
59.6k
2215
59.6k
    SDValue ValToCopy = OutVals[OutsIndex];
2216
59.6k
    EVT ValVT = ValToCopy.getValueType();
2217
59.6k
2218
59.6k
    // Promote values to the appropriate types.
2219
59.6k
    if (VA.getLocInfo() == CCValAssign::SExt)
2220
0
      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2221
59.6k
    else 
if (59.6k
VA.getLocInfo() == CCValAssign::ZExt59.6k
)
2222
0
      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2223
59.6k
    else 
if (59.6k
VA.getLocInfo() == CCValAssign::AExt59.6k
) {
2224
152
      if (
ValVT.isVector() && 152
ValVT.getVectorElementType() == MVT::i1143
)
2225
143
        ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2226
152
      else
2227
9
        ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2228
152
    }
2229
59.4k
    else 
if (59.4k
VA.getLocInfo() == CCValAssign::BCvt59.4k
)
2230
0
      ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2231
59.6k
2232
59.6k
    assert(VA.getLocInfo() != CCValAssign::FPExt &&
2233
59.6k
           "Unexpected FP-extend for return value.");
2234
59.6k
2235
59.6k
    // If this is x86-64, and we disabled SSE, we can't return FP values,
2236
59.6k
    // or SSE or MMX vectors.
2237
59.6k
    if (
(ValVT == MVT::f32 || 59.6k
ValVT == MVT::f6458.5k
||
2238
59.6k
         
VA.getLocReg() == X86::XMM057.4k
||
VA.getLocReg() == X86::XMM137.4k
) &&
2239
59.6k
        
(Subtarget.is64Bit() && 23.7k
!Subtarget.hasSSE1()21.1k
)) {
2240
1
      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2241
1
      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2242
59.6k
    } else 
if (59.6k
ValVT == MVT::f64 &&
2243
59.6k
               
(Subtarget.is64Bit() && 1.16k
!Subtarget.hasSSE2()870
)) {
2244
0
      // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2245
0
      // llvm-gcc has never done it right and no one has noticed, so this
2246
0
      // should be OK for now.
2247
0
      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2248
0
      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2249
0
    }
2250
59.6k
2251
59.6k
    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2252
59.6k
    // the RET instruction and handled by the FP Stackifier.
2253
59.6k
    if (VA.getLocReg() == X86::FP0 ||
2254
59.6k
        
VA.getLocReg() == X86::FP159.0k
) {
2255
724
      // If this is a copy from an xmm register to ST(0), use an FPExtend to
2256
724
      // change the value to the FP stack register class.
2257
724
      if (isScalarFPTypeInSSEReg(VA.getValVT()))
2258
202
        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2259
724
      RetOps.push_back(ValToCopy);
2260
724
      // Don't emit a copytoreg.
2261
724
      continue;
2262
724
    }
2263
58.9k
2264
58.9k
    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2265
58.9k
    // which is returned in RAX / RDX.
2266
58.9k
    
if (58.9k
Subtarget.is64Bit()58.9k
) {
2267
49.5k
      if (
ValVT == MVT::x86mmx49.5k
) {
2268
102
        if (
VA.getLocReg() == X86::XMM0 || 102
VA.getLocReg() == X86::XMM10
) {
2269
102
          ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2270
102
          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2271
102
                                  ValToCopy);
2272
102
          // If we don't have SSE2 available, convert to v4f32 so the generated
2273
102
          // register is legal.
2274
102
          if (!Subtarget.hasSSE2())
2275
0
            ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2276
102
        }
2277
102
      }
2278
49.5k
    }
2279
58.9k
2280
58.9k
    SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2281
58.9k
2282
58.9k
    if (
VA.needsCustom()58.9k
) {
2283
1
      assert(VA.getValVT() == MVT::v64i1 &&
2284
1
             "Currently the only custom case is when we split v64i1 to 2 regs");
2285
1
2286
1
      Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2287
1
                         Subtarget);
2288
1
2289
1
      assert(2 == RegsToPass.size() &&
2290
1
             "Expecting two registers after Pass64BitArgInRegs");
2291
1
2292
1
      // Add the second register to the CalleeSaveDisableRegs list.
2293
1
      if (ShouldDisableCalleeSavedRegister)
2294
1
        MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2295
58.9k
    } else {
2296
58.9k
      RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2297
58.9k
    }
2298
58.9k
2299
58.9k
    // Add nodes to the DAG and add the values into the RetOps list
2300
58.9k
    for (auto &Reg : RegsToPass) {
2301
58.9k
      Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2302
58.9k
      Flag = Chain.getValue(1);
2303
58.9k
      RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2304
58.9k
    }
2305
59.6k
  }
2306
67.2k
2307
67.2k
  // Swift calling convention does not require we copy the sret argument
2308
67.2k
  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2309
67.2k
2310
67.2k
  // All x86 ABIs require that for returning structs by value we copy
2311
67.2k
  // the sret argument into %rax/%eax (depending on ABI) for the return.
2312
67.2k
  // We saved the argument into a virtual register in the entry block,
2313
67.2k
  // so now we copy the value out and into %rax/%eax.
2314
67.2k
  //
2315
67.2k
  // Checking Function.hasStructRetAttr() here is insufficient because the IR
2316
67.2k
  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2317
67.2k
  // false, then an sret argument may be implicitly inserted in the SelDAG. In
2318
67.2k
  // either case FuncInfo->setSRetReturnReg() will have been called.
2319
67.2k
  if (unsigned 
SRetReg67.2k
= FuncInfo->getSRetReturnReg()) {
2320
527
    // When we have both sret and another return value, we should use the
2321
527
    // original Chain stored in RetOps[0], instead of the current Chain updated
2322
527
    // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2323
527
2324
527
    // For the case of sret and another return value, we have
2325
527
    //   Chain_0 at the function entry
2326
527
    //   Chain_1 = getCopyToReg(Chain_0) in the above loop
2327
527
    // If we use Chain_1 in getCopyFromReg, we will have
2328
527
    //   Val = getCopyFromReg(Chain_1)
2329
527
    //   Chain_2 = getCopyToReg(Chain_1, Val) from below
2330
527
2331
527
    // getCopyToReg(Chain_0) will be glued together with
2332
527
    // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2333
527
    // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2334
527
    //   Data dependency from Unit B to Unit A due to usage of Val in
2335
527
    //     getCopyToReg(Chain_1, Val)
2336
527
    //   Chain dependency from Unit A to Unit B
2337
527
2338
527
    // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2339
527
    SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2340
527
                                     getPointerTy(MF.getDataLayout()));
2341
527
2342
527
    unsigned RetValReg
2343
157
        = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2344
527
          
X86::RAX154
:
X86::EAX373
;
2345
527
    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2346
527
    Flag = Chain.getValue(1);
2347
527
2348
527
    // RAX/EAX now acts like a return value.
2349
527
    RetOps.push_back(
2350
527
        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2351
527
2352
527
    // Add the returned register to the CalleeSaveDisableRegs list.
2353
527
    if (ShouldDisableCalleeSavedRegister)
2354
0
      MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2355
527
  }
2356
67.2k
2357
67.2k
  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2358
67.2k
  const MCPhysReg *I =
2359
67.2k
      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2360
67.2k
  if (
I67.2k
) {
2361
130
    for (; 
*I130
;
++I120
) {
2362
120
      if (X86::GR64RegClass.contains(*I))
2363
120
        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2364
120
      else
2365
0
        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2366
120
    }
2367
10
  }
2368
67.2k
2369
67.2k
  RetOps[0] = Chain;  // Update chain.
2370
67.2k
2371
67.2k
  // Add the flag if we have it.
2372
67.2k
  if (Flag.getNode())
2373
54.8k
    RetOps.push_back(Flag);
2374
67.2k
2375
67.2k
  X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2376
67.2k
  if (CallConv == CallingConv::X86_INTR)
2377
25
    opcode = X86ISD::IRET;
2378
67.2k
  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2379
67.2k
}
2380
2381
666
bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2382
666
  if (
N->getNumValues() != 1 || 666
!N->hasNUsesOfValue(1, 0)666
)
2383
4
    return false;
2384
662
2385
662
  SDValue TCChain = Chain;
2386
662
  SDNode *Copy = *N->use_begin();
2387
662
  if (
Copy->getOpcode() == ISD::CopyToReg662
) {
2388
62
    // If the copy has a glue operand, we conservatively assume it isn't safe to
2389
62
    // perform a tail call.
2390
62
    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2391
2
      return false;
2392
60
    TCChain = Copy->getOperand(0);
2393
662
  } else 
if (600
Copy->getOpcode() != ISD::FP_EXTEND600
)
2394
569
    return false;
2395
91
2396
91
  bool HasRet = false;
2397
91
  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2398
203
       
UI != UE203
;
++UI112
) {
2399
138
    if (UI->getOpcode() != X86ISD::RET_FLAG)
2400
25
      return false;
2401
113
    // If we are returning more than one value, we can definitely
2402
113
    // not make a tail call see PR19530
2403
113
    
if (113
UI->getNumOperands() > 4113
)
2404
1
      return false;
2405
112
    
if (112
UI->getNumOperands() == 4 &&
2406
102
        UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2407
0
      return false;
2408
112
    HasRet = true;
2409
112
  }
2410
91
2411
65
  
if (65
!HasRet65
)
2412
4
    return false;
2413
61
2414
61
  Chain = TCChain;
2415
61
  return true;
2416
61
}
2417
2418
EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2419
2.83k
                                           ISD::NodeType ExtendKind) const {
2420
2.83k
  MVT ReturnMVT = MVT::i32;
2421
2.83k
2422
2.83k
  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2423
2.83k
  if (
VT == MVT::i1 || 2.83k
(!Darwin && 1.94k
(VT == MVT::i8 || 1.67k
VT == MVT::i161.42k
))) {
2424
1.51k
    // The ABI does not require i1, i8 or i16 to be extended.
2425
1.51k
    //
2426
1.51k
    // On Darwin, there is code in the wild relying on Clang's old behaviour of
2427
1.51k
    // always extending i8/i16 return values, so keep doing that for now.
2428
1.51k
    // (PR26665).
2429
1.51k
    ReturnMVT = MVT::i8;
2430
1.51k
  }
2431
2.83k
2432
2.83k
  EVT MinVT = getRegisterType(Context, ReturnMVT);
2433
2.83k
  return VT.bitsLT(MinVT) ? 
MinVT1.21k
:
VT1.62k
;
2434
2.83k
}
2435
2436
/// Reads two 32 bit registers and creates a 64 bit mask value.
2437
/// \param VA The current 32 bit value that need to be assigned.
2438
/// \param NextVA The next 32 bit value that need to be assigned.
2439
/// \param Root The parent DAG node.
2440
/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2441
///                        glue purposes. In the case the DAG is already using
2442
///                        physical register instead of virtual, we should glue
2443
///                        our new SDValue to InFlag SDvalue.
2444
/// \return a new SDvalue of size 64bit.
2445
static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2446
                                SDValue &Root, SelectionDAG &DAG,
2447
                                const SDLoc &Dl, const X86Subtarget &Subtarget,
2448
3
                                SDValue *InFlag = nullptr) {
2449
3
  assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2450
3
  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2451
3
  assert(VA.getValVT() == MVT::v64i1 &&
2452
3
         "Expecting first location of 64 bit width type");
2453
3
  assert(NextVA.getValVT() == VA.getValVT() &&
2454
3
         "The locations should have the same type");
2455
3
  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2456
3
         "The values should reside in two registers");
2457
3
2458
3
  SDValue Lo, Hi;
2459
3
  unsigned Reg;
2460
3
  SDValue ArgValueLo, ArgValueHi;
2461
3
2462
3
  MachineFunction &MF = DAG.getMachineFunction();
2463
3
  const TargetRegisterClass *RC = &X86::GR32RegClass;
2464
3
2465
3
  // Read a 32 bit value from the registers
2466
3
  if (
nullptr == InFlag3
) {
2467
2
    // When no physical register is present,
2468
2
    // create an intermediate virtual register
2469
2
    Reg = MF.addLiveIn(VA.getLocReg(), RC);
2470
2
    ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2471
2
    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2472
2
    ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2473
3
  } else {
2474
1
    // When a physical register is available read the value from it and glue
2475
1
    // the reads together.
2476
1
    ArgValueLo =
2477
1
      DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2478
1
    *InFlag = ArgValueLo.getValue(2);
2479
1
    ArgValueHi =
2480
1
      DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2481
1
    *InFlag = ArgValueHi.getValue(2);
2482
1
  }
2483
3
2484
3
  // Convert the i32 type into v32i1 type
2485
3
  Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2486
3
2487
3
  // Convert the i32 type into v32i1 type
2488
3
  Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2489
3
2490
3
  // Concatenate the two values together
2491
3
  return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2492
3
}
2493
2494
/// The function will lower a register of various sizes (8/16/32/64)
2495
/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2496
/// \returns a DAG node contains the operand after lowering to mask type.
2497
static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2498
                               const EVT &ValLoc, const SDLoc &Dl,
2499
71
                               SelectionDAG &DAG) {
2500
71
  SDValue ValReturned = ValArg;
2501
71
2502
71
  if (ValVT == MVT::v1i1)
2503
7
    return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2504
64
2505
64
  
if (64
ValVT == MVT::v64i164
) {
2506
28
    // In 32 bit machine, this case is handled by getv64i1Argument
2507
28
    assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2508
28
    // In 64 bit machine, There is no need to truncate the value only bitcast
2509
64
  } else {
2510
36
    MVT maskLen;
2511
36
    switch (ValVT.getSimpleVT().SimpleTy) {
2512
12
    case MVT::v8i1:
2513
12
      maskLen = MVT::i8;
2514
12
      break;
2515
12
    case MVT::v16i1:
2516
12
      maskLen = MVT::i16;
2517
12
      break;
2518
12
    case MVT::v32i1:
2519
12
      maskLen = MVT::i32;
2520
12
      break;
2521
0
    default:
2522
0
      llvm_unreachable("Expecting a vector of i1 types");
2523
36
    }
2524
36
2525
36
    ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2526
36
  }
2527
64
  return DAG.getBitcast(ValVT, ValReturned);
2528
71
}
2529
2530
/// Lower the result values of a call into the
2531
/// appropriate copies out of appropriate physical registers.
2532
///
2533
SDValue X86TargetLowering::LowerCallResult(
2534
    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2535
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2536
    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2537
27.7k
    uint32_t *RegMask) const {
2538
27.7k
2539
27.7k
  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2540
27.7k
  // Assign locations to each value returned by this call.
2541
27.7k
  SmallVector<CCValAssign, 16> RVLocs;
2542
27.7k
  bool Is64Bit = Subtarget.is64Bit();
2543
27.7k
  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2544
27.7k
                 *DAG.getContext());
2545
27.7k
  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2546
27.7k
2547
27.7k
  // Copy all of the result registers out of their specified physreg.
2548
42.4k
  for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2549
27.7k
       
++I, ++InsIndex14.7k
) {
2550
14.7k
    CCValAssign &VA = RVLocs[I];
2551
14.7k
    EVT CopyVT = VA.getLocVT();
2552
14.7k
2553
14.7k
    // In some calling conventions we need to remove the used registers
2554
14.7k
    // from the register mask.
2555
14.7k
    if (
RegMask14.7k
) {
2556
72
      for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2557
269
           
SubRegs.isValid()269
;
++SubRegs197
)
2558
197
        RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2559
72
    }
2560
14.7k
2561
14.7k
    // If this is x86-64, and we disabled SSE, we can't return FP values
2562
14.7k
    if (
(CopyVT == MVT::f32 || 14.7k
CopyVT == MVT::f6414.4k
||
CopyVT == MVT::f12814.0k
) &&
2563
14.7k
        
((Is64Bit || 803
Ins[InsIndex].Flags.isInReg()275
) &&
!Subtarget.hasSSE1()536
)) {
2564
9
      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2565
9
      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2566
9
    }
2567
14.7k
2568
14.7k
    // If we prefer to use the value in xmm registers, copy it out as f80 and
2569
14.7k
    // use a truncate to move it from fp stack reg to xmm reg.
2570
14.7k
    bool RoundAfterCopy = false;
2571
14.7k
    if (
(VA.getLocReg() == X86::FP0 || 14.7k
VA.getLocReg() == X86::FP114.3k
) &&
2572
14.7k
        
isScalarFPTypeInSSEReg(VA.getValVT())446
) {
2573
170
      if (!Subtarget.hasX87())
2574
0
        report_fatal_error("X87 register return with X87 disabled");
2575
170
      CopyVT = MVT::f80;
2576
170
      RoundAfterCopy = (CopyVT != VA.getLocVT());
2577
170
    }
2578
14.7k
2579
14.7k
    SDValue Val;
2580
14.7k
    if (
VA.needsCustom()14.7k
) {
2581
1
      assert(VA.getValVT() == MVT::v64i1 &&
2582
1
             "Currently the only custom case is when we split v64i1 to 2 regs");
2583
1
      Val =
2584
1
          getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2585
14.7k
    } else {
2586
14.7k
      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2587
14.7k
                  .getValue(1);
2588
14.7k
      Val = Chain.getValue(0);
2589
14.7k
      InFlag = Chain.getValue(2);
2590
14.7k
    }
2591
14.7k
2592
14.7k
    if (RoundAfterCopy)
2593
170
      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2594
170
                        // This truncation won't change the value.
2595
170
                        DAG.getIntPtrConstant(1, dl));
2596
14.7k
2597
14.7k
    if (
VA.isExtInLoc() && 14.7k
(VA.getValVT().getScalarType() == MVT::i1)22
) {
2598
22
      if (VA.getValVT().isVector() &&
2599
22
          
((VA.getLocVT() == MVT::i64) || 22
(VA.getLocVT() == MVT::i32)19
||
2600
22
           
(VA.getLocVT() == MVT::i16)16
||
(VA.getLocVT() == MVT::i8)13
)) {
2601
12
        // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2602
12
        Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2603
12
      } else
2604
10
        Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2605
22
    }
2606
14.7k
2607
14.7k
    InVals.push_back(Val);
2608
14.7k
  }
2609
27.7k
2610
27.7k
  return Chain;
2611
27.7k
}
2612
2613
//===----------------------------------------------------------------------===//
2614
//                C & StdCall & Fast Calling Convention implementation
2615
//===----------------------------------------------------------------------===//
2616
//  StdCall calling convention seems to be standard for many Windows' API
2617
//  routines and around. It differs from C calling convention just a little:
2618
//  callee should clean up the stack, not caller. Symbols should be also
2619
//  decorated in some fancy way :) It doesn't support any vector arguments.
2620
//  For info on fast calling convention see Fast Calling Convention (tail call)
2621
//  implementation LowerX86_32FastCCCallTo.
2622
2623
/// CallIsStructReturn - Determines whether a call uses struct return
2624
/// semantics.
2625
enum StructReturnType {
2626
  NotStructReturn,
2627
  RegStructReturn,
2628
  StackStructReturn
2629
};
2630
static StructReturnType
2631
29.6k
callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2632
29.6k
  if (Outs.empty())
2633
4.95k
    return NotStructReturn;
2634
24.6k
2635
24.6k
  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2636
24.6k
  if (!Flags.isSRet())
2637
24.4k
    return NotStructReturn;
2638
247
  
if (247
Flags.isInReg() || 247
IsMCU246
)
2639
1
    return RegStructReturn;
2640
246
  return StackStructReturn;
2641
246
}
2642
2643
/// Determines whether a function uses struct return semantics.
2644
static StructReturnType
2645
13.2k
argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2646
13.2k
  if (Ins.empty())
2647
2.08k
    return NotStructReturn;
2648
11.1k
2649
11.1k
  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2650
11.1k
  if (!Flags.isSRet())
2651
10.7k
    return NotStructReturn;
2652
333
  
if (333
Flags.isInReg() || 333
IsMCU331
)
2653
3
    return RegStructReturn;
2654
330
  return StackStructReturn;
2655
330
}
2656
2657
/// Make a copy of an aggregate at address specified by "Src" to address
2658
/// "Dst" with size and alignment information specified by the specific
2659
/// parameter attribute. The copy will be passed as a byval function parameter.
2660
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2661
                                         SDValue Chain, ISD::ArgFlagsTy Flags,
2662
98
                                         SelectionDAG &DAG, const SDLoc &dl) {
2663
98
  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2664
98
2665
98
  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2666
98
                       /*isVolatile*/false, /*AlwaysInline=*/true,
2667
98
                       /*isTailCall*/false,
2668
98
                       MachinePointerInfo(), MachinePointerInfo());
2669
98
}
2670
2671
/// Return true if the calling convention is one that we can guarantee TCO for.
2672
24.9k
static bool canGuaranteeTCO(CallingConv::ID CC) {
2673
24.4k
  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2674
24.9k
          
CC == CallingConv::X86_RegCall24.4k
||
CC == CallingConv::HiPE24.3k
||
2675
24.3k
          CC == CallingConv::HHVM);
2676
24.9k
}
2677
2678
/// Return true if we might ever do TCO for calls with this calling convention.
2679
3.11k
static bool mayTailCallThisCC(CallingConv::ID CC) {
2680
3.11k
  switch (CC) {
2681
3.11k
  // C calling conventions:
2682
3.00k
  case CallingConv::C:
2683
3.00k
  case CallingConv::Win64:
2684
3.00k
  case CallingConv::X86_64_SysV:
2685
3.00k
  // Callee pop conventions:
2686
3.00k
  case CallingConv::X86_ThisCall:
2687
3.00k
  case CallingConv::X86_StdCall:
2688
3.00k
  case CallingConv::X86_VectorCall:
2689
3.00k
  case CallingConv::X86_FastCall:
2690
3.00k
    return true;
2691
107
  default:
2692
107
    return canGuaranteeTCO(CC);
2693
0
  }
2694
0
}
2695
2696
/// Return true if the function is being made into a tailcall target by
2697
/// changing its ABI.
2698
191k
static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2699
143
  return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2700
191k
}
2701
2702
1.12k
bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2703
1.12k
  auto Attr =
2704
1.12k
      CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2705
1.12k
  if (
!CI->isTailCall() || 1.12k
Attr.getValueAsString() == "true"740
)
2706
384
    return false;
2707
740
2708
740
  ImmutableCallSite CS(CI);
2709
740
  CallingConv::ID CalleeCC = CS.getCallingConv();
2710
740
  if (!mayTailCallThisCC(CalleeCC))
2711
0
    return false;
2712
740
2713
740
  return true;
2714
740
}
2715
2716
SDValue
2717
X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2718
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2719
                                    const SDLoc &dl, SelectionDAG &DAG,
2720
                                    const CCValAssign &VA,
2721
22.5k
                                    MachineFrameInfo &MFI, unsigned i) const {
2722
22.5k
  // Create the nodes corresponding to a load from this parameter slot.
2723
22.5k
  ISD::ArgFlagsTy Flags = Ins[i].Flags;
2724
22.5k
  bool AlwaysUseMutable = shouldGuaranteeTCO(
2725
22.5k
      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2726
22.5k
  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2727
22.5k
  EVT ValVT;
2728
22.5k
  MVT PtrVT = getPointerTy(DAG.getDataLayout());
2729
22.5k
2730
22.5k
  // If value is passed by pointer we have address passed instead of the value
2731
22.5k
  // itself. No need to extend if the mask value and location share the same
2732
22.5k
  // absolute size.
2733
22.5k
  bool ExtendedInMem =
2734
3.03k
      VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2735
17
      VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2736
22.5k
2737
22.5k
  if (
VA.getLocInfo() == CCValAssign::Indirect || 22.5k
ExtendedInMem22.5k
)
2738
12
    ValVT = VA.getLocVT();
2739
22.5k
  else
2740
22.5k
    ValVT = VA.getValVT();
2741
22.5k
2742
22.5k
  // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2743
22.5k
  // taken by a return address.
2744
22.5k
  int Offset = 0;
2745
22.5k
  if (
CallConv == CallingConv::X86_INTR22.5k
) {
2746
34
    // X86 interrupts may take one or two arguments.
2747
34
    // On the stack there will be no return address as in regular call.
2748
34
    // Offset of last argument need to be set to -4/-8 bytes.
2749
34
    // Where offset of the first argument out of two, should be set to 0 bytes.
2750
34
    Offset = (Subtarget.is64Bit() ? 
819
:
415
) * ((i + 1) % Ins.size() - 1);
2751
34
    if (
Subtarget.is64Bit() && 34
Ins.size() == 219
) {
2752
10
      // The stack pointer needs to be realigned for 64 bit handlers with error
2753
10
      // code, so the argument offset changes by 8 bytes.
2754
10
      Offset += 8;
2755
10
    }
2756
34
  }
2757
22.5k
2758
22.5k
  // FIXME: For now, all byval parameter objects are marked mutable. This can be
2759
22.5k
  // changed with more analysis.
2760
22.5k
  // In case of tail call optimization mark all arguments mutable. Since they
2761
22.5k
  // could be overwritten by lowering of arguments in case of a tail call.
2762
22.5k
  if (
Flags.isByVal()22.5k
) {
2763
136
    unsigned Bytes = Flags.getByValSize();
2764
136
    if (
Bytes == 0136
)
Bytes = 11
; // Don't create zero-sized stack objects.
2765
136
    int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2766
136
    // Adjust SP offset of interrupt parameter.
2767
136
    if (
CallConv == CallingConv::X86_INTR136
) {
2768
25
      MFI.setObjectOffset(FI, Offset);
2769
25
    }
2770
136
    return DAG.getFrameIndex(FI, PtrVT);
2771
136
  }
2772
22.3k
2773
22.3k
  // This is an argument in memory. We might be able to perform copy elision.
2774
22.3k
  
if (22.3k
Flags.isCopyElisionCandidate()22.3k
) {
2775
264
    EVT ArgVT = Ins[i].ArgVT;
2776
264
    SDValue PartAddr;
2777
264
    if (
Ins[i].PartOffset == 0264
) {
2778
257
      // If this is a one-part value or the first part of a multi-part value,
2779
257
      // create a stack object for the entire argument value type and return a
2780
257
      // load from our portion of it. This assumes that if the first part of an
2781
257
      // argument is in memory, the rest will also be in memory.
2782
257
      int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2783
257
                                     /*Immutable=*/false);
2784
257
      PartAddr = DAG.getFrameIndex(FI, PtrVT);
2785
257
      return DAG.getLoad(
2786
257
          ValVT, dl, Chain, PartAddr,
2787
257
          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2788
0
    } else {
2789
7
      // This is not the first piece of an argument in memory. See if there is
2790
7
      // already a fixed stack object including this offset. If so, assume it
2791
7
      // was created by the PartOffset == 0 branch above and create a load from
2792
7
      // the appropriate offset into it.
2793
7
      int64_t PartBegin = VA.getLocMemOffset();
2794
7
      int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2795
7
      int FI = MFI.getObjectIndexBegin();
2796
7
      for (; 
MFI.isFixedObjectIndex(FI)7
;
++FI0
) {
2797
6
        int64_t ObjBegin = MFI.getObjectOffset(FI);
2798
6
        int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2799
6
        if (
ObjBegin <= PartBegin && 6
PartEnd <= ObjEnd6
)
2800
6
          break;
2801
6
      }
2802
7
      if (
MFI.isFixedObjectIndex(FI)7
) {
2803
6
        SDValue Addr =
2804
6
            DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2805
6
                        DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2806
6
        return DAG.getLoad(
2807
6
            ValVT, dl, Chain, Addr,
2808
6
            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2809
6
                                              Ins[i].PartOffset));
2810
6
      }
2811
22.1k
    }
2812
264
  }
2813
22.1k
2814
22.1k
  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2815
22.1k
                                 VA.getLocMemOffset(), isImmutable);
2816
22.1k
2817
22.1k
  // Set SExt or ZExt flag.
2818
22.1k
  if (
VA.getLocInfo() == CCValAssign::ZExt22.1k
) {
2819
291
    MFI.setObjectZExt(FI, true);
2820
22.1k
  } else 
if (21.8k
VA.getLocInfo() == CCValAssign::SExt21.8k
) {
2821
59
    MFI.setObjectSExt(FI, true);
2822
59
  }
2823
22.1k
2824
22.1k
  // Adjust SP offset of interrupt parameter.
2825
22.1k
  if (
CallConv == CallingConv::X86_INTR22.1k
) {
2826
9
    MFI.setObjectOffset(FI, Offset);
2827
9
  }
2828
22.1k
2829
22.1k
  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2830
22.1k
  SDValue Val = DAG.getLoad(
2831
22.1k
      ValVT, dl, Chain, FIN,
2832
22.1k
      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2833
22.1k
  return ExtendedInMem
2834
3
             ? (VA.getValVT().isVector()
2835
3
                    ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2836
3
                    : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2837
22.1k
             : Val;
2838
22.5k
}
2839
2840
// FIXME: Get this from tablegen.
2841
static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2842
60
                                                const X86Subtarget &Subtarget) {
2843
60
  assert(Subtarget.is64Bit());
2844
60
2845
60
  if (
Subtarget.isCallingConvWin64(CallConv)60
) {
2846
19
    static const MCPhysReg GPR64ArgRegsWin64[] = {
2847
19
      X86::RCX, X86::RDX, X86::R8,  X86::R9
2848
19
    };
2849
19
    return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2850
19
  }
2851
41
2852
41
  static const MCPhysReg GPR64ArgRegs64Bit[] = {
2853
41
    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2854
41
  };
2855
41
  return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2856
41
}
2857
2858
// FIXME: Get this from tablegen.
2859
static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2860
                                                CallingConv::ID CallConv,
2861
60
                                                const X86Subtarget &Subtarget) {
2862
60
  assert(Subtarget.is64Bit());
2863
60
  if (
Subtarget.isCallingConvWin64(CallConv)60
) {
2864
19
    // The XMM registers which might contain var arg parameters are shadowed
2865
19
    // in their paired GPR.  So we only need to save the GPR to their home
2866
19
    // slots.
2867
19
    // TODO: __vectorcall will change this.
2868
19
    return None;
2869
19
  }
2870
41
2871
41
  const Function *Fn = MF.getFunction();
2872
41
  bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2873
41
  bool isSoftFloat = Subtarget.useSoftFloat();
2874
41
  assert(!(isSoftFloat && NoImplicitFloatOps) &&
2875
41
         "SSE register cannot be used when SSE is disabled!");
2876
41
  if (
isSoftFloat || 41
NoImplicitFloatOps39
||
!Subtarget.hasSSE1()39
)
2877
41
    // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2878
41
    // registers.
2879
4
    return None;
2880
37
2881
37
  static const MCPhysReg XMMArgRegs64Bit[] = {
2882
37
    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2883
37
    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2884
37
  };
2885
37
  return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2886
37
}
2887
2888
#ifndef NDEBUG
2889
static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2890
  return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2891
                        [](const CCValAssign &A, const CCValAssign &B) -> bool {
2892
                          return A.getValNo() < B.getValNo();
2893
                        });
2894
}
2895
#endif
2896
2897
SDValue X86TargetLowering::LowerFormalArguments(
2898
    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2899
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2900
70.7k
    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2901
70.7k
  MachineFunction &MF = DAG.getMachineFunction();
2902
70.7k
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2903
70.7k
  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2904
70.7k
2905
70.7k
  const Function *Fn = MF.getFunction();
2906
70.7k
  if (Fn->hasExternalLinkage() &&
2907
69.4k
      Subtarget.isTargetCygMing() &&
2908
237
      Fn->getName() == "main")
2909
14
    FuncInfo->setForceFramePointer(true);
2910
70.7k
2911
70.7k
  MachineFrameInfo &MFI = MF.getFrameInfo();
2912
70.7k
  bool Is64Bit = Subtarget.is64Bit();
2913
70.7k
  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2914
70.7k
2915
70.7k
  assert(
2916
70.7k
      !(isVarArg && canGuaranteeTCO(CallConv)) &&
2917
70.7k
      "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2918
70.7k
2919
70.7k
  if (
CallConv == CallingConv::X86_INTR70.7k
) {
2920
25
    bool isLegal = Ins.size() == 1 ||
2921
9
                   
(Ins.size() == 2 && 9
((Is64Bit && 9
Ins[1].VT == MVT::i645
) ||
2922
9
                                        
(!Is64Bit && 4
Ins[1].VT == MVT::i324
)));
2923
25
    if (!isLegal)
2924
0
      report_fatal_error("X86 interrupts may take one or two arguments");
2925
70.7k
  }
2926
70.7k
2927
70.7k
  // Assign locations to all of the incoming arguments.
2928
70.7k
  SmallVector<CCValAssign, 16> ArgLocs;
2929
70.7k
  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2930
70.7k
2931
70.7k
  // Allocate shadow area for Win64.
2932
70.7k
  if (IsWin64)
2933
1.10k
    CCInfo.AllocateStack(32, 8);
2934
70.7k
2935
70.7k
  CCInfo.AnalyzeArguments(Ins, CC_X86);
2936
70.7k
2937
70.7k
  // In vectorcall calling convention a second pass is required for the HVA
2938
70.7k
  // types.
2939
70.7k
  if (
CallingConv::X86_VectorCall == CallConv70.7k
) {
2940
51
    CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2941
51
  }
2942
70.7k
2943
70.7k
  // The next loop assumes that the locations are in the same order of the
2944
70.7k
  // input arguments.
2945
70.7k
  assert(isSortedByValueNo(ArgLocs) &&
2946
70.7k
         "Argument Location list must be sorted before lowering");
2947
70.7k
2948
70.7k
  SDValue ArgValue;
2949
208k
  for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2950
138k
       
++I, ++InsIndex138k
) {
2951
138k
    assert(InsIndex < Ins.size() && "Invalid Ins index");
2952
138k
    CCValAssign &VA = ArgLocs[I];
2953
138k
2954
138k
    if (
VA.isRegLoc()138k
) {
2955
115k
      EVT RegVT = VA.getLocVT();
2956
115k
      if (
VA.needsCustom()115k
) {
2957
2
        assert(
2958
2
            VA.getValVT() == MVT::v64i1 &&
2959
2
            "Currently the only custom case is when we split v64i1 to 2 regs");
2960
2
2961
2
        // v64i1 values, in regcall calling convention, that are
2962
2
        // compiled to 32 bit arch, are split up into two registers.
2963
2
        ArgValue =
2964
2
            getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2965
115k
      } else {
2966
115k
        const TargetRegisterClass *RC;
2967
115k
        if (RegVT == MVT::i32)
2968
11.8k
          RC = &X86::GR32RegClass;
2969
103k
        else 
if (103k
Is64Bit && 103k
RegVT == MVT::i6496.5k
)
2970
30.8k
          RC = &X86::GR64RegClass;
2971
72.9k
        else 
if (72.9k
RegVT == MVT::f3272.9k
)
2972
1.87k
          
RC = Subtarget.hasAVX512() ? 1.87k
&X86::FR32XRegClass335
:
&X86::FR32RegClass1.54k
;
2973
71.0k
        else 
if (71.0k
RegVT == MVT::f6471.0k
)
2974
1.76k
          
RC = Subtarget.hasAVX512() ? 1.76k
&X86::FR64XRegClass264
:
&X86::FR64RegClass1.50k
;
2975
69.3k
        else 
if (69.3k
RegVT == MVT::f8069.3k
)
2976
6
          RC = &X86::RFP80RegClass;
2977
69.3k
        else 
if (69.3k
RegVT == MVT::f12869.3k
)
2978
186
          RC = &X86::FR128RegClass;
2979
69.1k
        else 
if (69.1k
RegVT.is512BitVector()69.1k
)
2980
9.25k
          RC = &X86::VR512RegClass;
2981
59.8k
        else 
if (59.8k
RegVT.is256BitVector()59.8k
)
2982
21.3k
          
RC = Subtarget.hasVLX() ? 21.3k
&X86::VR256XRegClass6.09k
:
&X86::VR256RegClass15.2k
;
2983
38.5k
        else 
if (38.5k
RegVT.is128BitVector()38.5k
)
2984
38.2k
          
RC = Subtarget.hasVLX() ? 38.2k
&X86::VR128XRegClass6.00k
:
&X86::VR128RegClass32.2k
;
2985
224
        else 
if (224
RegVT == MVT::x86mmx224
)
2986
219
          RC = &X86::VR64RegClass;
2987
5
        else 
if (5
RegVT == MVT::v1i15
)
2988
0
          RC = &X86::VK1RegClass;
2989
5
        else 
if (5
RegVT == MVT::v8i15
)
2990
0
          RC = &X86::VK8RegClass;
2991
5
        else 
if (5
RegVT == MVT::v16i15
)
2992
4
          RC = &X86::VK16RegClass;
2993
1
        else 
if (1
RegVT == MVT::v32i11
)
2994
0
          RC = &X86::VK32RegClass;
2995
1
        else 
if (1
RegVT == MVT::v64i11
)
2996
0
          RC = &X86::VK64RegClass;
2997
1
        else
2998
1
          llvm_unreachable("Unknown argument type!");
2999
115k
3000
115k
        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3001
115k
        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3002
115k
      }
3003
115k
3004
115k
      // If this is an 8 or 16-bit value, it is really passed promoted to 32
3005
115k
      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
3006
115k
      // right size.
3007
115k
      
if (115k
VA.getLocInfo() == CCValAssign::SExt115k
)
3008
143
        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3009
143
                               DAG.getValueType(VA.getValVT()));
3010
115k
      else 
if (115k
VA.getLocInfo() == CCValAssign::ZExt115k
)
3011
1.43k
        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3012
1.43k
                               DAG.getValueType(VA.getValVT()));
3013
114k
      else 
if (114k
VA.getLocInfo() == CCValAssign::BCvt114k
)
3014
0
        ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3015
115k
3016
115k
      if (
VA.isExtInLoc()115k
) {
3017
5.91k
        // Handle MMX values passed in XMM regs.
3018
5.91k
        if (
RegVT.isVector() && 5.91k
VA.getValVT().getScalarType() != MVT::i1348
)
3019
4
          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3020
5.90k
        else 
if (5.90k
VA.getValVT().isVector() &&
3021
403
                 VA.getValVT().getScalarType() == MVT::i1 &&
3022
403
                 
((VA.getLocVT() == MVT::i64) || 403
(VA.getLocVT() == MVT::i32)378
||
3023
5.90k
                  
(VA.getLocVT() == MVT::i16)344
||
(VA.getLocVT() == MVT::i8)344
)) {
3024
59
          // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3025
59
          ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3026
59
        } else
3027
5.84k
          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3028
5.91k
      }
3029
138k
    } else {
3030
22.5k
      assert(VA.isMemLoc());
3031
22.5k
      ArgValue =
3032
22.5k
          LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3033
22.5k
    }
3034
138k
3035
138k
    // If value is passed via pointer - do a load.
3036
138k
    
if (138k
VA.getLocInfo() == CCValAssign::Indirect138k
)
3037
749
      ArgValue =
3038
749
          DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3039
138k
3040
138k
    InVals.push_back(ArgValue);
3041
138k
  }
3042
70.7k
3043
204k
  
for (unsigned I = 0, E = Ins.size(); 70.7k
I != E204k
;
++I134k
) {
3044
134k
    // Swift calling convention does not require we copy the sret argument
3045
134k
    // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3046
134k
    if (CallConv == CallingConv::Swift)
3047
150
      continue;
3048
134k
3049
134k
    // All x86 ABIs require that for returning structs by value we copy the
3050
134k
    // sret argument into %rax/%eax (depending on ABI) for the return. Save
3051
134k
    // the argument into a virtual register so that we can access it from the
3052
134k
    // return points.
3053
134k
    
if (134k
Ins[I].Flags.isSRet()134k
) {
3054
549
      unsigned Reg = FuncInfo->getSRetReturnReg();
3055
549
      if (
!Reg549
) {
3056
549
        MVT PtrTy = getPointerTy(DAG.getDataLayout());
3057
549
        Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3058
549
        FuncInfo->setSRetReturnReg(Reg);
3059
549
      }
3060
549
      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3061
549
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3062
549
      break;
3063
549
    }
3064
134k
  }
3065
70.7k
3066
70.7k
  unsigned StackSize = CCInfo.getNextStackOffset();
3067
70.7k
  // Align stack specially for tail calls.
3068
70.7k
  if (shouldGuaranteeTCO(CallConv,
3069
70.7k
                         MF.getTarget().Options.GuaranteedTailCallOpt))
3070
42
    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3071
70.7k
3072
70.7k
  // If the function takes variable number of arguments, make a frame index for
3073
70.7k
  // the start of the first vararg value... for expansion of llvm.va_start. We
3074
70.7k
  // can skip this if there are no va_start calls.
3075
70.7k
  if (MFI.hasVAStart() &&
3076
84
      
(Is64Bit || 84
(CallConv != CallingConv::X86_FastCall &&
3077
70.7k
                   
CallConv != CallingConv::X86_ThisCall24
))) {
3078
84
    FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3079
84
  }
3080
70.7k
3081
70.7k
  // Figure out if XMM registers are in use.
3082
70.7k
  assert(!(Subtarget.useSoftFloat() &&
3083
70.7k
           Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3084
70.7k
         "SSE register cannot be used when SSE is disabled!");
3085
70.7k
3086
70.7k
  // 64-bit calling conventions support varargs and register parameters, so we
3087
70.7k
  // have to do extra work to spill them in the prologue.
3088
70.7k
  if (
Is64Bit && 70.7k
isVarArg56.4k
&&
MFI.hasVAStart()79
) {
3089
60
    // Find the first unallocated argument registers.
3090
60
    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3091
60
    ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3092
60
    unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3093
60
    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3094
60
    assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3095
60
           "SSE register cannot be used when SSE is disabled!");
3096
60
3097
60
    // Gather all the live in physical registers.
3098
60
    SmallVector<SDValue, 6> LiveGPRs;
3099
60
    SmallVector<SDValue, 8> LiveXMMRegs;
3100
60
    SDValue ALVal;
3101
215
    for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3102
215
      unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3103
215
      LiveGPRs.push_back(
3104
215
          DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3105
215
    }
3106
60
    if (
!ArgXMMs.empty()60
) {
3107
37
      unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3108
37
      ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3109
294
      for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3110
294
        unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3111
294
        LiveXMMRegs.push_back(
3112
294
            DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3113
294
      }
3114
37
    }
3115
60
3116
60
    if (
IsWin6460
) {
3117
19
      // Get to the caller-allocated home save location.  Add 8 to account
3118
19
      // for the return address.
3119
19
      int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3120
19
      FuncInfo->setRegSaveFrameIndex(
3121
19
          MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3122
19
      // Fixup to set vararg frame on shadow area (4 x i64).
3123
19
      if (NumIntRegs < 4)
3124
11
        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3125
60
    } else {
3126
41
      // For X86-64, if there are vararg parameters that are passed via
3127
41
      // registers, then we must store them to their spots on the stack so
3128
41
      // they may be loaded by dereferencing the result of va_next.
3129
41
      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3130
41
      FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3131
41
      FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3132
41
          ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3133
41
    }
3134
60
3135
60
    // Store the integer parameter registers.
3136
60
    SmallVector<SDValue, 8> MemOps;
3137
60
    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3138
60
                                      getPointerTy(DAG.getDataLayout()));
3139
60
    unsigned Offset = FuncInfo->getVarArgsGPOffset();
3140
215
    for (SDValue Val : LiveGPRs) {
3141
215
      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3142
215
                                RSFIN, DAG.getIntPtrConstant(Offset, dl));
3143
215
      SDValue Store =
3144
215
          DAG.getStore(Val.getValue(1), dl, Val, FIN,
3145
215
                       MachinePointerInfo::getFixedStack(
3146
215
                           DAG.getMachineFunction(),
3147
215
                           FuncInfo->getRegSaveFrameIndex(), Offset));
3148
215
      MemOps.push_back(Store);
3149
215
      Offset += 8;
3150
215
    }
3151
60
3152
60
    if (
!ArgXMMs.empty() && 60
NumXMMRegs != ArgXMMs.size()37
) {
3153
37
      // Now store the XMM (fp + vector) parameter registers.
3154
37
      SmallVector<SDValue, 12> SaveXMMOps;
3155
37
      SaveXMMOps.push_back(Chain);
3156
37
      SaveXMMOps.push_back(ALVal);
3157
37
      SaveXMMOps.push_back(DAG.getIntPtrConstant(
3158
37
                             FuncInfo->getRegSaveFrameIndex(), dl));
3159
37
      SaveXMMOps.push_back(DAG.getIntPtrConstant(
3160
37
                             FuncInfo->getVarArgsFPOffset(), dl));
3161
37
      SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3162
37
                        LiveXMMRegs.end());
3163
37
      MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3164
37
                                   MVT::Other, SaveXMMOps));
3165
37
    }
3166
60
3167
60
    if (!MemOps.empty())
3168
52
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3169
60
  }
3170
70.7k
3171
70.7k
  if (
isVarArg && 70.7k
MFI.hasMustTailInVarArgFunc()117
) {
3172
21
    // Find the largest legal vector type.
3173
21
    MVT VecVT = MVT::Other;
3174
21
    // FIXME: Only some x86_32 calling conventions support AVX512.
3175
21
    if (Subtarget.hasAVX512() &&
3176
2
        
(Is64Bit || 2
(CallConv == CallingConv::X86_VectorCall ||
3177
2
                     CallConv == CallingConv::Intel_OCL_BI)))
3178
1
      VecVT = MVT::v16f32;
3179
20
    else 
if (20
Subtarget.hasAVX()20
)
3180
3
      VecVT = MVT::v8f32;
3181
17
    else 
if (17
Subtarget.hasSSE2()17
)
3182
14
      VecVT = MVT::v4f32;
3183
21
3184
21
    // We forward some GPRs and some vector types.
3185
21
    SmallVector<MVT, 2> RegParmTypes;
3186
21
    MVT IntVT = Is64Bit ? 
MVT::i649
:
MVT::i3212
;
3187
21
    RegParmTypes.push_back(IntVT);
3188
21
    if (VecVT != MVT::Other)
3189
18
      RegParmTypes.push_back(VecVT);
3190
21
3191
21
    // Compute the set of forwarded registers. The rest are scratch.
3192
21
    SmallVectorImpl<ForwardedRegister> &Forwards =
3193
21
        FuncInfo->getForwardedMustTailRegParms();
3194
21
    CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3195
21
3196
21
    // Conservatively forward AL on x86_64, since it might be used for varargs.
3197
21
    if (
Is64Bit && 21
!CCInfo.isAllocated(X86::AL)9
) {
3198
9
      unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3199
9
      Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3200
9
    }
3201
21
3202
21
    // Copy all forwards from physical to virtual registers.
3203
144
    for (ForwardedRegister &F : Forwards) {
3204
144
      // FIXME: Can we use a less constrained schedule?
3205
144
      SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3206
144
      F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3207
144
      Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3208
144
    }
3209
21
  }
3210
70.7k
3211
70.7k
  // Some CCs need callee pop.
3212
70.7k
  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3213
70.7k
                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
3214
209
    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3215
70.7k
  } else 
if (70.5k
CallConv == CallingConv::X86_INTR && 70.5k
Ins.size() == 225
) {
3216
9
    // X86 interrupts must pop the error code (and the alignment padding) if
3217
9
    // present.
3218
9
    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 
165
:
44
);
3219
70.5k
  } else {
3220
70.5k
    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3221
70.5k
    // If this is an sret function, the return should pop the hidden pointer.
3222
70.5k
    if (
!Is64Bit && 70.5k
!canGuaranteeTCO(CallConv)14.0k
&&
3223
13.9k
        !Subtarget.getTargetTriple().isOSMSVCRT() &&
3224
13.2k
        argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3225
330
      FuncInfo->setBytesToPopOnReturn(4);
3226
70.5k
  }
3227
70.7k
3228
70.7k
  if (
!Is64Bit70.7k
) {
3229
14.2k
    // RegSaveFrameIndex is X86-64 only.
3230
14.2k
    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3231
14.2k
    if (CallConv == CallingConv::X86_FastCall ||
3232
14.2k
        CallConv == CallingConv::X86_ThisCall)
3233
14.2k
      // fastcc functions can't have varargs.
3234
106
      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3235
14.2k
  }
3236
70.7k
3237
70.7k
  FuncInfo->setArgumentStackSize(StackSize);
3238
70.7k
3239
70.7k
  if (WinEHFuncInfo *
EHInfo70.7k
= MF.getWinEHFuncInfo()) {
3240
86
    EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3241
86
    if (
Personality == EHPersonality::CoreCLR86
) {
3242
7
      assert(Is64Bit);
3243
7
      // TODO: Add a mechanism to frame lowering that will allow us to indicate
3244
7
      // that we'd prefer this slot be allocated towards the bottom of the frame
3245
7
      // (i.e. near the stack pointer after allocating the frame).  Every
3246
7
      // funclet needs a copy of this slot in its (mostly empty) frame, and the
3247
7
      // offset from the bottom of this and each funclet's frame must be the
3248
7
      // same, so the size of funclets' (mostly empty) frames is dictated by
3249
7
      // how far this slot is from the bottom (since they allocate just enough
3250
7
      // space to accommodate holding this slot at the correct offset).
3251
7
      int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3252
7
      EHInfo->PSPSymFrameIdx = PSPSymFI;
3253
7
    }
3254
86
  }
3255
70.7k
3256
70.7k
  if (CallConv == CallingConv::X86_RegCall ||
3257
70.7k
      
Fn->hasFnAttribute("no_caller_saved_registers")70.6k
) {
3258
133
    const MachineRegisterInfo &MRI = MF.getRegInfo();
3259
133
    for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3260
391
      MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3261
133
  }
3262
70.7k
3263
70.7k
  return Chain;
3264
70.7k
}
3265
3266
SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3267
                                            SDValue Arg, const SDLoc &dl,
3268
                                            SelectionDAG &DAG,
3269
                                            const CCValAssign &VA,
3270
28.6k
                                            ISD::ArgFlagsTy Flags) const {
3271
28.6k
  unsigned LocMemOffset = VA.getLocMemOffset();
3272
28.6k
  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3273
28.6k
  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3274
28.6k
                       StackPtr, PtrOff);
3275
28.6k
  if (Flags.isByVal())
3276
96
    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3277
28.5k
3278
28.5k
  return DAG.getStore(
3279
28.5k
      Chain, dl, Arg, PtrOff,
3280
28.5k
      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3281
28.5k
}
3282
3283
/// Emit a load of return address if tail call
3284
/// optimization is performed and it is required.
3285
SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3286
    SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3287
8
    bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3288
8
  // Adjust the Return address stack slot.
3289
8
  EVT VT = getPointerTy(DAG.getDataLayout());
3290
8
  OutRetAddr = getReturnAddressFrameIndex(DAG);
3291
8
3292
8
  // Load the "old" Return address.
3293
8
  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3294
8
  return SDValue(OutRetAddr.getNode(), 1);
3295
8
}
3296
3297
/// Emit a store of the return address if tail call
3298
/// optimization is performed and it is required (FPDiff!=0).
3299
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3300
                                        SDValue Chain, SDValue RetAddrFrIdx,
3301
                                        EVT PtrVT, unsigned SlotSize,
3302
97
                                        int FPDiff, const SDLoc &dl) {
3303
97
  // Store the return address to the appropriate stack slot.
3304
97
  if (
!FPDiff97
)
return Chain89
;
3305
8
  // Calculate the new stack slot for the return address.
3306
8
  int NewReturnAddrFI =
3307
8
    MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3308
8
                                         false);
3309
8
  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3310
8
  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3311
8
                       MachinePointerInfo::getFixedStack(
3312
8
                           DAG.getMachineFunction(), NewReturnAddrFI));
3313
8
  return Chain;
3314
8
}
3315
3316
/// Returns a vector_shuffle mask for an movs{s|d}, movd
3317
/// operation of specified width.
3318
static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3319
74
                       SDValue V2) {
3320
74
  unsigned NumElems = VT.getVectorNumElements();
3321
74
  SmallVector<int, 8> Mask;
3322
74
  Mask.push_back(NumElems);
3323
290
  for (unsigned i = 1; 
i != NumElems290
;
++i216
)
3324
216
    Mask.push_back(i);
3325
74
  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3326
74
}
3327
3328
SDValue
3329
X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3330
29.6k
                             SmallVectorImpl<SDValue> &InVals) const {
3331
29.6k
  SelectionDAG &DAG                     = CLI.DAG;
3332
29.6k
  SDLoc &dl                             = CLI.DL;
3333
29.6k
  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3334
29.6k
  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
3335
29.6k
  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
3336
29.6k
  SDValue Chain                         = CLI.Chain;
3337
29.6k
  SDValue Callee                        = CLI.Callee;
3338
29.6k
  CallingConv::ID CallConv              = CLI.CallConv;
3339
29.6k
  bool &isTailCall                      = CLI.IsTailCall;
3340
29.6k
  bool isVarArg                         = CLI.IsVarArg;
3341
29.6k
3342
29.6k
  MachineFunction &MF = DAG.getMachineFunction();
3343
29.6k
  bool Is64Bit        = Subtarget.is64Bit();
3344
29.6k
  bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
3345
29.6k
  StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3346
29.6k
  bool IsSibcall      = false;
3347
29.6k
  X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3348
29.6k
  auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3349
29.6k
  const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3350
29.6k
  const Function *Fn = CI ? 
CI->getCalledFunction()26.9k
:
nullptr2.69k
;
3351
26.9k
  bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3352
29.6k
                 
(Fn && 29.6k
Fn->hasFnAttribute("no_caller_saved_registers")25.4k
);
3353
29.6k
3354
29.6k
  if (CallConv == CallingConv::X86_INTR)
3355
0
    report_fatal_error("X86 interrupts may not be called directly");
3356
29.6k
3357
29.6k
  
if (29.6k
Attr.getValueAsString() == "true"29.6k
)
3358
12
    isTailCall = false;
3359
29.6k
3360
29.6k
  if (Subtarget.isPICStyleGOT() &&
3361
29.6k
      
!MF.getTarget().Options.GuaranteedTailCallOpt111
) {
3362
109
    // If we are using a GOT, disable tail calls to external symbols with
3363
109
    // default visibility. Tail calling such a symbol requires using a GOT
3364
109
    // relocation, which forces early binding of the symbol. This breaks code
3365
109
    // that require lazy function symbol resolution. Using musttail or
3366
109
    // GuaranteedTailCallOpt will override this.
3367
109
    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3368
109
    if (
!G || 109
(!G->getGlobal()->hasLocalLinkage() &&
3369
60
               G->getGlobal()->hasDefaultVisibility()))
3370
103
      isTailCall = false;
3371
109
  }
3372
29.6k
3373
27.3k
  bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3374
29.6k
  if (
IsMustTail29.6k
) {
3375
63
    // Force this to be a tail call.  The verifier rules are enough to ensure
3376
63
    // that we can lower this successfully without moving the return address
3377
63
    // around.
3378
63
    isTailCall = true;
3379
29.6k
  } else 
if (29.5k
isTailCall29.5k
) {
3380
2.37k
    // Check if it's really possible to do a tail call.
3381
2.37k
    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3382
2.37k
                    isVarArg, SR != NotStructReturn,
3383
2.37k
                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3384
2.37k
                    Outs, OutVals, Ins, DAG);
3385
2.37k
3386
2.37k
    // Sibcalls are automatically detected tailcalls which do not require
3387
2.37k
    // ABI changes.
3388
2.37k
    if (
!MF.getTarget().Options.GuaranteedTailCallOpt && 2.37k
isTailCall2.33k
)
3389
1.81k
      IsSibcall = true;
3390
2.37k
3391
2.37k
    if (isTailCall)
3392
1.85k
      ++NumTailCalls;
3393
29.5k
  }
3394
29.6k
3395
29.6k
  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3396
29.6k
         "Var args not supported with calling convention fastcc, ghc or hipe");
3397
29.6k
3398
29.6k
  // Analyze operands of the call, assigning locations to each operand.
3399
29.6k
  SmallVector<CCValAssign, 16> ArgLocs;
3400
29.6k
  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3401
29.6k
3402
29.6k
  // Allocate shadow area for Win64.
3403
29.6k
  if (IsWin64)
3404
513
    CCInfo.AllocateStack(32, 8);
3405
29.6k
3406
29.6k
  CCInfo.AnalyzeArguments(Outs, CC_X86);
3407
29.6k
3408
29.6k
  // In vectorcall calling convention a second pass is required for the HVA
3409
29.6k
  // types.
3410
29.6k
  if (
CallingConv::X86_VectorCall == CallConv29.6k
) {
3411
8
    CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3412
8
  }
3413
29.6k
3414
29.6k
  // Get a count of how many bytes are to be pushed on the stack.
3415
29.6k
  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3416
29.6k
  if (IsSibcall)
3417
29.6k
    // This is a sibcall. The memory operands are available in caller's
3418
29.6k
    // own caller's stack.
3419
1.81k
    NumBytes = 0;
3420
27.8k
  else 
if (27.8k
MF.getTarget().Options.GuaranteedTailCallOpt &&
3421
50
           canGuaranteeTCO(CallConv))
3422
42
    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3423
29.6k
3424
29.6k
  int FPDiff = 0;
3425
29.6k
  if (
isTailCall && 29.6k
!IsSibcall1.91k
&&
!IsMustTail97
) {
3426
34
    // Lower arguments at fp - stackoffset + fpdiff.
3427
34
    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3428
34
3429
34
    FPDiff = NumBytesCallerPushed - NumBytes;
3430
34
3431
34
    // Set the delta of movement of the returnaddr stackslot.
3432
34
    // But only set if delta is greater than previous delta.
3433
34
    if (FPDiff < X86Info->getTCReturnAddrDelta())
3434
8
      X86Info->setTCReturnAddrDelta(FPDiff);
3435
34
  }
3436
29.6k
3437
29.6k
  unsigned NumBytesToPush = NumBytes;
3438
29.6k
  unsigned NumBytesToPop = NumBytes;
3439
29.6k
3440
29.6k
  // If we have an inalloca argument, all stack space has already been allocated
3441
29.6k
  // for us and be right at the top of the stack.  We don't support multiple
3442
29.6k
  // arguments passed in memory when using inalloca.
3443
29.6k
  if (
!Outs.empty() && 29.6k
Outs.back().Flags.isInAlloca()24.6k
) {
3444
22
    NumBytesToPush = 0;
3445
22
    if (!ArgLocs.back().isMemLoc())
3446
1
      report_fatal_error("cannot use inalloca attribute on a register "
3447
1
                         "parameter");
3448
21
    
if (21
ArgLocs.back().getLocMemOffset() != 021
)
3449
0
      report_fatal_error("any parameter with the inalloca attribute must be "
3450
0
                         "the only memory argument");
3451
29.6k
  }
3452
29.6k
3453
29.6k
  
if (29.6k
!IsSibcall29.6k
)
3454
27.8k
    Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3455
27.8k
                                 NumBytes - NumBytesToPush, dl);
3456
29.6k
3457
29.6k
  SDValue RetAddrFrIdx;
3458
29.6k
  // Load return address for tail calls.
3459
29.6k
  if (
isTailCall && 29.6k
FPDiff1.91k
)
3460
8
    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3461
8
                                    Is64Bit, FPDiff, dl);
3462
29.6k
3463
29.6k
  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3464
29.6k
  SmallVector<SDValue, 8> MemOpChains;
3465
29.6k
  SDValue StackPtr;
3466
29.6k
3467
29.6k
  // The next loop assumes that the locations are in the same order of the
3468
29.6k
  // input arguments.
3469
29.6k
  assert(isSortedByValueNo(ArgLocs) &&
3470
29.6k
         "Argument Location list must be sorted before lowering");
3471
29.6k
3472
29.6k
  // Walk the register/memloc assignments, inserting copies/loads.  In the case
3473
29.6k
  // of tail call optimization arguments are handle later.
3474
29.6k
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3475
97.1k
  for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3476
67.5k
       
++I, ++OutIndex67.5k
) {
3477
67.5k
    assert(OutIndex < Outs.size() && "Invalid Out index");
3478
67.5k
    // Skip inalloca arguments, they have already been written.
3479
67.5k
    ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3480
67.5k
    if (Flags.isInAlloca())
3481
21
      continue;
3482
67.5k
3483
67.5k
    CCValAssign &VA = ArgLocs[I];
3484
67.5k
    EVT RegVT = VA.getLocVT();
3485
67.5k
    SDValue Arg = OutVals[OutIndex];
3486
67.5k
    bool isByVal = Flags.isByVal();
3487
67.5k
3488
67.5k
    // Promote the value if needed.
3489
67.5k
    switch (VA.getLocInfo()) {
3490
0
    
default: 0
llvm_unreachable0
("Unknown loc info!");
3491
66.0k
    case CCValAssign::Full: break;
3492
43
    case CCValAssign::SExt:
3493
43
      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3494
43
      break;
3495
1.16k
    case CCValAssign::ZExt:
3496
1.16k
      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3497
1.16k
      break;
3498
256
    case CCValAssign::AExt:
3499
256
      if (Arg.getValueType().isVector() &&
3500
103
          Arg.getValueType().getVectorElementType() == MVT::i1)
3501
103
        Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3502
153
      else 
if (153
RegVT.is128BitVector()153
) {
3503
3
        // Special case: passing MMX values in XMM registers.
3504
3
        Arg = DAG.getBitcast(MVT::i64, Arg);
3505
3
        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3506
3
        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3507
3
      } else
3508
150
        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3509
256
      break;
3510
0
    case CCValAssign::BCvt:
3511
0
      Arg = DAG.getBitcast(RegVT, Arg);
3512
0
      break;
3513
17
    case CCValAssign::Indirect: {
3514
17
      // Store the argument.
3515
17
      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3516
17
      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3517
17
      Chain = DAG.getStore(
3518
17
          Chain, dl, Arg, SpillSlot,
3519
17
          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3520
17
      Arg = SpillSlot;
3521
17
      break;
3522
67.5k
    }
3523
67.5k
    }
3524
67.5k
3525
67.5k
    
if (67.5k
VA.needsCustom()67.5k
) {
3526
2
      assert(VA.getValVT() == MVT::v64i1 &&
3527
2
             "Currently the only custom case is when we split v64i1 to 2 regs");
3528
2
      // Split v64i1 value into two registers
3529
2
      Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3530
2
                         Subtarget);
3531
67.5k
    } else 
if (67.5k
VA.isRegLoc()67.5k
) {
3532
38.3k
      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3533
38.3k
      if (
isVarArg && 38.3k
IsWin644.36k
) {
3534
61
        // Win64 ABI requires argument XMM reg to be copied to the corresponding
3535
61
        // shadow reg if callee is a varargs function.
3536
61
        unsigned ShadowReg = 0;
3537
61
        switch (VA.getLocReg()) {
3538
0
        case X86::XMM0: ShadowReg = X86::RCX; break;
3539
4
        case X86::XMM1: ShadowReg = X86::RDX; break;
3540
0
        case X86::XMM2: ShadowReg = X86::R8; break;
3541
0
        case X86::XMM3: ShadowReg = X86::R9; break;
3542
61
        }
3543
61
        
if (61
ShadowReg61
)
3544
4
          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3545
61
      }
3546
67.5k
    } else 
if (29.1k
!IsSibcall && 29.1k
(!isTailCall || 28.7k
isByVal80
)) {
3547
28.6k
      assert(VA.isMemLoc());
3548
28.6k
      if (!StackPtr.getNode())
3549
9.37k
        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3550
9.37k
                                      getPointerTy(DAG.getDataLayout()));
3551
67.5k
      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3552
67.5k
                                             dl, DAG, VA, Flags));
3553
67.5k
    }
3554
67.5k
  }
3555
29.6k
3556
29.6k
  
if (29.6k
!MemOpChains.empty()29.6k
)
3557
9.37k
    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3558
29.6k
3559
29.6k
  if (
Subtarget.isPICStyleGOT()29.6k
) {
3560
111
    // ELF / PIC requires GOT in the EBX register before function calls via PLT
3561
111
    // GOT pointer.
3562
111
    if (
!isTailCall111
) {
3563
105
      RegsToPass.push_back(std::make_pair(
3564
105
          unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3565
105
                                          getPointerTy(DAG.getDataLayout()))));
3566
111
    } else {
3567
6
      // If we are tail calling and generating PIC/GOT style code load the
3568
6
      // address of the callee into ECX. The value in ecx is used as target of
3569
6
      // the tail jump. This is done to circumvent the ebx/callee-saved problem
3570
6
      // for tail calls on PIC/GOT architectures. Normally we would just put the
3571
6
      // address of GOT into ebx and then call target@PLT. But for tail calls
3572
6
      // ebx would be restored (since ebx is callee saved) before jumping to the
3573
6
      // target@PLT.
3574
6
3575
6
      // Note: The actual moving to ECX is done further down.
3576
6
      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3577
6
      if (
G && 6
!G->getGlobal()->hasLocalLinkage()6
&&
3578
4
          G->getGlobal()->hasDefaultVisibility())
3579
2
        Callee = LowerGlobalAddress(Callee, DAG);
3580
4
      else 
if (4
isa<ExternalSymbolSDNode>(Callee)4
)
3581
0
        Callee = LowerExternalSymbol(Callee, DAG);
3582
6
    }
3583
111
  }
3584
29.6k
3585
29.6k
  if (
Is64Bit && 29.6k
isVarArg18.5k
&&
!IsWin641.74k
&&
!IsMustTail1.71k
) {
3586
1.70k
    // From AMD64 ABI document:
3587
1.70k
    // For calls that may call functions that use varargs or stdargs
3588
1.70k
    // (prototype-less calls or calls to functions containing ellipsis (...) in
3589
1.70k
    // the declaration) %al is used as hidden argument to specify the number
3590
1.70k
    // of SSE registers used. The contents of %al do not need to match exactly
3591
1.70k
    // the number of registers, but must be an ubound on the number of SSE
3592
1.70k
    // registers used and is in the range 0 - 8 inclusive.
3593
1.70k
3594
1.70k
    // Count the number of XMM registers allocated.
3595
1.70k
    static const MCPhysReg XMMArgRegs[] = {
3596
1.70k
      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3597
1.70k
      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3598
1.70k
    };
3599
1.70k
    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3600
1.70k
    assert((Subtarget.hasSSE1() || !NumXMMRegs)
3601
1.70k
           && "SSE registers cannot be used when SSE is disabled");
3602
1.70k
3603
1.70k
    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3604
1.70k
                                        DAG.getConstant(NumXMMRegs, dl,
3605
1.70k
                                                        MVT::i8)));
3606
1.70k
  }
3607
29.6k
3608
29.6k
  if (
isVarArg && 29.6k
IsMustTail2.77k
) {
3609
26
    const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3610
179
    for (const auto &F : Forwards) {
3611
179
      SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3612
179
      RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3613
179
    }
3614
26
  }
3615
29.6k
3616
29.6k
  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3617
29.6k
  // don't need this because the eligibility check rejects calls that require
3618
29.6k
  // shuffling arguments passed in memory.
3619
29.6k
  if (
!IsSibcall && 29.6k
isTailCall27.8k
) {
3620
97
    // Force all the incoming stack arguments to be loaded from the stack
3621
97
    // before any new outgoing arguments are stored to the stack, because the
3622
97
    // outgoing stack slots may alias the incoming argument stack slots, and
3623
97
    // the alias isn't otherwise explicit. This is slightly more conservative
3624
97
    // than necessary, because it means that each store effectively depends
3625
97
    // on every argument instead of just those arguments it would clobber.
3626
97
    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3627
97
3628
97
    SmallVector<SDValue, 8> MemOpChains2;
3629
97
    SDValue FIN;
3630
97
    int FI = 0;
3631
327
    for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3632
230
         
++I, ++OutsIndex230
) {
3633
230
      CCValAssign &VA = ArgLocs[I];
3634
230
3635
230
      if (
VA.isRegLoc()230
) {
3636
138
        if (
VA.needsCustom()138
) {
3637
0
          assert((CallConv == CallingConv::X86_RegCall) &&
3638
0
                 "Expecting custom case only in regcall calling convention");
3639
0
          // This means that we are in special case where one argument was
3640
0
          // passed through two register locations - Skip the next location
3641
0
          ++I;
3642
0
        }
3643
138
3644
138
        continue;
3645
138
      }
3646
92
3647
230
      assert(VA.isMemLoc());
3648
92
      SDValue Arg = OutVals[OutsIndex];
3649
92
      ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3650
92
      // Skip inalloca arguments.  They don't require any work.
3651
92
      if (Flags.isInAlloca())
3652
12
        continue;
3653
80
      // Create frame index.
3654
80
      int32_t Offset = VA.getLocMemOffset()+FPDiff;
3655
80
      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3656
80
      FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3657
80
      FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3658
80
3659
80
      if (
Flags.isByVal()80
) {
3660
2
        // Copy relative to framepointer.
3661
2
        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3662
2
        if (!StackPtr.getNode())
3663
0
          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3664
0
                                        getPointerTy(DAG.getDataLayout()));
3665
2
        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3666
2
                             StackPtr, Source);
3667
2
3668
2
        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3669
2
                                                         ArgChain,
3670
2
                                                         Flags, DAG, dl));
3671
80
      } else {
3672
78
        // Store relative to framepointer.
3673
78
        MemOpChains2.push_back(DAG.getStore(
3674
78
            ArgChain, dl, Arg, FIN,
3675
78
            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3676
78
      }
3677
230
    }
3678
97
3679
97
    if (!MemOpChains2.empty())
3680
48
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3681
97
3682
97
    // Store the return address to the appropriate stack slot.
3683
97
    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3684
97
                                     getPointerTy(DAG.getDataLayout()),
3685
97
                                     RegInfo->getSlotSize(), FPDiff, dl);
3686
97
  }
3687
29.6k
3688
29.6k
  // Build a sequence of copy-to-reg nodes chained together with token chain
3689
29.6k
  // and flag operands which copy the outgoing args into registers.
3690
29.6k
  SDValue InFlag;
3691
69.9k
  for (unsigned i = 0, e = RegsToPass.size(); 
i != e69.9k
;
++i40.3k
) {
3692
40.3k
    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3693
40.3k
                             RegsToPass[i].second, InFlag);
3694
40.3k
    InFlag = Chain.getValue(1);
3695
40.3k
  }
3696
29.6k
3697
29.6k
  if (
DAG.getTarget().getCodeModel() == CodeModel::Large29.6k
) {
3698
237
    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3699
237
    // In the 64-bit large code model, we have to make all calls
3700
237
    // through a register, since the call instruction's 32-bit
3701
237
    // pc-relative offset may not be large enough to hold the whole
3702
237
    // address.
3703
29.6k
  } else 
if (29.3k
Callee->getOpcode() == ISD::GlobalAddress29.3k
) {
3704
25.8k
    // If the callee is a GlobalAddress node (quite common, every direct call
3705
25.8k
    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3706
25.8k
    // it.
3707
25.8k
    GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3708
25.8k
3709
25.8k
    // We should use extra load for direct calls to dllimported functions in
3710
25.8k
    // non-JIT mode.
3711
25.8k
    const GlobalValue *GV = G->getGlobal();
3712
25.8k
    if (
!GV->hasDLLImportStorageClass()25.8k
) {
3713
25.8k
      unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3714
25.8k
3715
25.8k
      Callee = DAG.getTargetGlobalAddress(
3716
25.8k
          GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3717
25.8k
3718
25.8k
      if (
OpFlags == X86II::MO_GOTPCREL25.8k
) {
3719
9
        // Add a wrapper.
3720
9
        Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3721
9
          getPointerTy(DAG.getDataLayout()), Callee);
3722
9
        // Add extra indirection
3723
9
        Callee = DAG.getLoad(
3724
9
            getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3725
9
            MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3726
9
      }
3727
25.8k
    }
3728
29.3k
  } else 
if (ExternalSymbolSDNode *3.49k
S3.49k
= dyn_cast<ExternalSymbolSDNode>(Callee)) {
3729
2.10k
    const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3730
2.10k
    unsigned char OpFlags =
3731
2.10k
        Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3732
2.10k
3733
2.10k
    Callee = DAG.getTargetExternalSymbol(
3734
2.10k
        S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3735
3.49k
  } else 
if (1.38k
Subtarget.isTarget64BitILP32() &&
3736
1.38k
             
Callee->getValueType(0) == MVT::i3218
) {
3737
18
    // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3738
18
    Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3739
18
  }
3740
29.6k
3741
29.6k
  // Returns a chain & a flag for retval copy to use.
3742
29.6k
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3743
29.6k
  SmallVector<SDValue, 8> Ops;
3744
29.6k
3745
29.6k
  if (
!IsSibcall && 29.6k
isTailCall27.8k
) {
3746
97
    Chain = DAG.getCALLSEQ_END(Chain,
3747
97
                               DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3748
97
                               DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3749
97
    InFlag = Chain.getValue(1);
3750
97
  }
3751
29.6k
3752
29.6k
  Ops.push_back(Chain);
3753
29.6k
  Ops.push_back(Callee);
3754
29.6k
3755
29.6k
  if (isTailCall)
3756
1.91k
    Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3757
29.6k
3758
29.6k
  // Add argument registers to the end of the list so that they are known live
3759
29.6k
  // into the call.
3760
69.9k
  for (unsigned i = 0, e = RegsToPass.size(); 
i != e69.9k
;
++i40.3k
)
3761
40.3k
    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3762
40.3k
                                  RegsToPass[i].second.getValueType()));
3763
29.6k
3764
29.6k
  // Add a register mask operand representing the call-preserved registers.
3765
29.6k
  // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3766
29.6k
  // set X86_INTR calling convention because it has the same CSR mask
3767
29.6k
  // (same preserved registers).
3768
29.6k
  const uint32_t *Mask = RegInfo->getCallPreservedMask(
3769
29.6k
      MF, HasNCSR ? 
(CallingConv::ID)CallingConv::X86_INTR5
:
CallConv29.6k
);
3770
29.6k
  assert(Mask && "Missing call preserved mask for calling convention");
3771
29.6k
3772
29.6k
  // If this is an invoke in a 32-bit function using a funclet-based
3773
29.6k
  // personality, assume the function clobbers all registers. If an exception
3774
29.6k
  // is thrown, the runtime will not restore CSRs.
3775
29.6k
  // FIXME: Model this more precisely so that we can register allocate across
3776
29.6k
  // the normal edge and spill and fill across the exceptional edge.
3777
29.6k
  if (
!Is64Bit && 29.6k
CLI.CS11.0k
&&
CLI.CS.isInvoke()10.2k
) {
3778
171
    const Function *CallerFn = MF.getFunction();
3779
171
    EHPersonality Pers =
3780
171
        CallerFn->hasPersonalityFn()
3781
171
            ? classifyEHPersonality(CallerFn->getPersonalityFn())
3782
0
            : EHPersonality::Unknown;
3783
171
    if (isFuncletEHPersonality(Pers))
3784
47
      Mask = RegInfo->getNoPreservedMask();
3785
171
  }
3786
29.6k
3787
29.6k
  // Define a new register mask from the existing mask.
3788
29.6k
  uint32_t *RegMask = nullptr;
3789
29.6k
3790
29.6k
  // In some calling conventions we need to remove the used physical registers
3791
29.6k
  // from the reg mask.
3792
29.6k
  if (
CallConv == CallingConv::X86_RegCall || 29.6k
HasNCSR29.5k
) {
3793
70
    const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3794
70
3795
70
    // Allocate a new Reg Mask and copy Mask.
3796
70
    RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3797
70
    unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3798
70
    memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3799
70
3800
70
    // Make sure all sub registers of the argument registers are reset
3801
70
    // in the RegMask.
3802
70
    for (auto const &RegPair : RegsToPass)
3803
128
      for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3804
569
           
SubRegs.isValid()569
;
++SubRegs441
)
3805
441
        RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3806
70
3807
70
    // Create the RegMask Operand according to our updated mask.
3808
70
    Ops.push_back(DAG.getRegisterMask(RegMask));
3809
29.6k
  } else {
3810
29.5k
    // Create the RegMask Operand according to the static mask.
3811
29.5k
    Ops.push_back(DAG.getRegisterMask(Mask));
3812
29.5k
  }
3813
29.6k
3814
29.6k
  if (InFlag.getNode())
3815
15.7k
    Ops.push_back(InFlag);
3816
29.6k
3817
29.6k
  if (
isTailCall29.6k
) {
3818
1.91k
    // We used to do:
3819
1.91k
    //// If this is the first return lowered for this function, add the regs
3820
1.91k
    //// to the liveout set for the function.
3821
1.91k
    // This isn't right, although it's probably harmless on x86; liveouts
3822
1.91k
    // should be computed from returns not tail calls.  Consider a void
3823
1.91k
    // function making a tail call to a function returning int.
3824
1.91k
    MF.getFrameInfo().setHasTailCall();
3825
1.91k
    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3826
1.91k
  }
3827
27.7k
3828
27.7k
  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3829
27.7k
  InFlag = Chain.getValue(1);
3830
27.7k
3831
27.7k
  // Create the CALLSEQ_END node.
3832
27.7k
  unsigned NumBytesForCalleeToPop;
3833
27.7k
  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3834
27.7k
                       DAG.getTarget().Options.GuaranteedTailCallOpt))
3835
154
    NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3836
27.5k
  else 
if (27.5k
!Is64Bit && 27.5k
!canGuaranteeTCO(CallConv)10.5k
&&
3837
10.4k
           !Subtarget.getTargetTriple().isOSMSVCRT() &&
3838
9.89k
           SR == StackStructReturn)
3839
27.5k
    // If this is a call to a struct-return function, the callee
3840
27.5k
    // pops the hidden struct pointer, so we have to push it back.
3841
27.5k
    // This is common for Darwin/X86, Linux & Mingw32 targets.
3842
27.5k
    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3843
210
    NumBytesForCalleeToPop = 4;
3844
27.5k
  else
3845
27.3k
    NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3846
27.7k
3847
27.7k
  if (
CLI.DoesNotReturn && 27.7k
!getTargetMachine().Options.TrapUnreachable4.32k
) {
3848
4.30k
    // No need to reset the stack after the call if the call doesn't return. To
3849
4.30k
    // make the MI verify, we'll pretend the callee does it for us.
3850
4.30k
    NumBytesForCalleeToPop = NumBytes;
3851
4.30k
  }
3852
27.7k
3853
27.7k
  // Returns a flag for retval copy to use.
3854
27.7k
  if (
!IsSibcall27.7k
) {
3855
27.7k
    Chain = DAG.getCALLSEQ_END(Chain,
3856
27.7k
                               DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3857
27.7k
                               DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3858
27.7k
                                                     true),
3859
27.7k
                               InFlag, dl);
3860
27.7k
    InFlag = Chain.getValue(1);
3861
27.7k
  }
3862
29.6k
3863
29.6k
  // Handle result values, copying them out of physregs into vregs that we
3864
29.6k
  // return.
3865
29.6k
  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3866
29.6k
                         InVals, RegMask);
3867
29.6k
}
3868
3869
//===----------------------------------------------------------------------===//
3870
//                Fast Calling Convention (tail call) implementation
3871
//===----------------------------------------------------------------------===//
3872
3873
//  Like std call, callee cleans arguments, convention except that ECX is
3874
//  reserved for storing the tail called function address. Only 2 registers are
3875
//  free for argument passing (inreg). Tail call optimization is performed
3876
//  provided:
3877
//                * tailcallopt is enabled
3878
//                * caller/callee are fastcc
3879
//  On X86_64 architecture with GOT-style position independent code only local
3880
//  (within module) calls are supported at the moment.
3881
//  To keep the stack aligned according to platform abi the function
3882
//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3883
//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3884
//  If a tail called function callee has more arguments than the caller the
3885
//  caller needs to make sure that there is room to move the RETADDR to. This is
3886
//  achieved by reserving an area the size of the argument delta right after the
3887
//  original RETADDR, but before the saved framepointer or the spilled registers
3888
//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3889
//  stack layout:
3890
//    arg1
3891
//    arg2
3892
//    RETADDR
3893
//    [ new RETADDR
3894
//      move area ]
3895
//    (possible EBP)
3896
//    ESI
3897
//    EDI
3898
//    local1 ..
3899
3900
/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3901
/// requirement.
3902
unsigned
3903
X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3904
84
                                               SelectionDAG& DAG) const {
3905
84
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3906
84
  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3907
84
  unsigned StackAlignment = TFI.getStackAlignment();
3908
84
  uint64_t AlignMask = StackAlignment - 1;
3909
84
  int64_t Offset = StackSize;
3910
84
  unsigned SlotSize = RegInfo->getSlotSize();
3911
84
  if ( 
(Offset & AlignMask) <= (StackAlignment - SlotSize)84
) {
3912
84
    // Number smaller than 12 so just add the difference.
3913
84
    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3914
84
  } else {
3915
0
    // Mask out lower bits, add stackalignment once plus the 12 bytes.
3916
0
    Offset = ((~AlignMask) & Offset) + StackAlignment +
3917
0
      (StackAlignment-SlotSize);
3918
0
  }
3919
84
  return Offset;
3920
84
}
3921
3922
/// Return true if the given stack call argument is already available in the
3923
/// same position (relatively) of the caller's incoming argument stack.
3924
static
3925
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3926
                         MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3927
1.01k
                         const X86InstrInfo *TII, const CCValAssign &VA) {
3928
1.01k
  unsigned Bytes = Arg.getValueSizeInBits() / 8;
3929
1.01k
3930
1.02k
  for (;;) {
3931
1.02k
    // Look through nodes that don't alter the bits of the incoming value.
3932
1.02k
    unsigned Op = Arg.getOpcode();
3933
1.02k
    if (
Op == ISD::ZERO_EXTEND || 1.02k
Op == ISD::ANY_EXTEND1.02k
||
Op == ISD::BITCAST1.02k
) {
3934
13
      Arg = Arg.getOperand(0);
3935
13
      continue;
3936
13
    }
3937
1.01k
    
if (1.01k
Op == ISD::TRUNCATE1.01k
) {
3938
6
      const SDValue &TruncInput = Arg.getOperand(0);
3939
6
      if (TruncInput.getOpcode() == ISD::AssertZext &&
3940
1
          cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3941
6
              Arg.getValueType()) {
3942
1
        Arg = TruncInput.getOperand(0);
3943
1
        continue;
3944
1
      }
3945
1.01k
    }
3946
1.01k
    break;
3947
1.01k
  }
3948
1.01k
3949
1.01k
  int FI = INT_MAX;
3950
1.01k
  if (
Arg.getOpcode() == ISD::CopyFromReg1.01k
) {
3951
210
    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3952
210
    if (!TargetRegisterInfo::isVirtualRegister(VR))
3953
15
      return false;
3954
195
    MachineInstr *Def = MRI->getVRegDef(VR);
3955
195
    if (!Def)
3956
3
      return false;
3957
192
    
if (192
!Flags.isByVal()192
) {
3958
189
      if (!TII->isLoadFromStackSlot(*Def, FI))
3959
52
        return false;
3960
3
    } else {
3961
3
      unsigned Opcode = Def->getOpcode();
3962
3
      if (
(Opcode == X86::LEA32r || 3
Opcode == X86::LEA64r2
||
3963
1
           Opcode == X86::LEA64_32r) &&
3964
3
          
Def->getOperand(1).isFI()3
) {
3965
3
        FI = Def->getOperand(1).getIndex();
3966
3
        Bytes = Flags.getByValSize();
3967
3
      } else
3968
0
        return false;
3969
1.01k
    }
3970
801
  } else 
if (LoadSDNode *801
Ld801
= dyn_cast<LoadSDNode>(Arg)) {
3971
547
    if (Flags.isByVal())
3972
547
      // ByVal argument is passed in as a pointer but it's now being
3973
547
      // dereferenced. e.g.
3974
547
      // define @foo(%struct.X* %A) {
3975
547
      //   tail call @bar(%struct.X* byval %A)
3976
547
      // }
3977
1
      return false;
3978
546
    SDValue Ptr = Ld->getBasePtr();
3979
546
    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3980
546
    if (!FINode)
3981
27
      return false;
3982
519
    FI = FINode->getIndex();
3983
801
  } else 
if (254
Arg.getOpcode() == ISD::FrameIndex && 254
Flags.isByVal()4
) {
3984
4
    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3985
4
    FI = FINode->getIndex();
3986
4
    Bytes = Flags.getByValSize();
3987
4
  } else
3988
250
    return false;
3989
663
3990
1.01k
  assert(FI != INT_MAX);
3991
663
  if (!MFI.isFixedObjectIndex(FI))
3992
0
    return false;
3993
663
3994
663
  
if (663
Offset != MFI.getObjectOffset(FI)663
)
3995
42
    return false;
3996
621
3997
621
  // If this is not byval, check that the argument stack object is immutable.
3998
621
  // inalloca and argument copy elision can create mutable argument stack
3999
621
  // objects. Byval objects can be mutated, but a byval call intends to pass the
4000
621
  // mutated memory.
4001
621
  
if (621
!Flags.isByVal() && 621
!MFI.isImmutableObjectIndex(FI)614
)
4002
6
    return false;
4003
615
4004
615
  
if (615
VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()615
) {
4005
8
    // If the argument location is wider than the argument type, check that any
4006
8
    // extension flags match.
4007
8
    if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4008
8
        
Flags.isSExt() != MFI.isObjectSExt(FI)7
) {
4009
1
      return false;
4010
1
    }
4011
614
  }
4012
614
4013
614
  return Bytes == MFI.getObjectSize(FI);
4014
614
}
4015
4016
/// Check whether the call is eligible for tail call optimization. Targets
4017
/// that want to do tail call optimization should implement this function.
4018
bool X86TargetLowering::IsEligibleForTailCallOptimization(
4019
    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4020
    bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4021
    const SmallVectorImpl<ISD::OutputArg> &Outs,
4022
    const SmallVectorImpl<SDValue> &OutVals,
4023
2.37k
    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4024
2.37k
  if (!mayTailCallThisCC(CalleeCC))
4025
2
    return false;
4026
2.37k
4027
2.37k
  // If -tailcallopt is specified, make fastcc functions tail-callable.
4028
2.37k
  MachineFunction &MF = DAG.getMachineFunction();
4029
2.37k
  const Function *CallerF = MF.getFunction();
4030
2.37k
4031
2.37k
  // If the function return type is x86_fp80 and the callee return type is not,
4032
2.37k
  // then the FP_EXTEND of the call result is not a nop. It's not safe to
4033
2.37k
  // perform a tailcall optimization here.
4034
2.37k
  if (
CallerF->getReturnType()->isX86_FP80Ty() && 2.37k
!RetTy->isX86_FP80Ty()3
)
4035
0
    return false;
4036
2.37k
4037
2.37k
  CallingConv::ID CallerCC = CallerF->getCallingConv();
4038
2.37k
  bool CCMatch = CallerCC == CalleeCC;
4039
2.37k
  bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4040
2.37k
  bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4041
2.37k
4042
2.37k
  // Win64 functions have extra shadow space for argument homing. Don't do the
4043
2.37k
  // sibcall if the caller and callee have mismatched expectations for this
4044
2.37k
  // space.
4045
2.37k
  if (IsCalleeWin64 != IsCallerWin64)
4046
2
    return false;
4047
2.37k
4048
2.37k
  
if (2.37k
DAG.getTarget().Options.GuaranteedTailCallOpt2.37k
) {
4049
38
    if (
canGuaranteeTCO(CalleeCC) && 38
CCMatch35
)
4050
34
      return true;
4051
4
    return false;
4052
4
  }
4053
2.33k
4054
2.33k
  // Look for obvious safe cases to perform tail call optimization that do not
4055
2.33k
  // require ABI changes. This is what gcc calls sibcall.
4056
2.33k
4057
2.33k
  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4058
2.33k
  // emit a special epilogue.
4059
2.33k
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4060
2.33k
  if (RegInfo->needsStackRealignment(MF))
4061
4
    return false;
4062
2.33k
4063
2.33k
  // Also avoid sibcall optimization if either caller or callee uses struct
4064
2.33k
  // return semantics.
4065
2.33k
  
if (2.33k
isCalleeStructRet || 2.33k
isCallerStructRet2.32k
)
4066
3
    return false;
4067
2.32k
4068
2.32k
  // Do not sibcall optimize vararg calls unless all arguments are passed via
4069
2.32k
  // registers.
4070
2.32k
  LLVMContext &C = *DAG.getContext();
4071
2.32k
  if (
isVarArg && 2.32k
!Outs.empty()183
) {
4072
145
    // Optimizing for varargs on Win64 is unlikely to be safe without
4073
145
    // additional testing.
4074
145
    if (
IsCalleeWin64 || 145
IsCallerWin64140
)
4075
5
      return false;
4076
140
4077
140
    SmallVector<CCValAssign, 16> ArgLocs;
4078
140
    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4079
140
4080
140
    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4081
362
    for (unsigned i = 0, e = ArgLocs.size(); 
i != e362
;
++i222
)
4082
295
      
if (295
!ArgLocs[i].isRegLoc()295
)
4083
73
        return false;
4084
145
  }
4085
2.32k
4086
2.32k
  // If the call result is in ST0 / ST1, it needs to be popped off the x87
4087
2.32k
  // stack.  Therefore, if it's not used by the call it is not safe to optimize
4088
2.32k
  // this into a sibcall.
4089
2.24k
  bool Unused = false;
4090
3.06k
  for (unsigned i = 0, e = Ins.size(); 
i != e3.06k
;
++i814
) {
4091
1.09k
    if (
!Ins[i].Used1.09k
) {
4092
281
      Unused = true;
4093
281
      break;
4094
281
    }
4095
1.09k
  }
4096
2.24k
  if (
Unused2.24k
) {
4097
281
    SmallVector<CCValAssign, 16> RVLocs;
4098
281
    CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4099
281
    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4100
562
    for (unsigned i = 0, e = RVLocs.size(); 
i != e562
;
++i281
) {
4101
283
      CCValAssign &VA = RVLocs[i];
4102
283
      if (
VA.getLocReg() == X86::FP0 || 283
VA.getLocReg() == X86::FP1281
)
4103
2
        return false;
4104
283
    }
4105
281
  }
4106
2.24k
4107
2.24k
  // Check that the call results are passed in the same way.
4108
2.24k
  
if (2.24k
!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4109
2.24k
                                  RetCC_X86, RetCC_X86))
4110
1
    return false;
4111
2.24k
  // The callee has to preserve all registers the caller needs to preserve.
4112
2.24k
  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4113
2.24k
  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4114
2.24k
  if (
!CCMatch2.24k
) {
4115
77
    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4116
77
    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4117
7
      return false;
4118
2.23k
  }
4119
2.23k
4120
2.23k
  unsigned StackArgsSize = 0;
4121
2.23k
4122
2.23k
  // If the callee takes no arguments then go on to check the results of the
4123
2.23k
  // call.
4124
2.23k
  if (
!Outs.empty()2.23k
) {
4125
1.76k
    // Check if stack adjustment is needed. For now, do not do this if any
4126
1.76k
    // argument is passed on the stack.
4127
1.76k
    SmallVector<CCValAssign, 16> ArgLocs;
4128
1.76k
    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4129
1.76k
4130
1.76k
    // Allocate shadow area for Win64
4131
1.76k
    if (IsCalleeWin64)
4132
25
      CCInfo.AllocateStack(32, 8);
4133
1.76k
4134
1.76k
    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4135
1.76k
    StackArgsSize = CCInfo.getNextStackOffset();
4136
1.76k
4137
1.76k
    if (
CCInfo.getNextStackOffset()1.76k
) {
4138
645
      // Check if the arguments are already laid out in the right way as
4139
645
      // the caller's fixed stack objects.
4140
645
      MachineFrameInfo &MFI = MF.getFrameInfo();
4141
645
      const MachineRegisterInfo *MRI = &MF.getRegInfo();
4142
645
      const X86InstrInfo *TII = Subtarget.getInstrInfo();
4143
1.39k
      for (unsigned i = 0, e = ArgLocs.size(); 
i != e1.39k
;
++i754
) {
4144
1.15k
        CCValAssign &VA = ArgLocs[i];
4145
1.15k
        SDValue Arg = OutVals[i];
4146
1.15k
        ISD::ArgFlagsTy Flags = Outs[i].Flags;
4147
1.15k
        if (VA.getLocInfo() == CCValAssign::Indirect)
4148
0
          return false;
4149
1.15k
        
if (1.15k
!VA.isRegLoc()1.15k
) {
4150
1.01k
          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4151
1.01k
                                   MFI, MRI, TII, VA))
4152
397
            return false;
4153
1.01k
        }
4154
1.15k
      }
4155
645
    }
4156
1.76k
4157
1.36k
    bool PositionIndependent = isPositionIndependent();
4158
1.36k
    // If the tailcall address may be in a register, then make sure it's
4159
1.36k
    // possible to register allocate for it. In 32-bit, the call address can
4160
1.36k
    // only target EAX, EDX, or ECX since the tail call must be scheduled after
4161
1.36k
    // callee-saved registers are restored. These happen to be the same
4162
1.36k
    // registers used to pass 'inreg' arguments so watch out for those.
4163
1.36k
    if (
!Subtarget.is64Bit() && 1.36k
((!isa<GlobalAddressSDNode>(Callee) &&
4164
38
                                  !isa<ExternalSymbolSDNode>(Callee)) ||
4165
1.36k
                                 
PositionIndependent211
)) {
4166
163
      unsigned NumInRegs = 0;
4167
163
      // In PIC we need an extra register to formulate the address computation
4168
163
      // for the callee.
4169
163
      unsigned MaxInRegs = PositionIndependent ? 
2158
:
35
;
4170
163
4171
445
      for (unsigned i = 0, e = ArgLocs.size(); 
i != e445
;
++i282
) {
4172
285
        CCValAssign &VA = ArgLocs[i];
4173
285
        if (!VA.isRegLoc())
4174
268
          continue;
4175
17
        unsigned Reg = VA.getLocReg();
4176
17
        switch (Reg) {
4177
4
        default: break;
4178
13
        
case X86::EAX: 13
case X86::EDX: 13
case X86::ECX:
4179
13
          if (++NumInRegs == MaxInRegs)
4180
3
            return false;
4181
10
          break;
4182
285
        }
4183
285
      }
4184
163
    }
4185
1.36k
4186
1.36k
    const MachineRegisterInfo &MRI = MF.getRegInfo();
4187
1.36k
    if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4188
3
      return false;
4189
1.83k
  }
4190
1.83k
4191
1.83k
  bool CalleeWillPop =
4192
1.83k
      X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4193
1.83k
                       MF.getTarget().Options.GuaranteedTailCallOpt);
4194
1.83k
4195
1.83k
  if (unsigned BytesToPop =
4196
24
          MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4197
24
    // If we have bytes to pop, the callee must pop them.
4198
12
    bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4199
24
    if (!CalleePopMatches)
4200
17
      return false;
4201
1.81k
  } else 
if (1.81k
CalleeWillPop && 1.81k
StackArgsSize > 09
) {
4202
0
    // If we don't have bytes to pop, make sure the callee doesn't pop any.
4203
0
    return false;
4204
0
  }
4205
1.81k
4206
1.81k
  return true;
4207
1.81k
}
4208
4209
FastISel *
4210
X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4211
4.96k
                                  const TargetLibraryInfo *libInfo) const {
4212
4.96k
  return X86::createFastISel(funcInfo, libInfo);
4213
4.96k
}
4214
4215
//===----------------------------------------------------------------------===//
4216
//                           Other Lowering Hooks
4217
//===----------------------------------------------------------------------===//
4218
4219
5.97k
static bool MayFoldLoad(SDValue Op) {
4220
4.43k
  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4221
5.97k
}
4222
4223
178
static bool MayFoldIntoStore(SDValue Op) {
4224
176
  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4225
178
}
4226
4227
175
static bool MayFoldIntoZeroExtend(SDValue Op) {
4228
175
  if (
Op.hasOneUse()175
) {
4229
175
    unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4230
175
    return (ISD::ZERO_EXTEND == Opcode);
4231
175
  }
4232
0
  return false;
4233
0
}
4234
4235
2.58M
static bool isTargetShuffle(unsigned Opcode) {
4236
2.58M
  switch(Opcode) {
4237
1.08M
  default: return false;
4238
1.50M
  case X86ISD::BLENDI:
4239
1.50M
  case X86ISD::PSHUFB:
4240
1.50M
  case X86ISD::PSHUFD:
4241
1.50M
  case X86ISD::PSHUFHW:
4242
1.50M
  case X86ISD::PSHUFLW:
4243
1.50M
  case X86ISD::SHUFP:
4244
1.50M
  case X86ISD::INSERTPS:
4245
1.50M
  case X86ISD::EXTRQI:
4246
1.50M
  case X86ISD::INSERTQI:
4247
1.50M
  case X86ISD::PALIGNR:
4248
1.50M
  case X86ISD::VSHLDQ:
4249
1.50M
  case X86ISD::VSRLDQ:
4250
1.50M
  case X86ISD::MOVLHPS:
4251
1.50M
  case X86ISD::MOVHLPS:
4252
1.50M
  case X86ISD::MOVLPS:
4253
1.50M
  case X86ISD::MOVLPD:
4254
1.50M
  case X86ISD::MOVSHDUP:
4255
1.50M
  case X86ISD::MOVSLDUP:
4256
1.50M
  case X86ISD::MOVDDUP:
4257
1.50M
  case X86ISD::MOVSS:
4258
1.50M
  case X86ISD::MOVSD:
4259
1.50M
  case X86ISD::UNPCKL:
4260
1.50M
  case X86ISD::UNPCKH:
4261
1.50M
  case X86ISD::VBROADCAST:
4262
1.50M
  case X86ISD::VPERMILPI:
4263
1.50M
  case X86ISD::VPERMILPV:
4264
1.50M
  case X86ISD::VPERM2X128:
4265
1.50M
  case X86ISD::VPERMIL2:
4266
1.50M
  case X86ISD::VPERMI:
4267
1.50M
  case X86ISD::VPPERM:
4268
1.50M
  case X86ISD::VPERMV:
4269
1.50M
  case X86ISD::VPERMV3:
4270
1.50M
  case X86ISD::VPERMIV3:
4271
1.50M
  case X86ISD::VZEXT_MOVL:
4272
1.50M
    return true;
4273
0
  }
4274
0
}
4275
4276
1.37M
static bool isTargetShuffleVariableMask(unsigned Opcode) {
4277
1.37M
  switch (Opcode) {
4278
1.32M
  default: return false;
4279
1.37M
  // Target Shuffles.
4280
46.5k
  case X86ISD::PSHUFB:
4281
46.5k
  case X86ISD::VPERMILPV:
4282
46.5k
  case X86ISD::VPERMIL2:
4283
46.5k
  case X86ISD::VPPERM:
4284
46.5k
  case X86ISD::VPERMV:
4285
46.5k
  case X86ISD::VPERMV3:
4286
46.5k
  case X86ISD::VPERMIV3:
4287
46.5k
    return true;
4288
46.5k
  // 'Faux' Target Shuffles.
4289
3.88k
  case ISD::AND:
4290
3.88k
  case X86ISD::ANDNP:
4291
3.88k
    return true;
4292
0
  }
4293
0
}
4294
4295
37
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4296
37
  MachineFunction &MF = DAG.getMachineFunction();
4297
37
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4298
37
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4299
37
  int ReturnAddrIndex = FuncInfo->getRAIndex();
4300
37
4301
37
  if (
ReturnAddrIndex == 037
) {
4302
37
    // Set up a frame object for the return address.
4303
37
    unsigned SlotSize = RegInfo->getSlotSize();
4304
37
    ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4305
37
                                                          -(int64_t)SlotSize,
4306
37
                                                          false);
4307
37
    FuncInfo->setRAIndex(ReturnAddrIndex);
4308
37
  }
4309
37
4310
37
  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4311
37
}
4312
4313
bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4314
2.72M
                                       bool hasSymbolicDisplacement) {
4315
2.72M
  // Offset should fit into 32 bit immediate field.
4316
2.72M
  if (!isInt<32>(Offset))
4317
547
    return false;
4318
2.72M
4319
2.72M
  // If we don't have a symbolic displacement - we don't have any extra
4320
2.72M
  // restrictions.
4321
2.72M
  
if (2.72M
!hasSymbolicDisplacement2.72M
)
4322
2.66M
    return true;
4323
58.0k
4324
58.0k
  // FIXME: Some tweaks might be needed for medium code model.
4325
58.0k
  
if (58.0k
M != CodeModel::Small && 58.0k
M != CodeModel::Kernel493
)
4326
413
    return false;
4327
57.6k
4328
57.6k
  // For small code model we assume that latest object is 16MB before end of 31
4329
57.6k
  // bits boundary. We may also accept pretty large negative constants knowing
4330
57.6k
  // that all objects are in the positive half of address space.
4331
57.6k
  
if (57.6k
M == CodeModel::Small && 57.6k
Offset < 16*1024*102457.5k
)
4332
57.5k
    return true;
4333
83
4334
83
  // For kernel code model we know that all object resist in the negative half
4335
83
  // of 32bits address space. We may not accept negative offsets, since they may
4336
83
  // be just off and we may accept pretty large positive ones.
4337
83
  
if (83
M == CodeModel::Kernel && 83
Offset >= 080
)
4338
74
    return true;
4339
9
4340
9
  return false;
4341
9
}
4342
4343
/// Determines whether the callee is required to pop its own arguments.
4344
/// Callee pop is necessary to support tail calls.
4345
bool X86::isCalleePop(CallingConv::ID CallingConv,
4346
101k
                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4347
101k
  // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4348
101k
  // can guarantee TCO.
4349
101k
  if (
!IsVarArg && 101k
shouldGuaranteeTCO(CallingConv, GuaranteeTCO)98.2k
)
4350
50
    return true;
4351
101k
4352
101k
  switch (CallingConv) {
4353
100k
  default:
4354
100k
    return false;
4355
395
  case CallingConv::X86_StdCall:
4356
395
  case CallingConv::X86_FastCall:
4357
395
  case CallingConv::X86_ThisCall:
4358
395
  case CallingConv::X86_VectorCall:
4359
395
    return !is64Bit;
4360
0
  }
4361
0
}
4362
4363
/// \brief Return true if the condition is an unsigned comparison operation.
4364
54
static bool isX86CCUnsigned(unsigned X86CC) {
4365
54
  switch (X86CC) {
4366
0
  default:
4367
0
    llvm_unreachable("Invalid integer condition!");
4368
53
  case X86::COND_E:
4369
53
  case X86::COND_NE:
4370
53
  case X86::COND_B:
4371
53
  case X86::COND_A:
4372
53
  case X86::COND_BE:
4373
53
  case X86::COND_AE:
4374
53
    return true;
4375
1
  case X86::COND_G:
4376
1
  case X86::COND_GE:
4377
1
  case X86::COND_L:
4378
1
  case X86::COND_LE:
4379
1
    return false;
4380
0
  }
4381
0
}
4382
4383
34.1k
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4384
34.1k
  switch (SetCCOpcode) {
4385
0
  
default: 0
llvm_unreachable0
("Invalid integer condition!");
4386
14.0k
  case ISD::SETEQ:  return X86::COND_E;
4387
1.20k
  case ISD::SETGT:  return X86::COND_G;
4388
261
  case ISD::SETGE:  return X86::COND_GE;
4389
862
  case ISD::SETLT:  return X86::COND_L;
4390
137
  case ISD::SETLE:  return X86::COND_LE;
4391
8.81k
  case ISD::SETNE:  return X86::COND_NE;
4392
3.71k
  case ISD::SETULT: return X86::COND_B;
4393
3.33k
  case ISD::SETUGT: return X86::COND_A;
4394
694
  case ISD::SETULE: return X86::COND_BE;
4395
1.06k
  case ISD::SETUGE: return X86::COND_AE;
4396
0
  }
4397
0
}
4398
4399
/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4400
/// condition code, returning the condition code and the LHS/RHS of the
4401
/// comparison to make.
4402
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4403
                               bool isFP, SDValue &LHS, SDValue &RHS,
4404
37.1k
                               SelectionDAG &DAG) {
4405
37.1k
  if (
!isFP37.1k
) {
4406
34.9k
    if (ConstantSDNode *
RHSC34.9k
= dyn_cast<ConstantSDNode>(RHS)) {
4407
26.9k
      if (
SetCCOpcode == ISD::SETGT && 26.9k
RHSC->isAllOnesValue()935
) {
4408
267
        // X > -1   -> X == 0, jump !sign.
4409
267
        RHS = DAG.getConstant(0, DL, RHS.getValueType());
4410
267
        return X86::COND_NS;
4411
267
      }
4412
26.7k
      
if (26.7k
SetCCOpcode == ISD::SETLT && 26.7k
RHSC->isNullValue()1.49k
) {
4413
666
        // X < 0   -> X == 0, jump on sign.
4414
666
        return X86::COND_S;
4415
666
      }
4416
26.0k
      
if (26.0k
SetCCOpcode == ISD::SETLT && 26.0k
RHSC->getZExtValue() == 1827
) {
4417
375
        // X < 1   -> X <= 0
4418
375
        RHS = DAG.getConstant(0, DL, RHS.getValueType());
4419
375
        return X86::COND_LE;
4420
375
      }
4421
33.6k
    }
4422
33.6k
4423
33.6k
    return TranslateIntegerX86CC(SetCCOpcode);
4424
33.6k
  }
4425
2.20k
4426
2.20k
  // First determine if it is required or is profitable to flip the operands.
4427
2.20k
4428
2.20k
  // If LHS is a foldable load, but RHS is not, flip the condition.
4429
2.20k
  
if (2.20k
ISD::isNON_EXTLoad(LHS.getNode()) &&
4430
2.20k
      
!ISD::isNON_EXTLoad(RHS.getNode())280
) {
4431
233
    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4432
233
    std::swap(LHS, RHS);
4433
233
  }
4434
2.20k
4435
2.20k
  switch (SetCCOpcode) {
4436
2.12k
  default: break;
4437
80
  case ISD::SETOLT:
4438
80
  case ISD::SETOLE:
4439
80
  case ISD::SETUGT:
4440
80
  case ISD::SETUGE:
4441
80
    std::swap(LHS, RHS);
4442
80
    break;
4443
2.20k
  }
4444
2.20k
4445
2.20k
  // On a floating point condition, the flags are set as follows:
4446
2.20k
  // ZF  PF  CF   op
4447
2.20k
  //  0 | 0 | 0 | X > Y
4448
2.20k
  //  0 | 0 | 1 | X < Y
4449
2.20k
  //  1 | 0 | 0 | X == Y
4450
2.20k
  //  1 | 1 | 1 | unordered
4451
2.20k
  switch (SetCCOpcode) {
4452
0
  
default: 0
llvm_unreachable0
("Condcode should be pre-legalized away");
4453
266
  case ISD::SETUEQ:
4454
266
  case ISD::SETEQ:   return X86::COND_E;
4455
448
  case ISD::SETOLT:              // flipped
4456
448
  case ISD::SETOGT:
4457
448
  case ISD::SETGT:   return X86::COND_A;
4458
435
  case ISD::SETOLE:              // flipped
4459
435
  case ISD::SETOGE:
4460
435
  case ISD::SETGE:   return X86::COND_AE;
4461
312
  case ISD::SETUGT:              // flipped
4462
312
  case ISD::SETULT:
4463
312
  case ISD::SETLT:   return X86::COND_B;
4464
124
  case ISD::SETUGE:              // flipped
4465
124
  case ISD::SETULE:
4466
124
  case ISD::SETLE:   return X86::COND_BE;
4467
24
  case ISD::SETONE:
4468
24
  case ISD::SETNE:   return X86::COND_NE;
4469
188
  case ISD::SETUO:   return X86::COND_P;
4470
258
  case ISD::SETO:    return X86::COND_NP;
4471
154
  case ISD::SETOEQ:
4472
154
  case ISD::SETUNE:  return X86::COND_INVALID;
4473
0
  }
4474
0
}
4475
4476
/// Is there a floating point cmov for the specific X86 condition code?
4477
/// Current x86 isa includes the following FP cmov instructions:
4478
/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4479
724
static bool hasFPCMov(unsigned X86CC) {
4480
724
  switch (X86CC) {
4481
65
  default:
4482
65
    return false;
4483
659
  case X86::COND_B:
4484
659
  case X86::COND_BE:
4485
659
  case X86::COND_E:
4486
659
  case X86::COND_P:
4487
659
  case X86::COND_A:
4488
659
  case X86::COND_AE:
4489
659
  case X86::COND_NE:
4490
659
  case X86::COND_NP:
4491
659
    return true;
4492
0
  }
4493
0
}
4494
4495
4496
bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4497
                                           const CallInst &I,
4498
12.4k
                                           unsigned Intrinsic) const {
4499
12.4k
4500
12.4k
  const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4501
12.4k
  if (!IntrData)
4502
11.9k
    return false;
4503
455
4504
455
  Info.opc = ISD::INTRINSIC_W_CHAIN;
4505
455
  Info.readMem = false;
4506
455
  Info.writeMem = false;
4507
455
  Info.vol = false;
4508
455
  Info.offset = 0;
4509
455
4510
455
  switch (IntrData->Type) {
4511
6
  case EXPAND_FROM_MEM: {
4512
6
    Info.ptrVal = I.getArgOperand(0);
4513
6
    Info.memVT = MVT::getVT(I.getType());
4514
6
    Info.align = 1;
4515
6
    Info.readMem = true;
4516
6
    break;
4517
455
  }
4518
5
  case COMPRESS_TO_MEM: {
4519
5
    Info.ptrVal = I.getArgOperand(0);
4520
5
    Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4521
5
    Info.align = 1;
4522
5
    Info.writeMem = true;
4523
5
    break;
4524
455
  }
4525
114
  case TRUNCATE_TO_MEM_VI8:
4526
114
  case TRUNCATE_TO_MEM_VI16:
4527
114
  case TRUNCATE_TO_MEM_VI32: {
4528
114
    Info.ptrVal = I.getArgOperand(0);
4529
114
    MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
4530
114
    MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4531
114
    if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4532
60
      ScalarVT = MVT::i8;
4533
54
    else 
if (54
IntrData->Type == TRUNCATE_TO_MEM_VI1654
)
4534
36
      ScalarVT = MVT::i16;
4535
18
    else 
if (18
IntrData->Type == TRUNCATE_TO_MEM_VI3218
)
4536
18
      ScalarVT = MVT::i32;
4537
114
4538
114
    Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4539
114
    Info.align = 1;
4540
114
    Info.writeMem = true;
4541
114
    break;
4542
114
  }
4543
330
  default:
4544
330
    return false;
4545
125
  }
4546
125
4547
125
  return true;
4548
125
}
4549
4550
/// Returns true if the target can instruction select the
4551
/// specified FP immediate natively. If false, the legalizer will
4552
/// materialize the FP immediate as a load from a constant pool.
4553
5.17k
bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4554
27.4k
  for (unsigned i = 0, e = LegalFPImmediates.size(); 
i != e27.4k
;
++i22.2k
) {
4555
24.8k
    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4556
2.62k
      return true;
4557
24.8k
  }
4558
2.55k
  return false;
4559
5.17k
}
4560
4561
bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4562
                                              ISD::LoadExtType ExtTy,
4563
531
                                              EVT NewVT) const {
4564
531
  // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4565
531
  // relocation target a movq or addq instruction: don't let the load shrink.
4566
531
  SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4567
531
  if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4568
9
    
if (const auto *9
GA9
= dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4569
4
      return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4570
527
  return true;
4571
527
}
4572
4573
/// \brief Returns true if it is beneficial to convert a load of a constant
4574
/// to just the constant itself.
4575
bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4576
88
                                                          Type *Ty) const {
4577
88
  assert(Ty->isIntegerTy());
4578
88
4579
88
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
4580
88
  if (
BitSize == 0 || 88
BitSize > 6488
)
4581
0
    return false;
4582
88
  return true;
4583
88
}
4584
4585
15.9k
bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4586
15.9k
  // TODO: It might be a win to ease or lift this restriction, but the generic
4587
15.9k
  // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4588
15.9k
  if (
VT.isVector() && 15.9k
Subtarget.hasAVX512()15.4k
)
4589
14.0k
    return false;
4590
1.88k
4591
1.88k
  return true;
4592
1.88k
}
4593
4594
bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4595
510
                                                unsigned Index) const {
4596
510
  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4597
38
    return false;
4598
472
4599
472
  // Mask vectors support all subregister combinations and operations that
4600
472
  // extract half of vector.
4601
472
  
if (472
ResVT.getVectorElementType() == MVT::i1472
)
4602
0
    
return Index == 0 || 0
((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4603
0
                          (Index == ResVT.getVectorNumElements()));
4604
472
4605
472
  return (Index % ResVT.getVectorNumElements()) == 0;
4606
472
}
4607
4608
2.95k
bool X86TargetLowering::isCheapToSpeculateCttz() const {
4609
2.95k
  // Speculate cttz only if we can directly use TZCNT.
4610
2.95k
  return Subtarget.hasBMI();
4611
2.95k
}
4612
4613
6.02k
bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4614
6.02k
  // Speculate ctlz only if we can directly use LZCNT.
4615
6.02k
  return Subtarget.hasLZCNT();
4616
6.02k
}
4617
4618
bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4619
59.2k
                                         const SelectionDAG &DAG) const {
4620
59.2k
  // Do not merge to float value size (128 bytes) if no implicit
4621
59.2k
  // float attribute is set.
4622
59.2k
  bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute(
4623
59.2k
      Attribute::NoImplicitFloat);
4624
59.2k
4625
59.2k
  if (
NoFloat59.2k
) {
4626
471
    unsigned MaxIntSize = Subtarget.is64Bit() ? 
64268
:
32203
;
4627
471
    return (MemVT.getSizeInBits() <= MaxIntSize);
4628
471
  }
4629
58.7k
  return true;
4630
58.7k
}
4631
4632
18.2k
bool X86TargetLowering::isCtlzFast() const {
4633
18.2k
  return Subtarget.hasFastLZCNT();
4634
18.2k
}
4635
4636
bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4637
95
    const Instruction &AndI) const {
4638
95
  return true;
4639
95
}
4640
4641
534
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4642
534
  if (!Subtarget.hasBMI())
4643
496
    return false;
4644
38
4645
38
  // There are only 32-bit and 64-bit forms for 'andn'.
4646
38
  EVT VT = Y.getValueType();
4647
38
  if (
VT != MVT::i32 && 38
VT != MVT::i646
)
4648
6
    return false;
4649
32
4650
32
  return true;
4651
32
}
4652
4653
48
MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4654
48
  MVT VT = MVT::getIntegerVT(NumBits);
4655
48
  if (isTypeLegal(VT))
4656
4
    return VT;
4657
44
4658
44
  // PMOVMSKB can handle this.
4659
44
  
if (44
NumBits == 128 && 44
isTypeLegal(MVT::v16i8)16
)
4660
10
    return MVT::v16i8;
4661
34
4662
34
  // VPMOVMSKB can handle this.
4663
34
  
if (34
NumBits == 256 && 34
isTypeLegal(MVT::v32i8)24
)
4664
6
    return MVT::v32i8;
4665
28
4666
28
  // TODO: Allow 64-bit type for 32-bit target.
4667
28
  // TODO: 512-bit types should be allowed, but make sure that those
4668
28
  // cases are handled in combineVectorSizedSetCCEquality().
4669
28
4670
28
  return MVT::INVALID_SIMPLE_VALUE_TYPE;
4671
28
}
4672
4673
/// Val is the undef sentinel value or equal to the specified value.
4674
409k
static bool isUndefOrEqual(int Val, int CmpVal) {
4675
341k
  return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4676
409k
}
4677
4678
/// Val is either the undef or zero sentinel value.
4679
1.44M
static bool isUndefOrZero(int Val) {
4680
1.32M
  return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4681
1.44M
}
4682
4683
/// Return true if every element in Mask, beginning
4684
/// from position Pos and ending in Pos+Size is the undef sentinel value.
4685
31.5k
static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4686
50.3k
  for (unsigned i = Pos, e = Pos + Size; 
i != e50.3k
;
++i18.8k
)
4687
47.9k
    
if (47.9k
Mask[i] != SM_SentinelUndef47.9k
)
4688
29.1k
      return false;
4689
2.33k
  return true;
4690
31.5k
}
4691
4692
/// Return true if Val is undef or if its value falls within the
4693
/// specified range (L, H].
4694
61.7k
static bool isUndefOrInRange(int Val, int Low, int Hi) {
4695
59.7k
  return (Val == SM_SentinelUndef) || 
(Val >= Low && 59.7k
Val < Hi55.4k
);
4696
61.7k
}
4697
4698
/// Return true if every element in Mask is undef or if its value
4699
/// falls within the specified range (L, H].
4700
static bool isUndefOrInRange(ArrayRef<int> Mask,
4701
15.7k
                             int Low, int Hi) {
4702
15.7k
  for (int M : Mask)
4703
49.4k
    
if (49.4k
!isUndefOrInRange(M, Low, Hi)49.4k
)
4704
7.48k
      return false;
4705
8.28k
  return true;
4706
8.28k
}
4707
4708
/// Return true if Val is undef, zero or if its value falls within the
4709
/// specified range (L, H].
4710
16
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4711
11
  return isUndefOrZero(Val) || 
(Val >= Low && 11
Val < Hi2
);
4712
16
}
4713
4714
/// Return true if every element in Mask is undef, zero or if its value
4715
/// falls within the specified range (L, H].
4716
14
static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4717
14
  for (int M : Mask)
4718
16
    
if (16
!isUndefOrZeroOrInRange(M, Low, Hi)16
)
4719
9
      return false;
4720
5
  return true;
4721
5
}
4722
4723
/// Return true if every element in Mask, beginning
4724
/// from position Pos and ending in Pos+Size, falls within the specified
4725
/// sequential range (Low, Low+Size]. or is undef.
4726
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4727
259k
                                       unsigned Pos, unsigned Size, int Low) {
4728
377k
  for (unsigned i = Pos, e = Pos+Size; 
i != e377k
;
++i, ++Low117k
)
4729
347k
    
if (347k
!isUndefOrEqual(Mask[i], Low)347k
)
4730
229k
      return false;
4731
30.2k
  return true;
4732
259k
}
4733
4734
/// Return true if every element in Mask, beginning
4735
/// from position Pos and ending in Pos+Size, falls within the specified
4736
/// sequential range (Low, Low+Size], or is undef or is zero.
4737
static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4738
4.07k
                                             unsigned Size, int Low) {
4739
34.8k
  for (unsigned i = Pos, e = Pos + Size; 
i != e34.8k
;
++i, ++Low30.8k
)
4740
33.0k
    
if (33.0k
!isUndefOrZero(Mask[i]) && 33.0k
Mask[i] != Low10.7k
)
4741
2.27k
      return false;
4742
1.79k
  return true;
4743
4.07k
}
4744
4745
/// Return true if every element in Mask, beginning
4746
/// from position Pos and ending in Pos+Size is undef or is zero.
4747
static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4748
57.6k
                                 unsigned Size) {
4749
98.0k
  for (unsigned i = Pos, e = Pos + Size; 
i != e98.0k
;
++i40.4k
)
4750
83.2k
    
if (83.2k
!isUndefOrZero(Mask[i])83.2k
)
4751
42.7k
      return false;
4752
14.8k
  return true;
4753
57.6k
}
4754
4755
/// \brief Helper function to test whether a shuffle mask could be
4756
/// simplified by widening the elements being shuffled.
4757
///
4758
/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4759
/// leaves it in an unspecified state.
4760
///
4761
/// NOTE: This must handle normal vector shuffle masks and *target* vector
4762
/// shuffle masks. The latter have the special property of a '-2' representing
4763
/// a zero-ed lane of a vector.
4764
static bool canWidenShuffleElements(ArrayRef<int> Mask,
4765
144k
                                    SmallVectorImpl<int> &WidenedMask) {
4766
144k
  WidenedMask.assign(Mask.size() / 2, 0);
4767
371k
  for (int i = 0, Size = Mask.size(); 
i < Size371k
;
i += 2227k
) {
4768
342k
    int M0 = Mask[i];
4769
342k
    int M1 = Mask[i + 1];
4770
342k
4771
342k
    // If both elements are undef, its trivial.
4772
342k
    if (
M0 == SM_SentinelUndef && 342k
M1 == SM_SentinelUndef25.1k
) {
4773
20.6k
      WidenedMask[i / 2] = SM_SentinelUndef;
4774
20.6k
      continue;
4775
20.6k
    }
4776
322k
4777
322k
    // Check for an undef mask and a mask value properly aligned to fit with
4778
322k
    // a pair of values. If we find such a case, use the non-undef mask's value.
4779
322k
    
if (322k
M0 == SM_SentinelUndef && 322k
M1 >= 04.49k
&&
(M1 % 2) == 14.43k
) {
4780
1.19k
      WidenedMask[i / 2] = M1 / 2;
4781
1.19k
      continue;
4782
1.19k
    }
4783
321k
    
if (321k
M1 == SM_SentinelUndef && 321k
M0 >= 07.39k
&&
(M0 % 2) == 07.33k
) {
4784
3.82k
      WidenedMask[i / 2] = M0 / 2;
4785
3.82k
      continue;
4786
3.82k
    }
4787
317k
4788
317k
    // When zeroing, we need to spread the zeroing across both lanes to widen.
4789
317k
    
if (317k
M0 == SM_SentinelZero || 317k
M1 == SM_SentinelZero281k
) {
4790
50.7k
      if (
(M0 == SM_SentinelZero || 50.7k
M0 == SM_SentinelUndef14.9k
) &&
4791
50.7k
          
(M1 == SM_SentinelZero || 35.8k
M1 == SM_SentinelUndef3.95k
)) {
4792
31.9k
        WidenedMask[i / 2] = SM_SentinelZero;
4793
31.9k
        continue;
4794
31.9k
      }
4795
18.7k
      return false;
4796
18.7k
    }
4797
266k
4798
266k
    // Finally check if the two mask values are adjacent and aligned with
4799
266k
    // a pair.
4800
266k
    
if (266k
M0 != SM_SentinelUndef && 266k
(M0 % 2) == 0263k
&&
(M0 + 1) == M1237k
) {
4801
169k
      WidenedMask[i / 2] = M0 / 2;
4802
169k
      continue;
4803
169k
    }
4804
96.8k
4805
96.8k
    // Otherwise we can't safely widen the elements used in this shuffle.
4806
96.8k
    return false;
4807
96.8k
  }
4808
28.8k
  assert(WidenedMask.size() == Mask.size() / 2 &&
4809
28.8k
         "Incorrect size of mask after widening the elements!");
4810
28.8k
4811
28.8k
  return true;
4812
144k
}
4813
4814
/// Returns true if Elt is a constant zero or a floating point constant +0.0.
4815
5.24M
bool X86::isZeroNode(SDValue Elt) {
4816
5.21M
  return isNullConstant(Elt) || isNullFPConstant(Elt);
4817
5.24M
}
4818
4819
// Build a vector of constants.
4820
// Use an UNDEF node if MaskElt == -1.
4821
// Split 64-bit constants in the 32-bit mode.
4822
static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4823
1.48k
                              const SDLoc &dl, bool IsMask = false) {
4824
1.48k
4825
1.48k
  SmallVector<SDValue, 32>  Ops;
4826
1.48k
  bool Split = false;
4827
1.48k
4828
1.48k
  MVT ConstVecVT = VT;
4829
1.48k
  unsigned NumElts = VT.getVectorNumElements();
4830
1.48k
  bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4831
1.48k
  if (
!In64BitMode && 1.48k
VT.getVectorElementType() == MVT::i64131
) {
4832
80
    ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4833
80
    Split = true;
4834
80
  }
4835
1.48k
4836
1.48k
  MVT EltVT = ConstVecVT.getVectorElementType();
4837
18.8k
  for (unsigned i = 0; 
i < NumElts18.8k
;
++i17.3k
) {
4838
1.11k
    bool IsUndef = Values[i] < 0 && IsMask;
4839
1.11k
    SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4840
16.2k
      DAG.getConstant(Values[i], dl, EltVT);
4841
17.3k
    Ops.push_back(OpNode);
4842
17.3k
    if (Split)
4843
632
      
Ops.push_back(IsUndef ? 632
DAG.getUNDEF(EltVT)25
:
4844
632
                    DAG.getConstant(0, dl, EltVT));
4845
17.3k
  }
4846
1.48k
  SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4847
1.48k
  if (Split)
4848
80
    ConstsNode = DAG.getBitcast(VT, ConstsNode);
4849
1.48k
  return ConstsNode;
4850
1.48k
}
4851
4852
static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4853
671
                              MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4854
671
  assert(Bits.size() == Undefs.getBitWidth() &&
4855
671
         "Unequal constant and undef arrays");
4856
671
  SmallVector<SDValue, 32> Ops;
4857
671
  bool Split = false;
4858
671
4859
671
  MVT ConstVecVT = VT;
4860
671
  unsigned NumElts = VT.getVectorNumElements();
4861
671
  bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4862
671
  if (
!In64BitMode && 671
VT.getVectorElementType() == MVT::i6467
) {
4863
10
    ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4864
10
    Split = true;
4865
10
  }
4866
671
4867
671
  MVT EltVT = ConstVecVT.getVectorElementType();
4868
10.2k
  for (unsigned i = 0, e = Bits.size(); 
i != e10.2k
;
++i9.55k
) {
4869
9.55k
    if (
Undefs[i]9.55k
) {
4870
124
      Ops.append(Split ? 
22
:
1122
, DAG.getUNDEF(EltVT));
4871
124
      continue;
4872
124
    }
4873
9.43k
    const APInt &V = Bits[i];
4874
9.43k
    assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4875
9.43k
    if (
Split9.43k
) {
4876
18
      Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4877
18
      Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4878
9.43k
    } else 
if (9.41k
EltVT == MVT::f329.41k
) {
4879
152
      APFloat FV(APFloat::IEEEsingle(), V);
4880
152
      Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4881
9.41k
    } else 
if (9.26k
EltVT == MVT::f649.26k
) {
4882
60
      APFloat FV(APFloat::IEEEdouble(), V);
4883
60
      Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4884
9.26k
    } else {
4885
9.20k
      Ops.push_back(DAG.getConstant(V, dl, EltVT));
4886
9.20k
    }
4887
9.55k
  }
4888
671
4889
671
  SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4890
671
  return DAG.getBitcast(VT, ConstsNode);
4891
671
}
4892
4893
/// Returns a vector of specified type with all zero elements.
4894
static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4895
14.2k
                             SelectionDAG &DAG, const SDLoc &dl) {
4896
14.2k
  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4897
14.2k
          VT.getVectorElementType() == MVT::i1) &&
4898
14.2k
         "Unexpected vector type");
4899
14.2k
4900
14.2k
  // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4901
14.2k
  // type. This ensures they get CSE'd. But if the integer type is not
4902
14.2k
  // available, use a floating-point +0.0 instead.
4903
14.2k
  SDValue Vec;
4904
14.2k
  if (
!Subtarget.hasSSE2() && 14.2k
VT.is128BitVector()67
) {
4905
67
    Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4906
14.2k
  } else 
if (14.1k
VT.getVectorElementType() == MVT::i114.1k
) {
4907
288
    assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4908
288
           "Unexpected vector type");
4909
288
    assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4910
288
           "Unexpected vector type");
4911
288
    Vec = DAG.getConstant(0, dl, VT);
4912
14.1k
  } else {
4913
13.8k
    unsigned Num32BitElts = VT.getSizeInBits() / 32;
4914
13.8k
    Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4915
13.8k
  }
4916
14.2k
  return DAG.getBitcast(VT, Vec);
4917
14.2k
}
4918
4919
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4920
11.0k
                                const SDLoc &dl, unsigned vectorWidth) {
4921
11.0k
  EVT VT = Vec.getValueType();
4922
11.0k
  EVT ElVT = VT.getVectorElementType();
4923
11.0k
  unsigned Factor = VT.getSizeInBits()/vectorWidth;
4924
11.0k
  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4925
11.0k
                                  VT.getVectorNumElements()/Factor);
4926
11.0k
4927
11.0k
  // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
4928
11.0k
  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4929
11.0k
  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4930
11.0k
4931
11.0k
  // This is the index of the first element of the vectorWidth-bit chunk
4932
11.0k
  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4933
11.0k
  IdxVal &= ~(ElemsPerChunk - 1);
4934
11.0k
4935
11.0k
  // If the input is a buildvector just emit a smaller one.
4936
11.0k
  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4937
1.16k
    return DAG.getBuildVector(ResultVT, dl,
4938
1.16k
                              Vec->ops().slice(IdxVal, ElemsPerChunk));
4939
9.85k
4940
9.85k
  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4941
9.85k
  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4942
9.85k
}
4943
4944
/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
4945
/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4946
/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4947
/// instructions or a simple subregister reference. Idx is an index in the
4948
/// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
4949
/// lowering EXTRACT_VECTOR_ELT operations easier.
4950
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4951
9.20k
                                   SelectionDAG &DAG, const SDLoc &dl) {
4952
9.20k
  assert((Vec.getValueType().is256BitVector() ||
4953
9.20k
          Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4954
9.20k
  return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4955
9.20k
}
4956
4957
/// Generate a DAG to grab 256-bits from a 512-bit vector.
4958
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4959
8
                                   SelectionDAG &DAG, const SDLoc &dl) {
4960
8
  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4961
8
  return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4962
8
}
4963
4964
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4965
                               SelectionDAG &DAG, const SDLoc &dl,
4966
10.0k
                               unsigned vectorWidth) {
4967
10.0k
  assert((vectorWidth == 128 || vectorWidth == 256) &&
4968
10.0k
         "Unsupported vector width");
4969
10.0k
  // Inserting UNDEF is Result
4970
10.0k
  if (Vec.isUndef())
4971
649
    return Result;
4972
9.39k
  EVT VT = Vec.getValueType();
4973
9.39k
  EVT ElVT = VT.getVectorElementType();
4974
9.39k
  EVT ResultVT = Result.getValueType();
4975
9.39k
4976
9.39k
  // Insert the relevant vectorWidth bits.
4977
9.39k
  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4978
9.39k
  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4979
9.39k
4980
9.39k
  // This is the index of the first element of the vectorWidth-bit chunk
4981
9.39k
  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4982
9.39k
  IdxVal &= ~(ElemsPerChunk - 1);
4983
9.39k
4984
9.39k
  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4985
9.39k
  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4986
9.39k
}
4987
4988
/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
4989
/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4990
/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4991
/// simple superregister reference.  Idx is an index in the 128 bits
4992
/// we want.  It need not be aligned to a 128-bit boundary.  That makes
4993
/// lowering INSERT_VECTOR_ELT operations easier.
4994
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4995
8.22k
                                  SelectionDAG &DAG, const SDLoc &dl) {
4996
8.22k
  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4997
8.22k
  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4998
8.22k
}
4999
5000
static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5001
1.81k
                                  SelectionDAG &DAG, const SDLoc &dl) {
5002
1.81k
  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5003
1.81k
  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5004
1.81k
}
5005
5006
// Return true if the instruction zeroes the unused upper part of the
5007
// destination and accepts mask.
5008
4.54k
static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5009
4.54k
  switch (Opcode) {
5010
1.76k
  default:
5011
1.76k
    return false;
5012
2.77k
  case X86ISD::PCMPEQM:
5013
2.77k
  case X86ISD::PCMPGTM:
5014
2.77k
  case X86ISD::CMPM:
5015
2.77k
  case X86ISD::CMPMU:
5016
2.77k
    return true;
5017
0
  }
5018
0
}
5019
5020
/// Insert i1-subvector to i1-vector.
5021
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5022
2.76k
                                const X86Subtarget &Subtarget) {
5023
2.76k
5024
2.76k
  SDLoc dl(Op);
5025
2.76k
  SDValue Vec = Op.getOperand(0);
5026
2.76k
  SDValue SubVec = Op.getOperand(1);
5027
2.76k
  SDValue Idx = Op.getOperand(2);
5028
2.76k
5029
2.76k
  if (!isa<ConstantSDNode>(Idx))
5030
0
    return SDValue();
5031
2.76k
5032
2.76k
  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5033
2.76k
  if (
IdxVal == 0 && 2.76k
Vec.isUndef()2.75k
) // the operation is legal
5034
874
    return Op;
5035
1.88k
5036
1.88k
  MVT OpVT = Op.getSimpleValueType();
5037
1.88k
  MVT SubVecVT = SubVec.getSimpleValueType();
5038
1.88k
  unsigned NumElems = OpVT.getVectorNumElements();
5039
1.88k
  unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5040
1.88k
5041
1.88k
  assert(IdxVal + SubVecNumElems <= NumElems &&
5042
1.88k
         IdxVal % SubVecVT.getSizeInBits() == 0 &&
5043
1.88k
         "Unexpected index value in INSERT_SUBVECTOR");
5044
1.88k
5045
1.88k
  // There are 3 possible cases:
5046
1.88k
  // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5047
1.88k
  // 2. Subvector should be inserted in the upper part
5048
1.88k
  //    (IdxVal + SubVecNumElems == NumElems)
5049
1.88k
  // 3. Subvector should be inserted in the middle (for example v2i1
5050
1.88k
  //    to v16i1, index 2)
5051
1.88k
5052
1.88k
  // If this node widens - by concatenating zeroes - the type of the result
5053
1.88k
  // of a node with instruction that zeroes all upper (irrelevant) bits of the
5054
1.88k
  // output register, mark this node as legal to enable replacing them with
5055
1.88k
  // the v8i1 version of the previous instruction during instruction selection.
5056
1.88k
  // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
5057
1.88k
  // while zeroing all the upper remaining 60 bits of the register. if the
5058
1.88k
  // result of such instruction is inserted into an allZeroVector, then we can
5059
1.88k
  // safely remove insert_vector (in instruction selection) as the cmp instr
5060
1.88k
  // already zeroed the rest of the register.
5061
1.88k
  if (
ISD::isBuildVectorAllZeros(Vec.getNode()) && 1.88k
IdxVal == 01.88k
&&
5062
1.88k
      (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
5063
948
       (SubVec.getOpcode() == ISD::AND &&
5064
918
        (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
5065
948
         isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
5066
1.85k
    return Op;
5067
37
5068
37
  // extend to natively supported kshift
5069
37
  
MVT MinVT = Subtarget.hasDQI() ? 37
MVT::v8i117
:
MVT::v16i120
;
5070
37
  MVT WideOpVT = OpVT;
5071
37
  if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5072
19
    WideOpVT = MinVT;
5073
37
5074
37
  SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5075
37
  SDValue Undef = DAG.getUNDEF(WideOpVT);
5076
37
  SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5077
37
                                   Undef, SubVec, ZeroIdx);
5078
37
5079
37
  // Extract sub-vector if require.
5080
37
  auto ExtractSubVec = [&](SDValue V) {
5081
37
    return (WideOpVT == OpVT) ? 
V18
: DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5082
19
                                                OpVT, V, ZeroIdx);
5083
37
  };
5084
37
5085
37
  if (
Vec.isUndef()37
) {
5086
3
    if (
IdxVal != 03
) {
5087
3
      SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5088
3
      WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5089
3
                               ShiftBits);
5090
3
    }
5091
3
    return ExtractSubVec(WideSubVec);
5092
3
  }
5093
34
5094
34
  
if (34
ISD::isBuildVectorAllZeros(Vec.getNode())34
) {
5095
30
    NumElems = WideOpVT.getVectorNumElements();
5096
30
    unsigned ShiftLeft = NumElems - SubVecNumElems;
5097
30
    unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5098
30
    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5099
30
                      DAG.getConstant(ShiftLeft, dl, MVT::i8));
5100
30
    Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5101
30
      DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5102
30
    return ExtractSubVec(Vec);
5103
30
  }
5104
4
5105
4
  
if (4
IdxVal == 04
) {
5106
0
    // Zero lower bits of the Vec
5107
0
    SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5108
0
    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5109
0
    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5110
0
    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5111
0
    // Merge them together, SubVec should be zero extended.
5112
0
    WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5113
0
                             getZeroVector(WideOpVT, Subtarget, DAG, dl),
5114
0
                             SubVec, ZeroIdx);
5115
0
    Vec =  DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5116
0
    return ExtractSubVec(Vec);
5117
0
  }
5118
4
5119
4
  // Simple case when we put subvector in the upper part
5120
4
  
if (4
IdxVal + SubVecNumElems == NumElems4
) {
5121
4
    // Zero upper bits of the Vec
5122
4
    WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5123
4
                             DAG.getConstant(IdxVal, dl, MVT::i8));
5124
4
    SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5125
4
    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5126
4
    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5127
4
    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5128
4
    Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5129
4
    return ExtractSubVec(Vec);
5130
4
  }
5131
0
  // Subvector should be inserted in the middle - use shuffle
5132
0
  WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5133
0
                           SubVec, ZeroIdx);
5134
0
  SmallVector<int, 64> Mask;
5135
0
  for (unsigned i = 0; 
i < NumElems0
;
++i0
)
5136
0
    
Mask.push_back(i >= IdxVal && 0
i < IdxVal + SubVecNumElems0
?
5137
0
                    
i0
:
i + NumElems0
);
5138
2.76k
  return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5139
2.76k
}
5140
5141
/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5142
/// instructions. This is used because creating CONCAT_VECTOR nodes of
5143
/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5144
/// large BUILD_VECTORS.
5145
static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5146
                                   unsigned NumElems, SelectionDAG &DAG,
5147
3.85k
                                   const SDLoc &dl) {
5148
3.85k
  SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5149
3.85k
  return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5150
3.85k
}
5151
5152
static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5153
                                   unsigned NumElems, SelectionDAG &DAG,
5154
909
                                   const SDLoc &dl) {
5155
909
  SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5156
909
  return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5157
909
}
5158
5159
/// Returns a vector of specified type with all bits set.
5160
/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5161
/// Then bitcast to their original type, ensuring they get CSE'd.
5162
2.61k
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5163
2.61k
  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5164
2.61k
         "Expected a 128/256/512-bit vector type");
5165
2.61k
5166
2.61k
  APInt Ones = APInt::getAllOnesValue(32);
5167
2.61k
  unsigned NumElts = VT.getSizeInBits() / 32;
5168
2.61k
  SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5169
2.61k
  return DAG.getBitcast(VT, Vec);
5170
2.61k
}
5171
5172
static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5173
2.30k
                              SelectionDAG &DAG) {
5174
2.30k
  EVT InVT = In.getValueType();
5175
2.30k
  assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5176
2.30k
5177
2.30k
  if (
VT.is128BitVector() && 2.30k
InVT.is128BitVector()510
)
5178
510
    
return X86ISD::VSEXT == Opc ? 510
DAG.getSignExtendVectorInReg(In, DL, VT)42
5179
468
                                : DAG.getZeroExtendVectorInReg(In, DL, VT);
5180
1.79k
5181
1.79k
  // For 256-bit vectors, we only need the lower (128-bit) input half.
5182
1.79k
  // For 512-bit vectors, we only need the lower input half or quarter.
5183
1.79k
  
if (1.79k
VT.getSizeInBits() > 128 && 1.79k
InVT.getSizeInBits() > 1281.79k
) {
5184
486
    int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5185
486
    In = extractSubVector(In, 0, DAG, DL,
5186
486
                          std::max(128, (int)VT.getSizeInBits() / Scale));
5187
486
  }
5188
2.30k
5189
2.30k
  return DAG.getNode(Opc, DL, VT, In);
5190
2.30k
}
5191
5192
/// Returns a vector_shuffle node for an unpackl operation.
5193
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5194
113
                          SDValue V1, SDValue V2) {
5195
113
  SmallVector<int, 8> Mask;
5196
113
  createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5197
113
  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5198
113
}
5199
5200
/// Returns a vector_shuffle node for an unpackh operation.
5201
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5202
8
                          SDValue V1, SDValue V2) {
5203
8
  SmallVector<int, 8> Mask;
5204
8
  createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5205
8
  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5206
8
}
5207
5208
/// Return a vector_shuffle of the specified vector of zero or undef vector.
5209
/// This produces a shuffle where the low element of V2 is swizzled into the
5210
/// zero/undef vector, landing at element Idx.
5211
/// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5212
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5213
                                           bool IsZero,
5214
                                           const X86Subtarget &Subtarget,
5215
718
                                           SelectionDAG &DAG) {
5216
718
  MVT VT = V2.getSimpleValueType();
5217
718
  SDValue V1 = IsZero
5218
718
    ? 
getZeroVector(VT, Subtarget, DAG, SDLoc(V2))702
:
DAG.getUNDEF(VT)16
;
5219
718
  int NumElems = VT.getVectorNumElements();
5220
718
  SmallVector<int, 16> MaskVec(NumElems);
5221
3.24k
  for (int i = 0; 
i != NumElems3.24k
;
++i2.52k
)
5222
718
    // If this is the insertion idx, put the low elt of V2 here.
5223
2.52k
    
MaskVec[i] = (i == Idx) ? 2.52k
NumElems718
:
i1.80k
;
5224
718
  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5225
718
}
5226
5227
35.6M
static SDValue peekThroughBitcasts(SDValue V) {
5228
63.5M
  while (
V.getNode() && 63.5M
V.getOpcode() == ISD::BITCAST63.5M
)
5229
27.9M
    V = V.getOperand(0);
5230
35.6M
  return V;
5231
35.6M
}
5232
5233
2.32M
static SDValue peekThroughOneUseBitcasts(SDValue V) {
5234
4.42M
  while (
V.getNode() && 4.42M
V.getOpcode() == ISD::BITCAST4.42M
&&
5235
2.11M
         V.getOperand(0).hasOneUse())
5236
2.09M
    V = V.getOperand(0);
5237
2.32M
  return V;
5238
2.32M
}
5239
5240
4.25M
static const Constant *getTargetConstantFromNode(SDValue Op) {
5241
4.25M
  Op = peekThroughBitcasts(Op);
5242
4.25M
5243
4.25M
  auto *Load = dyn_cast<LoadSDNode>(Op);
5244
4.25M
  if (!Load)
5245
4.13M
    return nullptr;
5246
118k
5247
118k
  SDValue Ptr = Load->getBasePtr();
5248
118k
  if (Ptr->getOpcode() == X86ISD::Wrapper ||
5249
21.0k
      Ptr->getOpcode() == X86ISD::WrapperRIP)
5250
99.4k
    Ptr = Ptr->getOperand(0);
5251
118k
5252
118k
  auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5253
118k
  if (
!CNode || 118k
CNode->isMachineConstantPoolEntry()103k
)
5254
15.3k
    return nullptr;
5255
103k
5256
103k
  return dyn_cast<Constant>(CNode->getConstVal());
5257
103k
}
5258
5259
// Extract raw constant bits from constant pools.
5260
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5261
                                          APInt &UndefElts,
5262
                                          SmallVectorImpl<APInt> &EltBits,
5263
                                          bool AllowWholeUndefs = true,
5264
4.22M
                                          bool AllowPartialUndefs = true) {
5265
4.22M
  assert(EltBits.empty() && "Expected an empty EltBits vector");
5266
4.22M
5267
4.22M
  Op = peekThroughBitcasts(Op);
5268
4.22M
5269
4.22M
  EVT VT = Op.getValueType();
5270
4.22M
  unsigned SizeInBits = VT.getSizeInBits();
5271
4.22M
  assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5272
4.22M
  unsigned NumElts = SizeInBits / EltSizeInBits;
5273
4.22M
5274
4.22M
  // Bitcast a source array of element bits to the target size.
5275
80.8k
  auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5276
80.8k
    unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5277
80.8k
    unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5278
80.8k
    assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5279
80.8k
           "Constant bit sizes don't match");
5280
80.8k
5281
80.8k
    // Don't split if we don't allow undef bits.
5282
60.6k
    bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5283
80.8k
    if (
UndefSrcElts.getBoolValue() && 80.8k
!AllowUndefs32.8k
)
5284
32.7k
      return false;
5285
48.1k
5286
48.1k
    // If we're already the right size, don't bother bitcasting.
5287
48.1k
    
if (48.1k
NumSrcElts == NumElts48.1k
) {
5288
37.6k
      UndefElts = UndefSrcElts;
5289
37.6k
      EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5290
37.6k
      return true;
5291
37.6k
    }
5292
10.4k
5293
10.4k
    // Extract all the undef/constant element data and pack into single bitsets.
5294
10.4k
    APInt UndefBits(SizeInBits, 0);
5295
10.4k
    APInt MaskBits(SizeInBits, 0);
5296
10.4k
5297
72.8k
    for (unsigned i = 0; 
i != NumSrcElts72.8k
;
++i62.3k
) {
5298
62.3k
      unsigned BitOffset = i * SrcEltSizeInBits;
5299
62.3k
      if (UndefSrcElts[i])
5300
109
        UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5301
62.3k
      MaskBits.insertBits(SrcEltBits[i], BitOffset);
5302
62.3k
    }
5303
10.4k
5304
10.4k
    // Split the undef/constant single bitset data into the target elements.
5305
10.4k
    UndefElts = APInt(NumElts, 0);
5306
10.4k
    EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5307
10.4k
5308
181k
    for (unsigned i = 0; 
i != NumElts181k
;
++i170k
) {
5309
170k
      unsigned BitOffset = i * EltSizeInBits;
5310
170k
      APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5311
170k
5312
170k
      // Only treat an element as UNDEF if all bits are UNDEF.
5313
170k
      if (
UndefEltBits.isAllOnesValue()170k
) {
5314
190
        if (!AllowWholeUndefs)
5315
0
          return false;
5316
190
        UndefElts.setBit(i);
5317
190
        continue;
5318
190
      }
5319
170k
5320
170k
      // If only some bits are UNDEF then treat them as zero (or bail if not
5321
170k
      // supported).
5322
170k
      
if (170k
UndefEltBits.getBoolValue() && 170k
!AllowPartialUndefs2
)
5323
0
        return false;
5324
170k
5325
170k
      APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5326
170k
      EltBits[i] = Bits.getZExtValue();
5327
170k
    }
5328
10.4k
    return true;
5329
80.8k
  };
5330
4.22M
5331
4.22M
  // Collect constant bits and insert into mask/undef bit masks.
5332
4.22M
  auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5333
1.10M
                                unsigned UndefBitIndex) {
5334
1.10M
    if (!Cst)
5335
0
      return false;
5336
1.10M
    
if (1.10M
isa<UndefValue>(Cst)1.10M
) {
5337
381k
      Undefs.setBit(UndefBitIndex);
5338
381k
      return true;
5339
381k
    }
5340
720k
    
if (auto *720k
CInt720k
= dyn_cast<ConstantInt>(Cst)) {
5341
715k
      Mask = CInt->getValue();
5342
715k
      return true;
5343
715k
    }
5344
4.58k
    
if (auto *4.58k
CFP4.58k
= dyn_cast<ConstantFP>(Cst)) {
5345
4.58k
      Mask = CFP->getValueAPF().bitcastToAPInt();
5346
4.58k
      return true;
5347
4.58k
    }
5348
0
    return false;
5349
0
  };
5350
4.22M
5351
4.22M
  // Extract constant bits from build vector.
5352
4.22M
  if (
ISD::isBuildVectorOfConstantSDNodes(Op.getNode())4.22M
) {
5353
12.9k
    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5354
12.9k
    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5355
12.9k
5356
12.9k
    APInt UndefSrcElts(NumSrcElts, 0);
5357
12.9k
    SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5358
174k
    for (unsigned i = 0, e = Op.getNumOperands(); 
i != e174k
;
++i161k
) {
5359
161k
      const SDValue &Src = Op.getOperand(i);
5360
161k
      if (
Src.isUndef()161k
) {
5361
532
        UndefSrcElts.setBit(i);
5362
532
        continue;
5363
532
      }
5364
161k
      auto *Cst = cast<ConstantSDNode>(Src);
5365
161k
      SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5366
161k
    }
5367
12.9k
    return CastBitData(UndefSrcElts, SrcEltBits);
5368
12.9k
  }
5369
4.20M
5370
4.20M
  // Extract constant bits from constant pool vector.
5371
4.20M
  
if (auto *4.20M
Cst4.20M
= getTargetConstantFromNode(Op)) {
5372
66.9k
    Type *CstTy = Cst->getType();
5373
66.9k
    if (
!CstTy->isVectorTy() || 66.9k
(SizeInBits != CstTy->getPrimitiveSizeInBits())66.9k
)
5374
18
      return false;
5375
66.9k
5376
66.9k
    unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5377
66.9k
    unsigned NumSrcElts = CstTy->getVectorNumElements();
5378
66.9k
5379
66.9k
    APInt UndefSrcElts(NumSrcElts, 0);
5380
66.9k
    SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5381
1.16M
    for (unsigned i = 0; 
i != NumSrcElts1.16M
;
++i1.10M
)
5382
1.10M
      
if (1.10M
!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5383
1.10M
                               UndefSrcElts, i))
5384
0
        return false;
5385
66.9k
5386
66.9k
    return CastBitData(UndefSrcElts, SrcEltBits);
5387
4.14M
  }
5388
4.14M
5389
4.14M
  // Extract constant bits from a broadcasted constant pool scalar.
5390
4.14M
  
if (4.14M
Op.getOpcode() == X86ISD::VBROADCAST &&
5391
4.14M
      
EltSizeInBits <= VT.getScalarSizeInBits()930
) {
5392
906
    if (auto *
Broadcast906
= getTargetConstantFromNode(Op.getOperand(0))) {
5393
695
      unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5394
695
      unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5395
695
5396
695
      APInt UndefSrcElts(NumSrcElts, 0);
5397
695
      SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5398
695
      if (
CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)695
) {
5399
695
        if (UndefSrcElts[0])
5400
0
          UndefSrcElts.setBits(0, NumSrcElts);
5401
695
        SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5402
695
        return CastBitData(UndefSrcElts, SrcEltBits);
5403
695
      }
5404
4.14M
    }
5405
906
  }
5406
4.14M
5407
4.14M
  // Extract a rematerialized scalar constant insertion.
5408
4.14M
  
if (4.14M
Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5409
754
      Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5410
4.14M
      
isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))657
) {
5411
252
    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5412
252
    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5413
252
5414
252
    APInt UndefSrcElts(NumSrcElts, 0);
5415
252
    SmallVector<APInt, 64> SrcEltBits;
5416
252
    auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5417
252
    SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5418
252
    SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5419
252
    return CastBitData(UndefSrcElts, SrcEltBits);
5420
252
  }
5421
4.14M
5422
4.14M
  return false;
5423
4.14M
}
5424
5425
static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5426
                                        unsigned MaskEltSizeInBits,
5427
64.2k
                                        SmallVectorImpl<uint64_t> &RawMask) {
5428
64.2k
  APInt UndefElts;
5429
64.2k
  SmallVector<APInt, 64> EltBits;
5430
64.2k
5431
64.2k
  // Extract the raw target constant bits.
5432
64.2k
  // FIXME: We currently don't support UNDEF bits or mask entries.
5433
64.2k
  if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5434
64.2k
                                     EltBits, /* AllowWholeUndefs */ false,
5435
64.2k
                                     /* AllowPartialUndefs */ false))
5436
36.4k
    return false;
5437
27.7k
5438
27.7k
  // Insert the extracted elements into the mask.
5439
27.7k
  for (APInt Elt : EltBits)
5440
490k
    RawMask.push_back(Elt.getZExtValue());
5441
64.2k
5442
64.2k
  return true;
5443
64.2k
}
5444
5445
/// Calculates the shuffle mask corresponding to the target-specific opcode.
5446
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5447
/// operands in \p Ops, and returns true.
5448
/// Sets \p IsUnary to true if only one source is used. Note that this will set
5449
/// IsUnary for shuffles which use a single input multiple times, and in those
5450
/// cases it will adjust the mask to only have indices within that single input.
5451
/// It is an error to call this with non-empty Mask/Ops vectors.
5452
static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5453
                                 SmallVectorImpl<SDValue> &Ops,
5454
1.45M
                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5455
1.45M
  unsigned NumElems = VT.getVectorNumElements();
5456
1.45M
  SDValue ImmN;
5457
1.45M
5458
1.45M
  assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5459
1.45M
  assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5460
1.45M
5461
1.45M
  IsUnary = false;
5462
1.45M
  bool IsFakeUnary = false;
5463
1.45M
  switch(N->getOpcode()) {
5464
12.0k
  case X86ISD::BLENDI:
5465
12.0k
    ImmN = N->getOperand(N->getNumOperands()-1);
5466
12.0k
    DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5467
12.0k
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5468
12.0k
    break;
5469
25.3k
  case X86ISD::SHUFP:
5470
25.3k
    ImmN = N->getOperand(N->getNumOperands()-1);
5471
25.3k
    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5472
25.3k
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5473
25.3k
    break;
5474
8.94k
  case X86ISD::INSERTPS:
5475
8.94k
    ImmN = N->getOperand(N->getNumOperands()-1);
5476
8.94k
    DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5477
8.94k
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5478
8.94k
    break;
5479
188
  case X86ISD::EXTRQI:
5480
188
    if (isa<ConstantSDNode>(N->getOperand(1)) &&
5481
188
        
isa<ConstantSDNode>(N->getOperand(2))188
) {
5482
188
      int BitLen = N->getConstantOperandVal(1);
5483
188
      int BitIdx = N->getConstantOperandVal(2);
5484
188
      DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5485
188
      IsUnary = true;
5486
188
    }
5487
188
    break;
5488
151
  case X86ISD::INSERTQI:
5489
151
    if (isa<ConstantSDNode>(N->getOperand(2)) &&
5490
151
        
isa<ConstantSDNode>(N->getOperand(3))151
) {
5491
151
      int BitLen = N->getConstantOperandVal(2);
5492
151
      int BitIdx = N->getConstantOperandVal(3);
5493
151
      DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5494
151
      IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5495
151
    }
5496
151
    break;
5497
4.88k
  case X86ISD::UNPCKH:
5498
4.88k
    DecodeUNPCKHMask(VT, Mask);
5499
4.88k
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5500
4.88k
    break;
5501
926k
  case X86ISD::UNPCKL:
5502
926k
    DecodeUNPCKLMask(VT, Mask);
5503
926k
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5504
926k
    break;
5505
984
  case X86ISD::MOVHLPS:
5506
984
    DecodeMOVHLPSMask(NumElems, Mask);
5507
984
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5508
984
    break;
5509
603
  case X86ISD::MOVLHPS:
5510
603
    DecodeMOVLHPSMask(NumElems, Mask);
5511
603
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5512
603
    break;
5513
1.54k
  case X86ISD::PALIGNR:
5514
1.54k
    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5515
1.54k
    ImmN = N->getOperand(N->getNumOperands()-1);
5516
1.54k
    DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5517
1.54k
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5518
1.54k
    Ops.push_back(N->getOperand(1));
5519
1.54k
    Ops.push_back(N->getOperand(0));
5520
1.54k
    break;
5521
3.90k
  case X86ISD::VSHLDQ:
5522
3.90k
    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5523
3.90k
    ImmN = N->getOperand(N->getNumOperands() - 1);
5524
3.90k
    DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5525
3.90k
    IsUnary = true;
5526
3.90k
    break;
5527
1.04k
  case X86ISD::VSRLDQ:
5528
1.04k
    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5529
1.04k
    ImmN = N->getOperand(N->getNumOperands() - 1);
5530
1.04k
    DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5531
1.04k
    IsUnary = true;
5532
1.04k
    break;
5533
212k
  case X86ISD::PSHUFD:
5534
212k
  case X86ISD::VPERMILPI:
5535
212k
    ImmN = N->getOperand(N->getNumOperands()-1);
5536
212k
    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5537
212k
    IsUnary = true;
5538
212k
    break;
5539
15.2k
  case X86ISD::PSHUFHW:
5540
15.2k
    ImmN = N->getOperand(N->getNumOperands()-1);
5541
15.2k
    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5542
15.2k
    IsUnary = true;
5543
15.2k
    break;
5544
157k
  case X86ISD::PSHUFLW:
5545
157k
    ImmN = N->getOperand(N->getNumOperands()-1);
5546
157k
    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5547
157k
    IsUnary = true;
5548
157k
    break;
5549
7.16k
  case X86ISD::VZEXT_MOVL:
5550
7.16k
    DecodeZeroMoveLowMask(VT, Mask);
5551
7.16k
    IsUnary = true;
5552
7.16k
    break;
5553
258
  case X86ISD::VBROADCAST: {
5554
258
    SDValue N0 = N->getOperand(0);
5555
258
    // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5556
258
    // add the pre-extracted value to the Ops vector.
5557
258
    if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5558
42
        N0.getOperand(0).getValueType() == VT &&
5559
28
        N0.getConstantOperandVal(1) == 0)
5560
25
      Ops.push_back(N0.getOperand(0));
5561
258
5562
258
    // We only decode broadcasts of same-sized vectors, unless the broadcast
5563
258
    // came from an extract from the original width. If we found one, we
5564
258
    // pushed it the Ops vector above.
5565
258
    if (
N0.getValueType() == VT || 258
!Ops.empty()154
) {
5566
129
      DecodeVectorBroadcast(VT, Mask);
5567
129
      IsUnary = true;
5568
129
      break;
5569
129
    }
5570
129
    return false;
5571
129
  }
5572
1.10k
  case X86ISD::VPERMILPV: {
5573
1.10k
    IsUnary = true;
5574
1.10k
    SDValue MaskNode = N->getOperand(1);
5575
1.10k
    unsigned MaskEltSize = VT.getScalarSizeInBits();
5576
1.10k
    SmallVector<uint64_t, 32> RawMask;
5577
1.10k
    if (
getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)1.10k
) {
5578
672
      DecodeVPERMILPMask(VT, RawMask, Mask);
5579
672
      break;
5580
672
    }
5581
432
    
if (auto *432
C432
= getTargetConstantFromNode(MaskNode)) {
5582
218
      DecodeVPERMILPMask(C, MaskEltSize, Mask);
5583
218
      break;
5584
218
    }
5585
214
    return false;
5586
214
  }
5587
59.8k
  case X86ISD::PSHUFB: {
5588
59.8k
    IsUnary = true;
5589
59.8k
    SDValue MaskNode = N->getOperand(1);
5590
59.8k
    SmallVector<uint64_t, 32> RawMask;
5591
59.8k
    if (
getTargetShuffleMaskIndices(MaskNode, 8, RawMask)59.8k
) {
5592
24.8k
      DecodePSHUFBMask(RawMask, Mask);
5593
24.8k
      break;
5594
24.8k
    }
5595
35.0k
    
if (auto *35.0k
C35.0k
= getTargetConstantFromNode(MaskNode)) {
5596
31.6k
      DecodePSHUFBMask(C, Mask);
5597
31.6k
      break;
5598
31.6k
    }
5599
3.37k
    return false;
5600
3.37k
  }
5601
3.44k
  case X86ISD::VPERMI:
5602
3.44k
    ImmN = N->getOperand(N->getNumOperands()-1);
5603
3.44k
    DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5604
3.44k
    IsUnary = true;
5605
3.44k
    break;
5606
4.64k
  case X86ISD::MOVSS:
5607
4.64k
  case X86ISD::MOVSD:
5608
4.64k
    DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5609
4.64k
    break;
5610
1.27k
  case X86ISD::VPERM2X128:
5611
1.27k
    ImmN = N->getOperand(N->getNumOperands()-1);
5612
1.27k
    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5613
1.27k
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5614
1.27k
    break;
5615
469
  case X86ISD::MOVSLDUP:
5616
469
    DecodeMOVSLDUPMask(VT, Mask);
5617
469
    IsUnary = true;
5618
469
    break;
5619
1.39k
  case X86ISD::MOVSHDUP:
5620
1.39k
    DecodeMOVSHDUPMask(VT, Mask);
5621
1.39k
    IsUnary = true;
5622
1.39k
    break;
5623
1.15k
  case X86ISD::MOVDDUP:
5624
1.15k
    DecodeMOVDDUPMask(VT, Mask);
5625
1.15k
    IsUnary = true;
5626
1.15k
    break;
5627
0
  case X86ISD::MOVLPD:
5628
0
  case X86ISD::MOVLPS:
5629
0
    // Not yet implemented
5630
0
    return false;
5631
446
  case X86ISD::VPERMIL2: {
5632
446
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5633
446
    unsigned MaskEltSize = VT.getScalarSizeInBits();
5634
446
    SDValue MaskNode = N->getOperand(2);
5635
446
    SDValue CtrlNode = N->getOperand(3);
5636
446
    if (ConstantSDNode *
CtrlOp446
= dyn_cast<ConstantSDNode>(CtrlNode)) {
5637
446
      unsigned CtrlImm = CtrlOp->getZExtValue();
5638
446
      SmallVector<uint64_t, 32> RawMask;
5639
446
      if (
getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)446
) {
5640
330
        DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5641
330
        break;
5642
330
      }
5643
116
      
if (auto *116
C116
= getTargetConstantFromNode(MaskNode)) {
5644
44
        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5645
44
        break;
5646
44
      }
5647
72
    }
5648
72
    return false;
5649
72
  }
5650
552
  case X86ISD::VPPERM: {
5651
552
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5652
552
    SDValue MaskNode = N->getOperand(2);
5653
552
    SmallVector<uint64_t, 32> RawMask;
5654
552
    if (
getTargetShuffleMaskIndices(MaskNode, 8, RawMask)552
) {
5655
530
      DecodeVPPERMMask(RawMask, Mask);
5656
530
      break;
5657
530
    }
5658
22
    
if (auto *22
C22
= getTargetConstantFromNode(MaskNode)) {
5659
8
      DecodeVPPERMMask(C, Mask);
5660
8
      break;
5661
8
    }
5662
14
    return false;
5663
14
  }
5664
2.27k
  case X86ISD::VPERMV: {
5665
2.27k
    IsUnary = true;
5666
2.27k
    // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5667
2.27k
    Ops.push_back(N->getOperand(1));
5668
2.27k
    SDValue MaskNode = N->getOperand(0);
5669
2.27k
    SmallVector<uint64_t, 32> RawMask;
5670
2.27k
    unsigned MaskEltSize = VT.getScalarSizeInBits();
5671
2.27k
    if (
getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)2.27k
) {
5672
1.38k
      DecodeVPERMVMask(RawMask, Mask);
5673
1.38k
      break;
5674
1.38k
    }
5675
891
    
if (auto *891
C891
= getTargetConstantFromNode(MaskNode)) {
5676
729
      DecodeVPERMVMask(C, MaskEltSize, Mask);
5677
729
      break;
5678
729
    }
5679
162
    return false;
5680
162
  }
5681
2.92k
  case X86ISD::VPERMV3: {
5682
2.92k
    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5683
2.92k
    // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5684
2.92k
    Ops.push_back(N->getOperand(0));
5685
2.92k
    Ops.push_back(N->getOperand(2));
5686
2.92k
    SDValue MaskNode = N->getOperand(1);
5687
2.92k
    unsigned MaskEltSize = VT.getScalarSizeInBits();
5688
2.92k
    if (auto *
C2.92k
= getTargetConstantFromNode(MaskNode)) {
5689
2.52k
      DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5690
2.52k
      break;
5691
2.52k
    }
5692
402
    return false;
5693
402
  }
5694
334
  case X86ISD::VPERMIV3: {
5695
334
    IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5696
334
    // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5697
334
    Ops.push_back(N->getOperand(1));
5698
334
    Ops.push_back(N->getOperand(2));
5699
334
    SDValue MaskNode = N->getOperand(0);
5700
334
    unsigned MaskEltSize = VT.getScalarSizeInBits();
5701
334
    if (auto *
C334
= getTargetConstantFromNode(MaskNode)) {
5702
224
      DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5703
224
      break;
5704
224
    }
5705
110
    return false;
5706
110
  }
5707
0
  
default: 0
llvm_unreachable0
("unknown target shuffle node");
5708
1.45M
  }
5709
1.45M
5710
1.45M
  // Empty mask indicates the decode failed.
5711
1.45M
  
if (1.45M
Mask.empty()1.45M
)
5712
346
    return false;
5713
1.45M
5714
1.45M
  // Check if we're getting a shuffle mask with zero'd elements.
5715
1.45M
  
if (1.45M
!AllowSentinelZero1.45M
)
5716
58.3k
    
if (10.4k
any_of(Mask, [](int M) 10.4k
{ return M == SM_SentinelZero; }58.3k
))
5717
0
      return false;
5718
1.45M
5719
1.45M
  // If we have a fake unary shuffle, the shuffle mask is spread across two
5720
1.45M
  // inputs that are actually the same node. Re-map the mask to always point
5721
1.45M
  // into the first input.
5722
1.45M
  
if (1.45M
IsFakeUnary1.45M
)
5723
17.7k
    for (int &M : Mask)
5724
88.3k
      
if (88.3k
M >= (int)Mask.size()88.3k
)
5725
42.3k
        M -= Mask.size();
5726
1.45M
5727
1.45M
  // If we didn't already add operands in the opcode-specific code, default to
5728
1.45M
  // adding 1 or 2 operands starting at 0.
5729
1.45M
  if (
Ops.empty()1.45M
) {
5730
1.44M
    Ops.push_back(N->getOperand(0));
5731
1.44M
    if (
!IsUnary || 1.44M
IsFakeUnary478k
)
5732
986k
      Ops.push_back(N->getOperand(1));
5733
1.44M
  }
5734
1.45M
5735
1.45M
  return true;
5736
1.45M
}
5737
5738
/// Check a target shuffle mask's inputs to see if we can set any values to
5739
/// SM_SentinelZero - this is for elements that are known to be zero
5740
/// (not just zeroable) from their inputs.
5741
/// Returns true if the target shuffle mask was decoded.
5742
static bool setTargetShuffleZeroElements(SDValue N,
5743
                                         SmallVectorImpl<int> &Mask,
5744
2.31M
                                         SmallVectorImpl<SDValue> &Ops) {
5745
2.31M
  bool IsUnary;
5746
2.31M
  if (!isTargetShuffle(N.getOpcode()))
5747
948k
    return false;
5748
1.37M
5749
1.37M
  MVT VT = N.getSimpleValueType();
5750
1.37M
  if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5751
2.43k
    return false;
5752
1.36M
5753
1.36M
  SDValue V1 = Ops[0];
5754
1.36M
  SDValue V2 = IsUnary ? 
V1427k
:
Ops[1]939k
;
5755
1.36M
5756
1.36M
  V1 = peekThroughBitcasts(V1);
5757
1.36M
  V2 = peekThroughBitcasts(V2);
5758
1.36M
5759
1.36M
  assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5760
1.36M
         "Illegal split of shuffle value type");
5761
1.36M
  unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5762
1.36M
5763
1.36M
  // Extract known constant input data.
5764
1.36M
  APInt UndefSrcElts[2];
5765
1.36M
  SmallVector<APInt, 32> SrcEltBits[2];
5766
1.36M
  bool IsSrcConstant[2] = {
5767
1.36M
      getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5768
1.36M
                                    SrcEltBits[0], true, false),
5769
1.36M
      getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5770
1.36M
                                    SrcEltBits[1], true, false)};
5771
1.36M
5772
16.6M
  for (int i = 0, Size = Mask.size(); 
i < Size16.6M
;
++i15.2M
) {
5773
15.2M
    int M = Mask[i];
5774
15.2M
5775
15.2M
    // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5776
15.2M
    if (M < 0)
5777
385k
      continue;
5778
14.8M
5779
14.8M
    // Determine shuffle input and normalize the mask.
5780
14.8M
    unsigned SrcIdx = M / Size;
5781
14.8M
    SDValue V = M < Size ? 
V18.76M
:
V26.12M
;
5782
14.8M
    M %= Size;
5783
14.8M
5784
14.8M
    // We are referencing an UNDEF input.
5785
14.8M
    if (
V.isUndef()14.8M
) {
5786
18.9k
      Mask[i] = SM_SentinelUndef;
5787
18.9k
      continue;
5788
18.9k
    }
5789
14.8M
5790
14.8M
    // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5791
14.8M
    // TODO: We currently only set UNDEF for integer types - floats use the same
5792
14.8M
    // registers as vectors and many of the scalar folded loads rely on the
5793
14.8M
    // SCALAR_TO_VECTOR pattern.
5794
14.8M
    
if (14.8M
V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5795
14.8M
        
(Size % V.getValueType().getVectorNumElements()) == 09.94M
) {
5796
9.94M
      int Scale = Size / V.getValueType().getVectorNumElements();
5797
9.94M
      int Idx = M / Scale;
5798
9.94M
      if (
Idx != 0 && 9.94M
!VT.isFloatingPoint()4.96M
)
5799
4.95M
        Mask[i] = SM_SentinelUndef;
5800
4.98M
      else 
if (4.98M
Idx == 0 && 4.98M
X86::isZeroNode(V.getOperand(0))4.97M
)
5801
51
        Mask[i] = SM_SentinelZero;
5802
9.94M
      continue;
5803
9.94M
    }
5804
4.92M
5805
4.92M
    // Attempt to extract from the source's constant bits.
5806
4.92M
    
if (4.92M
IsSrcConstant[SrcIdx]4.92M
) {
5807
24.1k
      if (UndefSrcElts[SrcIdx][M])
5808
2
        Mask[i] = SM_SentinelUndef;
5809
24.1k
      else 
if (24.1k
SrcEltBits[SrcIdx][M] == 024.1k
)
5810
21.3k
        Mask[i] = SM_SentinelZero;
5811
24.1k
    }
5812
15.2M
  }
5813
2.31M
5814
2.31M
  assert(VT.getVectorNumElements() == Mask.size() &&
5815
2.31M
         "Different mask size from vector size!");
5816
2.31M
  return true;
5817
2.31M
}
5818
5819
// Attempt to decode ops that could be represented as a shuffle mask.
5820
// The decoded shuffle mask may contain a different number of elements to the
5821
// destination value type.
5822
static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5823
                               SmallVectorImpl<SDValue> &Ops,
5824
948k
                               SelectionDAG &DAG) {
5825
948k
  Mask.clear();
5826
948k
  Ops.clear();
5827
948k
5828
948k
  MVT VT = N.getSimpleValueType();
5829
948k
  unsigned NumElts = VT.getVectorNumElements();
5830
948k
  unsigned NumSizeInBits = VT.getSizeInBits();
5831
948k
  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5832
948k
  assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5833
948k
         "Expected byte aligned value types");
5834
948k
5835
948k
  unsigned Opcode = N.getOpcode();
5836
948k
  switch (Opcode) {
5837
26.2k
  case ISD::AND:
5838
26.2k
  case X86ISD::ANDNP: {
5839
26.2k
    // Attempt to decode as a per-byte mask.
5840
26.2k
    APInt UndefElts;
5841
26.2k
    SmallVector<APInt, 32> EltBits;
5842
26.2k
    SDValue N0 = N.getOperand(0);
5843
26.2k
    SDValue N1 = N.getOperand(1);
5844
26.2k
    bool IsAndN = (X86ISD::ANDNP == Opcode);
5845
26.2k
    uint64_t ZeroMask = IsAndN ? 
2553.25k
:
022.9k
;
5846
26.2k
    if (
!getTargetConstantBitsFromNode(IsAndN ? 26.2k
N03.25k
:
N122.9k
, 8, UndefElts, EltBits))
5847
12.0k
      return false;
5848
91.8k
    
for (int i = 0, e = (int)EltBits.size(); 14.1k
i != e91.8k
;
++i77.6k
) {
5849
87.9k
      if (
UndefElts[i]87.9k
) {
5850
64
        Mask.push_back(SM_SentinelUndef);
5851
64
        continue;
5852
64
      }
5853
87.8k
      uint64_t ByteBits = EltBits[i].getZExtValue();
5854
87.8k
      if (
ByteBits != 0 && 87.8k
ByteBits != 25544.7k
)
5855
10.2k
        return false;
5856
77.6k
      
Mask.push_back(ByteBits == ZeroMask ? 77.6k
SM_SentinelZero44.8k
:
i32.7k
);
5857
87.9k
    }
5858
3.88k
    
Ops.push_back(IsAndN ? 3.88k
N1242
:
N03.64k
);
5859
3.88k
    return true;
5860
14.1k
  }
5861
647k
  case ISD::SCALAR_TO_VECTOR: {
5862
647k
    // Match against a scalar_to_vector of an extract from a vector,
5863
647k
    // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5864
647k
    SDValue N0 = N.getOperand(0);
5865
647k
    SDValue SrcExtract;
5866
647k
5867
647k
    if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5868
647k
        
N0.getOperand(0).getValueType() == VT106
) {
5869
106
      SrcExtract = N0;
5870
647k
    } else 
if (647k
N0.getOpcode() == ISD::AssertZext &&
5871
1.47k
               N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
5872
647k
               
cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i161.40k
) {
5873
1.40k
      SrcExtract = N0.getOperand(0);
5874
1.40k
      assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
5875
647k
    } else 
if (645k
N0.getOpcode() == ISD::AssertZext &&
5876
66
               N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
5877
645k
               
cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i86
) {
5878
6
      SrcExtract = N0.getOperand(0);
5879
6
      assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
5880
6
    }
5881
647k
5882
647k
    if (
!SrcExtract || 647k
!isa<ConstantSDNode>(SrcExtract.getOperand(1))1.51k
)
5883
645k
      return false;
5884
1.51k
5885
1.51k
    SDValue SrcVec = SrcExtract.getOperand(0);
5886
1.51k
    EVT SrcVT = SrcVec.getValueType();
5887
1.51k
    unsigned NumSrcElts = SrcVT.getVectorNumElements();
5888
1.51k
    unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
5889
1.51k
5890
1.51k
    unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
5891
1.51k
    if (NumSrcElts <= SrcIdx)
5892
0
      return false;
5893
1.51k
5894
1.51k
    Ops.push_back(SrcVec);
5895
1.51k
    Mask.push_back(SrcIdx);
5896
1.51k
    Mask.append(NumZeros, SM_SentinelZero);
5897
1.51k
    Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
5898
1.51k
    return true;
5899
1.51k
  }
5900
9.90k
  case X86ISD::PINSRB:
5901
9.90k
  case X86ISD::PINSRW: {
5902
9.90k
    SDValue InVec = N.getOperand(0);
5903
9.90k
    SDValue InScl = N.getOperand(1);
5904
9.90k
    uint64_t InIdx = N.getConstantOperandVal(2);
5905
9.90k
    assert(InIdx < NumElts && "Illegal insertion index");
5906
9.90k
5907
9.90k
    // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
5908
9.90k
    if (
X86::isZeroNode(InScl)9.90k
) {
5909
213
      Ops.push_back(InVec);
5910
3.28k
      for (unsigned i = 0; 
i != NumElts3.28k
;
++i3.07k
)
5911
3.07k
        
Mask.push_back(i == InIdx ? 3.07k
SM_SentinelZero213
:
(int)i2.85k
);
5912
213
      return true;
5913
213
    }
5914
9.68k
5915
9.68k
    // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
5916
9.68k
    // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
5917
9.68k
    unsigned ExOp =
5918
9.68k
        (X86ISD::PINSRB == Opcode ? 
X86ISD::PEXTRB6.95k
:
X86ISD::PEXTRW2.73k
);
5919
9.68k
    if (InScl.getOpcode() != ISD::AssertZext ||
5920
204
        InScl.getOperand(0).getOpcode() != ExOp)
5921
9.48k
      return false;
5922
204
5923
204
    SDValue ExVec = InScl.getOperand(0).getOperand(0);
5924
204
    uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
5925
204
    assert(ExIdx < NumElts && "Illegal extraction index");
5926
204
    Ops.push_back(InVec);
5927
204
    Ops.push_back(ExVec);
5928
2.89k
    for (unsigned i = 0; 
i != NumElts2.89k
;
++i2.68k
)
5929
2.68k
      
Mask.push_back(i == InIdx ? 2.68k
NumElts + ExIdx204
:
i2.48k
);
5930
204
    return true;
5931
204
  }
5932
734
  case X86ISD::PACKSS: {
5933
734
    // If we know input saturation won't happen we can treat this
5934
734
    // as a truncation shuffle.
5935
734
    if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt ||
5936
471
        DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
5937
266
      return false;
5938
468
5939
468
    Ops.push_back(N.getOperand(0));
5940
468
    Ops.push_back(N.getOperand(1));
5941
8.34k
    for (unsigned i = 0; 
i != NumElts8.34k
;
++i7.87k
)
5942
7.87k
      Mask.push_back(i * 2);
5943
468
    return true;
5944
468
  }
5945
7.50k
  case X86ISD::VSHLI:
5946
7.50k
  case X86ISD::VSRLI: {
5947
7.50k
    uint64_t ShiftVal = N.getConstantOperandVal(1);
5948
7.50k
    // Out of range bit shifts are guaranteed to be zero.
5949
7.50k
    if (
NumBitsPerElt <= ShiftVal7.50k
) {
5950
0
      Mask.append(NumElts, SM_SentinelZero);
5951
0
      return true;
5952
0
    }
5953
7.50k
5954
7.50k
    // We can only decode 'whole byte' bit shifts as shuffles.
5955
7.50k
    
if (7.50k
(ShiftVal % 8) != 07.50k
)
5956
2.50k
      break;
5957
5.00k
5958
5.00k
    uint64_t ByteShift = ShiftVal / 8;
5959
5.00k
    unsigned NumBytes = NumSizeInBits / 8;
5960
5.00k
    unsigned NumBytesPerElt = NumBitsPerElt / 8;
5961
5.00k
    Ops.push_back(N.getOperand(0));
5962
5.00k
5963
5.00k
    // Clear mask to all zeros and insert the shifted byte indices.
5964
5.00k
    Mask.append(NumBytes, SM_SentinelZero);
5965
5.00k
5966
5.00k
    if (
X86ISD::VSHLI == Opcode5.00k
) {
5967
9.80k
      for (unsigned i = 0; 
i != NumBytes9.80k
;
i += NumBytesPerElt7.88k
)
5968
21.7k
        
for (unsigned j = ByteShift; 7.88k
j != NumBytesPerElt21.7k
;
++j13.8k
)
5969
13.8k
          Mask[i + j] = i + j - ByteShift;
5970
5.00k
    } else {
5971
23.8k
      for (unsigned i = 0; 
i != NumBytes23.8k
;
i += NumBytesPerElt20.7k
)
5972
55.4k
        
for (unsigned j = ByteShift; 20.7k
j != NumBytesPerElt55.4k
;
++j34.6k
)
5973
34.6k
          Mask[i + j - ByteShift] = i + j;
5974
3.08k
    }
5975
5.00k
    return true;
5976
5.00k
  }
5977
347
  case ISD::ZERO_EXTEND_VECTOR_INREG:
5978
347
  case X86ISD::VZEXT: {
5979
347
    // TODO - add support for VPMOVZX with smaller input vector types.
5980
347
    SDValue Src = N.getOperand(0);
5981
347
    MVT SrcVT = Src.getSimpleValueType();
5982
347
    if (NumSizeInBits != SrcVT.getSizeInBits())
5983
34
      break;
5984
313
    DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
5985
313
    Ops.push_back(Src);
5986
313
    return true;
5987
313
  }
5988
258k
  }
5989
258k
5990
258k
  return false;
5991
258k
}
5992
5993
/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
5994
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
5995
2.75M
                                              SmallVectorImpl<int> &Mask) {
5996
2.75M
  int MaskWidth = Mask.size();
5997
2.75M
  SmallVector<SDValue, 16> UsedInputs;
5998
14.5M
  for (int i = 0, e = Inputs.size(); 
i < e14.5M
;
++i11.7M
) {
5999
11.7M
    int lo = UsedInputs.size() * MaskWidth;
6000
11.7M
    int hi = lo + MaskWidth;
6001
72.6M
    if (
any_of(Mask, [lo, hi](int i) 11.7M
{ return (lo <= i) && 72.6M
(i < hi)31.0M
; })) {
6002
11.7M
      UsedInputs.push_back(Inputs[i]);
6003
11.7M
      continue;
6004
11.7M
    }
6005
30.0k
    for (int &M : Mask)
6006
234k
      
if (234k
lo <= M234k
)
6007
31.4k
        M -= MaskWidth;
6008
11.7M
  }
6009
2.75M
  Inputs = UsedInputs;
6010
2.75M
}
6011
6012
/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6013
/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6014
/// remaining input indices in case we now have a unary shuffle and adjust the
6015
/// inputs accordingly.
6016
/// Returns true if the target shuffle mask was decoded.
6017
static bool resolveTargetShuffleInputs(SDValue Op,
6018
                                       SmallVectorImpl<SDValue> &Inputs,
6019
                                       SmallVectorImpl<int> &Mask,
6020
2.31M
                                       SelectionDAG &DAG) {
6021
2.31M
  if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6022
948k
    
if (948k
!getFauxShuffleMask(Op, Mask, Inputs, DAG)948k
)
6023
936k
      return false;
6024
1.37M
6025
1.37M
  resolveTargetShuffleInputsAndMask(Inputs, Mask);
6026
1.37M
  return true;
6027
1.37M
}
6028
6029
/// Returns the scalar element that will make up the ith
6030
/// element of the result of the vector shuffle.
6031
static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6032
207k
                                   unsigned Depth) {
6033
207k
  if (Depth == 6)
6034
11
    return SDValue();  // Limit search depth.
6035
207k
6036
207k
  SDValue V = SDValue(N, 0);
6037
207k
  EVT VT = V.getValueType();
6038
207k
  unsigned Opcode = V.getOpcode();
6039
207k
6040
207k
  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6041
207k
  if (const ShuffleVectorSDNode *
SV207k
= dyn_cast<ShuffleVectorSDNode>(N)) {
6042
39.7k
    int Elt = SV->getMaskElt(Index);
6043
39.7k
6044
39.7k
    if (Elt < 0)
6045
2.71k
      return DAG.getUNDEF(VT.getVectorElementType());
6046
37.0k
6047
37.0k
    unsigned NumElems = VT.getVectorNumElements();
6048
33.5k
    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6049
3.50k
                                         : SV->getOperand(1);
6050
39.7k
    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6051
39.7k
  }
6052
168k
6053
168k
  // Recurse into target specific vector shuffles to find scalars.
6054
168k
  
if (168k
isTargetShuffle(Opcode)168k
) {
6055
76.6k
    MVT ShufVT = V.getSimpleValueType();
6056
76.6k
    MVT ShufSVT = ShufVT.getVectorElementType();
6057
76.6k
    int NumElems = (int)ShufVT.getVectorNumElements();
6058
76.6k
    SmallVector<int, 16> ShuffleMask;
6059
76.6k
    SmallVector<SDValue, 16> ShuffleOps;
6060
76.6k
    bool IsUnary;
6061
76.6k
6062
76.6k
    if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6063
2.38k
      return SDValue();
6064
74.2k
6065
74.2k
    int Elt = ShuffleMask[Index];
6066
74.2k
    if (Elt == SM_SentinelZero)
6067
10.7k
      
return ShufSVT.isInteger() ? 10.7k
DAG.getConstant(0, SDLoc(N), ShufSVT)9.97k
6068
791
                                 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6069
63.5k
    
if (63.5k
Elt == SM_SentinelUndef63.5k
)
6070
1.02k
      return DAG.getUNDEF(ShufSVT);
6071
62.4k
6072
0
    assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6073
62.4k
    SDValue NewV = (Elt < NumElems) ? 
ShuffleOps[0]54.3k
:
ShuffleOps[1]8.10k
;
6074
76.6k
    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6075
76.6k
                               Depth+1);
6076
76.6k
  }
6077
91.3k
6078
91.3k
  // Actual nodes that may contain scalar elements
6079
91.3k
  
if (91.3k
Opcode == ISD::BITCAST91.3k
) {
6080
33.8k
    V = V.getOperand(0);
6081
33.8k
    EVT SrcVT = V.getValueType();
6082
33.8k
    unsigned NumElems = VT.getVectorNumElements();
6083
33.8k
6084
33.8k
    if (
!SrcVT.isVector() || 33.8k
SrcVT.getVectorNumElements() != NumElems33.8k
)
6085
32.5k
      return SDValue();
6086
58.8k
  }
6087
58.8k
6088
58.8k
  
if (58.8k
V.getOpcode() == ISD::SCALAR_TO_VECTOR58.8k
)
6089
12.8k
    
return (Index == 0) ? 12.8k
V.getOperand(0)10.6k
6090
2.18k
                        : DAG.getUNDEF(VT.getVectorElementType());
6091
45.9k
6092
45.9k
  
if (45.9k
V.getOpcode() == ISD::BUILD_VECTOR45.9k
)
6093
2.83k
    return V.getOperand(Index);
6094
43.1k
6095
43.1k
  return SDValue();
6096
43.1k
}
6097
6098
/// Custom lower build_vector of v16i8.
6099
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6100
                                     unsigned NumNonZero, unsigned NumZero,
6101
                                     SelectionDAG &DAG,
6102
684
                                     const X86Subtarget &Subtarget) {
6103
684
  if (
NumNonZero > 8 && 684
!Subtarget.hasSSE41()303
)
6104
22
    return SDValue();
6105
662
6106
662
  SDLoc dl(Op);
6107
662
  SDValue V;
6108
662
  bool First = true;
6109
662
6110
662
  // SSE4.1 - use PINSRB to insert each byte directly.
6111
662
  if (
Subtarget.hasSSE41()662
) {
6112
10.8k
    for (unsigned i = 0; 
i < 1610.8k
;
++i10.2k
) {
6113
10.2k
      bool IsNonZero = (NonZeros & (1 << i)) != 0;
6114
10.2k
      if (
IsNonZero10.2k
) {
6115
6.42k
        // If the build vector contains zeros or our first insertion is not the
6116
6.42k
        // first index then insert into zero vector to break any register
6117
6.42k
        // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6118
6.42k
        if (
First6.42k
) {
6119
641
          First = false;
6120
641
          if (
NumZero || 641
0 != i468
)
6121
175
            V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6122
466
          else {
6123
466
            assert(0 == i && "Expected insertion into zero-index");
6124
466
            V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6125
466
            V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6126
466
            V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6127
466
            V = DAG.getBitcast(MVT::v16i8, V);
6128
466
            continue;
6129
466
          }
6130
5.95k
        }
6131
5.95k
        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6132
5.95k
                        Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6133
5.95k
      }
6134
10.2k
    }
6135
641
6136
641
    return V;
6137
641
  }
6138
21
6139
21
  // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6140
357
  
for (unsigned i = 0; 21
i < 16357
;
++i336
) {
6141
336
    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6142
336
    if (
ThisIsNonZero && 336
First60
) {
6143
21
      if (NumZero)
6144
13
        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6145
21
      else
6146
8
        V = DAG.getUNDEF(MVT::v8i16);
6147
21
      First = false;
6148
21
    }
6149
336
6150
336
    if (
(i & 1) != 0336
) {
6151
168
      // FIXME: Investigate extending to i32 instead of just i16.
6152
168
      // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6153
168
      SDValue ThisElt, LastElt;
6154
168
      bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6155
168
      if (
LastIsNonZero168
) {
6156
41
        LastElt =
6157
41
            DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6158
41
      }
6159
168
      if (
ThisIsNonZero168
) {
6160
19
        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6161
19
        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6162
19
                              DAG.getConstant(8, dl, MVT::i8));
6163
19
        if (LastIsNonZero)
6164
9
          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6165
19
      } else
6166
149
        ThisElt = LastElt;
6167
168
6168
168
      if (
ThisElt168
) {
6169
51
        if (
1 == i51
) {
6170
4
          V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6171
7
                      : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6172
11
          V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6173
11
          V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6174
11
          V = DAG.getBitcast(MVT::v8i16, V);
6175
51
        } else {
6176
40
          V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6177
40
                          DAG.getIntPtrConstant(i / 2, dl));
6178
40
        }
6179
51
      }
6180
168
    }
6181
336
  }
6182
684
6183
684
  return DAG.getBitcast(MVT::v16i8, V);
6184
684
}
6185
6186
/// Custom lower build_vector of v8i16.
6187
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6188
                                     unsigned NumNonZero, unsigned NumZero,
6189
                                     SelectionDAG &DAG,
6190
273
                                     const X86Subtarget &Subtarget) {
6191
273
  if (
NumNonZero > 4 && 273
!Subtarget.hasSSE41()233
)
6192
24
    return SDValue();
6193
249
6194
249
  SDLoc dl(Op);
6195
249
  SDValue V;
6196
249
  bool First = true;
6197
2.24k
  for (unsigned i = 0; 
i < 82.24k
;
++i1.99k
) {
6198
1.99k
    bool IsNonZero = (NonZeros & (1 << i)) != 0;
6199
1.99k
    if (
IsNonZero1.99k
) {
6200
1.71k
      // If the build vector contains zeros or our first insertion is not the
6201
1.71k
      // first index then insert into zero vector to break any register
6202
1.71k
      // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6203
1.71k
      if (
First1.71k
) {
6204
249
        First = false;
6205
249
        if (
NumZero || 249
0 != i190
)
6206
61
          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6207
188
        else {
6208
188
          assert(0 == i && "Expected insertion into zero-index");
6209
188
          V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6210
188
          V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6211
188
          V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6212
188
          V = DAG.getBitcast(MVT::v8i16, V);
6213
188
          continue;
6214
188
        }
6215
1.52k
      }
6216
1.52k
      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6217
1.52k
                      Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6218
1.52k
    }
6219
1.99k
  }
6220
273
6221
273
  return V;
6222
273
}
6223
6224
/// Custom lower build_vector of v4i32 or v4f32.
6225
static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6226
952
                                     const X86Subtarget &Subtarget) {
6227
952
  // Find all zeroable elements.
6228
952
  std::bitset<4> Zeroable;
6229
4.76k
  for (int i=0; 
i < 44.76k
;
++i3.80k
) {
6230
3.80k
    SDValue Elt = Op->getOperand(i);
6231
3.37k
    Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6232
3.80k
  }
6233
952
  assert(Zeroable.size() - Zeroable.count() > 1 &&
6234
952
         "We expect at least two non-zero elements!");
6235
952
6236
952
  // We only know how to deal with build_vector nodes where elements are either
6237
952
  // zeroable or extract_vector_elt with constant index.
6238
952
  SDValue FirstNonZero;
6239
952
  unsigned FirstNonZeroIdx;
6240
1.18k
  for (unsigned i=0; 
i < 41.18k
;
++i235
) {
6241
1.13k
    if (Zeroable[i])
6242
86
      continue;
6243
1.05k
    SDValue Elt = Op->getOperand(i);
6244
1.05k
    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6245
193
        !isa<ConstantSDNode>(Elt.getOperand(1)))
6246
879
      return SDValue();
6247
173
    // Make sure that this node is extracting from a 128-bit vector.
6248
173
    MVT VT = Elt.getOperand(0).getSimpleValueType();
6249
173
    if (!VT.is128BitVector())
6250
24
      return SDValue();
6251
149
    
if (149
!FirstNonZero.getNode()149
) {
6252
71
      FirstNonZero = Elt;
6253
71
      FirstNonZeroIdx = i;
6254
71
    }
6255
1.13k
  }
6256
952
6257
49
  assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6258
49
  SDValue V1 = FirstNonZero.getOperand(0);
6259
49
  MVT VT = V1.getSimpleValueType();
6260
49
6261
49
  // See if this build_vector can be lowered as a blend with zero.
6262
49
  SDValue Elt;
6263
49
  unsigned EltMaskIdx, EltIdx;
6264
49
  int Mask[4];
6265
108
  for (EltIdx = 0; 
EltIdx < 4108
;
++EltIdx59
) {
6266
94
    if (
Zeroable[EltIdx]94
) {
6267
16
      // The zero vector will be on the right hand side.
6268
16
      Mask[EltIdx] = EltIdx+4;
6269
16
      continue;
6270
16
    }
6271
78
6272
78
    Elt = Op->getOperand(EltIdx);
6273
78
    // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6274
78
    EltMaskIdx = Elt.getConstantOperandVal(1);
6275
78
    if (
Elt.getOperand(0) != V1 || 78
EltMaskIdx != EltIdx75
)
6276
35
      break;
6277
43
    Mask[EltIdx] = EltIdx;
6278
43
  }
6279
49
6280
49
  if (
EltIdx == 449
) {
6281
14
    // Let the shuffle legalizer deal with blend operations.
6282
14
    SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6283
14
    if (V1.getSimpleValueType() != VT)
6284
0
      V1 = DAG.getBitcast(VT, V1);
6285
14
    return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6286
14
  }
6287
35
6288
35
  // See if we can lower this build_vector to a INSERTPS.
6289
35
  
if (35
!Subtarget.hasSSE41()35
)
6290
22
    return SDValue();
6291
13
6292
13
  SDValue V2 = Elt.getOperand(0);
6293
13
  if (
Elt == FirstNonZero && 13
EltIdx == FirstNonZeroIdx13
)
6294
13
    V1 = SDValue();
6295
13
6296
13
  bool CanFold = true;
6297
26
  for (unsigned i = EltIdx + 1; 
i < 4 && 26
CanFold26
;
++i13
) {
6298
13
    if (Zeroable[i])
6299
0
      continue;
6300
13
6301
13
    SDValue Current = Op->getOperand(i);
6302
13
    SDValue SrcVector = Current->getOperand(0);
6303
13
    if (!V1.getNode())
6304
13
      V1 = SrcVector;
6305
13
    CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6306
13
  }
6307
13
6308
13
  if (!CanFold)
6309
13
    return SDValue();
6310
0
6311
13
  assert(V1.getNode() && "Expected at least two non-zero elements!");
6312
0
  if (V1.getSimpleValueType() != MVT::v4f32)
6313
0
    V1 = DAG.getBitcast(MVT::v4f32, V1);
6314
0
  if (V2.getSimpleValueType() != MVT::v4f32)
6315
0
    V2 = DAG.getBitcast(MVT::v4f32, V2);
6316
952
6317
952
  // Ok, we can emit an INSERTPS instruction.
6318
952
  unsigned ZMask = Zeroable.to_ulong();
6319
952
6320
952
  unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6321
952
  assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6322
952
  SDLoc DL(Op);
6323
952
  SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6324
952
                               DAG.getIntPtrConstant(InsertPSMask, DL));
6325
952
  return DAG.getBitcast(VT, Result);
6326
952
}
6327
6328
/// Return a vector logical shift node.
6329
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6330
                         SelectionDAG &DAG, const TargetLowering &TLI,
6331
57
                         const SDLoc &dl) {
6332
57
  assert(VT.is128BitVector() && "Unknown type for VShift");
6333
57
  MVT ShVT = MVT::v16i8;
6334
57
  unsigned Opc = isLeft ? 
X86ISD::VSHLDQ57
:
X86ISD::VSRLDQ0
;
6335
57
  SrcOp = DAG.getBitcast(ShVT, SrcOp);
6336
57
  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6337
57
  assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6338
57
  SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6339
57
  return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6340
57
}
6341
6342
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6343
1.04k
                                      SelectionDAG &DAG) {
6344
1.04k
6345
1.04k
  // Check if the scalar load can be widened into a vector load. And if
6346
1.04k
  // the address is "base + cst" see if the cst can be "absorbed" into
6347
1.04k
  // the shuffle mask.
6348
1.04k
  if (LoadSDNode *
LD1.04k
= dyn_cast<LoadSDNode>(SrcOp)) {
6349
27
    SDValue Ptr = LD->getBasePtr();
6350
27
    if (
!ISD::isNormalLoad(LD) || 27
LD->isVolatile()27
)
6351
0
      return SDValue();
6352
27
    EVT PVT = LD->getValueType(0);
6353
27
    if (
PVT != MVT::i32 && 27
PVT != MVT::f3222
)
6354
0
      return SDValue();
6355
27
6356
27
    int FI = -1;
6357
27
    int64_t Offset = 0;
6358
27
    if (FrameIndexSDNode *
FINode27
= dyn_cast<FrameIndexSDNode>(Ptr)) {
6359
2
      FI = FINode->getIndex();
6360
2
      Offset = 0;
6361
27
    } else 
if (25
DAG.isBaseWithConstantOffset(Ptr) &&
6362
25
               
isa<FrameIndexSDNode>(Ptr.getOperand(0))0
) {
6363
0
      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6364
0
      Offset = Ptr.getConstantOperandVal(1);
6365
0
      Ptr = Ptr.getOperand(0);
6366
25
    } else {
6367
25
      return SDValue();
6368
25
    }
6369
2
6370
2
    // FIXME: 256-bit vector instructions don't require a strict alignment,
6371
2
    // improve this code to support it better.
6372
2
    unsigned RequiredAlign = VT.getSizeInBits()/8;
6373
2
    SDValue Chain = LD->getChain();
6374
2
    // Make sure the stack object alignment is at least 16 or 32.
6375
2
    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6376
2
    if (
DAG.InferPtrAlignment(Ptr) < RequiredAlign2
) {
6377
2
      if (
MFI.isFixedObjectIndex(FI)2
) {
6378
2
        // Can't change the alignment. FIXME: It's possible to compute
6379
2
        // the exact stack offset and reference FI + adjust offset instead.
6380
2
        // If someone *really* cares about this. That's the way to implement it.
6381
2
        return SDValue();
6382
0
      } else {
6383
0
        MFI.setObjectAlignment(FI, RequiredAlign);
6384
0
      }
6385
2
    }
6386
2
6387
2
    // (Offset % 16 or 32) must be multiple of 4. Then address is then
6388
2
    // Ptr + (Offset & ~15).
6389
0
    
if (0
Offset < 00
)
6390
0
      return SDValue();
6391
0
    
if (0
(Offset % RequiredAlign) & 30
)
6392
0
      return SDValue();
6393
0
    int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6394
0
    if (
StartOffset0
) {
6395
0
      SDLoc DL(Ptr);
6396
0
      Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6397
0
                        DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6398
0
    }
6399
27
6400
27
    int EltNo = (Offset - StartOffset) >> 2;
6401
27
    unsigned NumElems = VT.getVectorNumElements();
6402
27
6403
27
    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6404
27
    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6405
27
                             LD->getPointerInfo().getWithOffset(StartOffset));
6406
27
6407
27
    SmallVector<int, 8> Mask(NumElems, EltNo);
6408
27
6409
27
    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6410
27
  }
6411
1.01k
6412
1.01k
  return SDValue();
6413
1.01k
}
6414
6415
/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6416
/// elements can be replaced by a single large load which has the same value as
6417
/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6418
///
6419
/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6420
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6421
                                        const SDLoc &DL, SelectionDAG &DAG,
6422
                                        const X86Subtarget &Subtarget,
6423
9.84k
                                        bool isAfterLegalize) {
6424
9.84k
  unsigned NumElems = Elts.size();
6425
9.84k
6426
9.84k
  int LastLoadedElt = -1;
6427
9.84k
  SmallBitVector LoadMask(NumElems, false);
6428
9.84k
  SmallBitVector ZeroMask(NumElems, false);
6429
9.84k
  SmallBitVector UndefMask(NumElems, false);
6430
9.84k
6431
9.84k
  // For each element in the initializer, see if we've found a load, zero or an
6432
9.84k
  // undef.
6433
21.4k
  for (unsigned i = 0; 
i < NumElems21.4k
;
++i11.5k
) {
6434
18.9k
    SDValue Elt = peekThroughBitcasts(Elts[i]);
6435
18.9k
    if (!Elt.getNode())
6436
0
      return SDValue();
6437
18.9k
6438
18.9k
    
if (18.9k
Elt.isUndef()18.9k
)
6439
2.40k
      UndefMask[i] = true;
6440
16.5k
    else 
if (16.5k
X86::isZeroNode(Elt) || 16.5k
ISD::isBuildVectorAllZeros(Elt.getNode())14.3k
)
6441
2.21k
      ZeroMask[i] = true;
6442
14.2k
    else 
if (14.2k
ISD::isNON_EXTLoad(Elt.getNode())14.2k
) {
6443
6.97k
      LoadMask[i] = true;
6444
6.97k
      LastLoadedElt = i;
6445
6.97k
      // Each loaded element must be the correct fractional portion of the
6446
6.97k
      // requested vector load.
6447
6.97k
      if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6448
0
        return SDValue();
6449
14.2k
    } else
6450
7.31k
      return SDValue();
6451
18.9k
  }
6452
2.53k
  assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6453
2.53k
         "Incomplete element masks");
6454
2.53k
6455
2.53k
  // Handle Special Cases - all undef or undef/zero.
6456
2.53k
  if (UndefMask.count() == NumElems)
6457
1
    return DAG.getUNDEF(VT);
6458
2.53k
6459
2.53k
  // FIXME: Should we return this as a BUILD_VECTOR instead?
6460
2.53k
  
if (2.53k
(ZeroMask | UndefMask).count() == NumElems2.53k
)
6461
39
    
return VT.isInteger() ? 39
DAG.getConstant(0, DL, VT)33
6462
6
                          : DAG.getConstantFP(0.0, DL, VT);
6463
2.49k
6464
2.49k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6465
2.49k
  int FirstLoadedElt = LoadMask.find_first();
6466
2.49k
  SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6467
2.49k
  LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6468
2.49k
  EVT LDBaseVT = EltBase.getValueType();
6469
2.49k
6470
2.49k
  // Consecutive loads can contain UNDEFS but not ZERO elements.
6471
2.49k
  // Consecutive loads with UNDEFs and ZEROs elements require a
6472
2.49k
  // an additional shuffle stage to clear the ZERO elements.
6473
2.49k
  bool IsConsecutiveLoad = true;
6474
2.49k
  bool IsConsecutiveLoadWithZeros = true;
6475
4.46k
  for (int i = FirstLoadedElt + 1; 
i <= LastLoadedElt4.46k
;
++i1.97k
) {
6476
3.31k
    if (
LoadMask[i]3.31k
) {
6477
2.73k
      SDValue Elt = peekThroughBitcasts(Elts[i]);
6478
2.73k
      LoadSDNode *LD = cast<LoadSDNode>(Elt);
6479
2.73k
      if (!DAG.areNonVolatileConsecutiveLoads(
6480
2.73k
              LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6481
2.73k
              i - FirstLoadedElt)) {
6482
1.34k
        IsConsecutiveLoad = false;
6483
1.34k
        IsConsecutiveLoadWithZeros = false;
6484
1.34k
        break;
6485
1.34k
      }
6486
578
    } else 
if (578
ZeroMask[i]578
) {
6487
110
      IsConsecutiveLoad = false;
6488
110
    }
6489
3.31k
  }
6490
2.49k
6491
461
  auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6492
461
    auto MMOFlags = LDBase->getMemOperand()->getFlags();
6493
461
    assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6494
461
           "Cannot merge volatile loads.");
6495
461
    SDValue NewLd =
6496
461
        DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6497
461
                    LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6498
461
    DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
6499
461
    return NewLd;
6500
461
  };
6501
2.49k
6502
2.49k
  // LOAD - all consecutive load/undefs (must start/end with a load).
6503
2.49k
  // If we have found an entire vector of loads and undefs, then return a large
6504
2.49k
  // load of the entire vector width starting at the base pointer.
6505
2.49k
  // If the vector contains zeros, then attempt to shuffle those elements.
6506
2.49k
  if (
FirstLoadedElt == 0 && 2.49k
LastLoadedElt == (int)(NumElems - 1)2.47k
&&
6507
2.49k
      
(IsConsecutiveLoad || 1.61k
IsConsecutiveLoadWithZeros1.13k
)) {
6508
507
    assert(LDBase && "Did not find base load for merging consecutive loads");
6509
507
    EVT EltVT = LDBase->getValueType(0);
6510
507
    // Ensure that the input vector size for the merged loads matches the
6511
507
    // cumulative size of the input elements.
6512
507
    if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6513
0
      return SDValue();
6514
507
6515
507
    
if (507
isAfterLegalize && 507
!TLI.isOperationLegal(ISD::LOAD, VT)5
)
6516
0
      return SDValue();
6517
507
6518
507
    // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6519
507
    // will lower to regular temporal loads and use the cache.
6520
507
    
if (507
LDBase->isNonTemporal() && 507
LDBase->getAlignment() >= 3246
&&
6521
507
        
VT.is256BitVector()46
&&
!Subtarget.hasInt256()46
)
6522
46
      return SDValue();
6523
461
6524
461
    
if (461
IsConsecutiveLoad461
)
6525
425
      return CreateLoad(VT, LDBase);
6526
36
6527
36
    // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6528
36
    // vector and a zero vector to clear out the zero elements.
6529
36
    
if (36
!isAfterLegalize && 36
NumElems == VT.getVectorNumElements()36
) {
6530
36
      SmallVector<int, 4> ClearMask(NumElems, -1);
6531
368
      for (unsigned i = 0; 
i < NumElems368
;
++i332
) {
6532
332
        if (ZeroMask[i])
6533
58
          ClearMask[i] = i + NumElems;
6534
274
        else 
if (274
LoadMask[i]274
)
6535
149
          ClearMask[i] = i;
6536
332
      }
6537
36
      SDValue V = CreateLoad(VT, LDBase);
6538
14
      SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6539
22
                                 : DAG.getConstantFP(0.0, DL, VT);
6540
36
      return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6541
36
    }
6542
1.98k
  }
6543
1.98k
6544
1.98k
  int LoadSize =
6545
1.98k
      (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6546
1.98k
6547
1.98k
  // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6548
1.98k
  if (
IsConsecutiveLoad && 1.98k
FirstLoadedElt == 0631
&&
6549
619
      
(LoadSize == 32 || 619
LoadSize == 64388
) &&
6550
1.98k
      
((VT.is128BitVector() || 543
VT.is256BitVector()48
||
VT.is512BitVector()12
))) {
6551
259
    MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6552
284
                                      : MVT::getIntegerVT(LoadSize);
6553
543
    MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6554
543
    if (
TLI.isTypeLegal(VecVT)543
) {
6555
530
      SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6556
530
      SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6557
530
      SDValue ResNode =
6558
530
          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6559
530
                                  LDBase->getPointerInfo(),
6560
530
                                  LDBase->getAlignment(),
6561
530
                                  false/*isVolatile*/, true/*ReadMem*/,
6562
530
                                  false/*WriteMem*/);
6563
530
      DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
6564
530
      return DAG.getBitcast(VT, ResNode);
6565
530
    }
6566
1.45k
  }
6567
1.45k
6568
1.45k
  return SDValue();
6569
1.45k
}
6570
6571
static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6572
106
                                   unsigned SplatBitSize, LLVMContext &C) {
6573
106
  unsigned ScalarSize = VT.getScalarSizeInBits();
6574
106
  unsigned NumElm = SplatBitSize / ScalarSize;
6575
106
6576
106
  SmallVector<Constant *, 32> ConstantVec;
6577
824
  for (unsigned i = 0; 
i < NumElm824
;
i++718
) {
6578
718
    APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6579
718
    Constant *Const;
6580
718
    if (
VT.isFloatingPoint()718
) {
6581
168
      if (
ScalarSize == 32168
) {
6582
96
        Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6583
168
      } else {
6584
72
        assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6585
72
        Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6586
72
      }
6587
168
    } else
6588
550
      Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6589
718
    ConstantVec.push_back(Const);
6590
718
  }
6591
106
  return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6592
106
}
6593
6594
2.17k
static bool isUseOfShuffle(SDNode *N) {
6595
2.72k
  for (auto *U : N->uses()) {
6596
2.72k
    if (isTargetShuffle(U->getOpcode()))
6597
1.05k
      return true;
6598
1.66k
    
if (1.66k
U->getOpcode() == ISD::BITCAST1.66k
) // Ignore bitcasts
6599
410
      return isUseOfShuffle(U);
6600
709
  }
6601
709
  return false;
6602
709
}
6603
6604
/// Attempt to use the vbroadcast instruction to generate a splat value
6605
/// from a splat BUILD_VECTOR which uses:
6606
///  a. A single scalar load, or a constant.
6607
///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6608
///
6609
/// The VBROADCAST node is returned when a pattern is found,
6610
/// or SDValue() otherwise.
6611
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6612
                                           const X86Subtarget &Subtarget,
6613
21.5k
                                           SelectionDAG &DAG) {
6614
21.5k
  // VBROADCAST requires AVX.
6615
21.5k
  // TODO: Splats could be generated for non-AVX CPUs using SSE
6616
21.5k
  // instructions, but there's less potential gain for only 128-bit vectors.
6617
21.5k
  if (!Subtarget.hasAVX())
6618
5.88k
    return SDValue();
6619
15.6k
6620
15.6k
  MVT VT = BVOp->getSimpleValueType(0);
6621
15.6k
  SDLoc dl(BVOp);
6622
15.6k
6623
15.6k
  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6624
15.6k
         "Unsupported vector type for broadcast.");
6625
15.6k
6626
15.6k
  BitVector UndefElements;
6627
15.6k
  SDValue Ld = BVOp->getSplatValue(&UndefElements);
6628
15.6k
6629
15.6k
  // We need a splat of a single value to use broadcast, and it doesn't
6630
15.6k
  // make any sense if the value is only in one element of the vector.
6631
15.6k
  if (
!Ld || 15.6k
(VT.getVectorNumElements() - UndefElements.count()) <= 14.46k
) {
6632
11.5k
    APInt SplatValue, Undef;
6633
11.5k
    unsigned SplatBitSize;
6634
11.5k
    bool HasUndef;
6635
11.5k
    // Check if this is a repeated constant pattern suitable for broadcasting.
6636
11.5k
    if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6637
7.82k
        SplatBitSize > VT.getScalarSizeInBits() &&
6638
11.5k
        
SplatBitSize < VT.getSizeInBits()7.79k
) {
6639
1.76k
      // Avoid replacing with broadcast when it's a use of a shuffle
6640
1.76k
      // instruction to preserve the present custom lowering of shuffles.
6641
1.76k
      if (
isUseOfShuffle(BVOp) || 1.76k
BVOp->hasOneUse()709
)
6642
1.46k
        return SDValue();
6643
302
      // replace BUILD_VECTOR with broadcast of the repeated constants.
6644
302
      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6645
302
      LLVMContext *Ctx = DAG.getContext();
6646
302
      MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6647
302
      if (
Subtarget.hasAVX()302
) {
6648
302
        if (
SplatBitSize <= 64 && 302
Subtarget.hasAVX2()196
&&
6649
302
            
!(SplatBitSize == 64 && 148
Subtarget.is32Bit()80
)) {
6650
112
          // Splatted value can fit in one INTEGER constant in constant pool.
6651
112
          // Load the constant and broadcast it.
6652
112
          MVT CVT = MVT::getIntegerVT(SplatBitSize);
6653
112
          Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6654
112
          Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6655
112
          SDValue CP = DAG.getConstantPool(C, PVT);
6656
112
          unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6657
112
6658
112
          unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6659
112
          Ld = DAG.getLoad(
6660
112
              CVT, dl, DAG.getEntryNode(), CP,
6661
112
              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6662
112
              Alignment);
6663
112
          SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6664
112
                                       MVT::getVectorVT(CVT, Repeat), Ld);
6665
112
          return DAG.getBitcast(VT, Brdcst);
6666
190
        } else 
if (190
SplatBitSize == 32 || 190
SplatBitSize == 64176
) {
6667
77
          // Splatted value can fit in one FLOAT constant in constant pool.
6668
77
          // Load the constant and broadcast it.
6669
77
          // AVX have support for 32 and 64 bit broadcast for floats only.
6670
77
          // No 64bit integer in 32bit subtarget.
6671
77
          MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6672
77
          // Lower the splat via APFloat directly, to avoid any conversion.
6673
77
          Constant *C =
6674
77
              SplatBitSize == 32
6675
14
                  ? ConstantFP::get(*Ctx,
6676
14
                                    APFloat(APFloat::IEEEsingle(), SplatValue))
6677
63
                  : ConstantFP::get(*Ctx,
6678
63
                                    APFloat(APFloat::IEEEdouble(), SplatValue));
6679
77
          SDValue CP = DAG.getConstantPool(C, PVT);
6680
77
          unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6681
77
6682
77
          unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6683
77
          Ld = DAG.getLoad(
6684
77
              CVT, dl, DAG.getEntryNode(), CP,
6685
77
              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6686
77
              Alignment);
6687
77
          SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6688
77
                                       MVT::getVectorVT(CVT, Repeat), Ld);
6689
77
          return DAG.getBitcast(VT, Brdcst);
6690
113
        } else 
if (113
SplatBitSize > 64113
) {
6691
106
          // Load the vector of constants and broadcast it.
6692
106
          MVT CVT = VT.getScalarType();
6693
106
          Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6694
106
                                             *Ctx);
6695
106
          SDValue VCP = DAG.getConstantPool(VecC, PVT);
6696
106
          unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6697
106
          unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6698
106
          Ld = DAG.getLoad(
6699
106
              MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6700
106
              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6701
106
              Alignment);
6702
106
          SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6703
106
          return DAG.getBitcast(VT, Brdcst);
6704
106
        }
6705
9.75k
      }
6706
1.76k
    }
6707
9.75k
    return SDValue();
6708
9.75k
  }
6709
4.11k
6710
4.11k
  bool ConstSplatVal =
6711
2.16k
      (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6712
4.11k
6713
4.11k
  // Make sure that all of the users of a non-constant load are from the
6714
4.11k
  // BUILD_VECTOR node.
6715
4.11k
  if (
!ConstSplatVal && 4.11k
!BVOp->isOnlyUserOf(Ld.getNode())1.55k
)
6716
60
    return SDValue();
6717
4.05k
6718
4.05k
  unsigned ScalarSize = Ld.getValueSizeInBits();
6719
4.05k
  bool IsGE256 = (VT.getSizeInBits() >= 256);
6720
4.05k
6721
4.05k
  // When optimizing for size, generate up to 5 extra bytes for a broadcast
6722
4.05k
  // instruction to save 8 or more bytes of constant pool data.
6723
4.05k
  // TODO: If multiple splats are generated to load the same constant,
6724
4.05k
  // it may be detrimental to overall size. There needs to be a way to detect
6725
4.05k
  // that condition to know if this is truly a size win.
6726
4.05k
  bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6727
4.05k
6728
4.05k
  // Handle broadcasting a single constant scalar from the constant pool
6729
4.05k
  // into a vector.
6730
4.05k
  // On Sandybridge (no AVX2), it is still better to load a constant vector
6731
4.05k
  // from the constant pool and not to broadcast it from a scalar.
6732
4.05k
  // But override that restriction when optimizing for size.
6733
4.05k
  // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6734
4.05k
  if (
ConstSplatVal && 4.05k
(Subtarget.hasAVX2() || 2.56k
OptForSize718
)) {
6735
1.85k
    EVT CVT = Ld.getValueType();
6736
1.85k
    assert(!CVT.isVector() && "Must not broadcast a vector type");
6737
1.85k
6738
1.85k
    // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6739
1.85k
    // For size optimization, also splat v2f64 and v2i64, and for size opt
6740
1.85k
    // with AVX2, also splat i8 and i16.
6741
1.85k
    // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6742
1.85k
    if (
ScalarSize == 32 || 1.85k
(IsGE256 && 1.28k
ScalarSize == 64817
) ||
6743
1.85k
        
(OptForSize && 1.05k
(ScalarSize == 64 || 13
Subtarget.hasAVX2()8
))) {
6744
811
      const Constant *C = nullptr;
6745
811
      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6746
510
        C = CI->getConstantIntValue();
6747
301
      else 
if (ConstantFPSDNode *301
CF301
= dyn_cast<ConstantFPSDNode>(Ld))
6748
301
        C = CF->getConstantFPValue();
6749
811
6750
811
      assert(C && "Invalid constant type");
6751
811
6752
811
      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6753
811
      SDValue CP =
6754
811
          DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6755
811
      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6756
811
      Ld = DAG.getLoad(
6757
811
          CVT, dl, DAG.getEntryNode(), CP,
6758
811
          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6759
811
          Alignment);
6760
811
6761
811
      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6762
811
    }
6763
3.23k
  }
6764
3.23k
6765
3.23k
  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6766
3.23k
6767
3.23k
  // Handle AVX2 in-register broadcasts.
6768
3.23k
  if (
!IsLoad && 3.23k
Subtarget.hasInt256()2.18k
&&
6769
1.43k
      
(ScalarSize == 32 || 1.43k
(IsGE256 && 1.29k
ScalarSize == 64756
)))
6770
234
    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6771
3.00k
6772
3.00k
  // The scalar source must be a normal load.
6773
3.00k
  
if (3.00k
!IsLoad3.00k
)
6774
1.95k
    return SDValue();
6775
1.05k
6776
1.05k
  
if (1.05k
ScalarSize == 32 || 1.05k
(IsGE256 && 539
ScalarSize == 64353
) ||
6777
232
      
(Subtarget.hasVLX() && 232
ScalarSize == 64148
))
6778
907
    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6779
148
6780
148
  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6781
148
  // double since there is no vbroadcastsd xmm
6782
148
  
if (148
Subtarget.hasInt256() && 148
Ld.getValueType().isInteger()139
) {
6783
120
    if (
ScalarSize == 8 || 120
ScalarSize == 1680
||
ScalarSize == 6444
)
6784
120
      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6785
28
  }
6786
28
6787
28
  // Unsupported broadcast.
6788
28
  return SDValue();
6789
28
}
6790
6791
/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6792
/// underlying vector and index.
6793
///
6794
/// Modifies \p ExtractedFromVec to the real vector and returns the real
6795
/// index.
6796
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6797
133
                                         SDValue ExtIdx) {
6798
133
  int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6799
133
  if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6800
133
    return Idx;
6801
0
6802
0
  // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6803
0
  // lowered this:
6804
0
  //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6805
0
  // to:
6806
0
  //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6807
0
  //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6808
0
  //                           undef)
6809
0
  //                       Constant<0>)
6810
0
  // In this case the vector is the extract_subvector expression and the index
6811
0
  // is 2, as specified by the shuffle.
6812
0
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6813
0
  SDValue ShuffleVec = SVOp->getOperand(0);
6814
0
  MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6815
0
  assert(ShuffleVecVT.getVectorElementType() ==
6816
0
         ExtractedFromVec.getSimpleValueType().getVectorElementType());
6817
0
6818
0
  int ShuffleIdx = SVOp->getMaskElt(Idx);
6819
0
  if (
isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())0
) {
6820
0
    ExtractedFromVec = ShuffleVec;
6821
0
    return ShuffleIdx;
6822
0
  }
6823
0
  return Idx;
6824
0
}
6825
6826
924
static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6827
924
  MVT VT = Op.getSimpleValueType();
6828
924
6829
924
  // Skip if insert_vec_elt is not supported.
6830
924
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6831
924
  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6832
119
    return SDValue();
6833
805
6834
805
  SDLoc DL(Op);
6835
805
  unsigned NumElems = Op.getNumOperands();
6836
805
6837
805
  SDValue VecIn1;
6838
805
  SDValue VecIn2;
6839
805
  SmallVector<unsigned, 4> InsertIndices;
6840
805
  SmallVector<int, 8> Mask(NumElems, -1);
6841
805
6842
2.70k
  for (unsigned i = 0; 
i != NumElems2.70k
;
++i1.90k
) {
6843
2.52k
    unsigned Opc = Op.getOperand(i).getOpcode();
6844
2.52k
6845
2.52k
    if (Opc == ISD::UNDEF)
6846
357
      continue;
6847
2.16k
6848
2.16k
    
if (2.16k
Opc != ISD::EXTRACT_VECTOR_ELT2.16k
) {
6849
2.01k
      // Quit if more than 1 elements need inserting.
6850
2.01k
      if (InsertIndices.size() > 1)
6851
575
        return SDValue();
6852
1.43k
6853
1.43k
      InsertIndices.push_back(i);
6854
1.43k
      continue;
6855
1.43k
    }
6856
152
6857
152
    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6858
152
    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6859
152
6860
152
    // Quit if non-constant index.
6861
152
    if (!isa<ConstantSDNode>(ExtIdx))
6862
19
      return SDValue();
6863
133
    int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6864
133
6865
133
    // Quit if extracted from vector of different type.
6866
133
    if (ExtractedFromVec.getValueType() != VT)
6867
24
      return SDValue();
6868
109
6869
109
    
if (109
!VecIn1.getNode()109
)
6870
55
      VecIn1 = ExtractedFromVec;
6871
54
    else 
if (54
VecIn1 != ExtractedFromVec54
) {
6872
6
      if (!VecIn2.getNode())
6873
3
        VecIn2 = ExtractedFromVec;
6874
3
      else 
if (3
VecIn2 != ExtractedFromVec3
)
6875
3
        // Quit if more than 2 vectors to shuffle
6876
3
        return SDValue();
6877
106
    }
6878
106
6879
106
    
if (106
ExtractedFromVec == VecIn1106
)
6880
103
      Mask[i] = Idx;
6881
3
    else 
if (3
ExtractedFromVec == VecIn23
)
6882
3
      Mask[i] = Idx + NumElems;
6883
2.52k
  }
6884
805
6885
184
  
if (184
!VecIn1.getNode()184
)
6886
137
    return SDValue();
6887
47
6888
47
  
VecIn2 = VecIn2.getNode() ? 47
VecIn20
:
DAG.getUNDEF(VT)47
;
6889
47
  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6890
47
6891
47
  for (unsigned Idx : InsertIndices)
6892
15
    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6893
15
                     DAG.getIntPtrConstant(Idx, DL));
6894
924
6895
924
  return NV;
6896
924
}
6897
6898
1.82k
static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6899
1.82k
  assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6900
1.82k
         Op.getScalarValueSizeInBits() == 1 &&
6901
1.82k
         "Can not convert non-constant vector");
6902
1.82k
  uint64_t Immediate = 0;
6903
21.6k
  for (unsigned idx = 0, e = Op.getNumOperands(); 
idx < e21.6k
;
++idx19.8k
) {
6904
19.8k
    SDValue In = Op.getOperand(idx);
6905
19.8k
    if (!In.isUndef())
6906
19.8k
      Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
6907
19.8k
  }
6908
1.82k
  SDLoc dl(Op);
6909
1.82k
  MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6910
1.82k
  return DAG.getConstant(Immediate, dl, VT);
6911
1.82k
}
6912
// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6913
SDValue
6914
7.16k
X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6915
7.16k
6916
7.16k
  MVT VT = Op.getSimpleValueType();
6917
7.16k
  assert((VT.getVectorElementType() == MVT::i1) &&
6918
7.16k
         "Unexpected type in LowerBUILD_VECTORvXi1!");
6919
7.16k
6920
7.16k
  SDLoc dl(Op);
6921
7.16k
  if (ISD::isBuildVectorAllZeros(Op.getNode()))
6922
4.66k
    return DAG.getTargetConstant(0, dl, VT);
6923
2.50k
6924
2.50k
  
if (2.50k
ISD::isBuildVectorAllOnes(Op.getNode())2.50k
)
6925
493
    return DAG.getTargetConstant(1, dl, VT);
6926
2.01k
6927
2.01k
  
if (2.01k
ISD::isBuildVectorOfConstantSDNodes(Op.getNode())2.01k
) {
6928
1.81k
    if (
VT == MVT::v64i1 && 1.81k
!Subtarget.is64Bit()35
) {
6929
1
      // Split the pieces.
6930
1
      SDValue Lower =
6931
1
          DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
6932
1
      SDValue Upper =
6933
1
          DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
6934
1
      // We have to manually lower both halves so getNode doesn't try to
6935
1
      // reassemble the build_vector.
6936
1
      Lower = LowerBUILD_VECTORvXi1(Lower, DAG);
6937
1
      Upper = LowerBUILD_VECTORvXi1(Upper, DAG);
6938
1
      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
6939
1
    }
6940
1.81k
    SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6941
1.81k
    if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6942
1.22k
      return DAG.getBitcast(VT, Imm);
6943
587
    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6944
587
    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6945
587
                        DAG.getIntPtrConstant(0, dl));
6946
587
  }
6947
195
6948
195
  // Vector has one or more non-const elements
6949
195
  uint64_t Immediate = 0;
6950
195
  SmallVector<unsigned, 16> NonConstIdx;
6951
195
  bool IsSplat = true;
6952
195
  bool HasConstElts = false;
6953
195
  int SplatIdx = -1;
6954
2.77k
  for (unsigned idx = 0, e = Op.getNumOperands(); 
idx < e2.77k
;
++idx2.57k
) {
6955
2.57k
    SDValue In = Op.getOperand(idx);
6956
2.57k
    if (In.isUndef())
6957
55
      continue;
6958
2.52k
    
if (2.52k
!isa<ConstantSDNode>(In)2.52k
)
6959
886
      NonConstIdx.push_back(idx);
6960
1.63k
    else {
6961
1.63k
      Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
6962
1.63k
      HasConstElts = true;
6963
1.63k
    }
6964
2.52k
    if (SplatIdx < 0)
6965
195
      SplatIdx = idx;
6966
2.32k
    else 
if (2.32k
In != Op.getOperand(SplatIdx)2.32k
)
6967
2.31k
      IsSplat = false;
6968
2.57k
  }
6969
195
6970
195
  // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6971
195
  if (IsSplat)
6972
4
    return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
6973
4
                         DAG.getConstant(1, dl, VT),
6974
4
                         DAG.getConstant(0, dl, VT));
6975
191
6976
191
  // insert elements one by one
6977
191
  SDValue DstVec;
6978
191
  SDValue Imm;
6979
191
  if (
Immediate191
) {
6980
5
    MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6981
5
    Imm = DAG.getConstant(Immediate, dl, ImmVT);
6982
5
  }
6983
186
  else 
if (186
HasConstElts186
)
6984
180
    Imm = DAG.getConstant(0, dl, VT);
6985
186
  else
6986
6
    Imm = DAG.getUNDEF(VT);
6987
191
  if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6988
191
    DstVec = DAG.getBitcast(VT, Imm);
6989
0
  else {
6990
0
    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6991
0
    DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6992
0
                         DAG.getIntPtrConstant(0, dl));
6993
0
  }
6994
191
6995
1.05k
  for (unsigned i = 0, e = NonConstIdx.size(); 
i != e1.05k
;
++i867
) {
6996
867
    unsigned InsertIdx = NonConstIdx[i];
6997
867
    DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6998
867
                         Op.getOperand(InsertIdx),
6999
867
                         DAG.getIntPtrConstant(InsertIdx, dl));
7000
867
  }
7001
7.16k
  return DstVec;
7002
7.16k
}
7003
7004
/// \brief Return true if \p N implements a horizontal binop and return the
7005
/// operands for the horizontal binop into V0 and V1.
7006
///
7007
/// This is a helper function of LowerToHorizontalOp().
7008
/// This function checks that the build_vector \p N in input implements a
7009
/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7010
/// operation to match.
7011
/// For example, if \p Opcode is equal to ISD::ADD, then this function
7012
/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7013
/// is equal to ISD::SUB, then this function checks if this is a horizontal
7014
/// arithmetic sub.
7015
///
7016
/// This function only analyzes elements of \p N whose indices are
7017
/// in range [BaseIdx, LastIdx).
7018
static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7019
                              SelectionDAG &DAG,
7020
                              unsigned BaseIdx, unsigned LastIdx,
7021
27.0k
                              SDValue &V0, SDValue &V1) {
7022
27.0k
  EVT VT = N->getValueType(0);
7023
27.0k
7024
27.0k
  assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7025
27.0k
  assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7026
27.0k
         "Invalid Vector in input!");
7027
27.0k
7028
19.4k
  bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7029
27.0k
  bool CanFold = true;
7030
27.0k
  unsigned ExpectedVExtractIdx = BaseIdx;
7031
27.0k
  unsigned NumElts = LastIdx - BaseIdx;
7032
27.0k
  V0 = DAG.getUNDEF(VT);
7033
27.0k
  V1 = DAG.getUNDEF(VT);
7034
27.0k
7035
27.0k
  // Check if N implements a horizontal binop.
7036
28.7k
  for (unsigned i = 0, e = NumElts; 
i != e && 28.7k
CanFold28.5k
;
++i1.70k
) {
7037
28.5k
    SDValue Op = N->getOperand(i + BaseIdx);
7038
28.5k
7039
28.5k
    // Skip UNDEFs.
7040
28.5k
    if (
Op->isUndef()28.5k
) {
7041
1.11k
      // Update the expected vector extract index.
7042
1.11k
      if (i * 2 == NumElts)
7043
91
        ExpectedVExtractIdx = BaseIdx;
7044
1.11k
      ExpectedVExtractIdx += 2;
7045
1.11k
      continue;
7046
1.11k
    }
7047
27.4k
7048
27.4k
    
CanFold = Op->getOpcode() == Opcode && 27.4k
Op->hasOneUse()618
;
7049
27.4k
7050
27.4k
    if (!CanFold)
7051
26.8k
      break;
7052
611
7053
611
    SDValue Op0 = Op.getOperand(0);
7054
611
    SDValue Op1 = Op.getOperand(1);
7055
611
7056
611
    // Try to match the following pattern:
7057
611
    // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7058
611
    CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7059
600
        Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7060
598
        Op0.getOperand(0) == Op1.getOperand(0) &&
7061
592
        isa<ConstantSDNode>(Op0.getOperand(1)) &&
7062
592
        isa<ConstantSDNode>(Op1.getOperand(1)));
7063
611
    if (!CanFold)
7064
19
      break;
7065
592
7066
592
    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7067
592
    unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7068
592
7069
592
    if (
i * 2 < NumElts592
) {
7070
323
      if (
V0.isUndef()323
) {
7071
173
        V0 = Op0.getOperand(0);
7072
173
        if (V0.getValueType() != VT)
7073
0
          return false;
7074
592
      }
7075
0
    } else {
7076
269
      if (
V1.isUndef()269
) {
7077
157
        V1 = Op0.getOperand(0);
7078
157
        if (V1.getValueType() != VT)
7079
0
          return false;
7080
269
      }
7081
269
      
if (269
i * 2 == NumElts269
)
7082
145
        ExpectedVExtractIdx = BaseIdx;
7083
269
    }
7084
592
7085
592
    
SDValue Expected = (i * 2 < NumElts) ? 592
V0323
:
V1269
;
7086
592
    if (I0 == ExpectedVExtractIdx)
7087
552
      
CanFold = I1 == I0 + 1 && 552
Op0.getOperand(0) == Expected552
;
7088
40
    else 
if (40
IsCommutable && 40
I1 == ExpectedVExtractIdx27
) {
7089
14
      // Try to match the following dag sequence:
7090
14
      // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7091
14
      CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7092
14
    } else
7093
26
      CanFold = false;
7094
28.5k
7095
28.5k
    ExpectedVExtractIdx += 2;
7096
28.5k
  }
7097
27.0k
7098
27.0k
  return CanFold;
7099
27.0k
}
7100
7101
/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7102
/// a concat_vector.
7103
///
7104
/// This is a helper function of LowerToHorizontalOp().
7105
/// This function expects two 256-bit vectors called V0 and V1.
7106
/// At first, each vector is split into two separate 128-bit vectors.
7107
/// Then, the resulting 128-bit vectors are used to implement two
7108
/// horizontal binary operations.
7109
///
7110
/// The kind of horizontal binary operation is defined by \p X86Opcode.
7111
///
7112
/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7113
/// the two new horizontal binop.
7114
/// When Mode is set, the first horizontal binop dag node would take as input
7115
/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7116
/// horizontal binop dag node would take as input the lower 128-bit of V1
7117
/// and the upper 128-bit of V1.
7118
///   Example:
7119
///     HADD V0_LO, V0_HI
7120
///     HADD V1_LO, V1_HI
7121
///
7122
/// Otherwise, the first horizontal binop dag node takes as input the lower
7123
/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7124
/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7125
///   Example:
7126
///     HADD V0_LO, V1_LO
7127
///     HADD V0_HI, V1_HI
7128
///
7129
/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7130
/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7131
/// the upper 128-bits of the result.
7132
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7133
                                     const SDLoc &DL, SelectionDAG &DAG,
7134
                                     unsigned X86Opcode, bool Mode,
7135
16
                                     bool isUndefLO, bool isUndefHI) {
7136
16
  MVT VT = V0.getSimpleValueType();
7137
16
  assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7138
16
         "Invalid nodes in input!");
7139
16
7140
16
  unsigned NumElts = VT.getVectorNumElements();
7141
16
  SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7142
16
  SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7143
16
  SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7144
16
  SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7145
16
  MVT NewVT = V0_LO.getSimpleValueType();
7146
16
7147
16
  SDValue LO = DAG.getUNDEF(NewVT);
7148
16
  SDValue HI = DAG.getUNDEF(NewVT);
7149
16
7150
16
  if (
Mode16
) {
7151
12
    // Don't emit a horizontal binop if the result is expected to be UNDEF.
7152
12
    if (
!isUndefLO && 12
!V0->isUndef()12
)
7153
12
      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7154
12
    if (
!isUndefHI && 12
!V1->isUndef()8
)
7155
8
      HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7156
16
  } else {
7157
4
    // Don't emit a horizontal binop if the result is expected to be UNDEF.
7158
4
    if (
!isUndefLO && 4
(!V0_LO->isUndef() || 4
!V1_LO->isUndef()0
))
7159
4
      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7160
4
7161
4
    if (
!isUndefHI && 4
(!V0_HI->isUndef() || 2
!V1_HI->isUndef()0
))
7162
2
      HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7163
4
  }
7164
16
7165
16
  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7166
16
}
7167
7168
/// Returns true iff \p BV builds a vector with the result equivalent to
7169
/// the result of ADDSUB operation.
7170
/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7171
/// are written to the parameters \p Opnd0 and \p Opnd1.
7172
static bool isAddSub(const BuildVectorSDNode *BV,
7173
                     const X86Subtarget &Subtarget, SelectionDAG &DAG,
7174
21.6k
                     SDValue &Opnd0, SDValue &Opnd1) {
7175
21.6k
7176
21.6k
  MVT VT = BV->getSimpleValueType(0);
7177
21.6k
  if (
(!Subtarget.hasSSE3() || 21.6k
(VT != MVT::v4f32 && 18.9k
VT != MVT::v2f6417.5k
)) &&
7178
19.2k
      
(!Subtarget.hasAVX() || 19.2k
(VT != MVT::v8f32 && 13.9k
VT != MVT::v4f6413.4k
)) &&
7179
18.2k
      
(!Subtarget.hasAVX512() || 18.2k
(VT != MVT::v16f32 && 7.61k
VT != MVT::v8f647.44k
)))
7180
17.9k
    return false;
7181
3.74k
7182
3.74k
  unsigned NumElts = VT.getVectorNumElements();
7183
3.74k
  SDValue InVec0 = DAG.getUNDEF(VT);
7184
3.74k
  SDValue InVec1 = DAG.getUNDEF(VT);
7185
3.74k
7186
3.74k
  // Odd-numbered elements in the input build vector are obtained from
7187
3.74k
  // adding two integer/float elements.
7188
3.74k
  // Even-numbered elements in the input build vector are obtained from
7189
3.74k
  // subtracting two integer/float elements.
7190
3.74k
  unsigned ExpectedOpcode = ISD::FSUB;
7191
3.74k
  unsigned NextExpectedOpcode = ISD::FADD;
7192
3.74k
  bool AddFound = false;
7193
3.74k
  bool SubFound = false;
7194
3.74k
7195
3.98k
  for (unsigned i = 0, e = NumElts; 
i != e3.98k
;
++i239
) {
7196
3.93k
    SDValue Op = BV->getOperand(i);
7197
3.93k
7198
3.93k
    // Skip 'undef' values.
7199
3.93k
    unsigned Opcode = Op.getOpcode();
7200
3.93k
    if (
Opcode == ISD::UNDEF3.93k
) {
7201
159
      std::swap(ExpectedOpcode, NextExpectedOpcode);
7202
159
      continue;
7203
159
    }
7204
3.77k
7205
3.77k
    // Early exit if we found an unexpected opcode.
7206
3.77k
    
if (3.77k
Opcode != ExpectedOpcode3.77k
)
7207
3.63k
      return false;
7208
137
7209
137
    SDValue Op0 = Op.getOperand(0);
7210
137
    SDValue Op1 = Op.getOperand(1);
7211
137
7212
137
    // Try to match the following pattern:
7213
137
    // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7214
137
    // Early exit if we cannot match that sequence.
7215
137
    if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7216
127
        Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7217
122
        !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7218
122
        !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7219
122
        Op0.getOperand(1) != Op1.getOperand(1))
7220
57
      return false;
7221
80
7222
80
    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7223
80
    if (I0 != i)
7224
0
      return false;
7225
80
7226
80
    // We found a valid add/sub node. Update the information accordingly.
7227
80
    
if (80
i & 180
)
7228
40
      AddFound = true;
7229
80
    else
7230
40
      SubFound = true;
7231
80
7232
80
    // Update InVec0 and InVec1.
7233
80
    if (
InVec0.isUndef()80
) {
7234
34
      InVec0 = Op0.getOperand(0);
7235
34
      if (InVec0.getSimpleValueType() != VT)
7236
0
        return false;
7237
80
    }
7238
80
    
if (80
InVec1.isUndef()80
) {
7239
34
      InVec1 = Op1.getOperand(0);
7240
34
      if (InVec1.getSimpleValueType() != VT)
7241
0
        return false;
7242
80
    }
7243
80
7244
80
    // Make sure that operands in input to each add/sub node always
7245
80
    // come from a same pair of vectors.
7246
80
    
if (80
InVec0 != Op0.getOperand(0)80
) {
7247
4
      if (ExpectedOpcode == ISD::FSUB)
7248
0
        return false;
7249
4
7250
4
      // FADD is commutable. Try to commute the operands
7251
4
      // and then test again.
7252
4
      std::swap(Op0, Op1);
7253
4
      if (InVec0 != Op0.getOperand(0))
7254
0
        return false;
7255
80
    }
7256
80
7257
80
    
if (80
InVec1 != Op1.getOperand(0)80
)
7258
0
      return false;
7259
80
7260
80
    // Update the pair of expected opcodes.
7261
80
    std::swap(ExpectedOpcode, NextExpectedOpcode);
7262
80
  }
7263
3.74k
7264
3.74k
  // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7265
48
  
if (48
!AddFound || 48
!SubFound28
||
InVec0.isUndef()22
||
InVec1.isUndef()22
)
7266
26
    return false;
7267
22
7268
22
  Opnd0 = InVec0;
7269
22
  Opnd1 = InVec1;
7270
22
  return true;
7271
22
}
7272
7273
/// Returns true if is possible to fold MUL and an idiom that has already been
7274
/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7275
/// If (and only if) true is returned, the operands of FMADDSUB are written to
7276
/// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7277
///
7278
/// Prior to calling this function it should be known that there is some
7279
/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7280
/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7281
/// before replacement of such SDNode with ADDSUB operation. Thus the number
7282
/// of \p Opnd0 uses is expected to be equal to 2.
7283
/// For example, this function may be called for the following IR:
7284
///    %AB = fmul fast <2 x double> %A, %B
7285
///    %Sub = fsub fast <2 x double> %AB, %C
7286
///    %Add = fadd fast <2 x double> %AB, %C
7287
///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7288
///                            <2 x i32> <i32 0, i32 3>
7289
/// There is a def for %Addsub here, which potentially can be replaced by
7290
/// X86ISD::ADDSUB operation:
7291
///    %Addsub = X86ISD::ADDSUB %AB, %C
7292
/// and such ADDSUB can further be replaced with FMADDSUB:
7293
///    %Addsub = FMADDSUB %A, %B, %C.
7294
///
7295
/// The main reason why this method is called before the replacement of the
7296
/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7297
/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7298
/// FMADDSUB is.
7299
static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7300
102
                       SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7301
102
  if (
Opnd0.getOpcode() != ISD::FMUL || 102
Opnd0->use_size() != 224
||
7302
24
      !Subtarget.hasAnyFMA())
7303
78
    return false;
7304
24
7305
24
  // FIXME: These checks must match the similar ones in
7306
24
  // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7307
24
  // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7308
24
  // or MUL + ADDSUB to FMADDSUB.
7309
24
  const TargetOptions &Options = DAG.getTarget().Options;
7310
24
  bool AllowFusion =
7311
24
      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7312
24
  if (!AllowFusion)
7313
2
    return false;
7314
22
7315
22
  Opnd2 = Opnd1;
7316
22
  Opnd1 = Opnd0.getOperand(1);
7317
22
  Opnd0 = Opnd0.getOperand(0);
7318
22
7319
22
  return true;
7320
22
}
7321
7322
/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7323
/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7324
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7325
                                       const X86Subtarget &Subtarget,
7326
21.6k
                                       SelectionDAG &DAG) {
7327
21.6k
  SDValue Opnd0, Opnd1;
7328
21.6k
  if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7329
21.6k
    return SDValue();
7330
22
7331
22
  MVT VT = BV->getSimpleValueType(0);
7332
22
  SDLoc DL(BV);
7333
22
7334
22
  // Try to generate X86ISD::FMADDSUB node here.
7335
22
  SDValue Opnd2;
7336
22
  if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7337
0
    return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7338
22
7339
22
  // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7340
22
  // the ADDSUB idiom has been successfully recognized. There are no known
7341
22
  // X86 targets with 512-bit ADDSUB instructions!
7342
22
  // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7343
22
  // recognition.
7344
22
  
if (22
VT.is512BitVector()22
)
7345
0
    return SDValue();
7346
22
7347
22
  return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7348
22
}
7349
7350
/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7351
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7352
                                   const X86Subtarget &Subtarget,
7353
21.6k
                                   SelectionDAG &DAG) {
7354
21.6k
  MVT VT = BV->getSimpleValueType(0);
7355
21.6k
  unsigned NumElts = VT.getVectorNumElements();
7356
21.6k
  unsigned NumUndefsLO = 0;
7357
21.6k
  unsigned NumUndefsHI = 0;
7358
21.6k
  unsigned Half = NumElts/2;
7359
21.6k
7360
21.6k
  // Count the number of UNDEF operands in the build_vector in input.
7361
145k
  for (unsigned i = 0, e = Half; 
i != e145k
;
++i123k
)
7362
123k
    
if (123k
BV->getOperand(i)->isUndef()123k
)
7363
5.61k
      NumUndefsLO++;
7364
21.6k
7365
145k
  for (unsigned i = Half, e = NumElts; 
i != e145k
;
++i123k
)
7366
123k
    
if (123k
BV->getOperand(i)->isUndef()123k
)
7367
11.1k
      NumUndefsHI++;
7368
21.6k
7369
21.6k
  // Early exit if this is either a build_vector of all UNDEFs or all the
7370
21.6k
  // operands but one are UNDEF.
7371
21.6k
  if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7372
573
    return SDValue();
7373
21.0k
7374
21.0k
  SDLoc DL(BV);
7375
21.0k
  SDValue InVec0, InVec1;
7376
21.0k
  if (
(VT == MVT::v4f32 || 21.0k
VT == MVT::v2f6419.4k
) &&
Subtarget.hasSSE3()2.62k
) {
7377
2.14k
    // Try to match an SSE3 float HADD/HSUB.
7378
2.14k
    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7379
51
      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7380
2.09k
7381
2.09k
    
if (2.09k
isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)2.09k
)
7382
28
      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7383
18.9k
  } else 
if (18.9k
(VT == MVT::v4i32 || 18.9k
VT == MVT::v8i1616.3k
) &&
Subtarget.hasSSSE3()3.85k
) {
7384
2.61k
    // Try to match an SSSE3 integer HADD/HSUB.
7385
2.61k
    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7386
17
      return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7387
2.59k
7388
2.59k
    
if (2.59k
isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)2.59k
)
7389
6
      return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7390
20.9k
  }
7391
20.9k
7392
20.9k
  
if (20.9k
!Subtarget.hasAVX()20.9k
)
7393
5.66k
    return SDValue();
7394
15.3k
7395
15.3k
  
if (15.3k
(VT == MVT::v8f32 || 15.3k
VT == MVT::v4f6414.8k
)) {
7396
903
    // Try to match an AVX horizontal add/sub of packed single/double
7397
903
    // precision floating point values from 256-bit vectors.
7398
903
    SDValue InVec2, InVec3;
7399
903
    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7400
11
        isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7401
10
        
((InVec0.isUndef() || 10
InVec2.isUndef()10
) ||
InVec0 == InVec24
) &&
7402
10
        
((InVec1.isUndef() || 10
InVec3.isUndef()6
) ||
InVec1 == InVec34
))
7403
10
      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7404
893
7405
893
    
if (893
isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7406
5
        isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7407
4
        
((InVec0.isUndef() || 4
InVec2.isUndef()4
) ||
InVec0 == InVec24
) &&
7408
4
        
((InVec1.isUndef() || 4
InVec3.isUndef()4
) ||
InVec1 == InVec34
))
7409
4
      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7410
14.4k
  } else 
if (14.4k
VT == MVT::v8i32 || 14.4k
VT == MVT::v16i1613.1k
) {
7411
2.04k
    // Try to match an AVX2 horizontal add/sub of signed integers.
7412
2.04k
    SDValue InVec2, InVec3;
7413
2.04k
    unsigned X86Opcode;
7414
2.04k
    bool CanFold = true;
7415
2.04k
7416
2.04k
    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7417
20
        isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7418
10
        
((InVec0.isUndef() || 10
InVec2.isUndef()10
) ||
InVec0 == InVec24
) &&
7419
10
        
((InVec1.isUndef() || 10
InVec3.isUndef()6
) ||
InVec1 == InVec34
))
7420
10
      X86Opcode = X86ISD::HADD;
7421
2.03k
    else 
if (2.03k
isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7422
10
        isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7423
0
        
((InVec0.isUndef() || 0
InVec2.isUndef()0
) ||
InVec0 == InVec20
) &&
7424
0
        
((InVec1.isUndef() || 0
InVec3.isUndef()0
) ||
InVec1 == InVec30
))
7425
0
      X86Opcode = X86ISD::HSUB;
7426
2.03k
    else
7427
2.03k
      CanFold = false;
7428
2.04k
7429
2.04k
    if (
CanFold2.04k
) {
7430
10
      // Fold this build_vector into a single horizontal add/sub.
7431
10
      // Do this only if the target has AVX2.
7432
10
      if (Subtarget.hasAVX2())
7433
5
        return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7434
5
7435
5
      // Do not try to expand this build_vector into a pair of horizontal
7436
5
      // add/sub if we can emit a pair of scalar add/sub.
7437
5
      
if (5
NumUndefsLO + 1 == Half || 5
NumUndefsHI + 1 == Half4
)
7438
1
        return SDValue();
7439
4
7440
4
      // Convert this build_vector into a pair of horizontal binop followed by
7441
4
      // a concat vector.
7442
4
      bool isUndefLO = NumUndefsLO == Half;
7443
4
      bool isUndefHI = NumUndefsHI == Half;
7444
4
      return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7445
4
                                   isUndefLO, isUndefHI);
7446
4
    }
7447
14.4k
  }
7448
15.2k
7449
15.2k
  
if (15.2k
(VT == MVT::v8f32 || 15.2k
VT == MVT::v4f6414.8k
||
VT == MVT::v8i3214.3k
||
7450
15.2k
       
VT == MVT::v16i1613.1k
) &&
Subtarget.hasAVX()2.92k
) {
7451
2.92k
    unsigned X86Opcode;
7452
2.92k
    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7453
6
      X86Opcode = X86ISD::HADD;
7454
2.92k
    else 
if (2.92k
isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)2.92k
)
7455
0
      X86Opcode = X86ISD::HSUB;
7456
2.92k
    else 
if (2.92k
isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)2.92k
)
7457
4
      X86Opcode = X86ISD::FHADD;
7458
2.91k
    else 
if (2.91k
isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)2.91k
)
7459
2
      X86Opcode = X86ISD::FHSUB;
7460
2.91k
    else
7461
2.91k
      return SDValue();
7462
12
7463
12
    // Don't try to expand this build_vector into a pair of horizontal add/sub
7464
12
    // if we can simply emit a pair of scalar add/sub.
7465
12
    
if (12
NumUndefsLO + 1 == Half || 12
NumUndefsHI + 1 == Half12
)
7466
0
      return SDValue();
7467
12
7468
12
    // Convert this build_vector into two horizontal add/sub followed by
7469
12
    // a concat vector.
7470
12
    bool isUndefLO = NumUndefsLO == Half;
7471
12
    bool isUndefHI = NumUndefsHI == Half;
7472
12
    return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7473
12
                                 isUndefLO, isUndefHI);
7474
12
  }
7475
12.3k
7476
12.3k
  return SDValue();
7477
12.3k
}
7478
7479
/// If a BUILD_VECTOR's source elements all apply the same bit operation and
7480
/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7481
/// just apply the bit to the vectors.
7482
/// NOTE: Its not in our interest to start make a general purpose vectorizer
7483
/// from this, but enough scalar bit operations are created from the later
7484
/// legalization + scalarization stages to need basic support.
7485
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7486
19.1k
                                       SelectionDAG &DAG) {
7487
19.1k
  SDLoc DL(Op);
7488
19.1k
  MVT VT = Op->getSimpleValueType(0);
7489
19.1k
  unsigned NumElems = VT.getVectorNumElements();
7490
19.1k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7491
19.1k
7492
19.1k
  // Check that all elements have the same opcode.
7493
19.1k
  // TODO: Should we allow UNDEFS and if so how many?
7494
19.1k
  unsigned Opcode = Op->getOperand(0).getOpcode();
7495
198k
  for (unsigned i = 1; 
i < NumElems198k
;
++i179k
)
7496
183k
    
if (183k
Opcode != Op->getOperand(i).getOpcode()183k
)
7497
3.62k
      return SDValue();
7498
19.1k
7499
19.1k
  // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7500
15.5k
  switch (Opcode) {
7501
15.3k
  default:
7502
15.3k
    return SDValue();
7503
162
  case ISD::AND:
7504
162
  case ISD::XOR:
7505
162
  case ISD::OR:
7506
162
    if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7507
0
      return SDValue();
7508
162
    break;
7509
162
  }
7510
162
7511
162
  SmallVector<SDValue, 4> LHSElts, RHSElts;
7512
455
  for (SDValue Elt : Op->ops()) {
7513
455
    SDValue LHS = Elt.getOperand(0);
7514
455
    SDValue RHS = Elt.getOperand(1);
7515
455
7516
455
    // We expect the canonicalized RHS operand to be the constant.
7517
455
    if (!isa<ConstantSDNode>(RHS))
7518
113
      return SDValue();
7519
342
    LHSElts.push_back(LHS);
7520
342
    RHSElts.push_back(RHS);
7521
342
  }
7522
162
7523
49
  SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7524
49
  SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7525
49
  return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7526
19.1k
}
7527
7528
/// Create a vector constant without a load. SSE/AVX provide the bare minimum
7529
/// functionality to do this, so it's all zeros, all ones, or some derivation
7530
/// that is cheap to calculate.
7531
static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7532
44.9k
                                         const X86Subtarget &Subtarget) {
7533
44.9k
  SDLoc DL(Op);
7534
44.9k
  MVT VT = Op.getSimpleValueType();
7535
44.9k
7536
44.9k
  // Vectors containing all zeros can be matched by pxor and xorps.
7537
44.9k
  if (
ISD::isBuildVectorAllZeros(Op.getNode())44.9k
) {
7538
18.8k
    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7539
18.8k
    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7540
18.8k
    if (
VT == MVT::v4i32 || 18.8k
VT == MVT::v8i3211.5k
||
VT == MVT::v16i328.12k
)
7541
14.4k
      return Op;
7542
4.37k
7543
4.37k
    return getZeroVector(VT, Subtarget, DAG, DL);
7544
4.37k
  }
7545
26.1k
7546
26.1k
  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7547
26.1k
  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7548
26.1k
  // vpcmpeqd on 256-bit vectors.
7549
26.1k
  
if (26.1k
Subtarget.hasSSE2() && 26.1k
ISD::isBuildVectorAllOnes(Op.getNode())25.9k
) {
7550
4.44k
    if (
VT == MVT::v4i32 || 4.44k
VT == MVT::v16i322.46k
||
7551
1.68k
        
(VT == MVT::v8i32 && 1.68k
Subtarget.hasInt256()851
))
7552
3.45k
      return Op;
7553
994
7554
994
    return getOnesVector(VT, DAG, DL);
7555
994
  }
7556
21.6k
7557
21.6k
  return SDValue();
7558
21.6k
}
7559
7560
SDValue
7561
52.0k
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7562
52.0k
  SDLoc dl(Op);
7563
52.0k
7564
52.0k
  MVT VT = Op.getSimpleValueType();
7565
52.0k
  MVT ExtVT = VT.getVectorElementType();
7566
52.0k
  unsigned NumElems = Op.getNumOperands();
7567
52.0k
7568
52.0k
  // Generate vectors for predicate vectors.
7569
52.0k
  if (
VT.getVectorElementType() == MVT::i1 && 52.0k
Subtarget.hasAVX512()7.16k
)
7570
7.16k
    return LowerBUILD_VECTORvXi1(Op, DAG);
7571
44.9k
7572
44.9k
  
if (SDValue 44.9k
VectorConstant44.9k
= materializeVectorConstant(Op, DAG, Subtarget))
7573
23.2k
    return VectorConstant;
7574
21.6k
7575
21.6k
  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7576
21.6k
  if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7577
22
    return AddSub;
7578
21.6k
  
if (SDValue 21.6k
HorizontalOp21.6k
= LowerToHorizontalOp(BV, Subtarget, DAG))
7579
137
    return HorizontalOp;
7580
21.5k
  
if (SDValue 21.5k
Broadcast21.5k
= lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7581
2.36k
    return Broadcast;
7582
19.1k
  
if (SDValue 19.1k
BitOp19.1k
= lowerBuildVectorToBitOp(BV, DAG))
7583
49
    return BitOp;
7584
19.1k
7585
19.1k
  unsigned EVTBits = ExtVT.getSizeInBits();
7586
19.1k
7587
19.1k
  unsigned NumZero  = 0;
7588
19.1k
  unsigned NumNonZero = 0;
7589
19.1k
  uint64_t NonZeros = 0;
7590
19.1k
  bool IsAllConstants = true;
7591
19.1k
  SmallSet<SDValue, 8> Values;
7592
243k
  for (unsigned i = 0; 
i < NumElems243k
;
++i224k
) {
7593
224k
    SDValue Elt = Op.getOperand(i);
7594
224k
    if (Elt.isUndef())
7595
16.6k
      continue;
7596
207k
    Values.insert(Elt);
7597
207k
    if (Elt.getOpcode() != ISD::Constant &&
7598
34.8k
        Elt.getOpcode() != ISD::ConstantFP)
7599
28.8k
      IsAllConstants = false;
7600
207k
    if (X86::isZeroNode(Elt))
7601
24.2k
      NumZero++;
7602
183k
    else {
7603
183k
      assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7604
183k
      NonZeros |= ((uint64_t)1 << i);
7605
183k
      NumNonZero++;
7606
183k
    }
7607
224k
  }
7608
19.1k
7609
19.1k
  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
7610
19.1k
  if (NumNonZero == 0)
7611
25
    return DAG.getUNDEF(VT);
7612
19.0k
7613
19.0k
  // Special case for single non-zero, non-undef, element.
7614
19.0k
  
if (19.0k
NumNonZero == 119.0k
) {
7615
1.56k
    unsigned Idx = countTrailingZeros(NonZeros);
7616
1.56k
    SDValue Item = Op.getOperand(Idx);
7617
1.56k
7618
1.56k
    // If this is an insertion of an i64 value on x86-32, and if the top bits of
7619
1.56k
    // the value are obviously zero, truncate the value to i32 and do the
7620
1.56k
    // insertion that way.  Only do this if the value is non-constant or if the
7621
1.56k
    // value is a constant being inserted into element 0.  It is cheaper to do
7622
1.56k
    // a constant pool load than it is to do a movd + shuffle.
7623
1.56k
    if (
ExtVT == MVT::i64 && 1.56k
!Subtarget.is64Bit()364
&&
7624
1.56k
        
(!IsAllConstants || 0
Idx == 00
)) {
7625
0
      if (
DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))0
) {
7626
0
        // Handle SSE only.
7627
0
        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7628
0
        MVT VecVT = MVT::v4i32;
7629
0
7630
0
        // Truncate the value (which may itself be a constant) to i32, and
7631
0
        // convert it to a vector with movd (S2V+shuffle to zero extend).
7632
0
        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7633
0
        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7634
0
        return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7635
0
                                      Item, Idx * 2, true, Subtarget, DAG));
7636
0
      }
7637
1.56k
    }
7638
1.56k
7639
1.56k
    // If we have a constant or non-constant insertion into the low element of
7640
1.56k
    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7641
1.56k
    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
7642
1.56k
    // depending on what the source datatype is.
7643
1.56k
    
if (1.56k
Idx == 01.56k
) {
7644
1.18k
      if (NumZero == 0)
7645
520
        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7646
661
7647
661
      
if (661
ExtVT == MVT::i32 || 661
ExtVT == MVT::f32474
||
ExtVT == MVT::f64339
||
7648
661
          
(ExtVT == MVT::i64 && 238
Subtarget.is64Bit()187
)) {
7649
610
        assert((VT.is128BitVector() || VT.is256BitVector() ||
7650
610
                VT.is512BitVector()) &&
7651
610
               "Expected an SSE value type!");
7652
610
        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7653
610
        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7654
610
        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7655
610
      }
7656
51
7657
51
      // We can't directly insert an i8 or i16 into a vector, so zero extend
7658
51
      // it to i32 first.
7659
51
      
if (51
ExtVT == MVT::i16 || 51
ExtVT == MVT::i832
) {
7660
51
        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7661
51
        if (
VT.getSizeInBits() >= 25651
) {
7662
10
          MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7663
10
          if (
Subtarget.hasAVX()10
) {
7664
10
            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7665
10
            Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7666
10
          } else {
7667
0
            // Without AVX, we need to extend to a 128-bit vector and then
7668
0
            // insert into the 256-bit vector.
7669
0
            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7670
0
            SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7671
0
            Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7672
0
          }
7673
51
        } else {
7674
41
          assert(VT.is128BitVector() && "Expected an SSE value type!");
7675
41
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7676
41
          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7677
41
        }
7678
51
        return DAG.getBitcast(VT, Item);
7679
51
      }
7680
382
    }
7681
382
7682
382
    // Is it a vector logical left shift?
7683
382
    
if (382
NumElems == 2 && 382
Idx == 162
&&
7684
62
        X86::isZeroNode(Op.getOperand(0)) &&
7685
382
        
!X86::isZeroNode(Op.getOperand(1))57
) {
7686
57
      unsigned NumBits = VT.getSizeInBits();
7687
57
      return getVShift(true, VT,
7688
57
                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7689
57
                                   VT, Op.getOperand(1)),
7690
57
                       NumBits/2, DAG, *this, dl);
7691
57
    }
7692
325
7693
325
    
if (325
IsAllConstants325
) // Otherwise, it's better to do a constpool load.
7694
225
      return SDValue();
7695
100
7696
100
    // Otherwise, if this is a vector with i32 or f32 elements, and the element
7697
100
    // is a non-constant being inserted into an element other than the low one,
7698
100
    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7699
100
    // movd/movss) to move this into the low element, then shuffle it into
7700
100
    // place.
7701
100
    
if (100
EVTBits == 32100
) {
7702
44
      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7703
44
      return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7704
44
    }
7705
17.5k
  }
7706
17.5k
7707
17.5k
  // Splat is obviously ok. Let legalizer expand it to a shuffle.
7708
17.5k
  
if (17.5k
Values.size() == 117.5k
) {
7709
3.94k
    if (
EVTBits == 323.94k
) {
7710
1.09k
      // Instead of a shuffle like this:
7711
1.09k
      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7712
1.09k
      // Check if it's possible to issue this instead.
7713
1.09k
      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7714
1.09k
      unsigned Idx = countTrailingZeros(NonZeros);
7715
1.09k
      SDValue Item = Op.getOperand(Idx);
7716
1.09k
      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7717
1.04k
        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7718
2.90k
    }
7719
2.90k
    return SDValue();
7720
2.90k
  }
7721
13.6k
7722
13.6k
  // A vector full of immediates; various special cases are already
7723
13.6k
  // handled, so this is best done with a single constant-pool load.
7724
13.6k
  
if (13.6k
IsAllConstants13.6k
)
7725
9.30k
    return SDValue();
7726
4.32k
7727
4.32k
  // See if we can use a vector load to get all of the elements.
7728
4.32k
  
if (4.32k
VT.is128BitVector() || 4.32k
VT.is256BitVector()870
||
VT.is512BitVector()182
) {
7729
4.32k
    SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7730
4.32k
    if (SDValue LD =
7731
4.32k
            EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
7732
552
      return LD;
7733
3.76k
  }
7734
3.76k
7735
3.76k
  // For AVX-length vectors, build the individual 128-bit pieces and use
7736
3.76k
  // shuffles to put them in place.
7737
3.76k
  
if (3.76k
VT.is256BitVector() || 3.76k
VT.is512BitVector()3.16k
) {
7738
748
    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7739
748
7740
748
    // Build both the lower and upper subvector.
7741
748
    SDValue Lower =
7742
748
        DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
7743
748
    SDValue Upper = DAG.getBuildVector(
7744
748
        HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
7745
748
7746
748
    // Recreate the wider vector with the lower and upper part.
7747
748
    if (VT.is256BitVector())
7748
607
      return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7749
141
    return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7750
141
  }
7751
3.02k
7752
3.02k
  // Let legalizer expand 2-wide build_vectors.
7753
3.02k
  
if (3.02k
EVTBits == 643.02k
) {
7754
1.11k
    if (
NumNonZero == 11.11k
) {
7755
0
      // One half is zero or undef.
7756
0
      unsigned Idx = countTrailingZeros(NonZeros);
7757
0
      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7758
0
                               Op.getOperand(Idx));
7759
0
      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7760
0
    }
7761
1.11k
    return SDValue();
7762
1.11k
  }
7763
1.90k
7764
1.90k
  // If element VT is < 32 bits, convert it to inserts into a zero vector.
7765
1.90k
  
if (1.90k
EVTBits == 8 && 1.90k
NumElems == 16684
)
7766
684
    
if (SDValue 684
V684
= LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7767
684
                                          DAG, Subtarget))
7768
662
      return V;
7769
1.24k
7770
1.24k
  
if (1.24k
EVTBits == 16 && 1.24k
NumElems == 8273
)
7771
273
    
if (SDValue 273
V273
= LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7772
273
                                          DAG, Subtarget))
7773
249
      return V;
7774
998
7775
998
  // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7776
998
  
if (998
EVTBits == 32 && 998
NumElems == 4952
)
7777
952
    
if (SDValue 952
V952
= LowerBuildVectorv4x32(Op, DAG, Subtarget))
7778
14
      return V;
7779
984
7780
984
  // If element VT is == 32 bits, turn it into a number of shuffles.
7781
984
  
if (984
NumElems == 4 && 984
NumZero > 0938
) {
7782
60
    SmallVector<SDValue, 8> Ops(NumElems);
7783
300
    for (unsigned i = 0; 
i < 4300
;
++i240
) {
7784
240
      bool isZero = !(NonZeros & (1ULL << i));
7785
240
      if (isZero)
7786
93
        Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7787
240
      else
7788
147
        Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7789
240
    }
7790
60
7791
180
    for (unsigned i = 0; 
i < 2180
;
++i120
) {
7792
120
      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7793
0
        default: break;
7794
11
        case 0:
7795
11
          Ops[i] = Ops[i*2];  // Must be a zero vector.
7796
11
          break;
7797
60
        case 1:
7798
60
          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7799
60
          break;
7800
11
        case 2:
7801
11
          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7802
11
          break;
7803
38
        case 3:
7804
38
          Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7805
38
          break;
7806
120
      }
7807
120
    }
7808
60
7809
60
    bool Reverse1 = (NonZeros & 0x3) == 2;
7810
60
    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7811
60
    int MaskVec[] = {
7812
60
      Reverse1 ? 
14
:
056
,
7813
60
      Reverse1 ? 
04
:
156
,
7814
60
      static_cast<int>(Reverse2 ? 
NumElems+17
:
NumElems53
),
7815
60
      static_cast<int>(Reverse2 ? 
NumElems7
:
NumElems+153
)
7816
60
    };
7817
60
    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7818
924
  }
7819
924
7820
924
  
if (924
Values.size() > 1 && 924
VT.is128BitVector()924
) {
7821
924
    // Check for a build vector from mostly shuffle plus few inserting.
7822
924
    if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7823
47
      return Sh;
7824
877
7825
877
    // For SSE 4.1, use insertps to put the high elements into the low element.
7826
877
    
if (877
Subtarget.hasSSE41()877
) {
7827
495
      SDValue Result;
7828
495
      if (!Op.getOperand(0).isUndef())
7829
494
        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7830
495
      else
7831
1
        Result = DAG.getUNDEF(VT);
7832
495
7833
1.98k
      for (unsigned i = 1; 
i < NumElems1.98k
;
++i1.48k
) {
7834
1.48k
        if (
Op.getOperand(i).isUndef()1.48k
)
continue114
;
7835
1.37k
        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7836
1.37k
                             Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7837
1.37k
      }
7838
495
      return Result;
7839
495
    }
7840
382
7841
382
    // Otherwise, expand into a number of unpckl*, start by extending each of
7842
382
    // our (non-undef) elements to the full vector width with the element in the
7843
382
    // bottom slot of the vector (which generates no code for SSE).
7844
382
    SmallVector<SDValue, 8> Ops(NumElems);
7845
2.27k
    for (unsigned i = 0; 
i < NumElems2.27k
;
++i1.88k
) {
7846
1.88k
      if (!Op.getOperand(i).isUndef())
7847
1.66k
        Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7848
1.88k
      else
7849
220
        Ops[i] = DAG.getUNDEF(VT);
7850
1.88k
    }
7851
382
7852
382
    // Next, we iteratively mix elements, e.g. for v4f32:
7853
382
    //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
7854
382
    //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
7855
382
    //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
7856
1.21k
    for (unsigned Scale = 1; 
Scale < NumElems1.21k
;
Scale *= 2832
) {
7857
832
      // Generate scaled UNPCKL shuffle mask.
7858
832
      SmallVector<int, 16> Mask;
7859
2.33k
      for(unsigned i = 0; 
i != Scale2.33k
;
++i1.50k
)
7860
1.50k
        Mask.push_back(i);
7861
2.33k
      for (unsigned i = 0; 
i != Scale2.33k
;
++i1.50k
)
7862
1.50k
        Mask.push_back(NumElems+i);
7863
832
      Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
7864
832
7865
2.33k
      for (unsigned i = 0, e = NumElems / (2 * Scale); 
i != e2.33k
;
++i1.50k
)
7866
1.50k
        Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
7867
832
    }
7868
924
    return Ops[0];
7869
924
  }
7870
0
  return SDValue();
7871
0
}
7872
7873
// 256-bit AVX can use the vinsertf128 instruction
7874
// to create 256-bit vectors from two other 128-bit ones.
7875
3.67k
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7876
3.67k
  SDLoc dl(Op);
7877
3.67k
  MVT ResVT = Op.getSimpleValueType();
7878
3.67k
7879
3.67k
  assert((ResVT.is256BitVector() ||
7880
3.67k
          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7881
3.67k
7882
3.67k
  SDValue V1 = Op.getOperand(0);
7883
3.67k
  SDValue V2 = Op.getOperand(1);
7884
3.67k
  unsigned NumElems = ResVT.getVectorNumElements();
7885
3.67k
  if (ResVT.is256BitVector())
7886
2.90k
    return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7887
768
7888
768
  
if (768
Op.getNumOperands() == 4768
) {
7889
169
    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7890
169
                                  ResVT.getVectorNumElements()/2);
7891
169
    SDValue V3 = Op.getOperand(2);
7892
169
    SDValue V4 = Op.getOperand(3);
7893
169
    return concat256BitVectors(
7894
169
        concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7895
169
        concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7896
169
        NumElems, DAG, dl);
7897
169
  }
7898
599
  return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7899
599
}
7900
7901
// Return true if all the operands of the given CONCAT_VECTORS node are zeros
7902
// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
7903
1.06k
static bool isExpandWithZeros(const SDValue &Op) {
7904
1.06k
  assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
7905
1.06k
         "Expand with zeros only possible in CONCAT_VECTORS nodes!");
7906
1.06k
7907
5.71k
  for (unsigned i = 1; 
i < Op.getNumOperands()5.71k
;
i++4.64k
)
7908
4.73k
    
if (4.73k
!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode())4.73k
)
7909
84
      return false;
7910
1.06k
7911
977
  return true;
7912
1.06k
}
7913
7914
// Returns true if the given node is a type promotion (by concatenating i1
7915
// zeros) of the result of a node that already zeros all upper bits of
7916
// k-register.
7917
1.05k
static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
7918
1.05k
  unsigned Opc = Op.getOpcode();
7919
1.05k
7920
1.05k
  assert(Opc == ISD::CONCAT_VECTORS &&
7921
1.05k
         Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
7922
1.05k
         "Unexpected node to check for type promotion!");
7923
1.05k
7924
1.05k
  // As long as we are concatenating zeros to the upper part of a previous node
7925
1.05k
  // result, climb up the tree until a node with different opcode is
7926
1.05k
  // encountered
7927
2.03k
  while (
Opc == ISD::INSERT_SUBVECTOR || 2.03k
Opc == ISD::CONCAT_VECTORS2.03k
) {
7928
1.06k
    if (
Opc == ISD::INSERT_SUBVECTOR1.06k
) {
7929
0
      if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
7930
0
          Op.getConstantOperandVal(2) == 0)
7931
0
        Op = Op.getOperand(1);
7932
0
      else
7933
0
        return SDValue();
7934
1.06k
    } else { // Opc == ISD::CONCAT_VECTORS
7935
1.06k
      if (isExpandWithZeros(Op))
7936
977
        Op = Op.getOperand(0);
7937
1.06k
      else
7938
84
        return SDValue();
7939
977
    }
7940
977
    Opc = Op.getOpcode();
7941
977
  }
7942
1.05k
7943
1.05k
  // Check if the first inserted node zeroes the upper bits, or an 'and' result
7944
1.05k
  // of a node that zeros the upper bits (its masked version).
7945
975
  
if (975
isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
7946
510
      (Op.getOpcode() == ISD::AND &&
7947
475
       (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
7948
975
        
isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())108
))) {
7949
924
    return Op;
7950
924
  }
7951
51
7952
51
  return SDValue();
7953
51
}
7954
7955
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7956
                                       const X86Subtarget &Subtarget,
7957
1.05k
                                       SelectionDAG & DAG) {
7958
1.05k
  SDLoc dl(Op);
7959
1.05k
  MVT ResVT = Op.getSimpleValueType();
7960
1.05k
  unsigned NumOfOperands = Op.getNumOperands();
7961
1.05k
7962
1.05k
  assert(isPowerOf2_32(NumOfOperands) &&
7963
1.05k
         "Unexpected number of operands in CONCAT_VECTORS");
7964
1.05k
7965
1.05k
  // If this node promotes - by concatenating zeroes - the type of the result
7966
1.05k
  // of a node with instruction that zeroes all upper (irrelevant) bits of the
7967
1.05k
  // output register, mark it as legal and catch the pattern in instruction
7968
1.05k
  // selection to avoid emitting extra insturctions (for zeroing upper bits).
7969
1.05k
  if (SDValue 
Promoted1.05k
= isTypePromotionOfi1ZeroUpBits(Op)) {
7970
924
    SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
7971
924
    SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
7972
924
    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
7973
924
                       ZeroC);
7974
924
  }
7975
135
7976
135
  SDValue Undef = DAG.getUNDEF(ResVT);
7977
135
  if (
NumOfOperands > 2135
) {
7978
15
    // Specialize the cases when all, or all but one, of the operands are undef.
7979
15
    unsigned NumOfDefinedOps = 0;
7980
15
    unsigned OpIdx = 0;
7981
87
    for (unsigned i = 0; 
i < NumOfOperands87
;
i++72
)
7982
72
      
if (72
!Op.getOperand(i).isUndef()72
) {
7983
26
        NumOfDefinedOps++;
7984
26
        OpIdx = i;
7985
26
      }
7986
15
    if (NumOfDefinedOps == 0)
7987
0
      return Undef;
7988
15
    
if (15
NumOfDefinedOps == 115
) {
7989
8
      unsigned SubVecNumElts =
7990
8
        Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7991
8
      SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7992
8
      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7993
8
                         Op.getOperand(OpIdx), IdxVal);
7994
8
    }
7995
7
7996
7
    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7997
7
                                  ResVT.getVectorNumElements()/2);
7998
7
    SmallVector<SDValue, 2> Ops;
7999
27
    for (unsigned i = 0; 
i < NumOfOperands/227
;
i++20
)
8000
20
      Ops.push_back(Op.getOperand(i));
8001
7
    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8002
7
    Ops.clear();
8003
27
    for (unsigned i = NumOfOperands/2; 
i < NumOfOperands27
;
i++20
)
8004
20
      Ops.push_back(Op.getOperand(i));
8005
15
    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8006
15
    return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8007
15
  }
8008
120
8009
120
  // 2 operands
8010
120
  SDValue V1 = Op.getOperand(0);
8011
120
  SDValue V2 = Op.getOperand(1);
8012
120
  unsigned NumElems = ResVT.getVectorNumElements();
8013
120
  assert(V1.getValueType() == V2.getValueType() &&
8014
120
         V1.getValueType().getVectorNumElements() == NumElems/2 &&
8015
120
         "Unexpected operands in CONCAT_VECTORS");
8016
120
8017
120
  if (ResVT.getSizeInBits() >= 16)
8018
92
    return Op; // The operation is legal with KUNPCK
8019
28
8020
28
  bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
8021
28
  bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
8022
28
  SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
8023
28
  if (
IsZeroV1 && 28
IsZeroV20
)
8024
0
    return ZeroVec;
8025
28
8026
28
  SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
8027
28
  if (V2.isUndef())
8028
8
    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8029
20
  
if (20
IsZeroV220
)
8030
15
    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
8031
5
8032
5
  SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
8033
5
  if (V1.isUndef())
8034
1
    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
8035
4
8036
4
  
if (4
IsZeroV14
)
8037
0
    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
8038
4
8039
4
  V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8040
4
  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
8041
4
}
8042
8043
static SDValue LowerCONCAT_VECTORS(SDValue Op,
8044
                                   const X86Subtarget &Subtarget,
8045
4.73k
                                   SelectionDAG &DAG) {
8046
4.73k
  MVT VT = Op.getSimpleValueType();
8047
4.73k
  if (VT.getVectorElementType() == MVT::i1)
8048
1.05k
    return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8049
3.67k
8050
4.73k
  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8051
3.67k
         (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8052
3.67k
          Op.getNumOperands() == 4)));
8053
3.67k
8054
3.67k
  // AVX can use the vinsertf128 instruction to create 256-bit vectors
8055
3.67k
  // from two other 128-bit ones.
8056
3.67k
8057
3.67k
  // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8058
3.67k
  return LowerAVXCONCAT_VECTORS(Op, DAG);
8059
3.67k
}
8060
8061
//===----------------------------------------------------------------------===//
8062
// Vector shuffle lowering
8063
//
8064
// This is an experimental code path for lowering vector shuffles on x86. It is
8065
// designed to handle arbitrary vector shuffles and blends, gracefully
8066
// degrading performance as necessary. It works hard to recognize idiomatic
8067
// shuffles and lower them to optimal instruction patterns without leaving
8068
// a framework that allows reasonably efficient handling of all vector shuffle
8069
// patterns.
8070
//===----------------------------------------------------------------------===//
8071
8072
/// \brief Tiny helper function to identify a no-op mask.
8073
///
8074
/// This is a somewhat boring predicate function. It checks whether the mask
8075
/// array input, which is assumed to be a single-input shuffle mask of the kind
8076
/// used by the X86 shuffle instructions (not a fully general
8077
/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8078
/// in-place shuffle are 'no-op's.
8079
24.1k
static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8080
71.2k
  for (int i = 0, Size = Mask.size(); 
i < Size71.2k
;
++i47.0k
) {
8081
64.7k
    assert(Mask[i] >= -1 && "Out of bound mask element!");
8082
64.7k
    if (
Mask[i] >= 0 && 64.7k
Mask[i] != i46.8k
)
8083
17.6k
      return false;
8084
64.7k
  }
8085
6.47k
  return true;
8086
24.1k
}
8087
8088
/// \brief Test whether there are elements crossing 128-bit lanes in this
8089
/// shuffle mask.
8090
///
8091
/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8092
/// and we routinely test for these.
8093
38.9k
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8094
38.9k
  int LaneSize = 128 / VT.getScalarSizeInBits();
8095
38.9k
  int Size = Mask.size();
8096
403k
  for (int i = 0; 
i < Size403k
;
++i364k
)
8097
369k
    
if (369k
Mask[i] >= 0 && 369k
(Mask[i] % Size) / LaneSize != i / LaneSize300k
)
8098
5.56k
      return true;
8099
33.3k
  return false;
8100
38.9k
}
8101
8102
/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8103
///
8104
/// This checks a shuffle mask to see if it is performing the same
8105
/// lane-relative shuffle in each sub-lane. This trivially implies
8106
/// that it is also not lane-crossing. It may however involve a blend from the
8107
/// same lane of a second vector.
8108
///
8109
/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8110
/// non-trivial to compute in the face of undef lanes. The representation is
8111
/// suitable for use with existing 128-bit shuffles as entries from the second
8112
/// vector have been remapped to [LaneSize, 2*LaneSize).
8113
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8114
                                  ArrayRef<int> Mask,
8115
57.9k
                                  SmallVectorImpl<int> &RepeatedMask) {
8116
57.9k
  auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8117
57.9k
  RepeatedMask.assign(LaneSize, -1);
8118
57.9k
  int Size = Mask.size();
8119
586k
  for (int i = 0; 
i < Size586k
;
++i528k
) {
8120
538k
    assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8121
538k
    if (Mask[i] < 0)
8122
68.5k
      continue;
8123
470k
    
if (470k
(Mask[i] % Size) / LaneSize != i / LaneSize470k
)
8124
470k
      // This entry crosses lanes, so there is no way to model this shuffle.
8125
8.07k
      return false;
8126
462k
8127
462k
    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8128
462k
    // Adjust second vector indices to start at LaneSize instead of Size.
8129
462k
    
int LocalM = Mask[i] < Size ? 462k
Mask[i] % LaneSize392k
8130
69.3k
                                : Mask[i] % LaneSize + LaneSize;
8131
462k
    if (RepeatedMask[i % LaneSize] < 0)
8132
462k
      // This is the first non-undef entry in this slot of a 128-bit lane.
8133
390k
      RepeatedMask[i % LaneSize] = LocalM;
8134
71.2k
    else 
if (71.2k
RepeatedMask[i % LaneSize] != LocalM71.2k
)
8135
71.2k
      // Found a mismatch with the repeated mask.
8136
2.49k
      return false;
8137
538k
  }
8138
47.4k
  return true;
8139
57.9k
}
8140
8141
/// Test whether a shuffle mask is equivalent within each 128-bit lane.
8142
static bool
8143
is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8144
57.4k
                                SmallVectorImpl<int> &RepeatedMask) {
8145
57.4k
  return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8146
57.4k
}
8147
8148
/// Test whether a shuffle mask is equivalent within each 256-bit lane.
8149
static bool
8150
is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8151
563
                                SmallVectorImpl<int> &RepeatedMask) {
8152
563
  return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8153
563
}
8154
8155
/// Test whether a target shuffle mask is equivalent within each sub-lane.
8156
/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8157
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8158
                                        ArrayRef<int> Mask,
8159
3.63k
                                        SmallVectorImpl<int> &RepeatedMask) {
8160
3.63k
  int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8161
3.63k
  RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8162
3.63k
  int Size = Mask.size();
8163
18.8k
  for (int i = 0; 
i < Size18.8k
;
++i15.1k
) {
8164
16.1k
    assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8165
16.1k
    if (Mask[i] == SM_SentinelUndef)
8166
174
      continue;
8167
15.9k
    
if (15.9k
Mask[i] == SM_SentinelZero15.9k
) {
8168
666
      if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8169
3
        return false;
8170
663
      RepeatedMask[i % LaneSize] = SM_SentinelZero;
8171
663
      continue;
8172
663
    }
8173
15.2k
    
if (15.2k
(Mask[i] % Size) / LaneSize != i / LaneSize15.2k
)
8174
15.2k
      // This entry crosses lanes, so there is no way to model this shuffle.
8175
602
      return false;
8176
14.6k
8177
14.6k
    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8178
14.6k
    // Adjust second vector indices to start at LaneSize instead of Size.
8179
14.6k
    int LocalM =
8180
14.6k
        Mask[i] < Size ? 
Mask[i] % LaneSize10.4k
:
Mask[i] % LaneSize + LaneSize4.21k
;
8181
14.6k
    if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8182
14.6k
      // This is the first non-undef entry in this slot of a 128-bit lane.
8183
12.6k
      RepeatedMask[i % LaneSize] = LocalM;
8184
2.02k
    else 
if (2.02k
RepeatedMask[i % LaneSize] != LocalM2.02k
)
8185
2.02k
      // Found a mismatch with the repeated mask.
8186
320
      return false;
8187
16.1k
  }
8188
2.70k
  return true;
8189
3.63k
}
8190
8191
/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8192
/// arguments.
8193
///
8194
/// This is a fast way to test a shuffle mask against a fixed pattern:
8195
///
8196
///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8197
///
8198
/// It returns true if the mask is exactly as wide as the argument list, and
8199
/// each element of the mask is either -1 (signifying undef) or the value given
8200
/// in the argument.
8201
static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8202
218k
                                ArrayRef<int> ExpectedMask) {
8203
218k
  if (Mask.size() != ExpectedMask.size())
8204
179
    return false;
8205
217k
8206
217k
  int Size = Mask.size();
8207
217k
8208
217k
  // If the values are build vectors, we can look through them to find
8209
217k
  // equivalent inputs that make the shuffles equivalent.
8210
217k
  auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8211
217k
  auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8212
217k
8213
376k
  for (int i = 0; 
i < Size376k
;
++i158k
) {
8214
367k
    assert(Mask[i] >= -1 && "Out of bound mask element!");
8215
367k
    if (
Mask[i] >= 0 && 367k
Mask[i] != ExpectedMask[i]280k
) {
8216
211k
      auto *MaskBV = Mask[i] < Size ? 
BV1190k
:
BV221.2k
;
8217
211k
      auto *ExpectedBV = ExpectedMask[i] < Size ? 
BV1184k
:
BV227.5k
;
8218
211k
      if (
!MaskBV || 211k
!ExpectedBV3.70k
||
8219
3.15k
          MaskBV->getOperand(Mask[i] % Size) !=
8220
3.15k
              ExpectedBV->getOperand(ExpectedMask[i] % Size))
8221
208k
        return false;
8222
211k
    }
8223
367k
  }
8224
217k
8225
9.19k
  return true;
8226
218k
}
8227
8228
/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8229
///
8230
/// The masks must be exactly the same width.
8231
///
8232
/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8233
/// value in ExpectedMask is always accepted. Otherwise the indices must match.
8234
///
8235
/// SM_SentinelZero is accepted as a valid negative index but must match in both.
8236
static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8237
407k
                                      ArrayRef<int> ExpectedMask) {
8238
407k
  int Size = Mask.size();
8239
407k
  if (Size != (int)ExpectedMask.size())
8240
188k
    return false;
8241
219k
8242
368k
  
for (int i = 0; 219k
i < Size368k
;
++i149k
)
8243
355k
    
if (355k
Mask[i] == SM_SentinelUndef355k
)
8244
26.1k
      continue;
8245
329k
    else 
if (329k
Mask[i] < 0 && 329k
Mask[i] != SM_SentinelZero15.7k
)
8246
0
      return false;
8247
329k
    else 
if (329k
Mask[i] != ExpectedMask[i]329k
)
8248
206k
      return false;
8249
219k
8250
12.7k
  return true;
8251
407k
}
8252
8253
// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8254
// mask.
8255
static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8256
13.6k
                                                    const APInt &Zeroable) {
8257
13.6k
  int NumElts = Mask.size();
8258
13.6k
  assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8259
13.6k
8260
13.6k
  SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8261
164k
  for (int i = 0; 
i != NumElts164k
;
++i150k
) {
8262
150k
    int M = Mask[i];
8263
150k
    if (M == SM_SentinelUndef)
8264
16.0k
      continue;
8265
150k
    assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8266
134k
    TargetMask[i] = (Zeroable[i] ? 
SM_SentinelZero9.13k
:
M125k
);
8267
150k
  }
8268
13.6k
  return TargetMask;
8269
13.6k
}
8270
8271
// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8272
// instructions.
8273
1.04k
static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8274
1.04k
  if (
VT != MVT::v8i32 && 1.04k
VT != MVT::v8f3267
)
8275
0
    return false;
8276
1.04k
8277
1.04k
  SmallVector<int, 8> Unpcklwd;
8278
1.04k
  createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8279
1.04k
                          /* Unary = */ false);
8280
1.04k
  SmallVector<int, 8> Unpckhwd;
8281
1.04k
  createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8282
1.04k
                          /* Unary = */ false);
8283
1.04k
  bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8284
1.03k
                         isTargetShuffleEquivalent(Mask, Unpckhwd));
8285
1.04k
  return IsUnpackwdMask;
8286
1.04k
}
8287
8288
/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8289
///
8290
/// This helper function produces an 8-bit shuffle immediate corresponding to
8291
/// the ubiquitous shuffle encoding scheme used in x86 instructions for
8292
/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8293
/// example.
8294
///
8295
/// NB: We rely heavily on "undef" masks preserving the input lane.
8296
25.7k
static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8297
25.7k
  assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8298
25.7k
  assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8299
25.7k
  assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8300
25.7k
  assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8301
25.7k
  assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8302
25.7k
8303
25.7k
  unsigned Imm = 0;
8304
25.7k
  Imm |= (Mask[0] < 0 ? 
0529
:
Mask[0]25.1k
) << 0;
8305
25.7k
  Imm |= (Mask[1] < 0 ? 
11.73k
:
Mask[1]23.9k
) << 2;
8306
25.7k
  Imm |= (Mask[2] < 0 ? 
24.70k
:
Mask[2]20.9k
) << 4;
8307
25.7k
  Imm |= (Mask[3] < 0 ? 
35.24k
:
Mask[3]20.4k
) << 6;
8308
25.7k
  return Imm;
8309
25.7k
}
8310
8311
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8312
13.7k
                                          SelectionDAG &DAG) {
8313
13.7k
  return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8314
13.7k
}
8315
8316
/// \brief Compute whether each element of a shuffle is zeroable.
8317
///
8318
/// A "zeroable" vector shuffle element is one which can be lowered to zero.
8319
/// Either it is an undef element in the shuffle mask, the element of the input
8320
/// referenced is undef, or the element of the input referenced is known to be
8321
/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8322
/// as many lanes with this technique as possible to simplify the remaining
8323
/// shuffle.
8324
static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8325
48.7k
                                            SDValue V1, SDValue V2) {
8326
48.7k
  APInt Zeroable(Mask.size(), 0);
8327
48.7k
  V1 = peekThroughBitcasts(V1);
8328
48.7k
  V2 = peekThroughBitcasts(V2);
8329
48.7k
8330
48.7k
  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8331
48.7k
  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8332
48.7k
8333
48.7k
  int VectorSizeInBits = V1.getValueSizeInBits();
8334
48.7k
  int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8335
48.7k
  assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8336
48.7k
8337
549k
  for (int i = 0, Size = Mask.size(); 
i < Size549k
;
++i500k
) {
8338
500k
    int M = Mask[i];
8339
500k
    // Handle the easy cases.
8340
500k
    if (
M < 0 || 500k
(M >= 0 && 398k
M < Size398k
&&
V1IsZero302k
) ||
(M >= Size && 369k
V2IsZero95.6k
)) {
8341
158k
      Zeroable.setBit(i);
8342
158k
      continue;
8343
158k
    }
8344
341k
8345
341k
    // Determine shuffle input and normalize the mask.
8346
341k
    
SDValue V = M < Size ? 341k
V1274k
:
V267.6k
;
8347
341k
    M %= Size;
8348
341k
8349
341k
    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8350
341k
    if (V.getOpcode() != ISD::BUILD_VECTOR)
8351
339k
      continue;
8352
2.07k
8353
2.07k
    // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8354
2.07k
    // the (larger) source element must be UNDEF/ZERO.
8355
2.07k
    
if (2.07k
(Size % V.getNumOperands()) == 02.07k
) {
8356
2.05k
      int Scale = Size / V->getNumOperands();
8357
2.05k
      SDValue Op = V.getOperand(M / Scale);
8358
2.05k
      if (
Op.isUndef() || 2.05k
X86::isZeroNode(Op)1.96k
)
8359
198
        Zeroable.setBit(i);
8360
1.85k
      else 
if (ConstantSDNode *1.85k
Cst1.85k
= dyn_cast<ConstantSDNode>(Op)) {
8361
302
        APInt Val = Cst->getAPIntValue();
8362
302
        Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8363
302
        Val = Val.getLoBits(ScalarSizeInBits);
8364
302
        if (Val == 0)
8365
3
          Zeroable.setBit(i);
8366
1.85k
      } else 
if (ConstantFPSDNode *1.55k
Cst1.55k
= dyn_cast<ConstantFPSDNode>(Op)) {
8367
49
        APInt Val = Cst->getValueAPF().bitcastToAPInt();
8368
49
        Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8369
49
        Val = Val.getLoBits(ScalarSizeInBits);
8370
49
        if (Val == 0)
8371
3
          Zeroable.setBit(i);
8372
1.85k
      }
8373
2.05k
      continue;
8374
2.05k
    }
8375
23
8376
23
    // If the BUILD_VECTOR has more elements then all the (smaller) source
8377
23
    // elements must be UNDEF or ZERO.
8378
23
    
if (23
(V.getNumOperands() % Size) == 023
) {
8379
23
      int Scale = V->getNumOperands() / Size;
8380
23
      bool AllZeroable = true;
8381
73
      for (int j = 0; 
j < Scale73
;
++j50
) {
8382
50
        SDValue Op = V.getOperand((M * Scale) + j);
8383
46
        AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8384
50
      }
8385
23
      if (AllZeroable)
8386
4
        Zeroable.setBit(i);
8387
23
      continue;
8388
23
    }
8389
500k
  }
8390
48.7k
8391
48.7k
  return Zeroable;
8392
48.7k
}
8393
8394
// The Shuffle result is as follow:
8395
// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8396
// Each Zeroable's element correspond to a particular Mask's element.
8397
// As described in computeZeroableShuffleElements function.
8398
//
8399
// The function looks for a sub-mask that the nonzero elements are in
8400
// increasing order. If such sub-mask exist. The function returns true.
8401
static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8402
                                     ArrayRef<int> Mask, const EVT &VectorType,
8403
1.06k
                                     bool &IsZeroSideLeft) {
8404
1.06k
  int NextElement = -1;
8405
1.06k
  // Check if the Mask's nonzero elements are in increasing order.
8406
2.56k
  for (int i = 0, e = Mask.size(); 
i < e2.56k
;
i++1.49k
) {
8407
2.53k
    // Checks if the mask's zeros elements are built from only zeros.
8408
2.53k
    assert(Mask[i] >= -1 && "Out of bound mask element!");
8409
2.53k
    if (Mask[i] < 0)
8410
38
      return false;
8411
2.49k
    
if (2.49k
Zeroable[i]2.49k
)
8412
238
      continue;
8413
2.25k
    // Find the lowest non zero element
8414
2.25k
    
if (2.25k
NextElement < 02.25k
) {
8415
1.05k
      NextElement = Mask[i] != 0 ? 
VectorType.getVectorNumElements()395
:
0655
;
8416
1.05k
      IsZeroSideLeft = NextElement != 0;
8417
1.05k
    }
8418
2.25k
    // Exit if the mask's non zero elements are not in increasing order.
8419
2.25k
    if (NextElement != Mask[i])
8420
998
      return false;
8421
1.25k
    NextElement++;
8422
1.25k
  }
8423
32
  return true;
8424
1.06k
}
8425
8426
/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8427
static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8428
                                            ArrayRef<int> Mask, SDValue V1,
8429
                                            SDValue V2,
8430
                                            const APInt &Zeroable,
8431
                                            const X86Subtarget &Subtarget,
8432
1.51k
                                            SelectionDAG &DAG) {
8433
1.51k
  int Size = Mask.size();
8434
1.51k
  int LaneSize = 128 / VT.getScalarSizeInBits();
8435
1.51k
  const int NumBytes = VT.getSizeInBits() / 8;
8436
1.51k
  const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8437
1.51k
8438
1.51k
  assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8439
1.51k
         (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8440
1.51k
         (Subtarget.hasBWI() && VT.is512BitVector()));
8441
1.51k
8442
1.51k
  SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8443
1.51k
  // Sign bit set in i8 mask means zero element.
8444
1.51k
  SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8445
1.51k
8446
1.51k
  SDValue V;
8447
34.0k
  for (int i = 0; 
i < NumBytes34.0k
;
++i32.5k
) {
8448
33.5k
    int M = Mask[i / NumEltBytes];
8449
33.5k
    if (
M < 033.5k
) {
8450
2.35k
      PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8451
2.35k
      continue;
8452
2.35k
    }
8453
31.1k
    
if (31.1k
Zeroable[i / NumEltBytes]31.1k
) {
8454
5.30k
      PSHUFBMask[i] = ZeroMask;
8455
5.30k
      continue;
8456
5.30k
    }
8457
25.8k
8458
25.8k
    // We can only use a single input of V1 or V2.
8459
25.8k
    
SDValue SrcV = (M >= Size ? 25.8k
V21.92k
:
V123.9k
);
8460
25.8k
    if (
V && 25.8k
V != SrcV24.3k
)
8461
880
      return SDValue();
8462
24.9k
    V = SrcV;
8463
24.9k
    M %= Size;
8464
24.9k
8465
24.9k
    // PSHUFB can't cross lanes, ensure this doesn't happen.
8466
24.9k
    if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8467
140
      return SDValue();
8468
24.8k
8469
24.8k
    M = M % LaneSize;
8470
24.8k
    M = M * NumEltBytes + (i % NumEltBytes);
8471
24.8k
    PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8472
24.8k
  }
8473
497
  assert(V && "Failed to find a source input");
8474
497
8475
497
  MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8476
497
  return DAG.getBitcast(
8477
497
      VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8478
497
                      DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8479
1.51k
}
8480
8481
static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8482
                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
8483
                           const SDLoc &dl);
8484
8485
// X86 has dedicated shuffle that can be lowered to VEXPAND
8486
static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8487
                                          const APInt &Zeroable,
8488
                                          ArrayRef<int> Mask, SDValue &V1,
8489
                                          SDValue &V2, SelectionDAG &DAG,
8490
1.06k
                                          const X86Subtarget &Subtarget) {
8491
1.06k
  bool IsLeftZeroSide = true;
8492
1.06k
  if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8493
1.06k
                                IsLeftZeroSide))
8494
1.03k
    return SDValue();
8495
32
  unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8496
32
  MVT IntegerType =
8497
32
      MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8498
32
  SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8499
32
  unsigned NumElts = VT.getVectorNumElements();
8500
32
  assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8501
32
         "Unexpected number of vector elements");
8502
32
  SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8503
32
                              Subtarget, DAG, DL);
8504
32
  SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8505
32
  SDValue ExpandedVector = IsLeftZeroSide ? 
V230
:
V12
;
8506
1.06k
  return DAG.getSelect(DL, VT, VMask,
8507
1.06k
                       DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8508
1.06k
                       ZeroVector);
8509
1.06k
}
8510
8511
static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8512
                                        unsigned &UnpackOpcode, bool IsUnary,
8513
                                        ArrayRef<int> TargetMask, SDLoc &DL,
8514
                                        SelectionDAG &DAG,
8515
59.3k
                                        const X86Subtarget &Subtarget) {
8516
59.3k
  int NumElts = VT.getVectorNumElements();
8517
59.3k
8518
59.3k
  bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8519
358k
  for (int i = 0; 
i != NumElts358k
;
i += 2299k
) {
8520
299k
    int M1 = TargetMask[i + 0];
8521
299k
    int M2 = TargetMask[i + 1];
8522
299k
    Undef1 &= (SM_SentinelUndef == M1);
8523
299k
    Undef2 &= (SM_SentinelUndef == M2);
8524
299k
    Zero1 &= isUndefOrZero(M1);
8525
299k
    Zero2 &= isUndefOrZero(M2);
8526
299k
  }
8527
59.3k
  assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8528
59.3k
         "Zeroable shuffle detected");
8529
59.3k
8530
59.3k
  // Attempt to match the target mask against the unpack lo/hi mask patterns.
8531
59.3k
  SmallVector<int, 64> Unpckl, Unpckh;
8532
59.3k
  createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8533
59.3k
  if (
isTargetShuffleEquivalent(TargetMask, Unpckl)59.3k
) {
8534
6.76k
    UnpackOpcode = X86ISD::UNPCKL;
8535
6.76k
    V2 = (Undef2 ? 
DAG.getUNDEF(VT)147
:
(IsUnary ? 6.61k
V11.34k
:
V25.27k
));
8536
6.76k
    V1 = (Undef1 ? 
DAG.getUNDEF(VT)1.18k
:
V15.58k
);
8537
6.76k
    return true;
8538
6.76k
  }
8539
52.5k
8540
52.5k
  createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8541
52.5k
  if (
isTargetShuffleEquivalent(TargetMask, Unpckh)52.5k
) {
8542
1.30k
    UnpackOpcode = X86ISD::UNPCKH;
8543
1.30k
    V2 = (Undef2 ? 
DAG.getUNDEF(VT)34
:
(IsUnary ? 1.26k
V1311
:
V2956
));
8544
1.30k
    V1 = (Undef1 ? 
DAG.getUNDEF(VT)240
:
V11.06k
);
8545
1.30k
    return true;
8546
1.30k
  }
8547
51.2k
8548
51.2k
  // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8549
51.2k
  
if (51.2k
IsUnary && 51.2k
(Zero1 || 21.7k
Zero220.3k
)) {
8550
7.82k
    // Don't bother if we can blend instead.
8551
7.82k
    if (
(Subtarget.hasSSE41() || 7.82k
VT == MVT::v2i644.85k
||
VT == MVT::v2f644.84k
) &&
8552
3.01k
        isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8553
1.62k
      return false;
8554
6.19k
8555
6.19k
    bool MatchLo = true, MatchHi = true;
8556
39.5k
    for (int i = 0; 
(i != NumElts) && 39.5k
(MatchLo || 37.2k
MatchHi12.6k
);
++i33.4k
) {
8557
33.4k
      int M = TargetMask[i];
8558
33.4k
8559
33.4k
      // Ignore if the input is known to be zero or the index is undef.
8560
33.4k
      if (
(((i & 1) == 0) && 33.4k
Zero118.0k
) ||
(((i & 1) == 1) && 30.5k
Zero215.3k
) ||
8561
18.0k
          (M == SM_SentinelUndef))
8562
16.2k
        continue;
8563
17.1k
8564
17.1k
      MatchLo &= (M == Unpckl[i]);
8565
17.1k
      MatchHi &= (M == Unpckh[i]);
8566
17.1k
    }
8567
6.19k
8568
6.19k
    if (
MatchLo || 6.19k
MatchHi4.89k
) {
8569
2.25k
      UnpackOpcode = MatchLo ? 
X86ISD::UNPCKL1.30k
:
X86ISD::UNPCKH947
;
8570
2.25k
      V2 = Zero2 ? 
getZeroVector(VT, Subtarget, DAG, DL)2.11k
:
V1136
;
8571
2.25k
      V1 = Zero1 ? 
getZeroVector(VT, Subtarget, DAG, DL)136
:
V12.11k
;
8572
2.25k
      return true;
8573
2.25k
    }
8574
47.3k
  }
8575
47.3k
8576
47.3k
  // If a binary shuffle, commute and try again.
8577
47.3k
  
if (47.3k
!IsUnary47.3k
) {
8578
29.5k
    ShuffleVectorSDNode::commuteMask(Unpckl);
8579
29.5k
    if (
isTargetShuffleEquivalent(TargetMask, Unpckl)29.5k
) {
8580
2
      UnpackOpcode = X86ISD::UNPCKL;
8581
2
      std::swap(V1, V2);
8582
2
      return true;
8583
2
    }
8584
29.5k
8585
29.5k
    ShuffleVectorSDNode::commuteMask(Unpckh);
8586
29.5k
    if (
isTargetShuffleEquivalent(TargetMask, Unpckh)29.5k
) {
8587
0
      UnpackOpcode = X86ISD::UNPCKH;
8588
0
      std::swap(V1, V2);
8589
0
      return true;
8590
0
    }
8591
47.3k
  }
8592
47.3k
8593
47.3k
  return false;
8594
47.3k
}
8595
8596
// X86 has dedicated unpack instructions that can handle specific blend
8597
// operations: UNPCKH and UNPCKL.
8598
static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8599
                                           ArrayRef<int> Mask, SDValue V1,
8600
14.9k
                                           SDValue V2, SelectionDAG &DAG) {
8601
14.9k
  SmallVector<int, 8> Unpckl;
8602
14.9k
  createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8603
14.9k
  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8604
3.67k
    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8605
11.2k
8606
11.2k
  SmallVector<int, 8> Unpckh;
8607
11.2k
  createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8608
11.2k
  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8609
904
    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8610
10.3k
8611
10.3k
  // Commute and try again.
8612
10.3k
  ShuffleVectorSDNode::commuteMask(Unpckl);
8613
10.3k
  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8614
36
    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8615
10.3k
8616
10.3k
  ShuffleVectorSDNode::commuteMask(Unpckh);
8617
10.3k
  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8618
14
    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8619
10.2k
8620
10.2k
  return SDValue();
8621
10.2k
}
8622
8623
/// \brief Try to emit a bitmask instruction for a shuffle.
8624
///
8625
/// This handles cases where we can model a blend exactly as a bitmask due to
8626
/// one of the inputs being zeroable.
8627
static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8628
                                           SDValue V2, ArrayRef<int> Mask,
8629
                                           const APInt &Zeroable,
8630
6.58k
                                           SelectionDAG &DAG) {
8631
6.58k
  assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8632
6.58k
  MVT EltVT = VT.getVectorElementType();
8633
6.58k
  SDValue Zero = DAG.getConstant(0, DL, EltVT);
8634
6.58k
  SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8635
6.58k
  SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8636
6.58k
  SDValue V;
8637
28.4k
  for (int i = 0, Size = Mask.size(); 
i < Size28.4k
;
++i21.8k
) {
8638
27.7k
    if (Zeroable[i])
8639
9.60k
      continue;
8640
18.1k
    
if (18.1k
Mask[i] % Size != i18.1k
)
8641
5.10k
      return SDValue(); // Not a blend.
8642
13.0k
    
if (13.0k
!V13.0k
)
8643
4.64k
      
V = Mask[i] < Size ? 4.64k
V13.94k
:
V2708
;
8644
8.36k
    else 
if (8.36k
V != (Mask[i] < Size ? 8.36k
V16.35k
:
V22.00k
))
8645
765
      return SDValue(); // Can only let one input through the mask.
8646
12.2k
8647
12.2k
    VMaskOps[i] = AllOnes;
8648
12.2k
  }
8649
714
  
if (714
!V714
)
8650
0
    return SDValue(); // No non-zeroable elements!
8651
714
8652
714
  SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8653
714
  return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8654
714
}
8655
8656
/// \brief Try to emit a blend instruction for a shuffle using bit math.
8657
///
8658
/// This is used as a fallback approach when first class blend instructions are
8659
/// unavailable. Currently it is only suitable for integer vectors, but could
8660
/// be generalized for floating point vectors if desirable.
8661
static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8662
                                            SDValue V2, ArrayRef<int> Mask,
8663
1.33k
                                            SelectionDAG &DAG) {
8664
1.33k
  assert(VT.isInteger() && "Only supports integer vector types!");
8665
1.33k
  MVT EltVT = VT.getVectorElementType();
8666
1.33k
  SDValue Zero = DAG.getConstant(0, DL, EltVT);
8667
1.33k
  SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8668
1.33k
  SmallVector<SDValue, 16> MaskOps;
8669
5.12k
  for (int i = 0, Size = Mask.size(); 
i < Size5.12k
;
++i3.78k
) {
8670
4.98k
    if (
Mask[i] >= 0 && 4.98k
Mask[i] != i4.23k
&&
Mask[i] != i + Size1.75k
)
8671
1.19k
      return SDValue(); // Shuffled input!
8672
3.78k
    
MaskOps.push_back(Mask[i] < Size ? 3.78k
AllOnes3.22k
:
Zero565
);
8673
4.98k
  }
8674
1.33k
8675
142
  SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8676
142
  V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8677
142
  // We have to cast V2 around.
8678
142
  MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8679
142
  V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8680
142
                                      DAG.getBitcast(MaskVT, V1Mask),
8681
142
                                      DAG.getBitcast(MaskVT, V2)));
8682
142
  return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8683
1.33k
}
8684
8685
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8686
                                    SDValue PreservedSrc,
8687
                                    const X86Subtarget &Subtarget,
8688
                                    SelectionDAG &DAG);
8689
8690
static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8691
                                      MutableArrayRef<int> TargetMask,
8692
                                      bool &ForceV1Zero, bool &ForceV2Zero,
8693
33.1k
                                      uint64_t &BlendMask) {
8694
33.1k
  bool V1IsZeroOrUndef =
8695
33.1k
      V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8696
33.1k
  bool V2IsZeroOrUndef =
8697
22.5k
      V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8698
33.1k
8699
33.1k
  BlendMask = 0;
8700
33.1k
  ForceV1Zero = false, ForceV2Zero = false;
8701
33.1k
  assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8702
33.1k
8703
33.1k
  // Attempt to generate the binary blend mask. If an input is zero then
8704
33.1k
  // we can use any lane.
8705
33.1k
  // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8706
117k
  for (int i = 0, Size = TargetMask.size(); 
i < Size117k
;
++i84.2k
) {
8707
110k
    int M = TargetMask[i];
8708
110k
    if (M == SM_SentinelUndef)
8709
4.23k
      continue;
8710
106k
    
if (106k
M == i106k
)
8711
63.2k
      continue;
8712
43.3k
    
if (43.3k
M == i + Size43.3k
) {
8713
11.2k
      BlendMask |= 1ull << i;
8714
11.2k
      continue;
8715
11.2k
    }
8716
32.0k
    
if (32.0k
M == SM_SentinelZero32.0k
) {
8717
5.86k
      if (
V1IsZeroOrUndef5.86k
) {
8718
1.33k
        ForceV1Zero = true;
8719
1.33k
        TargetMask[i] = i;
8720
1.33k
        continue;
8721
1.33k
      }
8722
4.53k
      
if (4.53k
V2IsZeroOrUndef4.53k
) {
8723
4.17k
        ForceV2Zero = true;
8724
4.17k
        BlendMask |= 1ull << i;
8725
4.17k
        TargetMask[i] = i + Size;
8726
4.17k
        continue;
8727
4.17k
      }
8728
26.5k
    }
8729
26.5k
    return false;
8730
26.5k
  }
8731
6.57k
  return true;
8732
33.1k
}
8733
8734
static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
8735
4.31k
                                            int Scale) {
8736
4.31k
  uint64_t ScaledMask = 0;
8737
24.2k
  for (int i = 0; 
i != Size24.2k
;
++i19.9k
)
8738
19.9k
    
if (19.9k
BlendMask & (1ull << i)19.9k
)
8739
8.07k
      ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8740
4.31k
  return ScaledMask;
8741
4.31k
}
8742
8743
/// \brief Try to emit a blend instruction for a shuffle.
8744
///
8745
/// This doesn't do any checks for the availability of instructions for blending
8746
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8747
/// be matched in the backend with the type given. What it does check for is
8748
/// that the shuffle mask is a blend, or convertible into a blend with zero.
8749
static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8750
                                         SDValue V2, ArrayRef<int> Original,
8751
                                         const APInt &Zeroable,
8752
                                         const X86Subtarget &Subtarget,
8753
13.6k
                                         SelectionDAG &DAG) {
8754
13.6k
  SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8755
13.6k
8756
13.6k
  uint64_t BlendMask = 0;
8757
13.6k
  bool ForceV1Zero = false, ForceV2Zero = false;
8758
13.6k
  if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8759
13.6k
                                 BlendMask))
8760
10.5k
    return SDValue();
8761
3.02k
8762
3.02k
  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8763
3.02k
  
if (3.02k
ForceV1Zero3.02k
)
8764
181
    V1 = getZeroVector(VT, Subtarget, DAG, DL);
8765
3.02k
  if (ForceV2Zero)
8766
428
    V2 = getZeroVector(VT, Subtarget, DAG, DL);
8767
3.02k
8768
3.02k
  switch (VT.SimpleTy) {
8769
865
  case MVT::v2f64:
8770
865
  case MVT::v4f32:
8771
865
  case MVT::v4f64:
8772
865
  case MVT::v8f32:
8773
865
    return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8774
865
                       DAG.getConstant(BlendMask, DL, MVT::i8));
8775
865
8776
279
  case MVT::v4i64:
8777
279
  case MVT::v8i32:
8778
279
    assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8779
279
    LLVM_FALLTHROUGH;
8780
1.19k
  case MVT::v2i64:
8781
1.19k
  case MVT::v4i32:
8782
1.19k
    // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8783
1.19k
    // that instruction.
8784
1.19k
    if (
Subtarget.hasAVX2()1.19k
) {
8785
543
      // Scale the blend by the number of 32-bit dwords per element.
8786
543
      int Scale =  VT.getScalarSizeInBits() / 32;
8787
543
      BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8788
543
      MVT BlendVT = VT.getSizeInBits() > 128 ? 
MVT::v8i32279
:
MVT::v4i32264
;
8789
543
      V1 = DAG.getBitcast(BlendVT, V1);
8790
543
      V2 = DAG.getBitcast(BlendVT, V2);
8791
543
      return DAG.getBitcast(
8792
543
          VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8793
543
                          DAG.getConstant(BlendMask, DL, MVT::i8)));
8794
543
    }
8795
650
    
LLVM_FALLTHROUGH650
;
8796
1.03k
  case MVT::v8i16: {
8797
1.03k
    // For integer shuffles we need to expand the mask and cast the inputs to
8798
1.03k
    // v8i16s prior to blending.
8799
1.03k
    int Scale = 8 / VT.getVectorNumElements();
8800
1.03k
    BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8801
1.03k
    V1 = DAG.getBitcast(MVT::v8i16, V1);
8802
1.03k
    V2 = DAG.getBitcast(MVT::v8i16, V2);
8803
1.03k
    return DAG.getBitcast(VT,
8804
1.03k
                          DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8805
1.03k
                                      DAG.getConstant(BlendMask, DL, MVT::i8)));
8806
650
  }
8807
650
8808
156
  case MVT::v16i16: {
8809
156
    assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8810
156
    SmallVector<int, 8> RepeatedMask;
8811
156
    if (
is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)156
) {
8812
59
      // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8813
59
      assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8814
59
      BlendMask = 0;
8815
531
      for (int i = 0; 
i < 8531
;
++i472
)
8816
472
        
if (472
RepeatedMask[i] >= 8472
)
8817
189
          BlendMask |= 1ull << i;
8818
59
      return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8819
59
                         DAG.getConstant(BlendMask, DL, MVT::i8));
8820
59
    }
8821
97
    
LLVM_FALLTHROUGH97
;
8822
97
  }
8823
484
  case MVT::v16i8:
8824
484
  case MVT::v32i8: {
8825
484
    assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8826
484
           "256-bit byte-blends require AVX2 support!");
8827
484
8828
484
    if (
Subtarget.hasBWI() && 484
Subtarget.hasVLX()305
) {
8829
28
      MVT IntegerType =
8830
28
          MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8831
28
      SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8832
28
      return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8833
28
    }
8834
456
8835
456
    // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8836
456
    
if (SDValue 456
Masked456
=
8837
456
            lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8838
43
      return Masked;
8839
413
8840
413
    // Scale the blend by the number of bytes per element.
8841
413
    int Scale = VT.getScalarSizeInBits() / 8;
8842
413
8843
413
    // This form of blend is always done on bytes. Compute the byte vector
8844
413
    // type.
8845
413
    MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8846
413
8847
413
    // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8848
413
    // mix of LLVM's code generator and the x86 backend. We tell the code
8849
413
    // generator that boolean values in the elements of an x86 vector register
8850
413
    // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8851
413
    // mapping a select to operand #1, and 'false' mapping to operand #2. The
8852
413
    // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8853
413
    // of the element (the remaining are ignored) and 0 in that high bit would
8854
413
    // mean operand #1 while 1 in the high bit would mean operand #2. So while
8855
413
    // the LLVM model for boolean values in vector elements gets the relevant
8856
413
    // bit set, it is set backwards and over constrained relative to x86's
8857
413
    // actual model.
8858
413
    SmallVector<SDValue, 32> VSELECTMask;
8859
12.2k
    for (int i = 0, Size = Mask.size(); 
i < Size12.2k
;
++i11.8k
)
8860
24.7k
      
for (int j = 0; 11.8k
j < Scale24.7k
;
++j12.8k
)
8861
12.8k
        VSELECTMask.push_back(
8862
584
            Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8863
12.2k
                        : 
DAG.getConstant(Mask[i] < Size ? 12.2k
-110.8k
:
01.45k
, DL,
8864
11.8k
                                          MVT::i8));
8865
413
8866
413
    V1 = DAG.getBitcast(BlendVT, V1);
8867
413
    V2 = DAG.getBitcast(BlendVT, V2);
8868
413
    return DAG.getBitcast(
8869
413
        VT,
8870
413
        DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
8871
413
                      V1, V2));
8872
413
  }
8873
40
  case MVT::v16f32:
8874
40
  case MVT::v8f64:
8875
40
  case MVT::v8i64:
8876
40
  case MVT::v16i32:
8877
40
  case MVT::v32i16:
8878
40
  case MVT::v64i8: {
8879
40
    MVT IntegerType =
8880
40
        MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8881
40
    SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8882
40
    return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8883
40
  }
8884
0
  default:
8885
0
    llvm_unreachable("Not a supported integer vector type!");
8886
0
  }
8887
0
}
8888
8889
/// \brief Try to lower as a blend of elements from two inputs followed by
8890
/// a single-input permutation.
8891
///
8892
/// This matches the pattern where we can blend elements from two inputs and
8893
/// then reduce the shuffle to a single-input permutation.
8894
static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8895
                                                   SDValue V1, SDValue V2,
8896
                                                   ArrayRef<int> Mask,
8897
907
                                                   SelectionDAG &DAG) {
8898
907
  // We build up the blend mask while checking whether a blend is a viable way
8899
907
  // to reduce the shuffle.
8900
907
  SmallVector<int, 32> BlendMask(Mask.size(), -1);
8901
907
  SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8902
907
8903
7.46k
  for (int i = 0, Size = Mask.size(); 
i < Size7.46k
;
++i6.55k
) {
8904
7.13k
    if (Mask[i] < 0)
8905
897
      continue;
8906
6.23k
8907
7.13k
    assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8908
6.23k
8909
6.23k
    if (BlendMask[Mask[i] % Size] < 0)
8910
4.18k
      BlendMask[Mask[i] % Size] = Mask[i];
8911
2.05k
    else 
if (2.05k
BlendMask[Mask[i] % Size] != Mask[i]2.05k
)
8912
580
      return SDValue(); // Can't blend in the needed input!
8913
5.65k
8914
5.65k
    PermuteMask[i] = Mask[i] % Size;
8915
5.65k
  }
8916
907
8917
327
  SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8918
327
  return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8919
907
}
8920
8921
/// \brief Generic routine to decompose a shuffle and blend into independent
8922
/// blends and permutes.
8923
///
8924
/// This matches the extremely common pattern for handling combined
8925
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8926
/// operations. It will try to pick the best arrangement of shuffles and
8927
/// blends.
8928
static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8929
                                                          MVT VT, SDValue V1,
8930
                                                          SDValue V2,
8931
                                                          ArrayRef<int> Mask,
8932
1.28k
                                                          SelectionDAG &DAG) {
8933
1.28k
  // Shuffle the input elements into the desired positions in V1 and V2 and
8934
1.28k
  // blend them together.
8935
1.28k
  SmallVector<int, 32> V1Mask(Mask.size(), -1);
8936
1.28k
  SmallVector<int, 32> V2Mask(Mask.size(), -1);
8937
1.28k
  SmallVector<int, 32> BlendMask(Mask.size(), -1);
8938
21.5k
  for (int i = 0, Size = Mask.size(); 
i < Size21.5k
;
++i20.2k
)
8939
20.2k
    
if (20.2k
Mask[i] >= 0 && 20.2k
Mask[i] < Size18.1k
) {
8940
14.2k
      V1Mask[i] = Mask[i];
8941
14.2k
      BlendMask[i] = i;
8942
20.2k
    } else 
if (6.02k
Mask[i] >= Size6.02k
) {
8943
3.87k
      V2Mask[i] = Mask[i] - Size;
8944
3.87k
      BlendMask[i] = i + Size;
8945
3.87k
    }
8946
1.28k
8947
1.28k
  // Try to lower with the simpler initial blend strategy unless one of the
8948
1.28k
  // input shuffles would be a no-op. We prefer to shuffle inputs as the
8949
1.28k
  // shuffle may be able to fold with a load or other benefit. However, when
8950
1.28k
  // we'll have to do 2x as many shuffles in order to achieve this, blending
8951
1.28k
  // first is a better strategy.
8952
1.28k
  if (
!isNoopShuffleMask(V1Mask) && 1.28k
!isNoopShuffleMask(V2Mask)907
)
8953
739
    
if (SDValue 739
BlendPerm739
=
8954
739
            lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8955
302
      return BlendPerm;
8956
979
8957
979
  V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8958
979
  V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8959
979
  return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8960
979
}
8961
8962
/// \brief Try to lower a vector shuffle as a rotation.
8963
///
8964
/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8965
static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8966
30.4k
                                      ArrayRef<int> Mask) {
8967
30.4k
  int NumElts = Mask.size();
8968
30.4k
8969
30.4k
  // We need to detect various ways of spelling a rotation:
8970
30.4k
  //   [11, 12, 13, 14, 15,  0,  1,  2]
8971
30.4k
  //   [-1, 12, 13, 14, -1, -1,  1, -1]
8972
30.4k
  //   [-1, -1, -1, -1, -1, -1,  1,  2]
8973
30.4k
  //   [ 3,  4,  5,  6,  7,  8,  9, 10]
8974
30.4k
  //   [-1,  4,  5,  6, -1, -1,  9, -1]
8975
30.4k
  //   [-1,  4,  5,  6, -1, -1, -1, -1]
8976
30.4k
  int Rotation = 0;
8977
30.4k
  SDValue Lo, Hi;
8978
55.1k
  for (int i = 0; 
i < NumElts55.1k
;
++i24.7k
) {
8979
54.1k
    int M = Mask[i];
8980
54.1k
    assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8981
54.1k
           "Unexpected mask index.");
8982
54.1k
    if (M < 0)
8983
5.74k
      continue;
8984
48.3k
8985
48.3k
    // Determine where a rotated vector would have started.
8986
48.3k
    int StartIdx = i - (M % NumElts);
8987
48.3k
    if (StartIdx == 0)
8988
48.3k
      // The identity rotation isn't interesting, stop.
8989
23.2k
      return -1;
8990
25.0k
8991
25.0k
    // If we found the tail of a vector the rotation must be the missing
8992
25.0k
    // front. If we found the head of a vector, it must be how much of the
8993
25.0k
    // head.
8994
25.0k
    
int CandidateRotation = StartIdx < 0 ? 25.0k
-StartIdx17.1k
:
NumElts - StartIdx7.90k
;
8995
25.0k
8996
25.0k
    if (Rotation == 0)
8997
8.47k
      Rotation = CandidateRotation;
8998
16.6k
    else 
if (16.6k
Rotation != CandidateRotation16.6k
)
8999
16.6k
      // The rotations don't match, so we can't match this mask.
9000
6.03k
      return -1;
9001
19.0k
9002
19.0k
    // Compute which value this mask is pointing at.
9003
19.0k
    
SDValue MaskV = M < NumElts ? 19.0k
V116.5k
:
V22.46k
;
9004
19.0k
9005
19.0k
    // Compute which of the two target values this index should be assigned
9006
19.0k
    // to. This reflects whether the high elements are remaining or the low
9007
19.0k
    // elements are remaining.
9008
19.0k
    SDValue &TargetV = StartIdx < 0 ? 
Hi12.8k
:
Lo6.18k
;
9009
19.0k
9010
19.0k
    // Either set up this value if we've not encountered it before, or check
9011
19.0k
    // that it remains consistent.
9012
19.0k
    if (!TargetV)
9013
9.72k
      TargetV = MaskV;
9014
9.31k
    else 
if (9.31k
TargetV != MaskV9.31k
)
9015
9.31k
      // This may be a rotation, but it pulls from the inputs in some
9016
9.31k
      // unsupported interleaving.
9017
85
      return -1;
9018
54.1k
  }
9019
30.4k
9020
30.4k
  // Check that we successfully analyzed the mask, and normalize the results.
9021
1.07k
  assert(Rotation != 0 && "Failed to locate a viable rotation!");
9022
1.07k
  assert((Lo || Hi) && "Failed to find a rotated input vector!");
9023
1.07k
  if (!Lo)
9024
0
    Lo = Hi;
9025
1.07k
  else 
if (1.07k
!Hi1.07k
)
9026
0
    Hi = Lo;
9027
1.07k
9028
1.07k
  V1 = Lo;
9029
1.07k
  V2 = Hi;
9030
1.07k
9031
1.07k
  return Rotation;
9032
30.4k
}
9033
9034
/// \brief Try to lower a vector shuffle as a byte rotation.
9035
///
9036
/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9037
/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9038
/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9039
/// try to generically lower a vector shuffle through such an pattern. It
9040
/// does not check for the profitability of lowering either as PALIGNR or
9041
/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9042
/// This matches shuffle vectors that look like:
9043
///
9044
///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9045
///
9046
/// Essentially it concatenates V1 and V2, shifts right by some number of
9047
/// elements, and takes the low elements as the result. Note that while this is
9048
/// specified as a *right shift* because x86 is little-endian, it is a *left
9049
/// rotate* of the vector lanes.
9050
static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9051
38.4k
                                          ArrayRef<int> Mask) {
9052
38.4k
  // Don't accept any shuffles with zero elements.
9053
431k
  if (
any_of(Mask, [](int M) 38.4k
{ return M == SM_SentinelZero; }431k
))
9054
4.44k
    return -1;
9055
34.0k
9056
34.0k
  // PALIGNR works on 128-bit lanes.
9057
34.0k
  SmallVector<int, 16> RepeatedMask;
9058
34.0k
  if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9059
4.47k
    return -1;
9060
29.5k
9061
29.5k
  int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9062
29.5k
  if (Rotation <= 0)
9063
28.5k
    return -1;
9064
1.02k
9065
1.02k
  // PALIGNR rotates bytes, so we need to scale the
9066
1.02k
  // rotation based on how many bytes are in the vector lane.
9067
1.02k
  int NumElts = RepeatedMask.size();
9068
1.02k
  int Scale = 16 / NumElts;
9069
1.02k
  return Rotation * Scale;
9070
1.02k
}
9071
9072
static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9073
                                              SDValue V1, SDValue V2,
9074
                                              ArrayRef<int> Mask,
9075
                                              const X86Subtarget &Subtarget,
9076
11.2k
                                              SelectionDAG &DAG) {
9077
11.2k
  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9078
11.2k
9079
11.2k
  SDValue Lo = V1, Hi = V2;
9080
11.2k
  int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9081
11.2k
  if (ByteRotation <= 0)
9082
10.7k
    return SDValue();
9083
482
9084
482
  // Cast the inputs to i8 vector of correct length to match PALIGNR or
9085
482
  // PSLLDQ/PSRLDQ.
9086
482
  MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9087
482
  Lo = DAG.getBitcast(ByteVT, Lo);
9088
482
  Hi = DAG.getBitcast(ByteVT, Hi);
9089
482
9090
482
  // SSSE3 targets can use the palignr instruction.
9091
482
  if (
Subtarget.hasSSSE3()482
) {
9092
459
    assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9093
459
           "512-bit PALIGNR requires BWI instructions");
9094
459
    return DAG.getBitcast(
9095
459
        VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9096
459
                        DAG.getConstant(ByteRotation, DL, MVT::i8)));
9097
459
  }
9098
23
9099
482
  assert(VT.is128BitVector() &&
9100
23
         "Rotate-based lowering only supports 128-bit lowering!");
9101
23
  assert(Mask.size() <= 16 &&
9102
23
         "Can shuffle at most 16 bytes in a 128-bit vector!");
9103
23
  assert(ByteVT == MVT::v16i8 &&
9104
23
         "SSE2 rotate lowering only needed for v16i8!");
9105
23
9106
23
  // Default SSE2 implementation
9107
23
  int LoByteShift = 16 - ByteRotation;
9108
23
  int HiByteShift = ByteRotation;
9109
23
9110
23
  SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9111
23
                                DAG.getConstant(LoByteShift, DL, MVT::i8));
9112
23
  SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9113
23
                                DAG.getConstant(HiByteShift, DL, MVT::i8));
9114
23
  return DAG.getBitcast(VT,
9115
23
                        DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9116
23
}
9117
9118
/// \brief Try to lower a vector shuffle as a dword/qword rotation.
9119
///
9120
/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9121
/// rotation of the concatenation of two vectors; This routine will
9122
/// try to generically lower a vector shuffle through such an pattern.
9123
///
9124
/// Essentially it concatenates V1 and V2, shifts right by some number of
9125
/// elements, and takes the low elements as the result. Note that while this is
9126
/// specified as a *right shift* because x86 is little-endian, it is a *left
9127
/// rotate* of the vector lanes.
9128
static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9129
                                          SDValue V1, SDValue V2,
9130
                                          ArrayRef<int> Mask,
9131
                                          const X86Subtarget &Subtarget,
9132
931
                                          SelectionDAG &DAG) {
9133
931
  assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9134
931
         "Only 32-bit and 64-bit elements are supported!");
9135
931
9136
931
  // 128/256-bit vectors are only supported with VLX.
9137
931
  assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9138
931
         && "VLX required for 128/256-bit vectors");
9139
931
9140
931
  SDValue Lo = V1, Hi = V2;
9141
931
  int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9142
931
  if (Rotation <= 0)
9143
881
    return SDValue();
9144
50
9145
50
  return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9146
50
                     DAG.getConstant(Rotation, DL, MVT::i8));
9147
50
}
9148
9149
/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9150
///
9151
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9152
/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9153
/// matches elements from one of the input vectors shuffled to the left or
9154
/// right with zeroable elements 'shifted in'. It handles both the strictly
9155
/// bit-wise element shifts and the byte shift across an entire 128-bit double
9156
/// quad word lane.
9157
///
9158
/// PSHL : (little-endian) left bit shift.
9159
/// [ zz, 0, zz,  2 ]
9160
/// [ -1, 4, zz, -1 ]
9161
/// PSRL : (little-endian) right bit shift.
9162
/// [  1, zz,  3, zz]
9163
/// [ -1, -1,  7, zz]
9164
/// PSLLDQ : (little-endian) left byte shift
9165
/// [ zz,  0,  1,  2,  3,  4,  5,  6]
9166
/// [ zz, zz, -1, -1,  2,  3,  4, -1]
9167
/// [ zz, zz, zz, zz, zz, zz, -1,  1]
9168
/// PSRLDQ : (little-endian) right byte shift
9169
/// [  5, 6,  7, zz, zz, zz, zz, zz]
9170
/// [ -1, 5,  6,  7, zz, zz, zz, zz]
9171
/// [  1, 2, -1, -1, -1, -1, zz, zz]
9172
static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9173
                                     unsigned ScalarSizeInBits,
9174
                                     ArrayRef<int> Mask, int MaskOffset,
9175
                                     const APInt &Zeroable,
9176
59.5k
                                     const X86Subtarget &Subtarget) {
9177
59.5k
  int Size = Mask.size();
9178
59.5k
  unsigned SizeInBits = Size * ScalarSizeInBits;
9179
59.5k
9180
1.65M
  auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9181
2.17M
    for (int i = 0; 
i < Size2.17M
;
i += Scale516k
)
9182
3.65M
      
for (int j = 0; 1.95M
j < Shift3.65M
;
++j1.70M
)
9183
3.13M
        
if (3.13M
!Zeroable[i + j + (Left ? 3.13M
01.19M
:
(Scale - Shift)1.94M
)])
9184
1.43M
          return false;
9185
1.65M
9186
215k
    return true;
9187
1.65M
  };
9188
59.5k
9189
215k
  auto MatchShift = [&](int Shift, int Scale, bool Left) {
9190
243k
    for (int i = 0; 
i != Size243k
;
i += Scale27.9k
) {
9191
238k
      unsigned Pos = Left ? 
i + Shift49.5k
:
i188k
;
9192
238k
      unsigned Low = Left ? 
i49.5k
:
i + Shift188k
;
9193
238k
      unsigned Len = Scale - Shift;
9194
238k
      if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9195
210k
        return -1;
9196
238k
    }
9197
215k
9198
5.41k
    int ShiftEltBits = ScalarSizeInBits * Scale;
9199
5.41k
    bool ByteShift = ShiftEltBits > 64;
9200
2.29k
    Opcode = Left ? 
(ByteShift ? 2.29k
X86ISD::VSHLDQ514
:
X86ISD::VSHLI1.77k
)
9201
3.12k
                  : 
(ByteShift ? 3.12k
X86ISD::VSRLDQ606
:
X86ISD::VSRLI2.51k
);
9202
5.41k
    int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 
81.12k
:
14.29k
);
9203
5.41k
9204
5.41k
    // Normalize the scale for byte shifts to still produce an i64 element
9205
5.41k
    // type.
9206
5.41k
    Scale = ByteShift ? 
Scale / 21.12k
:
Scale4.29k
;
9207
5.41k
9208
5.41k
    // We need to round trip through the appropriate type for the shift.
9209
5.41k
    MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9210
1.12k
    ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9211
4.29k
                        : MVT::getVectorVT(ShiftSVT, Size / Scale);
9212
5.41k
    return (int)ShiftAmt;
9213
215k
  };
9214
59.5k
9215
59.5k
  // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9216
59.5k
  // keep doubling the size of the integer elements up to that. We can
9217
59.5k
  // then shift the elements of the integer vector by whole multiples of
9218
59.5k
  // their width within the elements of the larger integer vector. Test each
9219
59.5k
  // multiple to see if we can find a match with the moved element indices
9220
59.5k
  // and that the shifted in elements are all zeroable.
9221
59.5k
  unsigned MaxWidth = ((SizeInBits == 512) && 
!Subtarget.hasBWI()2.20k
?
641.18k
:
12858.3k
);
9222
226k
  for (int Scale = 2; 
Scale * ScalarSizeInBits <= MaxWidth226k
;
Scale *= 2167k
)
9223
994k
    
for (int Shift = 1; 172k
Shift != Scale994k
;
++Shift822k
)
9224
827k
      for (bool Left : {true, false})
9225
1.65M
        
if (1.65M
CheckZeros(Shift, Scale, Left)1.65M
) {
9226
215k
          int ShiftAmt = MatchShift(Shift, Scale, Left);
9227
215k
          if (0 < ShiftAmt)
9228
5.41k
            return ShiftAmt;
9229
172k
        }
9230
59.5k
9231
59.5k
  // no match
9232
54.1k
  return -1;
9233
59.5k
}
9234
9235
static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9236
                                         SDValue V2, ArrayRef<int> Mask,
9237
                                         const APInt &Zeroable,
9238
                                         const X86Subtarget &Subtarget,
9239
17.2k
                                         SelectionDAG &DAG) {
9240
17.2k
  int Size = Mask.size();
9241
17.2k
  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9242
17.2k
9243
17.2k
  MVT ShiftVT;
9244
17.2k
  SDValue V = V1;
9245
17.2k
  unsigned Opcode;
9246
17.2k
9247
17.2k
  // Try to match shuffle against V1 shift.
9248
17.2k
  int ShiftAmt = matchVectorShuffleAsShift(
9249
17.2k
      ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9250
17.2k
9251
17.2k
  // If V1 failed, try to match shuffle against V2 shift.
9252
17.2k
  if (
ShiftAmt < 017.2k
) {
9253
16.4k
    ShiftAmt =
9254
16.4k
        matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9255
16.4k
                                  Mask, Size, Zeroable, Subtarget);
9256
16.4k
    V = V2;
9257
16.4k
  }
9258
17.2k
9259
17.2k
  if (ShiftAmt < 0)
9260
16.2k
    return SDValue();
9261
991
9262
17.2k
  assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9263
991
         "Illegal integer vector type");
9264
991
  V = DAG.getBitcast(ShiftVT, V);
9265
991
  V = DAG.getNode(Opcode, DL, ShiftVT, V,
9266
991
                  DAG.getConstant(ShiftAmt, DL, MVT::i8));
9267
991
  return DAG.getBitcast(VT, V);
9268
991
}
9269
9270
// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9271
// Remainder of lower half result is zero and upper half is all undef.
9272
static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9273
                                      ArrayRef<int> Mask, uint64_t &BitLen,
9274
386
                                      uint64_t &BitIdx, const APInt &Zeroable) {
9275
386
  int Size = Mask.size();
9276
386
  int HalfSize = Size / 2;
9277
386
  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9278
386
  assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9279
386
9280
386
  // Upper half must be undefined.
9281
386
  if (!isUndefInRange(Mask, HalfSize, HalfSize))
9282
200
    return false;
9283
186
9284
186
  // Determine the extraction length from the part of the
9285
186
  // lower half that isn't zeroable.
9286
186
  int Len = HalfSize;
9287
624
  for (; 
Len > 0624
;
--Len438
)
9288
624
    
if (624
!Zeroable[Len - 1]624
)
9289
186
      break;
9290
186
  assert(Len > 0 && "Zeroable shuffle mask");
9291
186
9292
186
  // Attempt to match first Len sequential elements from the lower half.
9293
186
  SDValue Src;
9294
186
  int Idx = -1;
9295
458
  for (int i = 0; 
i != Len458
;
++i272
) {
9296
376
    int M = Mask[i];
9297
376
    if (M == SM_SentinelUndef)
9298
15
      continue;
9299
361
    
SDValue &V = (M < Size ? 361
V1246
:
V2115
);
9300
361
    M = M % Size;
9301
361
9302
361
    // The extracted elements must start at a valid index and all mask
9303
361
    // elements must be in the lower half.
9304
361
    if (
i > M || 361
M >= HalfSize288
)
9305
77
      return false;
9306
284
9307
284
    
if (284
Idx < 0 || 284
(Src == V && 107
Idx == (M - i)84
)) {
9308
257
      Src = V;
9309
257
      Idx = M - i;
9310
257
      continue;
9311
257
    }
9312
27
    return false;
9313
27
  }
9314
186
9315
82
  
if (82
!Src || 82
Idx < 082
)
9316
0
    return false;
9317
82
9318
82
  assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9319
82
  BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9320
82
  BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9321
82
  V1 = Src;
9322
82
  return true;
9323
82
}
9324
9325
// INSERTQ: Extract lowest Len elements from lower half of second source and
9326
// insert over first source, starting at Idx.
9327
// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9328
static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9329
                                        ArrayRef<int> Mask, uint64_t &BitLen,
9330
304
                                        uint64_t &BitIdx) {
9331
304
  int Size = Mask.size();
9332
304
  int HalfSize = Size / 2;
9333
304
  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9334
304
9335
304
  // Upper half must be undefined.
9336
304
  if (!isUndefInRange(Mask, HalfSize, HalfSize))
9337
200
    return false;
9338
104
9339
354
  
for (int Idx = 0; 104
Idx != HalfSize354
;
++Idx250
) {
9340
333
    SDValue Base;
9341
333
9342
333
    // Attempt to match first source from mask before insertion point.
9343
333
    if (
isUndefInRange(Mask, 0, Idx)333
) {
9344
113
      /* EMPTY */
9345
333
    } else 
if (220
isSequentialOrUndefInRange(Mask, 0, Idx, 0)220
) {
9346
77
      Base = V1;
9347
220
    } else 
if (143
isSequentialOrUndefInRange(Mask, 0, Idx, Size)143
) {
9348
9
      Base = V2;
9349
143
    } else {
9350
134
      continue;
9351
134
    }
9352
199
9353
199
    // Extend the extraction length looking to match both the insertion of
9354
199
    // the second source and the remaining elements of the first.
9355
926
    
for (int Hi = Idx + 1; 199
Hi <= HalfSize926
;
++Hi727
) {
9356
810
      SDValue Insert;
9357
810
      int Len = Hi - Idx;
9358
810
9359
810
      // Match insertion.
9360
810
      if (
isSequentialOrUndefInRange(Mask, Idx, Len, 0)810
) {
9361
119
        Insert = V1;
9362
810
      } else 
if (691
isSequentialOrUndefInRange(Mask, Idx, Len, Size)691
) {
9363
118
        Insert = V2;
9364
691
      } else {
9365
573
        continue;
9366
573
      }
9367
237
9368
237
      // Match the remaining elements of the lower half.
9369
237
      
if (237
isUndefInRange(Mask, Hi, HalfSize - Hi)237
) {
9370
15
        /* EMPTY */
9371
237
      } else 
if (222
(!Base || 222
(Base == V1)89
) &&
9372
222
                 
isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)207
) {
9373
68
        Base = V1;
9374
222
      } else 
if (154
(!Base || 154
(Base == V2)40
) &&
9375
129
                 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9376
154
                                            Size + Hi)) {
9377
0
        Base = V2;
9378
154
      } else {
9379
154
        continue;
9380
154
      }
9381
83
9382
83
      BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9383
83
      BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9384
83
      V1 = Base;
9385
83
      V2 = Insert;
9386
83
      return true;
9387
83
    }
9388
333
  }
9389
104
9390
21
  return false;
9391
304
}
9392
9393
/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9394
static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9395
                                           SDValue V2, ArrayRef<int> Mask,
9396
                                           const APInt &Zeroable,
9397
108
                                           SelectionDAG &DAG) {
9398
108
  uint64_t BitLen, BitIdx;
9399
108
  if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9400
23
    return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9401
23
                       DAG.getConstant(BitLen, DL, MVT::i8),
9402
23
                       DAG.getConstant(BitIdx, DL, MVT::i8));
9403
85
9404
85
  
if (85
matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)85
)
9405
39
    
return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? 39
V139
:
DAG.getUNDEF(VT)0
,
9406
39
                       V2 ? 
V239
:
DAG.getUNDEF(VT)0
,
9407
39
                       DAG.getConstant(BitLen, DL, MVT::i8),
9408
39
                       DAG.getConstant(BitIdx, DL, MVT::i8));
9409
46
9410
46
  return SDValue();
9411
46
}
9412
9413
/// \brief Lower a vector shuffle as a zero or any extension.
9414
///
9415
/// Given a specific number of elements, element bit width, and extension
9416
/// stride, produce either a zero or any extension based on the available
9417
/// features of the subtarget. The extended elements are consecutive and
9418
/// begin and can start from an offsetted element index in the input; to
9419
/// avoid excess shuffling the offset must either being in the bottom lane
9420
/// or at the start of a higher lane. All extended elements must be from
9421
/// the same lane.
9422
static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9423
    const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9424
1.63k
    ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9425
1.63k
  assert(Scale > 1 && "Need a scale to extend.");
9426
1.63k
  int EltBits = VT.getScalarSizeInBits();
9427
1.63k
  int NumElements = VT.getVectorNumElements();
9428
1.63k
  int NumEltsPerLane = 128 / EltBits;
9429
1.63k
  int OffsetLane = Offset / NumEltsPerLane;
9430
1.63k
  assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9431
1.63k
         "Only 8, 16, and 32 bit elements can be extended.");
9432
1.63k
  assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9433
1.63k
  assert(0 <= Offset && "Extension offset must be positive.");
9434
1.63k
  assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9435
1.63k
         "Extension offset must be in the first lane or start an upper lane.");
9436
1.63k
9437
1.63k
  // Check that an index is in same lane as the base offset.
9438
767
  auto SafeOffset = [&](int Idx) {
9439
767
    return OffsetLane == (Idx / NumEltsPerLane);
9440
767
  };
9441
1.63k
9442
1.63k
  // Shift along an input so that the offset base moves to the first element.
9443
638
  auto ShuffleOffset = [&](SDValue V) {
9444
638
    if (!Offset)
9445
449
      return V;
9446
189
9447
189
    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9448
843
    for (int i = 0; 
i * Scale < NumElements843
;
++i654
) {
9449
654
      int SrcIdx = i + Offset;
9450
654
      ShMask[i] = SafeOffset(SrcIdx) ? 
SrcIdx642
:
-112
;
9451
654
    }
9452
638
    return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9453
638
  };
9454
1.63k
9455
1.63k
  // Found a valid zext mask! Try various lowering strategies based on the
9456
1.63k
  // input type and available ISA extensions.
9457
1.63k
  if (
Subtarget.hasSSE41()1.63k
) {
9458
685
    // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9459
685
    // PUNPCK will catch this in a later shuffle match.
9460
685
    if (
Offset && 685
Scale == 2236
&&
VT.is128BitVector()72
)
9461
47
      return SDValue();
9462
638
    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9463
638
                                 NumElements / Scale);
9464
638
    InputV = ShuffleOffset(InputV);
9465
638
    InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9466
638
    return DAG.getBitcast(VT, InputV);
9467
638
  }
9468
950
9469
1.63k
  assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9470
950
9471
950
  // For any extends we can cheat for larger element sizes and use shuffle
9472
950
  // instructions that can fold with a load and/or copy.
9473
950
  if (
AnyExt && 950
EltBits == 32166
) {
9474
59
    int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? 
Offset + 159
:
-10
,
9475
59
                         -1};
9476
59
    return DAG.getBitcast(
9477
59
        VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9478
59
                        DAG.getBitcast(MVT::v4i32, InputV),
9479
59
                        getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9480
59
  }
9481
891
  
if (891
AnyExt && 891
EltBits == 16107
&&
Scale > 246
) {
9482
8
    int PSHUFDMask[4] = {Offset / 2, -1,
9483
8
                         SafeOffset(Offset + 1) ? 
(Offset + 1) / 28
:
-10
, -1};
9484
8
    InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9485
8
                         DAG.getBitcast(MVT::v4i32, InputV),
9486
8
                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9487
8
    int PSHUFWMask[4] = {1, -1, -1, -1};
9488
8
    unsigned OddEvenOp = (Offset & 1 ? 
X86ISD::PSHUFLW0
:
X86ISD::PSHUFHW8
);
9489
8
    return DAG.getBitcast(
9490
8
        VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9491
8
                        DAG.getBitcast(MVT::v8i16, InputV),
9492
8
                        getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9493
8
  }
9494
883
9495
883
  // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9496
883
  // to 64-bits.
9497
883
  
if (883
(Scale * EltBits) == 64 && 883
EltBits < 32180
&&
Subtarget.hasSSE4A()90
) {
9498
12
    assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9499
12
    assert(VT.is128BitVector() && "Unexpected vector width!");
9500
12
9501
12
    int LoIdx = Offset * EltBits;
9502
12
    SDValue Lo = DAG.getBitcast(
9503
12
        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9504
12
                                DAG.getConstant(EltBits, DL, MVT::i8),
9505
12
                                DAG.getConstant(LoIdx, DL, MVT::i8)));
9506
12
9507
12
    if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9508
8
        !SafeOffset(Offset + 1))
9509
4
      return DAG.getBitcast(VT, Lo);
9510
8
9511
8
    int HiIdx = (Offset + 1) * EltBits;
9512
8
    SDValue Hi = DAG.getBitcast(
9513
8
        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9514
8
                                DAG.getConstant(EltBits, DL, MVT::i8),
9515
8
                                DAG.getConstant(HiIdx, DL, MVT::i8)));
9516
8
    return DAG.getBitcast(VT,
9517
8
                          DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9518
8
  }
9519
871
9520
871
  // If this would require more than 2 unpack instructions to expand, use
9521
871
  // pshufb when available. We can only use more than 2 unpack instructions
9522
871
  // when zero extending i8 elements which also makes it easier to use pshufb.
9523
871
  
if (871
Scale > 4 && 871
EltBits == 840
&&
Subtarget.hasSSSE3()40
) {
9524
19
    assert(NumElements == 16 && "Unexpected byte vector width!");
9525
19
    SDValue PSHUFBMask[16];
9526
323
    for (int i = 0; 
i < 16323
;
++i304
) {
9527
304
      int Idx = Offset + (i / Scale);
9528
304
      PSHUFBMask[i] = DAG.getConstant(
9529
304
          (i % Scale == 0 && 
SafeOffset(Idx)38
) ?
Idx38
:
0x80266
, DL, MVT::i8);
9530
304
    }
9531
19
    InputV = DAG.getBitcast(MVT::v16i8, InputV);
9532
19
    return DAG.getBitcast(
9533
19
        VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9534
19
                        DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9535
19
  }
9536
852
9537
852
  // If we are extending from an offset, ensure we start on a boundary that
9538
852
  // we can unpack from.
9539
852
  int AlignToUnpack = Offset % (NumElements / Scale);
9540
852
  if (
AlignToUnpack852
) {
9541
9
    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9542
82
    for (int i = AlignToUnpack; 
i < NumElements82
;
++i73
)
9543
73
      ShMask[i - AlignToUnpack] = i;
9544
9
    InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9545
9
    Offset -= AlignToUnpack;
9546
9
  }
9547
852
9548
852
  // Otherwise emit a sequence of unpacks.
9549
1.31k
  do {
9550
1.31k
    unsigned UnpackLoHi = X86ISD::UNPCKL;
9551
1.31k
    if (
Offset >= (NumElements / 2)1.31k
) {
9552
268
      UnpackLoHi = X86ISD::UNPCKH;
9553
268
      Offset -= (NumElements / 2);
9554
268
    }
9555
1.31k
9556
1.31k
    MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9557
134
    SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9558
1.18k
                         : getZeroVector(InputVT, Subtarget, DAG, DL);
9559
1.31k
    InputV = DAG.getBitcast(InputVT, InputV);
9560
1.31k
    InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9561
1.31k
    Scale /= 2;
9562
1.31k
    EltBits *= 2;
9563
1.31k
    NumElements /= 2;
9564
1.31k
  } while (Scale > 1);
9565
1.63k
  return DAG.getBitcast(VT, InputV);
9566
1.63k
}
9567
9568
/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9569
///
9570
/// This routine will try to do everything in its power to cleverly lower
9571
/// a shuffle which happens to match the pattern of a zero extend. It doesn't
9572
/// check for the profitability of this lowering,  it tries to aggressively
9573
/// match this pattern. It will use all of the micro-architectural details it
9574
/// can to emit an efficient lowering. It handles both blends with all-zero
9575
/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9576
/// masking out later).
9577
///
9578
/// The reason we have dedicated lowering for zext-style shuffles is that they
9579
/// are both incredibly common and often quite performance sensitive.
9580
static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9581
    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9582
    const APInt &Zeroable, const X86Subtarget &Subtarget,
9583
19.4k
    SelectionDAG &DAG) {
9584
19.4k
  int Bits = VT.getSizeInBits();
9585
19.4k
  int NumLanes = Bits / 128;
9586
19.4k
  int NumElements = VT.getVectorNumElements();
9587
19.4k
  int NumEltsPerLane = NumElements / NumLanes;
9588
19.4k
  assert(VT.getScalarSizeInBits() <= 32 &&
9589
19.4k
         "Exceeds 32-bit integer zero extension limit");
9590
19.4k
  assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9591
19.4k
9592
19.4k
  // Define a helper function to check a particular ext-scale and lower to it if
9593
19.4k
  // valid.
9594
36.8k
  auto Lower = [&](int Scale) -> SDValue {
9595
36.8k
    SDValue InputV;
9596
36.8k
    bool AnyExt = true;
9597
36.8k
    int Offset = 0;
9598
36.8k
    int Matches = 0;
9599
117k
    for (int i = 0; 
i < NumElements117k
;
++i80.4k
) {
9600
114k
      int M = Mask[i];
9601
114k
      if (M < 0)
9602
21.6k
        continue; // Valid anywhere but doesn't tell us anything.
9603
93.1k
      
if (93.1k
i % Scale != 093.1k
) {
9604
50.1k
        // Each of the extended elements need to be zeroable.
9605
50.1k
        if (!Zeroable[i])
9606
30.5k
          return SDValue();
9607
19.6k
9608
19.6k
        // We no longer are in the anyext case.
9609
19.6k
        AnyExt = false;
9610
19.6k
        continue;
9611
19.6k
      }
9612
43.0k
9613
43.0k
      // Each of the base elements needs to be consecutive indices into the
9614
43.0k
      // same input vector.
9615
43.0k
      
SDValue V = M < NumElements ? 43.0k
V135.1k
:
V27.91k
;
9616
43.0k
      M = M % NumElements;
9617
43.0k
      if (
!InputV43.0k
) {
9618
35.0k
        InputV = V;
9619
35.0k
        Offset = M - (i / Scale);
9620
43.0k
      } else 
if (7.96k
InputV != V7.96k
)
9621
1.32k
        return SDValue(); // Flip-flopping inputs.
9622
41.6k
9623
41.6k
      // Offset must start in the lowest 128-bit lane or at the start of an
9624
41.6k
      // upper lane.
9625
41.6k
      // FIXME: Is it ever worth allowing a negative base offset?
9626
41.6k
      
if (41.6k
!((0 <= Offset && 41.6k
Offset < NumEltsPerLane41.2k
) ||
9627
1.05k
            (Offset % NumEltsPerLane) == 0))
9628
716
        return SDValue();
9629
40.9k
9630
40.9k
      // If we are offsetting, all referenced entries must come from the same
9631
40.9k
      // lane.
9632
40.9k
      
if (40.9k
Offset && 40.9k
(Offset / NumEltsPerLane) != (M / NumEltsPerLane)13.1k
)
9633
51
        return SDValue();
9634
40.9k
9635
40.9k
      
if (40.9k
(M % NumElements) != (Offset + (i / Scale))40.9k
)
9636
1.73k
        return SDValue(); // Non-consecutive strided elements.
9637
39.1k
      Matches++;
9638
39.1k
    }
9639
36.8k
9640
36.8k
    // If we fail to find an input, we have a zero-shuffle which should always
9641
36.8k
    // have already been handled.
9642
36.8k
    // FIXME: Maybe handle this here in case during blending we end up with one?
9643
2.50k
    
if (2.50k
!InputV2.50k
)
9644
0
      return SDValue();
9645
2.50k
9646
2.50k
    // If we are offsetting, don't extend if we only match a single input, we
9647
2.50k
    // can always do better by using a basic PSHUF or PUNPCK.
9648
2.50k
    
if (2.50k
Offset != 0 && 2.50k
Matches < 21.39k
)
9649
870
      return SDValue();
9650
1.63k
9651
1.63k
    return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9652
1.63k
        DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9653
1.63k
  };
9654
19.4k
9655
19.4k
  // The widest scale possible for extending is to a 64-bit integer.
9656
19.4k
  assert(Bits % 64 == 0 &&
9657
19.4k
         "The number of bits in a vector must be divisible by 64 on x86!");
9658
19.4k
  int NumExtElements = Bits / 64;
9659
19.4k
9660
19.4k
  // Each iteration, try extending the elements half as much, but into twice as
9661
19.4k
  // many elements.
9662
54.7k
  for (; 
NumExtElements < NumElements54.7k
;
NumExtElements *= 235.2k
) {
9663
36.8k
    assert(NumElements % NumExtElements == 0 &&
9664
36.8k
           "The input vector size must be divisible by the extended size.");
9665
36.8k
    if (SDValue V = Lower(NumElements / NumExtElements))
9666
1.58k
      return V;
9667
36.8k
  }
9668
19.4k
9669
19.4k
  // General extends failed, but 128-bit vectors may be able to use MOVQ.
9670
17.8k
  
if (17.8k
Bits != 12817.8k
)
9671
4.80k
    return SDValue();
9672
13.0k
9673
13.0k
  // Returns one of the source operands if the shuffle can be reduced to a
9674
13.0k
  // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9675
13.0k
  
auto CanZExtLowHalf = [&]() 13.0k
{
9676
33.7k
    for (int i = NumElements / 2; 
i != NumElements33.7k
;
++i20.7k
)
9677
29.0k
      
if (29.0k
!Zeroable[i]29.0k
)
9678
8.33k
        return SDValue();
9679
4.70k
    
if (4.70k
isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)4.70k
)
9680
0
      return V1;
9681
4.70k
    
if (4.70k
isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)4.70k
)
9682
0
      return V2;
9683
4.70k
    return SDValue();
9684
4.70k
  };
9685
13.0k
9686
13.0k
  if (SDValue 
V13.0k
= CanZExtLowHalf()) {
9687
0
    V = DAG.getBitcast(MVT::v2i64, V);
9688
0
    V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9689
0
    return DAG.getBitcast(VT, V);
9690
0
  }
9691
13.0k
9692
13.0k
  // No viable ext lowering found.
9693
13.0k
  return SDValue();
9694
13.0k
}
9695
9696
/// \brief Try to get a scalar value for a specific element of a vector.
9697
///
9698
/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9699
static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9700
11.3k
                                              SelectionDAG &DAG) {
9701
11.3k
  MVT VT = V.getSimpleValueType();
9702
11.3k
  MVT EltVT = VT.getVectorElementType();
9703
11.3k
  V = peekThroughBitcasts(V);
9704
11.3k
9705
11.3k
  // If the bitcasts shift the element size, we can't extract an equivalent
9706
11.3k
  // element from it.
9707
11.3k
  MVT NewVT = V.getSimpleValueType();
9708
11.3k
  if (
!NewVT.isVector() || 11.3k
NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()11.3k
)
9709
3.32k
    return SDValue();
9710
8.00k
9711
8.00k
  
if (8.00k
V.getOpcode() == ISD::BUILD_VECTOR ||
9712
8.00k
      
(Idx == 0 && 7.70k
V.getOpcode() == ISD::SCALAR_TO_VECTOR5.81k
)) {
9713
4.20k
    // Ensure the scalar operand is the same size as the destination.
9714
4.20k
    // FIXME: Add support for scalar truncation where possible.
9715
4.20k
    SDValue S = V.getOperand(Idx);
9716
4.20k
    if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9717
4.20k
      return DAG.getBitcast(EltVT, S);
9718
3.79k
  }
9719
3.79k
9720
3.79k
  return SDValue();
9721
3.79k
}
9722
9723
/// \brief Helper to test for a load that can be folded with x86 shuffles.
9724
///
9725
/// This is particularly important because the set of instructions varies
9726
/// significantly based on whether the operand is a load or not.
9727
339
static bool isShuffleFoldableLoad(SDValue V) {
9728
339
  V = peekThroughBitcasts(V);
9729
339
  return ISD::isNON_EXTLoad(V.getNode());
9730
339
}
9731
9732
/// \brief Try to lower insertion of a single element into a zero vector.
9733
///
9734
/// This is a common pattern that we have especially efficient patterns to lower
9735
/// across all subtarget feature sets.
9736
static SDValue lowerVectorShuffleAsElementInsertion(
9737
    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9738
    const APInt &Zeroable, const X86Subtarget &Subtarget,
9739
10.9k
    SelectionDAG &DAG) {
9740
10.9k
  MVT ExtVT = VT;
9741
10.9k
  MVT EltVT = VT.getVectorElementType();
9742
10.9k
9743
10.9k
  int V2Index =
9744
18.4k
      find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9745
10.9k
      Mask.begin();
9746
10.9k
  bool IsV1Zeroable = true;
9747
19.6k
  for (int i = 0, Size = Mask.size(); 
i < Size19.6k
;
++i8.72k
)
9748
18.5k
    
if (18.5k
i != V2Index && 18.5k
!Zeroable[i]12.9k
) {
9749
9.78k
      IsV1Zeroable = false;
9750
9.78k
      break;
9751
9.78k
    }
9752
10.9k
9753
10.9k
  // Check for a single input from a SCALAR_TO_VECTOR node.
9754
10.9k
  // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9755
10.9k
  // all the smarts here sunk into that routine. However, the current
9756
10.9k
  // lowering of BUILD_VECTOR makes that nearly impossible until the old
9757
10.9k
  // vector shuffle lowering is dead.
9758
10.9k
  SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9759
10.9k
                                               DAG);
9760
10.9k
  if (
V2S && 10.9k
DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())4.07k
) {
9761
4.06k
    // We need to zext the scalar if it is smaller than an i32.
9762
4.06k
    V2S = DAG.getBitcast(EltVT, V2S);
9763
4.06k
    if (
EltVT == MVT::i8 || 4.06k
EltVT == MVT::i164.03k
) {
9764
139
      // Using zext to expand a narrow element won't work for non-zero
9765
139
      // insertions.
9766
139
      if (!IsV1Zeroable)
9767
139
        return SDValue();
9768
0
9769
0
      // Zero-extend directly to i32.
9770
0
      ExtVT = MVT::v4i32;
9771
0
      V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9772
0
    }
9773
3.93k
    V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9774
10.9k
  } else 
if (6.85k
Mask[V2Index] != (int)Mask.size() || 6.85k
EltVT == MVT::i83.69k
||
9775
6.85k
             
EltVT == MVT::i163.67k
) {
9776
3.34k
    // Either not inserting from the low element of the input or the input
9777
3.34k
    // element size is too small to use VZEXT_MOVL to clear the high bits.
9778
3.34k
    return SDValue();
9779
3.34k
  }
9780
7.43k
9781
7.43k
  
if (7.43k
!IsV1Zeroable7.43k
) {
9782
6.39k
    // If V1 can't be treated as a zero vector we have fewer options to lower
9783
6.39k
    // this. We can't support integer vectors or non-zero targets cheaply, and
9784
6.39k
    // the V1 elements can't be permuted in any way.
9785
6.39k
    assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9786
6.39k
    if (
!VT.isFloatingPoint() || 6.39k
V2Index != 02.81k
)
9787
4.47k
      return SDValue();
9788
1.92k
    SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9789
1.92k
    V1Mask[V2Index] = -1;
9790
1.92k
    if (!isNoopShuffleMask(V1Mask))
9791
832
      return SDValue();
9792
1.09k
    // This is essentially a special case blend operation, but if we have
9793
1.09k
    // general purpose blend operations, they are always faster. Bail and let
9794
1.09k
    // the rest of the lowering handle these as blends.
9795
1.09k
    
if (1.09k
Subtarget.hasSSE41()1.09k
)
9796
454
      return SDValue();
9797
639
9798
639
    // Otherwise, use MOVSD or MOVSS.
9799
0
    assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9800
639
           "Only two types of floating point element types to handle!");
9801
639
    return DAG.getNode(EltVT == MVT::f32 ? 
X86ISD::MOVSS136
:
X86ISD::MOVSD503
, DL,
9802
6.39k
                       ExtVT, V1, V2);
9803
6.39k
  }
9804
1.03k
9805
1.03k
  // This lowering only works for the low element with floating point vectors.
9806
1.03k
  
if (1.03k
VT.isFloatingPoint() && 1.03k
V2Index != 0336
)
9807
10
    return SDValue();
9808
1.02k
9809
1.02k
  V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9810
1.02k
  if (ExtVT != VT)
9811
0
    V2 = DAG.getBitcast(VT, V2);
9812
1.02k
9813
1.02k
  if (
V2Index != 01.02k
) {
9814
40
    // If we have 4 or fewer lanes we can cheaply shuffle the element into
9815
40
    // the desired position. Otherwise it is more efficient to do a vector
9816
40
    // shift left. We know that we can do a vector shift left because all
9817
40
    // the inputs are zero.
9818
40
    if (
VT.isFloatingPoint() || 40
VT.getVectorNumElements() <= 440
) {
9819
40
      SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9820
40
      V2Shuffle[V2Index] = 0;
9821
40
      V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9822
40
    } else {
9823
0
      V2 = DAG.getBitcast(MVT::v16i8, V2);
9824
0
      V2 = DAG.getNode(
9825
0
          X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9826
0
          DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9827
0
                          DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9828
0
                              DAG.getDataLayout(), VT)));
9829
0
      V2 = DAG.getBitcast(VT, V2);
9830
0
    }
9831
40
  }
9832
10.9k
  return V2;
9833
10.9k
}
9834
9835
/// Try to lower broadcast of a single - truncated - integer element,
9836
/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9837
///
9838
/// This assumes we have AVX2.
9839
static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9840
                                                  SDValue V0, int BroadcastIdx,
9841
                                                  const X86Subtarget &Subtarget,
9842
375
                                                  SelectionDAG &DAG) {
9843
375
  assert(Subtarget.hasAVX2() &&
9844
375
         "We can only lower integer broadcasts with AVX2!");
9845
375
9846
375
  EVT EltVT = VT.getVectorElementType();
9847
375
  EVT V0VT = V0.getValueType();
9848
375
9849
375
  assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9850
375
  assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9851
375
9852
375
  EVT V0EltVT = V0VT.getVectorElementType();
9853
375
  if (!V0EltVT.isInteger())
9854
0
    return SDValue();
9855
375
9856
375
  const unsigned EltSize = EltVT.getSizeInBits();
9857
375
  const unsigned V0EltSize = V0EltVT.getSizeInBits();
9858
375
9859
375
  // This is only a truncation if the original element type is larger.
9860
375
  if (V0EltSize <= EltSize)
9861
201
    return SDValue();
9862
174
9863
375
  assert(((V0EltSize % EltSize) == 0) &&
9864
174
         "Scalar type sizes must all be powers of 2 on x86!");
9865
174
9866
174
  const unsigned V0Opc = V0.getOpcode();
9867
174
  const unsigned Scale = V0EltSize / EltSize;
9868
174
  const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9869
174
9870
174
  if (
(V0Opc != ISD::SCALAR_TO_VECTOR || 174
V0BroadcastIdx != 04
) &&
9871
170
      V0Opc != ISD::BUILD_VECTOR)
9872
100
    return SDValue();
9873
74
9874
74
  SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9875
74
9876
74
  // If we're extracting non-least-significant bits, shift so we can truncate.
9877
74
  // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9878
74
  // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9879
74
  // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9880
74
  if (const int OffsetIdx = BroadcastIdx % Scale)
9881
44
    Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9882
44
            DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9883
375
9884
375
  return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9885
375
                     DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9886
375
}
9887
9888
/// \brief Try to lower broadcast of a single element.
9889
///
9890
/// For convenience, this code also bundles all of the subtarget feature set
9891
/// filtering. While a little annoying to re-dispatch on type here, there isn't
9892
/// a convenient way to factor it out.
9893
static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9894
                                             SDValue V1, SDValue V2,
9895
                                             ArrayRef<int> Mask,
9896
                                             const X86Subtarget &Subtarget,
9897
19.6k
                                             SelectionDAG &DAG) {
9898
19.6k
  if (
!((Subtarget.hasSSE3() && 19.6k
VT == MVT::v2f6416.8k
) ||
9899
18.7k
        
(Subtarget.hasAVX() && 18.7k
VT.isFloatingPoint()13.4k
) ||
9900
15.6k
        
(Subtarget.hasAVX2() && 15.6k
VT.isInteger()8.76k
)))
9901
6.88k
    return SDValue();
9902
12.7k
9903
12.7k
  // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9904
12.7k
  // we can only broadcast from a register with AVX2.
9905
12.7k
  unsigned NumElts = Mask.size();
9906
12.7k
  unsigned Opcode = VT == MVT::v2f64 ? 
X86ISD::MOVDDUP972
:
X86ISD::VBROADCAST11.8k
;
9907
11.8k
  bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9908
12.7k
9909
12.7k
  // Check that the mask is a broadcast.
9910
12.7k
  int BroadcastIdx = -1;
9911
167k
  for (int i = 0; 
i != (int)NumElts167k
;
++i154k
) {
9912
157k
    SmallVector<int, 8> BroadcastMask(NumElts, i);
9913
157k
    if (
isShuffleEquivalent(V1, V2, Mask, BroadcastMask)157k
) {
9914
2.91k
      BroadcastIdx = i;
9915
2.91k
      break;
9916
2.91k
    }
9917
157k
  }
9918
12.7k
9919
12.7k
  if (BroadcastIdx < 0)
9920
9.88k
    return SDValue();
9921
12.7k
  assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9922
2.91k
                                            "a sorted mask where the broadcast "
9923
2.91k
                                            "comes from V1.");
9924
2.91k
9925
2.91k
  // Go up the chain of (vector) values to find a scalar load that we can
9926
2.91k
  // combine with the broadcast.
9927
2.91k
  SDValue V = V1;
9928
3.23k
  for (;;) {
9929
3.23k
    switch (V.getOpcode()) {
9930
771
    case ISD::BITCAST: {
9931
771
      SDValue VSrc = V.getOperand(0);
9932
771
      MVT SrcVT = VSrc.getSimpleValueType();
9933
771
      if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9934
699
        break;
9935
72
      V = VSrc;
9936
72
      continue;
9937
72
    }
9938
225
    case ISD::CONCAT_VECTORS: {
9939
225
      int OperandSize = Mask.size() / V.getNumOperands();
9940
225
      V = V.getOperand(BroadcastIdx / OperandSize);
9941
225
      BroadcastIdx %= OperandSize;
9942
225
      continue;
9943
72
    }
9944
26
    case ISD::INSERT_SUBVECTOR: {
9945
26
      SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9946
26
      auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9947
26
      if (!ConstantIdx)
9948
0
        break;
9949
26
9950
26
      int BeginIdx = (int)ConstantIdx->getZExtValue();
9951
26
      int EndIdx =
9952
26
          BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9953
26
      if (
BroadcastIdx >= BeginIdx && 26
BroadcastIdx < EndIdx26
) {
9954
26
        BroadcastIdx -= BeginIdx;
9955
26
        V = VInner;
9956
26
      } else {
9957
0
        V = VOuter;
9958
0
      }
9959
771
      continue;
9960
771
    }
9961
2.91k
    }
9962
2.91k
    break;
9963
2.91k
  }
9964
2.91k
9965
2.91k
  // Check if this is a broadcast of a scalar. We special case lowering
9966
2.91k
  // for scalars so that we can more effectively fold with loads.
9967
2.91k
  // First, look through bitcast: if the original value has a larger element
9968
2.91k
  // type than the shuffle, the broadcast element is in essence truncated.
9969
2.91k
  // Make that explicit to ease folding.
9970
2.91k
  
if (2.91k
V.getOpcode() == ISD::BITCAST && 2.91k
VT.isInteger()699
)
9971
375
    
if (SDValue 375
TruncBroadcast375
= lowerVectorShuffleAsTruncBroadcast(
9972
375
            DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9973
74
      return TruncBroadcast;
9974
2.83k
9975
2.83k
  MVT BroadcastVT = VT;
9976
2.83k
9977
2.83k
  // Peek through any bitcast (only useful for loads).
9978
2.83k
  SDValue BC = peekThroughBitcasts(V);
9979
2.83k
9980
2.83k
  // Also check the simpler case, where we can directly reuse the scalar.
9981
2.83k
  if (V.getOpcode() == ISD::BUILD_VECTOR ||
9982
2.83k
      
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && 2.83k
BroadcastIdx == 0343
)) {
9983
343
    V = V.getOperand(BroadcastIdx);
9984
343
9985
343
    // If we can't broadcast from a register, check that the input is a load.
9986
343
    if (
!BroadcastFromReg && 343
!isShuffleFoldableLoad(V)43
)
9987
32
      return SDValue();
9988
2.49k
  } else 
if (2.49k
MayFoldLoad(BC) && 2.49k
!cast<LoadSDNode>(BC)->isVolatile()211
) {
9989
211
    // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9990
211
    if (
!Subtarget.is64Bit() && 211
VT.getScalarType() == MVT::i6453
) {
9991
7
      BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9992
7
      Opcode = (BroadcastVT.is128BitVector() ? 
X86ISD::MOVDDUP2
:
Opcode5
);
9993
7
    }
9994
211
9995
211
    // If we are broadcasting a load that is only used by the shuffle
9996
211
    // then we can reduce the vector load to the broadcasted scalar load.
9997
211
    LoadSDNode *Ld = cast<LoadSDNode>(BC);
9998
211
    SDValue BaseAddr = Ld->getOperand(1);
9999
211
    EVT SVT = BroadcastVT.getScalarType();
10000
211
    unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10001
211
    SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10002
211
    V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10003
211
                    DAG.getMachineFunction().getMachineMemOperand(
10004
211
                        Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10005
211
    DAG.makeEquivalentMemoryOrdering(Ld, V);
10006
2.49k
  } else 
if (2.28k
!BroadcastFromReg2.28k
) {
10007
159
    // We can't broadcast from a vector register.
10008
159
    return SDValue();
10009
2.12k
  } else 
if (2.12k
BroadcastIdx != 02.12k
) {
10010
1.27k
    // We can only broadcast from the zero-element of a vector register,
10011
1.27k
    // but it can be advantageous to broadcast from the zero-element of a
10012
1.27k
    // subvector.
10013
1.27k
    if (
!VT.is256BitVector() && 1.27k
!VT.is512BitVector()1.22k
)
10014
1.20k
      return SDValue();
10015
73
10016
73
    // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10017
73
    
if (73
VT == MVT::v4f64 || 73
VT == MVT::v4i6458
)
10018
34
      return SDValue();
10019
39
10020
39
    // Only broadcast the zero-element of a 128-bit subvector.
10021
39
    unsigned EltSize = VT.getScalarSizeInBits();
10022
39
    if (((BroadcastIdx * EltSize) % 128) != 0)
10023
3
      return SDValue();
10024
36
10025
36
    // The shuffle input might have been a bitcast we looked through; look at
10026
36
    // the original input vector.  Emit an EXTRACT_SUBVECTOR of that type; we'll
10027
36
    // later bitcast it to BroadcastVT.
10028
36
    MVT SrcVT = V.getSimpleValueType();
10029
36
    assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10030
36
           "Unexpected vector element size");
10031
36
    assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
10032
36
           "Unexpected vector size");
10033
36
10034
36
    MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
10035
36
    V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
10036
36
                    DAG.getIntPtrConstant(BroadcastIdx, DL));
10037
36
  }
10038
2.83k
10039
1.40k
  
if (1.40k
Opcode == X86ISD::MOVDDUP && 1.40k
!V.getValueType().isVector()270
)
10040
139
    V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10041
139
                    DAG.getBitcast(MVT::f64, V));
10042
1.40k
10043
1.40k
  // Bitcast back to the same scalar type as BroadcastVT.
10044
1.40k
  MVT SrcVT = V.getSimpleValueType();
10045
1.40k
  if (
SrcVT.getScalarType() != BroadcastVT.getScalarType()1.40k
) {
10046
16
    assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10047
16
           "Unexpected vector element size");
10048
16
    if (
SrcVT.isVector()16
) {
10049
12
      unsigned NumSrcElts = SrcVT.getVectorNumElements();
10050
12
      SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10051
16
    } else {
10052
4
      SrcVT = BroadcastVT.getScalarType();
10053
4
    }
10054
16
    V = DAG.getBitcast(SrcVT, V);
10055
16
  }
10056
1.40k
10057
1.40k
  // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10058
1.40k
  if (
!Subtarget.is64Bit() && 1.40k
SrcVT == MVT::i64363
) {
10059
1
    V = DAG.getBitcast(MVT::f64, V);
10060
1
    unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10061
1
    BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10062
1
  }
10063
1.40k
10064
1.40k
  // We only support broadcasting from 128-bit vectors to minimize the
10065
1.40k
  // number of patterns we need to deal with in isel. So extract down to
10066
1.40k
  // 128-bits.
10067
1.40k
  if (SrcVT.getSizeInBits() > 128)
10068
260
    V = extract128BitVector(V, 0, DAG, DL);
10069
1.40k
10070
1.40k
  return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10071
19.6k
}
10072
10073
// Check for whether we can use INSERTPS to perform the shuffle. We only use
10074
// INSERTPS when the V1 elements are already in the correct locations
10075
// because otherwise we can just always use two SHUFPS instructions which
10076
// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10077
// perform INSERTPS if a single V1 element is out of place and all V2
10078
// elements are zeroable.
10079
static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10080
                                         unsigned &InsertPSMask,
10081
                                         const APInt &Zeroable,
10082
                                         ArrayRef<int> Mask,
10083
845
                                         SelectionDAG &DAG) {
10084
845
  assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10085
845
  assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10086
845
  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10087
845
10088
845
  // Attempt to match INSERTPS with one element from VA or VB being
10089
845
  // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10090
845
  // are updated.
10091
845
  auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10092
1.29k
                             ArrayRef<int> CandidateMask) {
10093
1.29k
    unsigned ZMask = 0;
10094
1.29k
    int VADstIndex = -1;
10095
1.29k
    int VBDstIndex = -1;
10096
1.29k
    bool VAUsedInPlace = false;
10097
1.29k
10098
4.33k
    for (int i = 0; 
i < 44.33k
;
++i3.04k
) {
10099
3.89k
      // Synthesize a zero mask from the zeroable elements (includes undefs).
10100
3.89k
      if (
Zeroable[i]3.89k
) {
10101
943
        ZMask |= 1 << i;
10102
943
        continue;
10103
943
      }
10104
2.95k
10105
2.95k
      // Flag if we use any VA inputs in place.
10106
2.95k
      
if (2.95k
i == CandidateMask[i]2.95k
) {
10107
807
        VAUsedInPlace = true;
10108
807
        continue;
10109
807
      }
10110
2.14k
10111
2.14k
      // We can only insert a single non-zeroable element.
10112
2.14k
      
if (2.14k
VADstIndex >= 0 || 2.14k
VBDstIndex >= 01.84k
)
10113
857
        return false;
10114
1.29k
10115
1.29k
      
if (1.29k
CandidateMask[i] < 41.29k
) {
10116
495
        // VA input out of place for insertion.
10117
495
        VADstIndex = i;
10118
1.29k
      } else {
10119
797
        // VB input for insertion.
10120
797
        VBDstIndex = i;
10121
797
      }
10122
3.89k
    }
10123
1.29k
10124
1.29k
    // Don't bother if we have no (non-zeroable) element for insertion.
10125
435
    
if (435
VADstIndex < 0 && 435
VBDstIndex < 0246
)
10126
0
      return false;
10127
435
10128
435
    // Determine element insertion src/dst indices. The src index is from the
10129
435
    // start of the inserted vector, not the start of the concatenated vector.
10130
435
    unsigned VBSrcIndex = 0;
10131
435
    if (
VADstIndex >= 0435
) {
10132
189
      // If we have a VA input out of place, we use VA as the V2 element
10133
189
      // insertion and don't use the original V2 at all.
10134
189
      VBSrcIndex = CandidateMask[VADstIndex];
10135
189
      VBDstIndex = VADstIndex;
10136
189
      VB = VA;
10137
435
    } else {
10138
246
      VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10139
246
    }
10140
435
10141
435
    // If no V1 inputs are used in place, then the result is created only from
10142
435
    // the zero mask and the V2 insertion - so remove V1 dependency.
10143
435
    if (!VAUsedInPlace)
10144
62
      VA = DAG.getUNDEF(MVT::v4f32);
10145
1.29k
10146
1.29k
    // Update V1, V2 and InsertPSMask accordingly.
10147
1.29k
    V1 = VA;
10148
1.29k
    V2 = VB;
10149
1.29k
10150
1.29k
    // Insert the V2 element into the desired position.
10151
1.29k
    InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10152
1.29k
    assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10153
1.29k
    return true;
10154
1.29k
  };
10155
845
10156
845
  if (matchAsInsertPS(V1, V2, Mask))
10157
398
    return true;
10158
447
10159
447
  // Commute and try again.
10160
447
  SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10161
447
  ShuffleVectorSDNode::commuteMask(CommutedMask);
10162
447
  if (matchAsInsertPS(V2, V1, CommutedMask))
10163
37
    return true;
10164
410
10165
410
  return false;
10166
410
}
10167
10168
static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10169
                                            SDValue V2, ArrayRef<int> Mask,
10170
                                            const APInt &Zeroable,
10171
533
                                            SelectionDAG &DAG) {
10172
533
  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10173
533
  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10174
533
10175
533
  // Attempt to match the insertps pattern.
10176
533
  unsigned InsertPSMask;
10177
533
  if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10178
356
    return SDValue();
10179
177
10180
177
  // Insert the V2 element into the desired position.
10181
177
  return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10182
177
                     DAG.getConstant(InsertPSMask, DL, MVT::i8));
10183
177
}
10184
10185
/// \brief Try to lower a shuffle as a permute of the inputs followed by an
10186
/// UNPCK instruction.
10187
///
10188
/// This specifically targets cases where we end up with alternating between
10189
/// the two inputs, and so can permute them into something that feeds a single
10190
/// UNPCK instruction. Note that this routine only targets integer vectors
10191
/// because for floating point vectors we have a generalized SHUFPS lowering
10192
/// strategy that handles everything that doesn't *exactly* match an unpack,
10193
/// making this clever lowering unnecessary.
10194
static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10195
                                                    SDValue V1, SDValue V2,
10196
                                                    ArrayRef<int> Mask,
10197
1.45k
                                                    SelectionDAG &DAG) {
10198
1.45k
  assert(!VT.isFloatingPoint() &&
10199
1.45k
         "This routine only supports integer vectors.");
10200
1.45k
  assert(VT.is128BitVector() &&
10201
1.45k
         "This routine only works on 128-bit vectors.");
10202
1.45k
  assert(!V2.isUndef() &&
10203
1.45k
         "This routine should only be used when blending two inputs.");
10204
1.45k
  assert(Mask.size() >= 2 && "Single element masks are invalid.");
10205
1.45k
10206
1.45k
  int Size = Mask.size();
10207
1.45k
10208
1.45k
  int NumLoInputs =
10209
14.6k
      count_if(Mask, [Size](int M) 
{ return M >= 0 && 14.6k
M % Size < Size / 211.7k
; });
10210
1.45k
  int NumHiInputs =
10211
14.6k
      count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10212
1.45k
10213
1.45k
  bool UnpackLo = NumLoInputs >= NumHiInputs;
10214
1.45k
10215
3.40k
  auto TryUnpack = [&](int ScalarSize, int Scale) {
10216
3.40k
    SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10217
3.40k
    SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10218
3.40k
10219
17.7k
    for (int i = 0; 
i < Size17.7k
;
++i14.3k
) {
10220
16.9k
      if (Mask[i] < 0)
10221
3.51k
        continue;
10222
13.3k
10223
13.3k
      // Each element of the unpack contains Scale elements from this mask.
10224
13.3k
      int UnpackIdx = i / Scale;
10225
13.3k
10226
13.3k
      // We only handle the case where V1 feeds the first slots of the unpack.
10227
13.3k
      // We rely on canonicalization to ensure this is the case.
10228
13.3k
      if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10229
2.52k
        return SDValue();
10230
10.8k
10231
10.8k
      // Setup the mask for this input. The indexing is tricky as we have to
10232
10.8k
      // handle the unpack stride.
10233
10.8k
      
SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? 10.8k
V1Mask7.03k
:
V2Mask3.84k
;
10234
10.8k
      VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 
010.1k
:
Size / 2745
)] =
10235
16.9k
          Mask[i] % Size;
10236
16.9k
    }
10237
3.40k
10238
3.40k
    // If we will have to shuffle both inputs to use the unpack, check whether
10239
3.40k
    // we can just unpack first and shuffle the result. If so, skip this unpack.
10240
880
    
if (880
(NumLoInputs == 0 || 880
NumHiInputs == 0878
) &&
!isNoopShuffleMask(V1Mask)58
&&
10241
55
        !isNoopShuffleMask(V2Mask))
10242
48
      return SDValue();
10243
832
10244
832
    // Shuffle the inputs into place.
10245
832
    V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10246
832
    V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10247
832
10248
832
    // Cast the inputs to the type we will use to unpack them.
10249
832
    MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10250
832
    V1 = DAG.getBitcast(UnpackVT, V1);
10251
832
    V2 = DAG.getBitcast(UnpackVT, V2);
10252
832
10253
832
    // Unpack the inputs and cast the result back to the desired type.
10254
832
    return DAG.getBitcast(
10255
832
        VT, DAG.getNode(UnpackLo ? 
X86ISD::UNPCKL799
:
X86ISD::UNPCKH33
, DL,
10256
3.40k
                        UnpackVT, V1, V2));
10257
3.40k
  };
10258
1.45k
10259
1.45k
  // We try each unpack from the largest to the smallest to try and find one
10260
1.45k
  // that fits this mask.
10261
1.45k
  int OrigScalarSize = VT.getScalarSizeInBits();
10262
4.02k
  for (int ScalarSize = 64; 
ScalarSize >= OrigScalarSize4.02k
;
ScalarSize /= 22.56k
)
10263
3.40k
    
if (SDValue 3.40k
Unpack3.40k
= TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10264
832
      return Unpack;
10265
1.45k
10266
1.45k
  // If none of the unpack-rooted lowerings worked (or were profitable) try an
10267
1.45k
  // initial unpack.
10268
621
  
if (621
NumLoInputs == 0 || 621
NumHiInputs == 0598
) {
10269
122
    assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10270
122
           "We have to have *some* inputs!");
10271
122
    int HalfOffset = NumLoInputs == 0 ? 
Size / 223
:
099
;
10272
122
10273
122
    // FIXME: We could consider the total complexity of the permute of each
10274
122
    // possible unpacking. Or at the least we should consider how many
10275
122
    // half-crossings are created.
10276
122
    // FIXME: We could consider commuting the unpacks.
10277
122
10278
122
    SmallVector<int, 32> PermMask((unsigned)Size, -1);
10279
1.41k
    for (int i = 0; 
i < Size1.41k
;
++i1.28k
) {
10280
1.28k
      if (Mask[i] < 0)
10281
302
        continue;
10282
986
10283
1.28k
      assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10284
986
10285
986
      PermMask[i] =
10286
986
          2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 
0605
:
1381
);
10287
1.28k
    }
10288
122
    return DAG.getVectorShuffle(
10289
122
        VT, DL, DAG.getNode(NumLoInputs == 0 ? 
X86ISD::UNPCKH23
:
X86ISD::UNPCKL99
,
10290
122
                            DL, VT, V1, V2),
10291
122
        DAG.getUNDEF(VT), PermMask);
10292
122
  }
10293
499
10294
499
  return SDValue();
10295
499
}
10296
10297
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10298
///
10299
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
10300
/// support for floating point shuffles but not integer shuffles. These
10301
/// instructions will incur a domain crossing penalty on some chips though so
10302
/// it is better to avoid lowering through this for integer vectors where
10303
/// possible.
10304
static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10305
                                       const APInt &Zeroable,
10306
                                       SDValue V1, SDValue V2,
10307
                                       const X86Subtarget &Subtarget,
10308
3.09k
                                       SelectionDAG &DAG) {
10309
3.09k
  assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10310
3.09k
  assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10311
3.09k
  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10312
3.09k
10313
3.09k
  if (
V2.isUndef()3.09k
) {
10314
1.17k
    // Check for being able to broadcast a single element.
10315
1.17k
    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10316
1.17k
            DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10317
268
      return Broadcast;
10318
905
10319
905
    // Straight shuffle of a single input vector. Simulate this by using the
10320
905
    // single input as both of the "inputs" to this instruction..
10321
905
    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10322
905
10323
905
    if (
Subtarget.hasAVX()905
) {
10324
624
      // If we have AVX, we can use VPERMILPS which will allow folding a load
10325
624
      // into the shuffle.
10326
624
      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10327
624
                         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10328
624
    }
10329
281
10330
281
    return DAG.getNode(
10331
281
        X86ISD::SHUFP, DL, MVT::v2f64,
10332
281
        Mask[0] == SM_SentinelUndef ? 
DAG.getUNDEF(MVT::v2f64)0
:
V1281
,
10333
281
        Mask[1] == SM_SentinelUndef ? 
DAG.getUNDEF(MVT::v2f64)213
:
V168
,
10334
1.17k
        DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10335
1.17k
  }
10336
3.09k
  assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10337
1.91k
  assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10338
1.91k
10339
1.91k
  // If we have a single input, insert that into V1 if we can do so cheaply.
10340
1.91k
  if (
(Mask[0] >= 2) + (Mask[1] >= 2) == 11.91k
) {
10341
1.91k
    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10342
1.91k
            DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10343
0
      return Insertion;
10344
1.91k
    // Try inverting the insertion since for v2 masks it is easy to do and we
10345
1.91k
    // can't reliably sort the mask one way or the other.
10346
1.91k
    
int InverseMask[2] = {Mask[0] < 0 ? 1.91k
-10
:
(Mask[0] ^ 2)1.91k
,
10347
1.91k
                          Mask[1] < 0 ? 
-10
:
(Mask[1] ^ 2)1.91k
};
10348
1.91k
    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10349
1.91k
            DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10350
636
      return Insertion;
10351
1.28k
  }
10352
1.28k
10353
1.28k
  // Try to use one of the special instruction patterns to handle two common
10354
1.28k
  // blend patterns if a zero-blend above didn't work.
10355
1.28k
  
if (1.28k
isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10356
987
      isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10357
405
    
if (SDValue 405
V1S405
= getScalarValueForVectorElement(V1, Mask[0], DAG))
10358
405
      // We can either use a special instruction to load over the low double or
10359
405
      // to move just the low double.
10360
133
      return DAG.getNode(
10361
133
          isShuffleFoldableLoad(V1S) ? 
X86ISD::MOVLPD31
:
X86ISD::MOVSD102
,
10362
405
          DL, MVT::v2f64, V2,
10363
405
          DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10364
1.15k
10365
1.15k
  
if (1.15k
Subtarget.hasSSE41()1.15k
)
10366
875
    
if (SDValue 875
Blend875
= lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10367
875
                                                  Zeroable, Subtarget, DAG))
10368
163
      return Blend;
10369
987
10370
987
  // Use dedicated unpack instructions for masks that match their pattern.
10371
987
  
if (SDValue 987
V987
=
10372
987
          lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10373
912
    return V;
10374
75
10375
75
  unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10376
75
  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10377
75
                     DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10378
75
}
10379
10380
/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10381
///
10382
/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10383
/// the integer unit to minimize domain crossing penalties. However, for blends
10384
/// it falls back to the floating point shuffle operation with appropriate bit
10385
/// casting.
10386
static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10387
                                       const APInt &Zeroable,
10388
                                       SDValue V1, SDValue V2,
10389
                                       const X86Subtarget &Subtarget,
10390
3.64k
                                       SelectionDAG &DAG) {
10391
3.64k
  assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10392
3.64k
  assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10393
3.64k
  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10394
3.64k
10395
3.64k
  if (
V2.isUndef()3.64k
) {
10396
1.35k
    // Check for being able to broadcast a single element.
10397
1.35k
    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10398
1.35k
            DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10399
115
      return Broadcast;
10400
1.24k
10401
1.24k
    // Straight shuffle of a single input vector. For everything from SSE2
10402
1.24k
    // onward this has a single fast instruction with no scary immediates.
10403
1.24k
    // We have to map the mask as it is actually a v4i32 shuffle instruction.
10404
1.24k
    V1 = DAG.getBitcast(MVT::v4i32, V1);
10405
1.24k
    int WidenedMask[4] = {
10406
1.24k
        std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10407
1.24k
        std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10408
1.24k
    return DAG.getBitcast(
10409
1.24k
        MVT::v2i64,
10410
1.24k
        DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10411
1.24k
                    getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10412
1.24k
  }
10413
3.64k
  assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10414
2.28k
  assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10415
2.28k
  assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10416
2.28k
  assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10417
2.28k
10418
2.28k
  // If we have a blend of two same-type PACKUS operations and the blend aligns
10419
2.28k
  // with the low and high halves, we can just merge the PACKUS operations.
10420
2.28k
  // This is particularly important as it lets us merge shuffles that this
10421
2.28k
  // routine itself creates.
10422
2.29k
  auto GetPackNode = [](SDValue V) {
10423
2.29k
    V = peekThroughBitcasts(V);
10424
2.29k
    return V.getOpcode() == X86ISD::PACKUS ? 
V25
:
SDValue()2.27k
;
10425
2.29k
  };
10426
2.28k
  if (SDValue V1Pack = GetPackNode(V1))
10427
13
    
if (SDValue 13
V2Pack13
= GetPackNode(V2)) {
10428
12
      EVT PackVT = V1Pack.getValueType();
10429
12
      if (PackVT == V2Pack.getValueType())
10430
10
        return DAG.getBitcast(MVT::v2i64,
10431
10
                              DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10432
8
                                          Mask[0] == 0 ? V1Pack.getOperand(0)
10433
2
                                                       : V1Pack.getOperand(1),
10434
6
                                          Mask[1] == 2 ? V2Pack.getOperand(0)
10435
10
                                                       : V2Pack.getOperand(1)));
10436
2.27k
    }
10437
2.27k
10438
2.27k
  // Try to use shift instructions.
10439
2.27k
  
if (SDValue 2.27k
Shift2.27k
= lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10440
2.27k
                                                Zeroable, Subtarget, DAG))
10441
25
    return Shift;
10442
2.25k
10443
2.25k
  // When loading a scalar and then shuffling it into a vector we can often do
10444
2.25k
  // the insertion cheaply.
10445
2.25k
  
if (SDValue 2.25k
Insertion2.25k
= lowerVectorShuffleAsElementInsertion(
10446
2.25k
          DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10447
1
    return Insertion;
10448
2.24k
  // Try inverting the insertion since for v2 masks it is easy to do and we
10449
2.24k
  // can't reliably sort the mask one way or the other.
10450
2.24k
  int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10451
2.24k
  if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10452
2.24k
          DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10453
313
    return Insertion;
10454
1.93k
10455
1.93k
  // We have different paths for blend lowering, but they all must use the
10456
1.93k
  // *exact* same predicate.
10457
1.93k
  bool IsBlendSupported = Subtarget.hasSSE41();
10458
1.93k
  if (IsBlendSupported)
10459
1.11k
    
if (SDValue 1.11k
Blend1.11k
= lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10460
1.11k
                                                  Zeroable, Subtarget, DAG))
10461
399
      return Blend;
10462
1.53k
10463
1.53k
  // Use dedicated unpack instructions for masks that match their pattern.
10464
1.53k
  
if (SDValue 1.53k
V1.53k
=
10465
1.53k
          lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10466
1.10k
    return V;
10467
435
10468
435
  // Try to use byte rotation instructions.
10469
435
  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10470
435
  
if (435
Subtarget.hasSSSE3()435
)
10471
292
    
if (SDValue 292
Rotate292
= lowerVectorShuffleAsByteRotate(
10472
292
            DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10473
65
      return Rotate;
10474
370
10475
370
  // If we have direct support for blends, we should lower by decomposing into
10476
370
  // a permute. That will be faster than the domain cross.
10477
370
  
if (370
IsBlendSupported370
)
10478
0
    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10479
0
                                                      Mask, DAG);
10480
370
10481
370
  // We implement this with SHUFPD which is pretty lame because it will likely
10482
370
  // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10483
370
  // However, all the alternatives are still more cycles and newer chips don't
10484
370
  // have this problem. It would be really nice if x86 had better shuffles here.
10485
370
  V1 = DAG.getBitcast(MVT::v2f64, V1);
10486
370
  V2 = DAG.getBitcast(MVT::v2f64, V2);
10487
370
  return DAG.getBitcast(MVT::v2i64,
10488
370
                        DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10489
370
}
10490
10491
/// \brief Test whether this can be lowered with a single SHUFPS instruction.
10492
///
10493
/// This is used to disable more specialized lowerings when the shufps lowering
10494
/// will happen to be efficient.
10495
1.22k
static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10496
1.22k
  // This routine only handles 128-bit shufps.
10497
1.22k
  assert(Mask.size() == 4 && "Unsupported mask size!");
10498
1.22k
  assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10499
1.22k
  assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10500
1.22k
  assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10501
1.22k
  assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10502
1.22k
10503
1.22k
  // To lower with a single SHUFPS we need to have the low half and high half
10504
1.22k
  // each requiring a single input.
10505
1.22k
  if (
Mask[0] >= 0 && 1.22k
Mask[1] >= 01.21k
&&
(Mask[0] < 4) != (Mask[1] < 4)1.21k
)
10506
609
    return false;
10507
617
  
if (617
Mask[2] >= 0 && 617
Mask[3] >= 0614
&&
(Mask[2] < 4) != (Mask[3] < 4)607
)
10508
122
    return false;
10509
495
10510
495
  return true;
10511
495
}
10512
10513
/// \brief Lower a vector shuffle using the SHUFPS instruction.
10514
///
10515
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10516
/// It makes no assumptions about whether this is the *best* lowering, it simply
10517
/// uses it.
10518
static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10519
                                            ArrayRef<int> Mask, SDValue V1,
10520
676
                                            SDValue V2, SelectionDAG &DAG) {
10521
676
  SDValue LowV = V1, HighV = V2;
10522
676
  int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10523
676
10524
2.70k
  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10525
676
10526
676
  if (
NumV2Elements == 1676
) {
10527
664
    int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10528
233
10529
233
    // Compute the index adjacent to V2Index and in the same half by toggling
10530
233
    // the low bit.
10531
233
    int V2AdjIndex = V2Index ^ 1;
10532
233
10533
233
    if (
Mask[V2AdjIndex] < 0233
) {
10534
18
      // Handles all the cases where we have a single V2 element and an undef.
10535
18
      // This will only ever happen in the high lanes because we commute the
10536
18
      // vector otherwise.
10537
18
      if (V2Index < 2)
10538
3
        std::swap(LowV, HighV);
10539
18
      NewMask[V2Index] -= 4;
10540
233
    } else {
10541
215
      // Handle the case where the V2 element ends up adjacent to a V1 element.
10542
215
      // To make this work, blend them together as the first step.
10543
215
      int V1Index = V2AdjIndex;
10544
215
      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10545
215
      V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10546
215
                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10547
215
10548
215
      // Now proceed to reconstruct the final blend as we have the necessary
10549
215
      // high or low half formed.
10550
215
      if (
V2Index < 2215
) {
10551
85
        LowV = V2;
10552
85
        HighV = V1;
10553
215
      } else {
10554
130
        HighV = V2;
10555
130
      }
10556
215
      NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10557
215
      NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10558
215
    }
10559
676
  } else 
if (443
NumV2Elements == 2443
) {
10560
443
    if (
Mask[0] < 4 && 443
Mask[1] < 4436
) {
10561
395
      // Handle the easy case where we have V1 in the low lanes and V2 in the
10562
395
      // high lanes.
10563
395
      NewMask[2] -= 4;
10564
395
      NewMask[3] -= 4;
10565
443
    } else 
if (48
Mask[2] < 4 && 48
Mask[3] < 439
) {
10566
1
      // We also handle the reversed case because this utility may get called
10567
1
      // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10568
1
      // arrange things in the right direction.
10569
1
      NewMask[0] -= 4;
10570
1
      NewMask[1] -= 4;
10571
1
      HighV = V1;
10572
1
      LowV = V2;
10573
48
    } else {
10574
47
      // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10575
47
      // trying to place elements directly, just blend them and set up the final
10576
47
      // shuffle to place them.
10577
47
10578
47
      // The first two blend mask elements are for V1, the second two are for
10579
47
      // V2.
10580
47
      int BlendMask[4] = {Mask[0] < 4 ? 
Mask[0]41
:
Mask[1]6
,
10581
47
                          Mask[2] < 4 ? 
Mask[2]38
:
Mask[3]9
,
10582
47
                          (Mask[0] >= 4 ? 
Mask[0]6
:
Mask[1]41
) - 4,
10583
47
                          (Mask[2] >= 4 ? 
Mask[2]9
:
Mask[3]38
) - 4};
10584
47
      V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10585
47
                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10586
47
10587
47
      // Now we do a normal shuffle of V1 by giving V1 as both operands to
10588
47
      // a blend.
10589
47
      LowV = HighV = V1;
10590
47
      NewMask[0] = Mask[0] < 4 ? 
041
:
26
;
10591
47
      NewMask[1] = Mask[0] < 4 ? 
241
:
06
;
10592
47
      NewMask[2] = Mask[2] < 4 ? 
138
:
39
;
10593
47
      NewMask[3] = Mask[2] < 4 ? 
338
:
19
;
10594
48
    }
10595
443
  }
10596
676
  return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10597
676
                     getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10598
676
}
10599
10600
/// \brief Lower 4-lane 32-bit floating point shuffles.
10601
///
10602
/// Uses instructions exclusively from the floating point unit to minimize
10603
/// domain crossing penalties, as these are sufficient to implement all v4f32
10604
/// shuffles.
10605
static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10606
                                       const APInt &Zeroable,
10607
                                       SDValue V1, SDValue V2,
10608
                                       const X86Subtarget &Subtarget,
10609
3.12k
                                       SelectionDAG &DAG) {
10610
3.12k
  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10611
3.12k
  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10612
3.12k
  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10613
3.12k
10614
12.4k
  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10615
3.12k
10616
3.12k
  if (
NumV2Elements == 03.12k
) {
10617
1.20k
    // Check for being able to broadcast a single element.
10618
1.20k
    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10619
1.20k
            DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10620
51
      return Broadcast;
10621
1.15k
10622
1.15k
    // Use even/odd duplicate instructions for masks that match their pattern.
10623
1.15k
    
if (1.15k
Subtarget.hasSSE3()1.15k
) {
10624
916
      if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10625
91
        return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10626
825
      
if (825
isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})825
)
10627
322
        return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10628
745
    }
10629
745
10630
745
    
if (745
Subtarget.hasAVX()745
) {
10631
397
      // If we have AVX, we can use VPERMILPS which will allow folding a load
10632
397
      // into the shuffle.
10633
397
      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10634
397
                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10635
397
    }
10636
348
10637
348
    // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
10638
348
    // in SSE1 because otherwise they are widened to v2f64 and never get here.
10639
348
    
if (348
!Subtarget.hasSSE2()348
) {
10640
75
      if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
10641
1
        return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
10642
74
      
if (74
isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3})74
)
10643
11
        return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
10644
336
    }
10645
336
10646
336
    // Otherwise, use a straight shuffle of a single input vector. We pass the
10647
336
    // input vector to both operands to simulate this with a SHUFPS.
10648
336
    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10649
336
                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10650
336
  }
10651
1.91k
10652
1.91k
  // There are special ways we can lower some single-element blends. However, we
10653
1.91k
  // have custom ways we can lower more complex single-element blends below that
10654
1.91k
  // we defer to if both this and BLENDPS fail to match, so restrict this to
10655
1.91k
  // when the V2 input is targeting element 0 of the mask -- that is the fast
10656
1.91k
  // case here.
10657
1.91k
  
if (1.91k
NumV2Elements == 1 && 1.91k
Mask[0] >= 41.16k
)
10658
470
    
if (SDValue 470
V470
= lowerVectorShuffleAsElementInsertion(
10659
470
            DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10660
306
      return V;
10661
1.60k
10662
1.60k
  
if (1.60k
Subtarget.hasSSE41()1.60k
) {
10663
837
    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10664
837
                                                  Zeroable, Subtarget, DAG))
10665
304
      return Blend;
10666
533
10667
533
    // Use INSERTPS if we can complete the shuffle efficiently.
10668
533
    
if (SDValue 533
V533
=
10669
533
            lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10670
177
      return V;
10671
356
10672
356
    
if (356
!isSingleSHUFPSMask(Mask)356
)
10673
168
      
if (SDValue 168
BlendPerm168
= lowerVectorShuffleAsBlendAndPermute(
10674
168
              DL, MVT::v4f32, V1, V2, Mask, DAG))
10675
25
        return BlendPerm;
10676
1.10k
  }
10677
1.10k
10678
1.10k
  // Use low/high mov instructions. These are only valid in SSE1 because
10679
1.10k
  // otherwise they are widened to v2f64 and never get here.
10680
1.10k
  
if (1.10k
!Subtarget.hasSSE2()1.10k
) {
10681
330
    if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10682
108
      return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10683
222
    
if (222
isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7})222
)
10684
8
      return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10685
987
  }
10686
987
10687
987
  // Use dedicated unpack instructions for masks that match their pattern.
10688
987
  
if (SDValue 987
V987
=
10689
987
          lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10690
451
    return V;
10691
536
10692
536
  // Otherwise fall back to a SHUFPS lowering strategy.
10693
536
  return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10694
536
}
10695
10696
/// \brief Lower 4-lane i32 vector shuffles.
10697
///
10698
/// We try to handle these with integer-domain shuffles where we can, but for
10699
/// blends we use the floating point domain blend instructions.
10700
static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10701
                                       const APInt &Zeroable,
10702
                                       SDValue V1, SDValue V2,
10703
                                       const X86Subtarget &Subtarget,
10704
5.81k
                                       SelectionDAG &DAG) {
10705
5.81k
  assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10706
5.81k
  assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10707
5.81k
  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10708
5.81k
10709
5.81k
  // Whenever we can lower this as a zext, that instruction is strictly faster
10710
5.81k
  // than any alternative. It also allows us to fold memory operands into the
10711
5.81k
  // shuffle in many cases.
10712
5.81k
  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10713
5.81k
          DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10714
250
    return ZExt;
10715
5.56k
10716
22.2k
  
int NumV2Elements = count_if(Mask, [](int M) 5.56k
{ return M >= 4; }22.2k
);
10717
5.56k
10718
5.56k
  if (
NumV2Elements == 05.56k
) {
10719
2.91k
    // Check for being able to broadcast a single element.
10720
2.91k
    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10721
2.91k
            DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10722
88
      return Broadcast;
10723
2.82k
10724
2.82k
    // Straight shuffle of a single input vector. For everything from SSE2
10725
2.82k
    // onward this has a single fast instruction with no scary immediates.
10726
2.82k
    // We coerce the shuffle pattern to be compatible with UNPCK instructions
10727
2.82k
    // but we aren't actually going to use the UNPCK instruction because doing
10728
2.82k
    // so prevents folding a load into this instruction or making a copy.
10729
2.82k
    const int UnpackLoMask[] = {0, 0, 1, 1};
10730
2.82k
    const int UnpackHiMask[] = {2, 2, 3, 3};
10731
2.82k
    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10732
50
      Mask = UnpackLoMask;
10733
2.77k
    else 
if (2.77k
isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})2.77k
)
10734
57
      Mask = UnpackHiMask;
10735
2.91k
10736
2.91k
    return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10737
2.91k
                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10738
2.91k
  }
10739
2.65k
10740
2.65k
  // Try to use shift instructions.
10741
2.65k
  
if (SDValue 2.65k
Shift2.65k
= lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10742
2.65k
                                                Zeroable, Subtarget, DAG))
10743
214
    return Shift;
10744
2.43k
10745
2.43k
  // There are special ways we can lower some single-element blends.
10746
2.43k
  
if (2.43k
NumV2Elements == 12.43k
)
10747
1.25k
    
if (SDValue 1.25k
V1.25k
= lowerVectorShuffleAsElementInsertion(
10748
1.25k
            DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10749
333
      return V;
10750
2.10k
10751
2.10k
  // We have different paths for blend lowering, but they all must use the
10752
2.10k
  // *exact* same predicate.
10753
2.10k
  bool IsBlendSupported = Subtarget.hasSSE41();
10754
2.10k
  if (IsBlendSupported)
10755
1.06k
    
if (SDValue 1.06k
Blend1.06k
= lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10756
1.06k
                                                  Zeroable, Subtarget, DAG))
10757
515
      return Blend;
10758
1.58k
10759
1.58k
  
if (SDValue 1.58k
Masked1.58k
= lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10760
1.58k
                                                   Zeroable, DAG))
10761
38
    return Masked;
10762
1.55k
10763
1.55k
  // Use dedicated unpack instructions for masks that match their pattern.
10764
1.55k
  
if (SDValue 1.55k
V1.55k
=
10765
1.55k
          lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10766
704
    return V;
10767
846
10768
846
  // Try to use byte rotation instructions.
10769
846
  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10770
846
  
if (846
Subtarget.hasSSSE3()846
)
10771
584
    
if (SDValue 584
Rotate584
= lowerVectorShuffleAsByteRotate(
10772
584
            DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10773
33
      return Rotate;
10774
813
10775
813
  // Assume that a single SHUFPS is faster than an alternative sequence of
10776
813
  // multiple instructions (even if the CPU has a domain penalty).
10777
813
  // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10778
813
  
if (813
!isSingleSHUFPSMask(Mask)813
) {
10779
535
    // If we have direct support for blends, we should lower by decomposing into
10780
535
    // a permute. That will be faster than the domain cross.
10781
535
    if (IsBlendSupported)
10782
188
      return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10783
188
                                                        Mask, DAG);
10784
347
10785
347
    // Try to lower by permuting the inputs into an unpack instruction.
10786
347
    
if (SDValue 347
Unpack347
= lowerVectorShuffleAsPermuteAndUnpack(
10787
347
            DL, MVT::v4i32, V1, V2, Mask, DAG))
10788
226
      return Unpack;
10789
399
  }
10790
399
10791
399
  // We implement this with SHUFPS because it can blend from two vectors.
10792
399
  // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10793
399
  // up the inputs, bypassing domain shift penalties that we would incur if we
10794
399
  // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10795
399
  // relevant.
10796
399
  SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10797
399
  SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10798
399
  SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10799
399
  return DAG.getBitcast(MVT::v4i32, ShufPS);
10800
399
}
10801
10802
/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10803
/// shuffle lowering, and the most complex part.
10804
///
10805
/// The lowering strategy is to try to form pairs of input lanes which are
10806
/// targeted at the same half of the final vector, and then use a dword shuffle
10807
/// to place them onto the right half, and finally unpack the paired lanes into
10808
/// their final position.
10809
///
10810
/// The exact breakdown of how to form these dword pairs and align them on the
10811
/// correct sides is really tricky. See the comments within the function for
10812
/// more of the details.
10813
///
10814
/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10815
/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10816
/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10817
/// vector, form the analogous 128-bit 8-element Mask.
10818
static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10819
    const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10820
2.45k
    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10821
2.45k
  assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10822
2.45k
  MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10823
2.45k
10824
2.45k
  assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10825
2.45k
  MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10826
2.45k
  MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10827
2.45k
10828
2.45k
  SmallVector<int, 4> LoInputs;
10829
9.83k
  copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10830
2.45k
  std::sort(LoInputs.begin(), LoInputs.end());
10831
2.45k
  LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10832
2.45k
  SmallVector<int, 4> HiInputs;
10833
9.83k
  copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10834
2.45k
  std::sort(HiInputs.begin(), HiInputs.end());
10835
2.45k
  HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10836
2.45k
  int NumLToL =
10837
2.45k
      std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10838
2.45k
  int NumHToL = LoInputs.size() - NumLToL;
10839
2.45k
  int NumLToH =
10840
2.45k
      std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10841
2.45k
  int NumHToH = HiInputs.size() - NumLToH;
10842
2.45k
  MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10843
2.45k
  MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10844
2.45k
  MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10845
2.45k
  MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10846
2.45k
10847
2.45k
  // If we are splatting two values from one half - one to each half, then
10848
2.45k
  // we can shuffle that half so each is splatted to a dword, then splat those
10849
2.45k
  // to their respective halves.
10850
2.45k
  auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10851
371
                        int DOffset) {
10852
371
    int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10853
371
    int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10854
371
    V = DAG.getNode(ShufWOp, DL, VT, V,
10855
371
                    getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10856
371
    V = DAG.getBitcast(PSHUFDVT, V);
10857
371
    V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10858
371
                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10859
371
    return DAG.getBitcast(VT, V);
10860
371
  };
10861
2.45k
10862
2.45k
  if (
NumLToL == 1 && 2.45k
NumLToH == 1932
&&
(NumHToL + NumHToH) == 0449
)
10863
340
    return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10864
2.11k
  
if (2.11k
NumHToL == 1 && 2.11k
NumHToH == 1585
&&
(NumLToL + NumLToH) == 0150
)
10865
31
    return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10866
2.08k
10867
2.08k
  // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10868
2.08k
  // such inputs we can swap two of the dwords across the half mark and end up
10869
2.08k
  // with <=2 inputs to each half in each half. Once there, we can fall through
10870
2.08k
  // to the generic code below. For example:
10871
2.08k
  //
10872
2.08k
  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10873
2.08k
  // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10874
2.08k
  //
10875
2.08k
  // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10876
2.08k
  // and an existing 2-into-2 on the other half. In this case we may have to
10877
2.08k
  // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10878
2.08k
  // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10879
2.08k
  // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10880
2.08k
  // because any other situation (including a 3-into-1 or 1-into-3 in the other
10881
2.08k
  // half than the one we target for fixing) will be fixed when we re-enter this
10882
2.08k
  // path. We will also combine away any sequence of PSHUFD instructions that
10883
2.08k
  // result into a single instruction. Here is an example of the tricky case:
10884
2.08k
  //
10885
2.08k
  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10886
2.08k
  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10887
2.08k
  //
10888
2.08k
  // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10889
2.08k
  //
10890
2.08k
  // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10891
2.08k
  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10892
2.08k
  //
10893
2.08k
  // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10894
2.08k
  // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10895
2.08k
  //
10896
2.08k
  // The result is fine to be handled by the generic logic.
10897
2.08k
  auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10898
2.08k
                          ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10899
99
                          int AOffset, int BOffset) {
10900
99
    assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10901
99
           "Must call this with A having 3 or 1 inputs from the A half.");
10902
99
    assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10903
99
           "Must call this with B having 1 or 3 inputs from the B half.");
10904
99
    assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10905
99
           "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10906
99
10907
99
    bool ThreeAInputs = AToAInputs.size() == 3;
10908
99
10909
99
    // Compute the index of dword with only one word among the three inputs in
10910
99
    // a half by taking the sum of the half with three inputs and subtracting
10911
99
    // the sum of the actual three inputs. The difference is the remaining
10912
99
    // slot.
10913
99
    int ADWord, BDWord;
10914
99
    int &TripleDWord = ThreeAInputs ? 
ADWord77
:
BDWord22
;
10915
99
    int &OneInputDWord = ThreeAInputs ? 
BDWord77
:
ADWord22
;
10916
99
    int TripleInputOffset = ThreeAInputs ? 
AOffset77
:
BOffset22
;
10917
99
    ArrayRef<int> TripleInputs = ThreeAInputs ? 
AToAInputs77
:
BToAInputs22
;
10918
99
    int OneInput = ThreeAInputs ? 
BToAInputs[0]77
:
AToAInputs[0]22
;
10919
99
    int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10920
99
    int TripleNonInputIdx =
10921
99
        TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10922
99
    TripleDWord = TripleNonInputIdx / 2;
10923
99
10924
99
    // We use xor with one to compute the adjacent DWord to whichever one the
10925
99
    // OneInput is in.
10926
99
    OneInputDWord = (OneInput / 2) ^ 1;
10927
99
10928
99
    // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10929
99
    // and BToA inputs. If there is also such a problem with the BToB and AToB
10930
99
    // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10931
99
    // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10932
99
    // is essential that we don't *create* a 3<-1 as then we might oscillate.
10933
99
    if (
BToBInputs.size() == 2 && 99
AToBInputs.size() == 225
) {
10934
20
      // Compute how many inputs will be flipped by swapping these DWords. We
10935
20
      // need
10936
20
      // to balance this to ensure we don't form a 3-1 shuffle in the other
10937
20
      // half.
10938
20
      int NumFlippedAToBInputs =
10939
20
          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10940
20
          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10941
20
      int NumFlippedBToBInputs =
10942
20
          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10943
20
          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10944
20
      if ((NumFlippedAToBInputs == 1 &&
10945
7
           
(NumFlippedBToBInputs == 0 || 7
NumFlippedBToBInputs == 27
)) ||
10946
13
          (NumFlippedBToBInputs == 1 &&
10947
20
           
(NumFlippedAToBInputs == 0 || 13
NumFlippedAToBInputs == 213
))) {
10948
20
        // We choose whether to fix the A half or B half based on whether that
10949
20
        // half has zero flipped inputs. At zero, we may not be able to fix it
10950
20
        // with that half. We also bias towards fixing the B half because that
10951
20
        // will more commonly be the high half, and we have to bias one way.
10952
20
        auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10953
20
                                                       ArrayRef<int> Inputs) {
10954
20
          int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10955
20
          bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10956
20
          // Determine whether the free index is in the flipped dword or the
10957
20
          // unflipped dword based on where the pinned index is. We use this bit
10958
20
          // in an xor to conditionally select the adjacent dword.
10959
20
          int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10960
20
          bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10961
20
          if (IsFixIdxInput == IsFixFreeIdxInput)
10962
13
            FixFreeIdx += 1;
10963
20
          IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10964
20
          assert(IsFixIdxInput != IsFixFreeIdxInput &&
10965
20
                 "We need to be changing the number of flipped inputs!");
10966
20
          int PSHUFHalfMask[] = {0, 1, 2, 3};
10967
20
          std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10968
20
          V = DAG.getNode(
10969
20
              FixIdx < 4 ? 
X86ISD::PSHUFLW11
:
X86ISD::PSHUFHW9
, DL,
10970
20
              MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
10971
20
              getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10972
20
10973
20
          for (int &M : Mask)
10974
160
            
if (160
M >= 0 && 160
M == FixIdx160
)
10975
4
              M = FixFreeIdx;
10976
156
            else 
if (156
M >= 0 && 156
M == FixFreeIdx156
)
10977
16
              M = FixIdx;
10978
20
        };
10979
20
        if (
NumFlippedBToBInputs != 020
) {
10980
20
          int BPinnedIdx =
10981
20
              BToAInputs.size() == 3 ? 
TripleNonInputIdx0
:
OneInput20
;
10982
20
          FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10983
0
        } else {
10984
0
          assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10985
0
          int APinnedIdx = ThreeAInputs ? 
TripleNonInputIdx0
:
OneInput0
;
10986
0
          FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10987
0
        }
10988
20
      }
10989
20
    }
10990
99
10991
99
    int PSHUFDMask[] = {0, 1, 2, 3};
10992
99
    PSHUFDMask[ADWord] = BDWord;
10993
99
    PSHUFDMask[BDWord] = ADWord;
10994
99
    V = DAG.getBitcast(
10995
99
        VT,
10996
99
        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10997
99
                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10998
99
10999
99
    // Adjust the mask to match the new locations of A and B.
11000
99
    for (int &M : Mask)
11001
792
      
if (792
M >= 0 && 792
M/2 == ADWord629
)
11002
156
        M = 2 * BDWord + M % 2;
11003
636
      else 
if (636
M >= 0 && 636
M/2 == BDWord473
)
11004
98
        M = 2 * ADWord + M % 2;
11005
99
11006
99
    // Recurse back into this routine to re-compute state now that this isn't
11007
99
    // a 3 and 1 problem.
11008
99
    return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11009
99
                                                     DAG);
11010
99
  };
11011
2.08k
  if (
(NumLToL == 3 && 2.08k
NumHToL == 1112
) ||
(NumLToL == 1 && 2.03k
NumHToL == 3592
))
11012
77
    return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11013
2.01k
  
if (2.01k
(NumHToH == 3 && 2.01k
NumLToH == 190
) ||
(NumHToH == 1 && 1.99k
NumLToH == 3236
))
11014
22
    return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11015
1.98k
11016
1.98k
  // At this point there are at most two inputs to the low and high halves from
11017
1.98k
  // each half. That means the inputs can always be grouped into dwords and
11018
1.98k
  // those dwords can then be moved to the correct half with a dword shuffle.
11019
1.98k
  // We use at most one low and one high word shuffle to collect these paired
11020
1.98k
  // inputs into dwords, and finally a dword shuffle to place them.
11021
1.98k
  int PSHUFLMask[4] = {-1, -1, -1, -1};
11022
1.98k
  int PSHUFHMask[4] = {-1, -1, -1, -1};
11023
1.98k
  int PSHUFDMask[4] = {-1, -1, -1, -1};
11024
1.98k
11025
1.98k
  // First fix the masks for all the inputs that are staying in their
11026
1.98k
  // original halves. This will then dictate the targets of the cross-half
11027
1.98k
  // shuffles.
11028
1.98k
  auto fixInPlaceInputs =
11029
1.98k
      [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11030
1.98k
                    MutableArrayRef<int> SourceHalfMask,
11031
3.97k
                    MutableArrayRef<int> HalfMask, int HalfOffset) {
11032
3.97k
    if (InPlaceInputs.empty())
11033
1.34k
      return;
11034
2.63k
    
if (2.63k
InPlaceInputs.size() == 12.63k
) {
11035
803
      SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11036
803
          InPlaceInputs[0] - HalfOffset;
11037
803
      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11038
803
      return;
11039
803
    }
11040
1.83k
    
if (1.83k
IncomingInputs.empty()1.83k
) {
11041
878
      // Just fix all of the in place inputs.
11042
2.96k
      for (int Input : InPlaceInputs) {
11043
2.96k
        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11044
2.96k
        PSHUFDMask[Input / 2] = Input / 2;
11045
2.96k
      }
11046
878
      return;
11047
878
    }
11048
954
11049
1.83k
    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11050
954
    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11051
954
        InPlaceInputs[0] - HalfOffset;
11052
954
    // Put the second input next to the first so that they are packed into
11053
954
    // a dword. We find the adjacent index by toggling the low bit.
11054
954
    int AdjIndex = InPlaceInputs[0] ^ 1;
11055
954
    SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11056
954
    std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11057
954
    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11058
954
  };
11059
1.98k
  fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11060
1.98k
  fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11061
1.98k
11062
1.98k
  // Now gather the cross-half inputs and place them into a free dword of
11063
1.98k
  // their target half.
11064
1.98k
  // FIXME: This operation could almost certainly be simplified dramatically to
11065
1.98k
  // look more like the 3-1 fixing operation.
11066
1.98k
  auto moveInputsToRightHalf = [&PSHUFDMask](
11067
1.98k
      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11068
1.98k
      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11069
1.98k
      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11070
3.97k
      int DestOffset) {
11071
2.44k
    auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11072
537
      return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11073
2.44k
    };
11074
3.97k
    auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11075
109
                                               int Word) {
11076
109
      int LowWord = Word & ~1;
11077
109
      int HighWord = Word | 1;
11078
109
      return isWordClobbered(SourceHalfMask, LowWord) ||
11079
109
             isWordClobbered(SourceHalfMask, HighWord);
11080
109
    };
11081
3.97k
11082
3.97k
    if (IncomingInputs.empty())
11083
2.14k
      return;
11084
1.82k
11085
1.82k
    
if (1.82k
ExistingInputs.empty()1.82k
) {
11086
323
      // Map any dwords with inputs from them into the right half.
11087
695
      for (int Input : IncomingInputs) {
11088
695
        // If the source half mask maps over the inputs, turn those into
11089
695
        // swaps and use the swapped lane.
11090
695
        if (
isWordClobbered(SourceHalfMask, Input - SourceOffset)695
) {
11091
18
          if (
SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 018
) {
11092
9
            SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11093
9
                Input - SourceOffset;
11094
9
            // We have to swap the uses in our half mask in one sweep.
11095
9
            for (int &M : HalfMask)
11096
36
              
if (36
M == SourceHalfMask[Input - SourceOffset] + SourceOffset36
)
11097
18
                M = Input;
11098
18
              else 
if (18
M == Input18
)
11099
9
                M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11100
18
          } else {
11101
9
            assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11102
9
                       Input - SourceOffset &&
11103
9
                   "Previous placement doesn't match!");
11104
9
          }
11105
18
          // Note that this correctly re-maps both when we do a swap and when
11106
18
          // we observe the other side of the swap above. We rely on that to
11107
18
          // avoid swapping the members of the input list directly.
11108
18
          Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11109
18
        }
11110
695
11111
695
        // Map the input's dword into the correct half.
11112
695
        if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11113
470
          PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11114
695
        else
11115
695
          assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11116
695
                     Input / 2 &&
11117
695
                 "Previous placement doesn't match!");
11118
695
      }
11119
323
11120
323
      // And just directly shift any other-half mask elements to be same-half
11121
323
      // as we will have mirrored the dword containing the element into the
11122
323
      // same position within that half.
11123
323
      for (int &M : HalfMask)
11124
1.29k
        
if (1.29k
M >= SourceOffset && 1.29k
M < SourceOffset + 41.10k
) {
11125
1.10k
          M = M - SourceOffset + DestOffset;
11126
1.10k
          assert(M >= 0 && "This should never wrap below zero!");
11127
1.10k
        }
11128
323
      return;
11129
323
    }
11130
1.50k
11131
1.50k
    // Ensure we have the input in a viable dword of its current half. This
11132
1.50k
    // is particularly tricky because the original position may be clobbered
11133
1.50k
    // by inputs being moved and *staying* in that half.
11134
1.50k
    
if (1.50k
IncomingInputs.size() == 11.50k
) {
11135
573
      if (
isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)573
) {
11136
6
        int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11137
6
                         SourceOffset;
11138
6
        SourceHalfMask[InputFixed - SourceOffset] =
11139
6
            IncomingInputs[0] - SourceOffset;
11140
6
        std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11141
6
                     InputFixed);
11142
6
        IncomingInputs[0] = InputFixed;
11143
6
      }
11144
1.50k
    } else 
if (933
IncomingInputs.size() == 2933
) {
11145
933
      if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11146
933
          
isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)109
) {
11147
824
        // We have two non-adjacent or clobbered inputs we need to extract from
11148
824
        // the source half. To do this, we need to map them into some adjacent
11149
824
        // dword slot in the source mask.
11150
824
        int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11151
824
                              IncomingInputs[1] - SourceOffset};
11152
824
11153
824
        // If there is a free slot in the source half mask adjacent to one of
11154
824
        // the inputs, place the other input in it. We use (Index XOR 1) to
11155
824
        // compute an adjacent index.
11156
824
        if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11157
824
            
SourceHalfMask[InputsFixed[0] ^ 1] < 0724
) {
11158
691
          SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11159
691
          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11160
691
          InputsFixed[1] = InputsFixed[0] ^ 1;
11161
824
        } else 
if (133
!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11162
133
                   
SourceHalfMask[InputsFixed[1] ^ 1] < 0133
) {
11163
118
          SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11164
118
          SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11165
118
          InputsFixed[0] = InputsFixed[1] ^ 1;
11166
133
        } else 
if (15
SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11167
15
                   
SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 06
) {
11168
0
          // The two inputs are in the same DWord but it is clobbered and the
11169
0
          // adjacent DWord isn't used at all. Move both inputs to the free
11170
0
          // slot.
11171
0
          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11172
0
          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11173
0
          InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11174
0
          InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11175
15
        } else {
11176
15
          // The only way we hit this point is if there is no clobbering
11177
15
          // (because there are no off-half inputs to this half) and there is no
11178
15
          // free slot adjacent to one of the inputs. In this case, we have to
11179
15
          // swap an input with a non-input.
11180
75
          for (int i = 0; 
i < 475
;
++i60
)
11181
15
            assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11182
15
                   "We can't handle any clobbers here!");
11183
15
          assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11184
15
                 "Cannot have adjacent inputs here!");
11185
15
11186
15
          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11187
15
          SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11188
15
11189
15
          // We also have to update the final source mask in this case because
11190
15
          // it may need to undo the above swap.
11191
15
          for (int &M : FinalSourceHalfMask)
11192
60
            
if (60
M == (InputsFixed[0] ^ 1) + SourceOffset60
)
11193
21
              M = InputsFixed[1] + SourceOffset;
11194
39
            else 
if (39
M == InputsFixed[1] + SourceOffset39
)
11195
9
              M = (InputsFixed[0] ^ 1) + SourceOffset;
11196
133
11197
133
          InputsFixed[1] = InputsFixed[0] ^ 1;
11198
133
        }
11199
824
11200
824
        // Point everything at the fixed inputs.
11201
824
        for (int &M : HalfMask)
11202
3.29k
          
if (3.29k
M == IncomingInputs[0]3.29k
)
11203
824
            M = InputsFixed[0] + SourceOffset;
11204
2.47k
          else 
if (2.47k
M == IncomingInputs[1]2.47k
)
11205
824
            M = InputsFixed[1] + SourceOffset;
11206
824
11207
824
        IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11208
824
        IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11209
824
      }
11210
0
    } else {
11211
0
      llvm_unreachable("Unhandled input size!");
11212
933
    }
11213
1.50k
11214
1.50k
    // Now hoist the DWord down to the right half.
11215
1.50k
    
int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 1.50k
0171
:
11.33k
) + DestOffset / 2;
11216
1.50k
    assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11217
1.50k
    PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11218
1.50k
    for (int &M : HalfMask)
11219
6.02k
      for (int Input : IncomingInputs)
11220
9.75k
        
if (9.75k
M == Input9.75k
)
11221
2.62k
          M = FreeDWord * 2 + Input % 2;
11222
3.97k
  };
11223
1.98k
  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11224
1.98k
                        /*SourceOffset*/ 4, /*DestOffset*/ 0);
11225
1.98k
  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11226
1.98k
                        /*SourceOffset*/ 0, /*DestOffset*/ 4);
11227
1.98k
11228
1.98k
  // Now enact all the shuffles we've computed to move the inputs into their
11229
1.98k
  // target half.
11230
1.98k
  if (!isNoopShuffleMask(PSHUFLMask))
11231
739
    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11232
739
                    getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11233
1.98k
  if (!isNoopShuffleMask(PSHUFHMask))
11234
758
    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11235
758
                    getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11236
1.98k
  if (!isNoopShuffleMask(PSHUFDMask))
11237
1.52k
    V = DAG.getBitcast(
11238
1.52k
        VT,
11239
1.52k
        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11240
1.52k
                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11241
1.98k
11242
1.98k
  // At this point, each half should contain all its inputs, and we can then
11243
1.98k
  // just shuffle them into their final position.
11244
1.98k
  assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11245
1.98k
         "Failed to lift all the high half inputs to the low mask!");
11246
1.98k
  assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11247
1.98k
         "Failed to lift all the low half inputs to the high mask!");
11248
1.98k
11249
1.98k
  // Do a half shuffle for the low mask.
11250
1.98k
  if (!isNoopShuffleMask(LoMask))
11251
1.21k
    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11252
1.21k
                    getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11253
1.98k
11254
1.98k
  // Do a half shuffle with the high mask after shifting its values down.
11255
1.98k
  for (int &M : HiMask)
11256
7.95k
    
if (7.95k
M >= 07.95k
)
11257
3.78k
      M -= 4;
11258
1.98k
  if (!isNoopShuffleMask(HiMask))
11259
893
    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11260
893
                    getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11261
2.45k
11262
2.45k
  return V;
11263
2.45k
}
11264
11265
/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11266
/// blend if only one input is used.
11267
static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11268
    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11269
    const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11270
1.73k
    bool &V2InUse) {
11271
1.73k
  SDValue V1Mask[16];
11272
1.73k
  SDValue V2Mask[16];
11273
1.73k
  V1InUse = false;
11274
1.73k
  V2InUse = false;
11275
1.73k
11276
1.73k
  int Size = Mask.size();
11277
1.73k
  int Scale = 16 / Size;
11278
29.4k
  for (int i = 0; 
i < 1629.4k
;
++i27.7k
) {
11279
27.7k
    if (
Mask[i / Scale] < 027.7k
) {
11280
10.8k
      V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11281
27.7k
    } else {
11282
16.8k
      const int ZeroMask = 0x80;
11283
13.5k
      int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11284
3.39k
                                          : ZeroMask;
11285
16.8k
      int V2Idx = Mask[i / Scale] < Size
11286
13.5k
                      ? ZeroMask
11287
3.39k
                      : (Mask[i / Scale] - Size) * Scale + i % Scale;
11288
16.8k
      if (Zeroable[i / Scale])
11289
99
        V1Idx = V2Idx = ZeroMask;
11290
16.8k
      V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11291
16.8k
      V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11292
16.8k
      V1InUse |= (ZeroMask != V1Idx);
11293
16.8k
      V2InUse |= (ZeroMask != V2Idx);
11294
16.8k
    }
11295
27.7k
  }
11296
1.73k
11297
1.73k
  if (V1InUse)
11298
1.72k
    V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11299
1.72k
                     DAG.getBitcast(MVT::v16i8, V1),
11300
1.72k
                     DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11301
1.73k
  if (V2InUse)
11302
590
    V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11303
590
                     DAG.getBitcast(MVT::v16i8, V2),
11304
590
                     DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11305
1.73k
11306
1.73k
  // If we need shuffled inputs from both, blend the two.
11307
1.73k
  SDValue V;
11308
1.73k
  if (
V1InUse && 1.73k
V2InUse1.72k
)
11309
584
    V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11310
1.73k
  else
11311
1.15k
    
V = V1InUse ? 1.15k
V11.14k
:
V26
;
11312
1.73k
11313
1.73k
  // Cast the result back to the correct type.
11314
1.73k
  return DAG.getBitcast(VT, V);
11315
1.73k
}
11316
11317
/// \brief Generic lowering of 8-lane i16 shuffles.
11318
///
11319
/// This handles both single-input shuffles and combined shuffle/blends with
11320
/// two inputs. The single input shuffles are immediately delegated to
11321
/// a dedicated lowering routine.
11322
///
11323
/// The blends are lowered in one of three fundamental ways. If there are few
11324
/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11325
/// of the input is significantly cheaper when lowered as an interleaving of
11326
/// the two inputs, try to interleave them. Otherwise, blend the low and high
11327
/// halves of the inputs separately (making them have relatively few inputs)
11328
/// and then concatenate them.
11329
static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11330
                                       const APInt &Zeroable,
11331
                                       SDValue V1, SDValue V2,
11332
                                       const X86Subtarget &Subtarget,
11333
4.47k
                                       SelectionDAG &DAG) {
11334
4.47k
  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11335
4.47k
  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11336
4.47k
  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11337
4.47k
11338
4.47k
  // Whenever we can lower this as a zext, that instruction is strictly faster
11339
4.47k
  // than any alternative.
11340
4.47k
  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11341
4.47k
          DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11342
370
    return ZExt;
11343
4.10k
11344
32.8k
  
int NumV2Inputs = count_if(Mask, [](int M) 4.10k
{ return M >= 8; }32.8k
);
11345
4.10k
11346
4.10k
  if (
NumV2Inputs == 04.10k
) {
11347
2.40k
    // Check for being able to broadcast a single element.
11348
2.40k
    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11349
2.40k
            DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11350
154
      return Broadcast;
11351
2.25k
11352
2.25k
    // Try to use shift instructions.
11353
2.25k
    
if (SDValue 2.25k
Shift2.25k
= lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11354
2.25k
                                                  Zeroable, Subtarget, DAG))
11355
220
      return Shift;
11356
2.03k
11357
2.03k
    // Use dedicated unpack instructions for masks that match their pattern.
11358
2.03k
    
if (SDValue 2.03k
V2.03k
=
11359
2.03k
            lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11360
26
      return V;
11361
2.00k
11362
2.00k
    // Try to use byte rotation instructions.
11363
2.00k
    
if (SDValue 2.00k
Rotate2.00k
= lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11364
2.00k
                                                        Mask, Subtarget, DAG))
11365
46
      return Rotate;
11366
1.96k
11367
1.96k
    // Make a copy of the mask so it can be modified.
11368
1.96k
    SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11369
1.96k
    return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11370
1.96k
                                                     MutableMask, Subtarget,
11371
1.96k
                                                     DAG);
11372
1.96k
  }
11373
1.70k
11374
4.10k
  assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11375
1.70k
         "All single-input shuffles should be canonicalized to be V1-input "
11376
1.70k
         "shuffles.");
11377
1.70k
11378
1.70k
  // Try to use shift instructions.
11379
1.70k
  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11380
1.70k
                                                Zeroable, Subtarget, DAG))
11381
70
    return Shift;
11382
1.63k
11383
1.63k
  // See if we can use SSE4A Extraction / Insertion.
11384
1.63k
  
if (1.63k
Subtarget.hasSSE4A()1.63k
)
11385
51
    
if (SDValue 51
V51
= lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11386
51
                                                Zeroable, DAG))
11387
35
      return V;
11388
1.59k
11389
1.59k
  // There are special ways we can lower some single-element blends.
11390
1.59k
  
if (1.59k
NumV2Inputs == 11.59k
)
11391
532
    
if (SDValue 532
V532
= lowerVectorShuffleAsElementInsertion(
11392
532
            DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11393
0
      return V;
11394
1.59k
11395
1.59k
  // We have different paths for blend lowering, but they all must use the
11396
1.59k
  // *exact* same predicate.
11397
1.59k
  bool IsBlendSupported = Subtarget.hasSSE41();
11398
1.59k
  if (IsBlendSupported)
11399
995
    
if (SDValue 995
Blend995
= lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11400
995
                                                  Zeroable, Subtarget, DAG))
11401
386
      return Blend;
11402
1.21k
11403
1.21k
  
if (SDValue 1.21k
Masked1.21k
= lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11404
1.21k
                                                   Zeroable, DAG))
11405
44
    return Masked;
11406
1.16k
11407
1.16k
  // Use dedicated unpack instructions for masks that match their pattern.
11408
1.16k
  
if (SDValue 1.16k
V1.16k
=
11409
1.16k
          lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11410
457
    return V;
11411
709
11412
709
  // Try to use byte rotation instructions.
11413
709
  
if (SDValue 709
Rotate709
= lowerVectorShuffleAsByteRotate(
11414
709
          DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11415
97
    return Rotate;
11416
612
11417
612
  
if (SDValue 612
BitBlend612
=
11418
612
          lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11419
54
    return BitBlend;
11420
558
11421
558
  // Try to lower by permuting the inputs into an unpack instruction.
11422
558
  
if (SDValue 558
Unpack558
= lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11423
558
                                                            V2, Mask, DAG))
11424
355
    return Unpack;
11425
203
11426
203
  // If we can't directly blend but can use PSHUFB, that will be better as it
11427
203
  // can both shuffle and set up the inefficient blend.
11428
203
  
if (203
!IsBlendSupported && 203
Subtarget.hasSSSE3()53
) {
11429
8
    bool V1InUse, V2InUse;
11430
8
    return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11431
8
                                              Zeroable, DAG, V1InUse, V2InUse);
11432
8
  }
11433
195
11434
195
  // We can always bit-blend if we have to so the fallback strategy is to
11435
195
  // decompose into single-input permutes and blends.
11436
195
  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11437
195
                                                    Mask, DAG);
11438
195
}
11439
11440
/// \brief Check whether a compaction lowering can be done by dropping even
11441
/// elements and compute how many times even elements must be dropped.
11442
///
11443
/// This handles shuffles which take every Nth element where N is a power of
11444
/// two. Example shuffle masks:
11445
///
11446
///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
11447
///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11448
///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
11449
///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
11450
///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
11451
///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
11452
///
11453
/// Any of these lanes can of course be undef.
11454
///
11455
/// This routine only supports N <= 3.
11456
/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11457
/// for larger N.
11458
///
11459
/// \returns N above, or the number of times even elements must be dropped if
11460
/// there is such a number. Otherwise returns zero.
11461
static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11462
232
                                          bool IsSingleInput) {
11463
232
  // The modulus for the shuffle vector entries is based on whether this is
11464
232
  // a single input or not.
11465
232
  int ShuffleModulus = Mask.size() * (IsSingleInput ? 
1125
:
2107
);
11466
232
  assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11467
232
         "We should only be called with masks with a power-of-2 size!");
11468
232
11469
232
  uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11470
232
11471
232
  // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11472
232
  // and 2^3 simultaneously. This is because we may have ambiguity with
11473
232
  // partially undef inputs.
11474
232
  bool ViableForN[3] = {true, true, true};
11475
232
11476
1.78k
  for (int i = 0, e = Mask.size(); 
i < e1.78k
;
++i1.55k
) {
11477
1.69k
    // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11478
1.69k
    // want.
11479
1.69k
    if (Mask[i] < 0)
11480
523
      continue;
11481
1.16k
11482
1.16k
    bool IsAnyViable = false;
11483
4.67k
    for (unsigned j = 0; 
j != array_lengthof(ViableForN)4.67k
;
++j3.50k
)
11484
3.50k
      
if (3.50k
ViableForN[j]3.50k
) {
11485
1.90k
        uint64_t N = j + 1;
11486
1.90k
11487
1.90k
        // The shuffle mask must be equal to (i * 2^N) % M.
11488
1.90k
        if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11489
1.29k
          IsAnyViable = true;
11490
1.90k
        else
11491
605
          ViableForN[j] = false;
11492
3.50k
      }
11493
1.16k
    // Early exit if we exhaust the possible powers of two.
11494
1.16k
    if (!IsAnyViable)
11495
141
      break;
11496
1.69k
  }
11497
232
11498
686
  for (unsigned j = 0; 
j != array_lengthof(ViableForN)686
;
++j454
)
11499
545
    
if (545
ViableForN[j]545
)
11500
91
      return j + 1;
11501
232
11502
232
  // Return 0 as there is no viable power of two.
11503
141
  return 0;
11504
232
}
11505
11506
/// \brief Generic lowering of v16i8 shuffles.
11507
///
11508
/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11509
/// detect any complexity reducing interleaving. If that doesn't help, it uses
11510
/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11511
/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11512
/// back together.
11513
static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11514
                                       const APInt &Zeroable,
11515
                                       SDValue V1, SDValue V2,
11516
                                       const X86Subtarget &Subtarget,
11517
4.49k
                                       SelectionDAG &DAG) {
11518
4.49k
  assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11519
4.49k
  assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11520
4.49k
  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11521
4.49k
11522
4.49k
  // Try to use shift instructions.
11523
4.49k
  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11524
4.49k
                                                Zeroable, Subtarget, DAG))
11525
244
    return Shift;
11526
4.25k
11527
4.25k
  // Try to use byte rotation instructions.
11528
4.25k
  
if (SDValue 4.25k
Rotate4.25k
= lowerVectorShuffleAsByteRotate(
11529
4.25k
          DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11530
112
    return Rotate;
11531
4.13k
11532
4.13k
  // Try to use a zext lowering.
11533
4.13k
  
if (SDValue 4.13k
ZExt4.13k
= lowerVectorShuffleAsZeroOrAnyExtend(
11534
4.13k
          DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11535
766
    return ZExt;
11536
3.37k
11537
3.37k
  // See if we can use SSE4A Extraction / Insertion.
11538
3.37k
  
if (3.37k
Subtarget.hasSSE4A()3.37k
)
11539
57
    
if (SDValue 57
V57
= lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11540
57
                                                Zeroable, DAG))
11541
27
      return V;
11542
3.34k
11543
53.5k
  
int NumV2Elements = count_if(Mask, [](int M) 3.34k
{ return M >= 16; }53.5k
);
11544
3.34k
11545
3.34k
  // For single-input shuffles, there are some nicer lowering tricks we can use.
11546
3.34k
  if (
NumV2Elements == 03.34k
) {
11547
1.71k
    // Check for being able to broadcast a single element.
11548
1.71k
    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11549
1.71k
            DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11550
101
      return Broadcast;
11551
1.61k
11552
1.61k
    // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11553
1.61k
    // Notably, this handles splat and partial-splat shuffles more efficiently.
11554
1.61k
    // However, it only makes sense if the pre-duplication shuffle simplifies
11555
1.61k
    // things significantly. Currently, this means we need to be able to
11556
1.61k
    // express the pre-duplication shuffle as an i16 shuffle.
11557
1.61k
    //
11558
1.61k
    // FIXME: We should check for other patterns which can be widened into an
11559
1.61k
    // i16 shuffle as well.
11560
1.61k
    
auto canWidenViaDuplication = [](ArrayRef<int> Mask) 1.61k
{
11561
5.19k
      for (int i = 0; 
i < 165.19k
;
i += 23.58k
)
11562
4.83k
        
if (4.83k
Mask[i] >= 0 && 4.83k
Mask[i + 1] >= 03.99k
&&
Mask[i] != Mask[i + 1]3.89k
)
11563
1.24k
          return false;
11564
1.61k
11565
364
      return true;
11566
1.61k
    };
11567
1.61k
    auto tryToWidenViaDuplication = [&]() -> SDValue {
11568
1.61k
      if (!canWidenViaDuplication(Mask))
11569
1.24k
        return SDValue();
11570
364
      SmallVector<int, 4> LoInputs;
11571
364
      copy_if(Mask, std::back_inserter(LoInputs),
11572
5.82k
              [](int M) 
{ return M >= 0 && 5.82k
M < 85.02k
; });
11573
364
      std::sort(LoInputs.begin(), LoInputs.end());
11574
364
      LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11575
364
                     LoInputs.end());
11576
364
      SmallVector<int, 4> HiInputs;
11577
5.82k
      copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11578
364
      std::sort(HiInputs.begin(), HiInputs.end());
11579
364
      HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11580
364
                     HiInputs.end());
11581
364
11582
364
      bool TargetLo = LoInputs.size() >= HiInputs.size();
11583
364
      ArrayRef<int> InPlaceInputs = TargetLo ? 
LoInputs315
:
HiInputs49
;
11584
364
      ArrayRef<int> MovingInputs = TargetLo ? 
HiInputs315
:
LoInputs49
;
11585
364
11586
364
      int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11587
364
      SmallDenseMap<int, int, 8> LaneMap;
11588
1.05k
      for (int I : InPlaceInputs) {
11589
1.05k
        PreDupI16Shuffle[I/2] = I/2;
11590
1.05k
        LaneMap[I] = I;
11591
1.05k
      }
11592
364
      int j = TargetLo ? 
0315
:
449
, je = j + 4;
11593
427
      for (int i = 0, ie = MovingInputs.size(); 
i < ie427
;
++i63
) {
11594
74
        // Check if j is already a shuffle of this input. This happens when
11595
74
        // there are two adjacent bytes after we move the low one.
11596
74
        if (
PreDupI16Shuffle[j] != MovingInputs[i] / 274
) {
11597
74
          // If we haven't yet mapped the input, search for a slot into which
11598
74
          // we can map it.
11599
191
          while (
j < je && 191
PreDupI16Shuffle[j] >= 0180
)
11600
117
            ++j;
11601
74
11602
74
          if (j == je)
11603
74
            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11604
11
            return SDValue();
11605
63
11606
63
          // Map this input with the i16 shuffle.
11607
63
          PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11608
63
        }
11609
74
11610
74
        // Update the lane map based on the mapping we ended up with.
11611
63
        LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11612
63
      }
11613
353
      V1 = DAG.getBitcast(
11614
353
          MVT::v16i8,
11615
353
          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11616
353
                               DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11617
353
11618
353
      // Unpack the bytes to form the i16s that will be shuffled into place.
11619
353
      V1 = DAG.getNode(TargetLo ? 
X86ISD::UNPCKL306
:
X86ISD::UNPCKH47
, DL,
11620
353
                       MVT::v16i8, V1, V1);
11621
353
11622
353
      int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11623
6.00k
      for (int i = 0; 
i < 166.00k
;
++i5.64k
)
11624
5.64k
        
if (5.64k
Mask[i] >= 05.64k
) {
11625
4.86k
          int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 
04.41k
:
8450
);
11626
4.86k
          assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11627
4.86k
          if (PostDupI16Shuffle[i / 2] < 0)
11628
2.71k
            PostDupI16Shuffle[i / 2] = MappedMask;
11629
5.64k
          else
11630
5.64k
            assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11631
5.64k
                   "Conflicting entries in the original shuffle!");
11632
5.64k
        }
11633
353
      return DAG.getBitcast(
11634
353
          MVT::v16i8,
11635
353
          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11636
353
                               DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11637
1.61k
    };
11638
1.61k
    if (SDValue V = tryToWidenViaDuplication())
11639
353
      return V;
11640
2.89k
  }
11641
2.89k
11642
2.89k
  
if (SDValue 2.89k
Masked2.89k
= lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11643
2.89k
                                                   Zeroable, DAG))
11644
572
    return Masked;
11645
2.31k
11646
2.31k
  // Use dedicated unpack instructions for masks that match their pattern.
11647
2.31k
  
if (SDValue 2.31k
V2.31k
=
11648
2.31k
          lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11649
295
    return V;
11650
2.02k
11651
2.02k
  // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11652
2.02k
  // with PSHUFB. It is important to do this before we attempt to generate any
11653
2.02k
  // blends but after all of the single-input lowerings. If the single input
11654
2.02k
  // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11655
2.02k
  // want to preserve that and we can DAG combine any longer sequences into
11656
2.02k
  // a PSHUFB in the end. But once we start blending from multiple inputs,
11657
2.02k
  // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11658
2.02k
  // and there are *very* few patterns that would actually be faster than the
11659
2.02k
  // PSHUFB approach because of its ability to zero lanes.
11660
2.02k
  //
11661
2.02k
  // FIXME: The only exceptions to the above are blends which are exact
11662
2.02k
  // interleavings with direct instructions supporting them. We currently don't
11663
2.02k
  // handle those well here.
11664
2.02k
  
if (2.02k
Subtarget.hasSSSE3()2.02k
) {
11665
1.72k
    bool V1InUse = false;
11666
1.72k
    bool V2InUse = false;
11667
1.72k
11668
1.72k
    SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11669
1.72k
        DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11670
1.72k
11671
1.72k
    // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11672
1.72k
    // do so. This avoids using them to handle blends-with-zero which is
11673
1.72k
    // important as a single pshufb is significantly faster for that.
11674
1.72k
    if (
V1InUse && 1.72k
V2InUse1.72k
) {
11675
576
      if (Subtarget.hasSSE41())
11676
536
        
if (SDValue 536
Blend536
= lowerVectorShuffleAsBlend(
11677
536
                DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11678
28
          return Blend;
11679
548
11680
548
      // We can use an unpack to do the blending rather than an or in some
11681
548
      // cases. Even though the or may be (very minorly) more efficient, we
11682
548
      // preference this lowering because there are common cases where part of
11683
548
      // the complexity of the shuffles goes away when we do the final blend as
11684
548
      // an unpack.
11685
548
      // FIXME: It might be worth trying to detect if the unpack-feeding
11686
548
      // shuffles will both be pshufb, in which case we shouldn't bother with
11687
548
      // this.
11688
548
      
if (SDValue 548
Unpack548
= lowerVectorShuffleAsPermuteAndUnpack(
11689
548
              DL, MVT::v16i8, V1, V2, Mask, DAG))
11690
373
        return Unpack;
11691
1.32k
    }
11692
1.32k
11693
1.32k
    return PSHUFB;
11694
1.32k
  }
11695
298
11696
298
  // There are special ways we can lower some single-element blends.
11697
298
  
if (298
NumV2Elements == 1298
)
11698
74
    
if (SDValue 74
V74
= lowerVectorShuffleAsElementInsertion(
11699
74
            DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11700
0
      return V;
11701
298
11702
298
  
if (SDValue 298
BitBlend298
=
11703
298
          lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11704
66
    return BitBlend;
11705
232
11706
232
  // Check whether a compaction lowering can be done. This handles shuffles
11707
232
  // which take every Nth element for some even N. See the helper function for
11708
232
  // details.
11709
232
  //
11710
232
  // We special case these as they can be particularly efficiently handled with
11711
232
  // the PACKUSB instruction on x86 and they show up in common patterns of
11712
232
  // rearranging bytes to truncate wide elements.
11713
232
  bool IsSingleInput = V2.isUndef();
11714
232
  if (int 
NumEvenDrops232
= canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11715
91
    // NumEvenDrops is the power of two stride of the elements. Another way of
11716
91
    // thinking about it is that we need to drop the even elements this many
11717
91
    // times to get the original input.
11718
91
11719
91
    // First we need to zero all the dropped bytes.
11720
91
    assert(NumEvenDrops <= 3 &&
11721
91
           "No support for dropping even elements more than 3 times.");
11722
91
    // We use the mask type to pick which bytes are preserved based on how many
11723
91
    // elements are dropped.
11724
91
    MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11725
91
    SDValue ByteClearMask = DAG.getBitcast(
11726
91
        MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11727
91
    V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11728
91
    if (!IsSingleInput)
11729
43
      V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11730
91
11731
91
    // Now pack things back together.
11732
91
    V1 = DAG.getBitcast(MVT::v8i16, V1);
11733
91
    V2 = IsSingleInput ? 
V148
:
DAG.getBitcast(MVT::v8i16, V2)43
;
11734
91
    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11735
122
    for (int i = 1; 
i < NumEvenDrops122
;
++i31
) {
11736
31
      Result = DAG.getBitcast(MVT::v8i16, Result);
11737
31
      Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11738
31
    }
11739
91
11740
91
    return Result;
11741
91
  }
11742
141
11743
141
  // Handle multi-input cases by blending single-input shuffles.
11744
141
  
if (141
NumV2Elements > 0141
)
11745
64
    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11746
64
                                                      Mask, DAG);
11747
77
11748
77
  // The fallback path for single-input shuffles widens this into two v8i16
11749
77
  // vectors with unpacks, shuffles those, and then pulls them back together
11750
77
  // with a pack.
11751
77
  SDValue V = V1;
11752
77
11753
77
  std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11754
77
  std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11755
1.30k
  for (int i = 0; 
i < 161.30k
;
++i1.23k
)
11756
1.23k
    
if (1.23k
Mask[i] >= 01.23k
)
11757
1.23k
      
(i < 8 ? 836
LoBlendMask[i]456
:
HiBlendMask[i % 8]380
) = Mask[i];
11758
77
11759
77
  SDValue VLoHalf, VHiHalf;
11760
77
  // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11761
77
  // them out and avoid using UNPCK{L,H} to extract the elements of V as
11762
77
  // i16s.
11763
181
  if (
none_of(LoBlendMask, [](int M) 77
{ return M >= 0 && 181
M % 2 == 1108
; }) &&
11764
77
      
none_of(HiBlendMask, [](int M) 8
{ return M >= 0 && 54
M % 2 == 15
; })) {
11765
6
    // Use a mask to drop the high bytes.
11766
6
    VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11767
6
    VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11768
6
                          DAG.getConstant(0x00FF, DL, MVT::v8i16));
11769
6
11770
6
    // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11771
6
    VHiHalf = DAG.getUNDEF(MVT::v8i16);
11772
6
11773
6
    // Squash the masks to point directly into VLoHalf.
11774
6
    for (int &M : LoBlendMask)
11775
48
      
if (48
M >= 048
)
11776
18
        M /= 2;
11777
6
    for (int &M : HiBlendMask)
11778
48
      
if (48
M >= 048
)
11779
2
        M /= 2;
11780
77
  } else {
11781
71
    // Otherwise just unpack the low half of V into VLoHalf and the high half into
11782
71
    // VHiHalf so that we can blend them as i16s.
11783
71
    SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11784
71
11785
71
    VLoHalf = DAG.getBitcast(
11786
71
        MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11787
71
    VHiHalf = DAG.getBitcast(
11788
71
        MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11789
71
  }
11790
4.49k
11791
4.49k
  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11792
4.49k
  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11793
4.49k
11794
4.49k
  return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11795
4.49k
}
11796
11797
/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11798
///
11799
/// This routine breaks down the specific type of 128-bit shuffle and
11800
/// dispatches to the lowering routines accordingly.
11801
static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11802
                                        MVT VT, SDValue V1, SDValue V2,
11803
                                        const APInt &Zeroable,
11804
                                        const X86Subtarget &Subtarget,
11805
24.6k
                                        SelectionDAG &DAG) {
11806
24.6k
  switch (VT.SimpleTy) {
11807
3.64k
  case MVT::v2i64:
11808
3.64k
    return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11809
3.09k
  case MVT::v2f64:
11810
3.09k
    return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11811
5.81k
  case MVT::v4i32:
11812
5.81k
    return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11813
3.12k
  case MVT::v4f32:
11814
3.12k
    return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11815
4.47k
  case MVT::v8i16:
11816
4.47k
    return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11817
4.49k
  case MVT::v16i8:
11818
4.49k
    return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11819
24.6k
11820
0
  default:
11821
0
    llvm_unreachable("Unimplemented!");
11822
0
  }
11823
0
}
11824
11825
/// \brief Generic routine to split vector shuffle into half-sized shuffles.
11826
///
11827
/// This routine just extracts two subvectors, shuffles them independently, and
11828
/// then concatenates them back together. This should work effectively with all
11829
/// AVX vector shuffle types.
11830
static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11831
                                          SDValue V2, ArrayRef<int> Mask,
11832
797
                                          SelectionDAG &DAG) {
11833
797
  assert(VT.getSizeInBits() >= 256 &&
11834
797
         "Only for 256-bit or wider vector shuffles!");
11835
797
  assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11836
797
  assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11837
797
11838
797
  ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11839
797
  ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11840
797
11841
797
  int NumElements = VT.getVectorNumElements();
11842
797
  int SplitNumElements = NumElements / 2;
11843
797
  MVT ScalarVT = VT.getVectorElementType();
11844
797
  MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11845
797
11846
797
  // Rather than splitting build-vectors, just build two narrower build
11847
797
  // vectors. This helps shuffling with splats and zeros.
11848
1.59k
  auto SplitVector = [&](SDValue V) {
11849
1.59k
    V = peekThroughBitcasts(V);
11850
1.59k
11851
1.59k
    MVT OrigVT = V.getSimpleValueType();
11852
1.59k
    int OrigNumElements = OrigVT.getVectorNumElements();
11853
1.59k
    int OrigSplitNumElements = OrigNumElements / 2;
11854
1.59k
    MVT OrigScalarVT = OrigVT.getVectorElementType();
11855
1.59k
    MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11856
1.59k
11857
1.59k
    SDValue LoV, HiV;
11858
1.59k
11859
1.59k
    auto *BV = dyn_cast<BuildVectorSDNode>(V);
11860
1.59k
    if (
!BV1.59k
) {
11861
1.56k
      LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11862
1.56k
                        DAG.getIntPtrConstant(0, DL));
11863
1.56k
      HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11864
1.56k
                        DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11865
1.59k
    } else {
11866
33
11867
33
      SmallVector<SDValue, 16> LoOps, HiOps;
11868
309
      for (int i = 0; 
i < OrigSplitNumElements309
;
++i276
) {
11869
276
        LoOps.push_back(BV->getOperand(i));
11870
276
        HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11871
276
      }
11872
33
      LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11873
33
      HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11874
33
    }
11875
1.59k
    return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11876
1.59k
                          DAG.getBitcast(SplitVT, HiV));
11877
1.59k
  };
11878
797
11879
797
  SDValue LoV1, HiV1, LoV2, HiV2;
11880
797
  std::tie(LoV1, HiV1) = SplitVector(V1);
11881
797
  std::tie(LoV2, HiV2) = SplitVector(V2);
11882
797
11883
797
  // Now create two 4-way blends of these half-width vectors.
11884
1.59k
  auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11885
1.59k
    bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11886
1.59k
    SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11887
1.59k
    SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11888
1.59k
    SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11889
30.3k
    for (int i = 0; 
i < SplitNumElements30.3k
;
++i28.7k
) {
11890
28.7k
      int M = HalfMask[i];
11891
28.7k
      if (
M >= NumElements28.7k
) {
11892
2.21k
        if (M >= NumElements + SplitNumElements)
11893
892
          UseHiV2 = true;
11894
2.21k
        else
11895
1.32k
          UseLoV2 = true;
11896
2.21k
        V2BlendMask[i] = M - NumElements;
11897
2.21k
        BlendMask[i] = SplitNumElements + i;
11898
28.7k
      } else 
if (26.5k
M >= 026.5k
) {
11899
24.5k
        if (M >= SplitNumElements)
11900
10.6k
          UseHiV1 = true;
11901
24.5k
        else
11902
13.9k
          UseLoV1 = true;
11903
26.5k
        V1BlendMask[i] = M;
11904
26.5k
        BlendMask[i] = i;
11905
26.5k
      }
11906
28.7k
    }
11907
1.59k
11908
1.59k
    // Because the lowering happens after all combining takes place, we need to
11909
1.59k
    // manually combine these blend masks as much as possible so that we create
11910
1.59k
    // a minimal number of high-level vector shuffle nodes.
11911
1.59k
11912
1.59k
    // First try just blending the halves of V1 or V2.
11913
1.59k
    if (
!UseLoV1 && 1.59k
!UseHiV1637
&&
!UseLoV275
&&
!UseHiV266
)
11914
52
      return DAG.getUNDEF(SplitVT);
11915
1.54k
    
if (1.54k
!UseLoV2 && 1.54k
!UseHiV21.01k
)
11916
844
      return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11917
698
    
if (698
!UseLoV1 && 698
!UseHiV1305
)
11918
23
      return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11919
675
11920
675
    SDValue V1Blend, V2Blend;
11921
675
    if (
UseLoV1 && 675
UseHiV1393
) {
11922
38
      V1Blend =
11923
38
        DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11924
675
    } else {
11925
637
      // We only use half of V1 so map the usage down into the final blend mask.
11926
637
      V1Blend = UseLoV1 ? 
LoV1355
:
HiV1282
;
11927
12.9k
      for (int i = 0; 
i < SplitNumElements12.9k
;
++i12.3k
)
11928
12.3k
        
if (12.3k
BlendMask[i] >= 0 && 12.3k
BlendMask[i] < SplitNumElements11.8k
)
11929
12.3k
          
BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 10.0k
05.24k
:
SplitNumElements4.81k
);
11930
637
    }
11931
675
    if (
UseLoV2 && 675
UseHiV2514
) {
11932
36
      V2Blend =
11933
36
        DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11934
675
    } else {
11935
639
      // We only use half of V2 so map the usage down into the final blend mask.
11936
639
      V2Blend = UseLoV2 ? 
LoV2478
:
HiV2161
;
11937
13.0k
      for (int i = 0; 
i < SplitNumElements13.0k
;
++i12.3k
)
11938
12.3k
        
if (12.3k
BlendMask[i] >= SplitNumElements12.3k
)
11939
12.3k
          
BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? 1.75k
SplitNumElements1.09k
:
0653
);
11940
639
    }
11941
1.59k
    return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11942
1.59k
  };
11943
797
  SDValue Lo = HalfBlend(LoMask);
11944
797
  SDValue Hi = HalfBlend(HiMask);
11945
797
  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11946
797
}
11947
11948
/// \brief Either split a vector in halves or decompose the shuffles and the
11949
/// blend.
11950
///
11951
/// This is provided as a good fallback for many lowerings of non-single-input
11952
/// shuffles with more than one 128-bit lane. In those cases, we want to select
11953
/// between splitting the shuffle into 128-bit components and stitching those
11954
/// back together vs. extracting the single-input shuffles and blending those
11955
/// results.
11956
static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11957
                                                SDValue V1, SDValue V2,
11958
                                                ArrayRef<int> Mask,
11959
664
                                                SelectionDAG &DAG) {
11960
664
  assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11961
664
         "shuffles as it could then recurse on itself.");
11962
664
  int Size = Mask.size();
11963
664
11964
664
  // If this can be modeled as a broadcast of two elements followed by a blend,
11965
664
  // prefer that lowering. This is especially important because broadcasts can
11966
664
  // often fold with memory operands.
11967
664
  auto DoBothBroadcast = [&] {
11968
664
    int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11969
664
    for (int M : Mask)
11970
2.80k
      
if (2.80k
M >= Size2.80k
) {
11971
774
        if (V2BroadcastIdx < 0)
11972
217
          V2BroadcastIdx = M - Size;
11973
557
        else 
if (557
M - Size != V2BroadcastIdx557
)
11974
26
          return false;
11975
2.03k
      } else 
if (2.03k
M >= 02.03k
) {
11976
1.36k
        if (V1BroadcastIdx < 0)
11977
651
          V1BroadcastIdx = M;
11978
710
        else 
if (710
M != V1BroadcastIdx710
)
11979
634
          return false;
11980
4
      }
11981
4
    return true;
11982
4
  };
11983
664
  if (DoBothBroadcast())
11984
4
    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11985
4
                                                      DAG);
11986
660
11987
660
  // If the inputs all stem from a single 128-bit lane of each input, then we
11988
660
  // split them rather than blending because the split will decompose to
11989
660
  // unusually few instructions.
11990
660
  int LaneCount = VT.getSizeInBits() / 128;
11991
660
  int LaneSize = Size / LaneCount;
11992
660
  SmallBitVector LaneInputs[2];
11993
660
  LaneInputs[0].resize(LaneCount, false);
11994
660
  LaneInputs[1].resize(LaneCount, false);
11995
17.4k
  for (int i = 0; 
i < Size17.4k
;
++i16.8k
)
11996
16.8k
    
if (16.8k
Mask[i] >= 016.8k
)
11997
14.7k
      LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11998
660
  if (
LaneInputs[0].count() <= 1 && 660
LaneInputs[1].count() <= 182
)
11999
78
    return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12000
582
12001
582
  // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12002
582
  // that the decomposed single-input shuffles don't end up here.
12003
582
  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12004
582
}
12005
12006
/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12007
/// a permutation and blend of those lanes.
12008
///
12009
/// This essentially blends the out-of-lane inputs to each lane into the lane
12010
/// from a permuted copy of the vector. This lowering strategy results in four
12011
/// instructions in the worst case for a single-input cross lane shuffle which
12012
/// is lower than any other fully general cross-lane shuffle strategy I'm aware
12013
/// of. Special cases for each particular shuffle pattern should be handled
12014
/// prior to trying this lowering.
12015
static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12016
                                                       SDValue V1, SDValue V2,
12017
                                                       ArrayRef<int> Mask,
12018
                                                       SelectionDAG &DAG,
12019
287
                                                       const X86Subtarget &Subtarget) {
12020
287
  // FIXME: This should probably be generalized for 512-bit vectors as well.
12021
287
  assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12022
287
  int Size = Mask.size();
12023
287
  int LaneSize = Size / 2;
12024
287
12025
287
  // If there are only inputs from one 128-bit lane, splitting will in fact be
12026
287
  // less expensive. The flags track whether the given lane contains an element
12027
287
  // that crosses to another lane.
12028
287
  if (
!Subtarget.hasAVX2()287
) {
12029
59
    bool LaneCrossing[2] = {false, false};
12030
451
    for (int i = 0; 
i < Size451
;
++i392
)
12031
392
      
if (392
Mask[i] >= 0 && 392
(Mask[i] % Size) / LaneSize != i / LaneSize326
)
12032
160
        LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12033
59
    if (
!LaneCrossing[0] || 59
!LaneCrossing[1]56
)
12034
34
      return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12035
228
  } else {
12036
228
    bool LaneUsed[2] = {false, false};
12037
5.33k
    for (int i = 0; 
i < Size5.33k
;
++i5.10k
)
12038
5.10k
      
if (5.10k
Mask[i] >= 05.10k
)
12039
4.13k
        LaneUsed[(Mask[i] / LaneSize)] = true;
12040
228
    if (
!LaneUsed[0] || 228
!LaneUsed[1]222
)
12041
15
      return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12042
238
  }
12043
238
12044
287
  assert(V2.isUndef() &&
12045
238
         "This last part of this routine only works on single input shuffles");
12046
238
12047
238
  SmallVector<int, 32> FlippedBlendMask(Size);
12048
5.11k
  for (int i = 0; 
i < Size5.11k
;
++i4.87k
)
12049
4.87k
    FlippedBlendMask[i] =
12050
4.87k
        Mask[i] < 0 ? 
-1875
: (((Mask[i] % Size) / LaneSize == i / LaneSize)
12051
2.62k
                                ? Mask[i]
12052
1.37k
                                : Mask[i] % LaneSize +
12053
4.87k
                                      (i / LaneSize) * LaneSize + Size);
12054
238
12055
238
  // Flip the vector, and blend the results which should now be in-lane.
12056
238
  MVT PVT = VT.isFloatingPoint() ? 
MVT::v4f6425
:
MVT::v4i64213
;
12057
287
  SDValue Flipped = DAG.getBitcast(PVT, V1);
12058
287
  Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
12059
287
                                 { 2, 3, 0, 1 });
12060
287
  Flipped = DAG.getBitcast(VT, Flipped);
12061
287
  return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12062
287
}
12063
12064
/// \brief Handle lowering 2-lane 128-bit shuffles.
12065
static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12066
                                        SDValue V2, ArrayRef<int> Mask,
12067
                                        const APInt &Zeroable,
12068
                                        const X86Subtarget &Subtarget,
12069
2.90k
                                        SelectionDAG &DAG) {
12070
2.90k
  // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
12071
2.90k
  if (
Subtarget.hasAVX2() && 2.90k
V2.isUndef()2.30k
)
12072
1.38k
    return SDValue();
12073
1.52k
12074
1.52k
  SmallVector<int, 4> WidenedMask;
12075
1.52k
  if (!canWidenShuffleElements(Mask, WidenedMask))
12076
899
    return SDValue();
12077
626
12078
626
  // TODO: If minimizing size and one of the inputs is a zero vector and the
12079
626
  // the zero vector has only one use, we could use a VPERM2X128 to save the
12080
626
  // instruction bytes needed to explicitly generate the zero vector.
12081
626
12082
626
  // Blends are faster and handle all the non-lane-crossing cases.
12083
626
  
if (SDValue 626
Blend626
= lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12084
626
                                                Zeroable, Subtarget, DAG))
12085
110
    return Blend;
12086
516
12087
516
  bool IsLowZero = (Zeroable & 0x3) == 0x3;
12088
516
  bool IsHighZero = (Zeroable & 0xc) == 0xc;
12089
516
12090
516
  // If either input operand is a zero vector, use VPERM2X128 because its mask
12091
516
  // allows us to replace the zero input with an implicit zero.
12092
516
  if (
!IsLowZero && 516
!IsHighZero499
) {
12093
489
    // Check for patterns which can be matched with a single insert of a 128-bit
12094
489
    // subvector.
12095
489
    bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12096
489
    if (
OnlyUsesV1 || 489
isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})436
) {
12097
190
12098
190
      // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12099
190
      // this will likely become vinsertf128 which can't fold a 256-bit memop.
12100
190
      if (
!isa<LoadSDNode>(peekThroughBitcasts(V1))190
) {
12101
163
        MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12102
163
                                     VT.getVectorNumElements() / 2);
12103
163
        SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12104
163
                                  DAG.getIntPtrConstant(0, DL));
12105
163
        SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12106
163
                                  OnlyUsesV1 ? 
V152
:
V2111
,
12107
163
                                  DAG.getIntPtrConstant(0, DL));
12108
163
        return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12109
163
      }
12110
353
    }
12111
489
  }
12112
353
12113
353
  // Otherwise form a 128-bit permutation. After accounting for undefs,
12114
353
  // convert the 64-bit shuffle mask selection values into 128-bit
12115
353
  // selection bits by dividing the indexes by 2 and shifting into positions
12116
353
  // defined by a vperm2*128 instruction's immediate control byte.
12117
353
12118
353
  // The immediate permute control byte looks like this:
12119
353
  //    [1:0] - select 128 bits from sources for low half of destination
12120
353
  //    [2]   - ignore
12121
353
  //    [3]   - zero low half of destination
12122
353
  //    [5:4] - select 128 bits from sources for high half of destination
12123
353
  //    [6]   - ignore
12124
353
  //    [7]   - zero high half of destination
12125
353
12126
516
  assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
12127
353
12128
353
  unsigned PermMask = 0;
12129
353
  PermMask |= IsLowZero  ? 
0x0817
:
(WidenedMask[0] << 0)336
;
12130
353
  PermMask |= IsHighZero ? 
0x8010
:
(WidenedMask[1] << 4)343
;
12131
353
12132
353
  // Check the immediate mask and replace unused sources with undef.
12133
353
  if (
(PermMask & 0x0a) != 0x00 && 353
(PermMask & 0xa0) != 0x0021
)
12134
17
    V1 = DAG.getUNDEF(VT);
12135
353
  if (
(PermMask & 0x0a) != 0x02 && 353
(PermMask & 0xa0) != 0x20349
)
12136
72
    V2 = DAG.getUNDEF(VT);
12137
2.90k
12138
2.90k
  return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12139
2.90k
                     DAG.getConstant(PermMask, DL, MVT::i8));
12140
2.90k
}
12141
12142
/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12143
/// shuffling each lane.
12144
///
12145
/// This will only succeed when the result of fixing the 128-bit lanes results
12146
/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12147
/// each 128-bit lanes. This handles many cases where we can quickly blend away
12148
/// the lane crosses early and then use simpler shuffles within each lane.
12149
///
12150
/// FIXME: It might be worthwhile at some point to support this without
12151
/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12152
/// in x86 only floating point has interesting non-repeating shuffles, and even
12153
/// those are still *marginally* more expensive.
12154
static SDValue lowerVectorShuffleByMerging128BitLanes(
12155
    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12156
983
    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12157
983
  assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12158
983
12159
983
  int Size = Mask.size();
12160
983
  int LaneSize = 128 / VT.getScalarSizeInBits();
12161
983
  int NumLanes = Size / LaneSize;
12162
983
  assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12163
983
12164
983
  // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12165
983
  // check whether the in-128-bit lane shuffles share a repeating pattern.
12166
983
  SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12167
983
  SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12168
8.37k
  for (int i = 0; 
i < Size8.37k
;
++i7.39k
) {
12169
8.29k
    if (Mask[i] < 0)
12170
708
      continue;
12171
7.58k
12172
7.58k
    int j = i / LaneSize;
12173
7.58k
12174
7.58k
    if (
Lanes[j] < 07.58k
) {
12175
1.22k
      // First entry we've seen for this lane.
12176
1.22k
      Lanes[j] = Mask[i] / LaneSize;
12177
7.58k
    } else 
if (6.35k
Lanes[j] != Mask[i] / LaneSize6.35k
) {
12178
886
      // This doesn't match the lane selected previously!
12179
886
      return SDValue();
12180
886
    }
12181
6.69k
12182
6.69k
    // Check that within each lane we have a consistent shuffle mask.
12183
6.69k
    int k = i % LaneSize;
12184
6.69k
    if (
InLaneMask[k] < 06.69k
) {
12185
5.26k
      InLaneMask[k] = Mask[i] % LaneSize;
12186
6.69k
    } else 
if (1.42k
InLaneMask[k] != Mask[i] % LaneSize1.42k
) {
12187
13
      // This doesn't fit a repeating in-lane mask.
12188
13
      return SDValue();
12189
13
    }
12190
8.29k
  }
12191
983
12192
983
  // First shuffle the lanes into place.
12193
84
  
MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? 84
MVT::f6443
:
MVT::i6441
,
12194
84
                                VT.getSizeInBits() / 64);
12195
84
  SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12196
252
  for (int i = 0; 
i < NumLanes252
;
++i168
)
12197
168
    
if (168
Lanes[i] >= 0168
) {
12198
168
      LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12199
168
      LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12200
168
    }
12201
84
12202
84
  V1 = DAG.getBitcast(LaneVT, V1);
12203
84
  V2 = DAG.getBitcast(LaneVT, V2);
12204
84
  SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12205
84
12206
84
  // Cast it back to the type we actually want.
12207
84
  LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12208
84
12209
84
  // Now do a simple shuffle that isn't lane crossing.
12210
84
  SmallVector<int, 8> NewMask((unsigned)Size, -1);
12211
1.04k
  for (int i = 0; 
i < Size1.04k
;
++i960
)
12212
960
    
if (960
Mask[i] >= 0960
)
12213
952
      NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12214
84
  assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12215
84
         "Must not introduce lane crosses at this point!");
12216
84
12217
84
  return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12218
983
}
12219
12220
/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
12221
/// This allows for fast cases such as subvector extraction/insertion
12222
/// or shuffling smaller vector types which can lower more efficiently.
12223
static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12224
                                               SDValue V1, SDValue V2,
12225
                                               ArrayRef<int> Mask,
12226
                                               const X86Subtarget &Subtarget,
12227
12.5k
                                               SelectionDAG &DAG) {
12228
12.5k
  assert((VT.is256BitVector() || VT.is512BitVector()) &&
12229
12.5k
         "Expected 256-bit or 512-bit vector");
12230
12.5k
12231
12.5k
  unsigned NumElts = VT.getVectorNumElements();
12232
12.5k
  unsigned HalfNumElts = NumElts / 2;
12233
12.5k
  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12234
12.5k
12235
12.5k
  bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12236
12.5k
  bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12237
12.5k
  if (
!UndefLower && 12.5k
!UndefUpper12.2k
)
12238
10.6k
    return SDValue();
12239
1.90k
12240
1.90k
  // Upper half is undef and lower half is whole upper subvector.
12241
1.90k
  // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12242
1.90k
  
if (1.90k
UndefUpper &&
12243
1.90k
      
isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)1.62k
) {
12244
95
    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12245
95
                             DAG.getIntPtrConstant(HalfNumElts, DL));
12246
95
    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12247
95
                       DAG.getIntPtrConstant(0, DL));
12248
95
  }
12249
1.81k
12250
1.81k
  // Lower half is undef and upper half is whole lower subvector.
12251
1.81k
  // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12252
1.81k
  
if (1.81k
UndefLower &&
12253
1.81k
      
isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)280
) {
12254
18
    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12255
18
                             DAG.getIntPtrConstant(0, DL));
12256
18
    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12257
18
                       DAG.getIntPtrConstant(HalfNumElts, DL));
12258
18
  }
12259
1.79k
12260
1.79k
  // If the shuffle only uses two of the four halves of the input operands,
12261
1.79k
  // then extract them and perform the 'half' shuffle at half width.
12262
1.79k
  // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12263
1.79k
  int HalfIdx1 = -1, HalfIdx2 = -1;
12264
1.79k
  SmallVector<int, 8> HalfMask(HalfNumElts);
12265
1.79k
  unsigned Offset = UndefLower ? 
HalfNumElts262
:
01.53k
;
12266
14.2k
  for (unsigned i = 0; 
i != HalfNumElts14.2k
;
++i12.4k
) {
12267
12.5k
    int M = Mask[i + Offset];
12268
12.5k
    if (
M < 012.5k
) {
12269
4.59k
      HalfMask[i] = M;
12270
4.59k
      continue;
12271
4.59k
    }
12272
7.95k
12273
7.95k
    // Determine which of the 4 half vectors this element is from.
12274
7.95k
    // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12275
7.95k
    int HalfIdx = M / HalfNumElts;
12276
7.95k
12277
7.95k
    // Determine the element index into its half vector source.
12278
7.95k
    int HalfElt = M % HalfNumElts;
12279
7.95k
12280
7.95k
    // We can shuffle with up to 2 half vectors, set the new 'half'
12281
7.95k
    // shuffle mask accordingly.
12282
7.95k
    if (
HalfIdx1 < 0 || 7.95k
HalfIdx1 == HalfIdx6.16k
) {
12283
5.01k
      HalfMask[i] = HalfElt;
12284
5.01k
      HalfIdx1 = HalfIdx;
12285
5.01k
      continue;
12286
5.01k
    }
12287
2.94k
    
if (2.94k
HalfIdx2 < 0 || 2.94k
HalfIdx2 == HalfIdx1.87k
) {
12288
2.81k
      HalfMask[i] = HalfElt + HalfNumElts;
12289
2.81k
      HalfIdx2 = HalfIdx;
12290
2.81k
      continue;
12291
2.81k
    }
12292
123
12293
123
    // Too many half vectors referenced.
12294
123
    return SDValue();
12295
123
  }
12296
1.66k
  assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12297
1.66k
12298
1.66k
  // Only shuffle the halves of the inputs when useful.
12299
1.66k
  int NumLowerHalves =
12300
1.66k
      (HalfIdx1 == 0 || 
HalfIdx1 == 2115
) + (HalfIdx2 == 0 ||
HalfIdx2 == 21.64k
);
12301
1.66k
  int NumUpperHalves =
12302
1.66k
      (HalfIdx1 == 1 || 
HalfIdx1 == 31.56k
) + (HalfIdx2 == 1 ||
HalfIdx2 == 3916
);
12303
1.66k
12304
1.66k
  // uuuuXXXX - don't extract uppers just to insert again.
12305
1.66k
  if (
UndefLower && 1.66k
NumUpperHalves != 0253
)
12306
91
    return SDValue();
12307
1.57k
12308
1.57k
  // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12309
1.57k
  
if (1.57k
UndefUpper && 1.57k
NumUpperHalves == 21.41k
)
12310
8
    return SDValue();
12311
1.57k
12312
1.57k
  // AVX2 - XXXXuuuu - always extract lowers.
12313
1.57k
  
if (1.57k
Subtarget.hasAVX2() && 1.57k
!(UndefUpper && 1.43k
NumUpperHalves == 01.28k
)) {
12314
887
    // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12315
887
    if (
VT == MVT::v4f64 || 887
VT == MVT::v4i64855
)
12316
384
      return SDValue();
12317
503
    // AVX2 supports variable 32-bit element cross-lane shuffles.
12318
503
    
if (503
VT == MVT::v8f32 || 503
VT == MVT::v8i32474
) {
12319
204
      // XXXXuuuu - don't extract lowers and uppers.
12320
204
      if (
UndefUpper && 204
NumLowerHalves != 0178
&&
NumUpperHalves != 0167
)
12321
167
        return SDValue();
12322
1.01k
    }
12323
887
  }
12324
1.01k
12325
1.01k
  // AVX512 - XXXXuuuu - always extract lowers.
12326
1.01k
  
if (1.01k
VT.is512BitVector() && 1.01k
!(UndefUpper && 64
NumUpperHalves == 062
))
12327
4
    return SDValue();
12328
1.01k
12329
1.01k
  
auto GetHalfVector = [&](int HalfIdx) 1.01k
{
12330
2.03k
    if (HalfIdx < 0)
12331
676
      return DAG.getUNDEF(HalfVT);
12332
1.35k
    
SDValue V = (HalfIdx < 2 ? 1.35k
V11.23k
:
V2118
);
12333
2.03k
    HalfIdx = (HalfIdx % 2) * HalfNumElts;
12334
2.03k
    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12335
2.03k
                       DAG.getIntPtrConstant(HalfIdx, DL));
12336
2.03k
  };
12337
12.5k
12338
12.5k
  SDValue Half1 = GetHalfVector(HalfIdx1);
12339
12.5k
  SDValue Half2 = GetHalfVector(HalfIdx2);
12340
12.5k
  SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12341
12.5k
  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12342
12.5k
                     DAG.getIntPtrConstant(Offset, DL));
12343
12.5k
}
12344
12345
/// \brief Test whether the specified input (0 or 1) is in-place blended by the
12346
/// given mask.
12347
///
12348
/// This returns true if the elements from a particular input are already in the
12349
/// slot required by the given mask and require no permutation.
12350
234
static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12351
234
  assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12352
234
  int Size = Mask.size();
12353
568
  for (int i = 0; 
i < Size568
;
++i334
)
12354
552
    
if (552
Mask[i] >= 0 && 552
Mask[i] / Size == Input544
&&
Mask[i] % Size != i296
)
12355
218
      return false;
12356
234
12357
16
  return true;
12358
234
}
12359
12360
/// Handle case where shuffle sources are coming from the same 128-bit lane and
12361
/// every lane can be represented as the same repeating mask - allowing us to
12362
/// shuffle the sources with the repeating shuffle and then permute the result
12363
/// to the destination lanes.
12364
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12365
    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12366
3.26k
    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12367
3.26k
  int NumElts = VT.getVectorNumElements();
12368
3.26k
  int NumLanes = VT.getSizeInBits() / 128;
12369
3.26k
  int NumLaneElts = NumElts / NumLanes;
12370
3.26k
12371
3.26k
  // On AVX2 we may be able to just shuffle the lowest elements and then
12372
3.26k
  // broadcast the result.
12373
3.26k
  if (
Subtarget.hasAVX2()3.26k
) {
12374
8.93k
    for (unsigned BroadcastSize : {16, 32, 64}) {
12375
8.93k
      if (BroadcastSize <= VT.getScalarSizeInBits())
12376
2.55k
        continue;
12377
6.38k
      int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12378
6.38k
12379
6.38k
      // Attempt to match a repeating pattern every NumBroadcastElts,
12380
6.38k
      // accounting for UNDEFs but only references the lowest 128-bit
12381
6.38k
      // lane of the inputs.
12382
6.38k
      auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12383
15.1k
        for (int i = 0; 
i != NumElts15.1k
;
i += NumBroadcastElts8.76k
)
12384
48.7k
          
for (int j = 0; 15.1k
j != NumBroadcastElts48.7k
;
++j33.6k
) {
12385
39.9k
            int M = Mask[i + j];
12386
39.9k
            if (M < 0)
12387
6.62k
              continue;
12388
33.3k
            int &R = RepeatMask[j];
12389
33.3k
            if (0 != ((M % NumElts) / NumLaneElts))
12390
1.52k
              return false;
12391
31.8k
            
if (31.8k
0 <= R && 31.8k
R != M9.83k
)
12392
4.83k
              return false;
12393
26.9k
            R = M;
12394
26.9k
          }
12395
21
        return true;
12396
6.38k
      };
12397
6.38k
12398
6.38k
      SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12399
6.38k
      if (!FindRepeatingBroadcastMask(RepeatMask))
12400
6.35k
        continue;
12401
21
12402
21
      // Shuffle the (lowest) repeated elements in place for broadcast.
12403
21
      SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12404
21
12405
21
      // Shuffle the actual broadcast.
12406
21
      SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12407
137
      for (int i = 0; 
i != NumElts137
;
i += NumBroadcastElts116
)
12408
380
        
for (int j = 0; 116
j != NumBroadcastElts380
;
++j264
)
12409
264
          BroadcastMask[i + j] = j;
12410
8.93k
      return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12411
8.93k
                                  BroadcastMask);
12412
8.93k
    }
12413
2.98k
  }
12414
3.24k
12415
3.24k
  // Bail if the shuffle mask doesn't cross 128-bit lanes.
12416
3.24k
  
if (3.24k
!is128BitLaneCrossingShuffleMask(VT, Mask)3.24k
)
12417
1.52k
    return SDValue();
12418
1.71k
12419
1.71k
  // Bail if we already have a repeated lane shuffle mask.
12420
1.71k
  SmallVector<int, 8> RepeatedShuffleMask;
12421
1.71k
  if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12422
0
    return SDValue();
12423
1.71k
12424
1.71k
  // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12425
1.71k
  // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12426
1.71k
  
int SubLaneScale = Subtarget.hasAVX2() && 1.71k
VT.is256BitVector()1.50k
?
21.30k
:
1410
;
12427
1.71k
  int NumSubLanes = NumLanes * SubLaneScale;
12428
1.71k
  int NumSubLaneElts = NumLaneElts / SubLaneScale;
12429
1.71k
12430
1.71k
  // Check that all the sources are coming from the same lane and see if we can
12431
1.71k
  // form a repeating shuffle mask (local to each sub-lane). At the same time,
12432
1.71k
  // determine the source sub-lane for each destination sub-lane.
12433
1.71k
  int TopSrcSubLane = -1;
12434
1.71k
  SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12435
1.71k
  SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12436
1.71k
      SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12437
1.71k
      SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12438
1.71k
12439
4.96k
  for (int DstSubLane = 0; 
DstSubLane != NumSubLanes4.96k
;
++DstSubLane3.25k
) {
12440
4.53k
    // Extract the sub-lane mask, check that it all comes from the same lane
12441
4.53k
    // and normalize the mask entries to come from the first lane.
12442
4.53k
    int SrcLane = -1;
12443
4.53k
    SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12444
25.9k
    for (int Elt = 0; 
Elt != NumSubLaneElts25.9k
;
++Elt21.4k
) {
12445
22.4k
      int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12446
22.4k
      if (M < 0)
12447
2.36k
        continue;
12448
20.0k
      int Lane = (M % NumElts) / NumLaneElts;
12449
20.0k
      if (
(0 <= SrcLane) && 20.0k
(SrcLane != Lane)16.0k
)
12450
1.01k
        return SDValue();
12451
19.0k
      SrcLane = Lane;
12452
19.0k
      int LocalM = (M % NumLaneElts) + (M < NumElts ? 
017.9k
:
NumElts1.15k
);
12453
22.4k
      SubLaneMask[Elt] = LocalM;
12454
22.4k
    }
12455
4.53k
12456
4.53k
    // Whole sub-lane is UNDEF.
12457
3.51k
    
if (3.51k
SrcLane < 03.51k
)
12458
486
      continue;
12459
3.03k
12460
3.03k
    // Attempt to match against the candidate repeated sub-lane masks.
12461
4.18k
    
for (int SubLane = 0; 3.03k
SubLane != SubLaneScale4.18k
;
++SubLane1.15k
) {
12462
3.92k
      auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12463
20.0k
        for (int i = 0; 
i != NumSubLaneElts20.0k
;
++i16.0k
) {
12464
17.2k
          if (
M1[i] < 0 || 17.2k
M2[i] < 016.4k
)
12465
10.4k
            continue;
12466
6.79k
          
if (6.79k
M1[i] != M2[i]6.79k
)
12467
1.15k
            return false;
12468
17.2k
        }
12469
2.76k
        return true;
12470
3.92k
      };
12471
3.92k
12472
3.92k
      auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12473
3.92k
      if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12474
1.15k
        continue;
12475
2.76k
12476
2.76k
      // Merge the sub-lane mask into the matching repeated sub-lane mask.
12477
18.4k
      
for (int i = 0; 2.76k
i != NumSubLaneElts18.4k
;
++i15.7k
) {
12478
15.7k
        int M = SubLaneMask[i];
12479
15.7k
        if (M < 0)
12480
691
          continue;
12481
15.7k
        assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12482
15.0k
               "Unexpected mask element");
12483
15.0k
        RepeatedSubLaneMask[i] = M;
12484
15.0k
      }
12485
3.92k
12486
3.92k
      // Track the top most source sub-lane - by setting the remaining to UNDEF
12487
3.92k
      // we can greatly simplify shuffle matching.
12488
3.92k
      int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12489
3.92k
      TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12490
3.92k
      Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12491
3.92k
      break;
12492
3.92k
    }
12493
3.03k
12494
3.03k
    // Bail if we failed to find a matching repeated sub-lane mask.
12495
3.03k
    if (Dst2SrcSubLanes[DstSubLane] < 0)
12496
261
      return SDValue();
12497
4.53k
  }
12498
437
  assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12499
437
         "Unexpected source lane");
12500
437
12501
437
  // Create a repeating shuffle mask for the entire vector.
12502
437
  SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12503
1.58k
  for (int SubLane = 0; 
SubLane <= TopSrcSubLane1.58k
;
++SubLane1.15k
) {
12504
1.15k
    int Lane = SubLane / SubLaneScale;
12505
1.15k
    auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12506
4.72k
    for (int Elt = 0; 
Elt != NumSubLaneElts4.72k
;
++Elt3.57k
) {
12507
3.57k
      int M = RepeatedSubLaneMask[Elt];
12508
3.57k
      if (M < 0)
12509
443
        continue;
12510
3.13k
      int Idx = (SubLane * NumSubLaneElts) + Elt;
12511
3.13k
      RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12512
3.13k
    }
12513
1.15k
  }
12514
437
  SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12515
437
12516
437
  // Shuffle each source sub-lane to its destination.
12517
437
  SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12518
2.04k
  for (int i = 0; 
i != NumElts2.04k
;
i += NumSubLaneElts1.60k
) {
12519
1.60k
    int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12520
1.60k
    if (SrcSubLane < 0)
12521
370
      continue;
12522
5.62k
    
for (int j = 0; 1.23k
j != NumSubLaneElts5.62k
;
++j4.39k
)
12523
4.39k
      SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12524
1.60k
  }
12525
437
12526
437
  return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12527
437
                              SubLaneMask);
12528
3.26k
}
12529
12530
static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12531
                                         unsigned &ShuffleImm,
12532
1.05k
                                         ArrayRef<int> Mask) {
12533
1.05k
  int NumElts = VT.getVectorNumElements();
12534
1.05k
  assert(VT.getScalarSizeInBits() == 64 &&
12535
1.05k
         (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12536
1.05k
         "Unexpected data type for VSHUFPD");
12537
1.05k
12538
1.05k
  // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
12539
1.05k
  // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
12540
1.05k
  ShuffleImm = 0;
12541
1.05k
  bool ShufpdMask = true;
12542
1.05k
  bool CommutableMask = true;
12543
6.03k
  for (int i = 0; 
i < NumElts6.03k
;
++i4.97k
) {
12544
5.04k
    if (Mask[i] == SM_SentinelUndef)
12545
70
      continue;
12546
4.97k
    
if (4.97k
Mask[i] < 04.97k
)
12547
67
      return false;
12548
4.90k
    int Val = (i & 6) + NumElts * (i & 1);
12549
4.90k
    int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12550
4.90k
    if (
Mask[i] < Val || 4.90k
Mask[i] > Val + 12.96k
)
12551
2.75k
      ShufpdMask = false;
12552
4.90k
    if (
Mask[i] < CommutVal || 4.90k
Mask[i] > CommutVal + 12.24k
)
12553
4.25k
      CommutableMask = false;
12554
5.04k
    ShuffleImm |= (Mask[i] % 2) << i;
12555
5.04k
  }
12556
1.05k
12557
989
  
if (989
ShufpdMask989
)
12558
257
    return true;
12559
732
  
if (732
CommutableMask732
) {
12560
1
    std::swap(V1, V2);
12561
1
    return true;
12562
1
  }
12563
731
12564
731
  return false;
12565
731
}
12566
12567
static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12568
                                            ArrayRef<int> Mask, SDValue V1,
12569
302
                                            SDValue V2, SelectionDAG &DAG) {
12570
302
  assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12571
302
         "Unexpected data type for VSHUFPD");
12572
302
12573
302
  unsigned Immediate = 0;
12574
302
  if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12575
210
    return SDValue();
12576
92
12577
92
  return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12578
92
                     DAG.getConstant(Immediate, DL, MVT::i8));
12579
92
}
12580
12581
static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12582
                                           ArrayRef<int> Mask, SDValue V1,
12583
891
                                           SDValue V2, SelectionDAG &DAG) {
12584
891
  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12585
891
  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12586
891
12587
891
  SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12588
891
  if (V2.isUndef())
12589
216
    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12590
675
12591
675
  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12592
675
}
12593
12594
/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12595
///
12596
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12597
/// isn't available.
12598
static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12599
                                       const APInt &Zeroable,
12600
                                       SDValue V1, SDValue V2,
12601
                                       const X86Subtarget &Subtarget,
12602
1.43k
                                       SelectionDAG &DAG) {
12603
1.43k
  assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12604
1.43k
  assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12605
1.43k
  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12606
1.43k
12607
1.43k
  if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12608
1.43k
                                           Zeroable, Subtarget, DAG))
12609
423
    return V;
12610
1.00k
12611
1.00k
  
if (1.00k
V2.isUndef()1.00k
) {
12612
525
    // Check for being able to broadcast a single element.
12613
525
    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12614
525
            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12615
94
      return Broadcast;
12616
431
12617
431
    // Use low duplicate instructions for masks that match their pattern.
12618
431
    
if (431
isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})431
)
12619
74
      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12620
357
12621
357
    
if (357
!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)357
) {
12622
102
      // Non-half-crossing single input shuffles can be lowered with an
12623
102
      // interleaved permutation.
12624
102
      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12625
102
                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12626
102
      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12627
102
                         DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12628
102
    }
12629
255
12630
255
    // With AVX2 we have direct support for this permutation.
12631
255
    
if (255
Subtarget.hasAVX2()255
)
12632
197
      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12633
197
                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12634
58
12635
58
    // Try to create an in-lane repeating shuffle mask and then shuffle the
12636
58
    // the results into the target lanes.
12637
58
    
if (SDValue 58
V58
= lowerShuffleAsRepeatedMaskAndLanePermute(
12638
58
            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12639
38
      return V;
12640
20
12641
20
    // Otherwise, fall back.
12642
20
    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12643
20
                                                   DAG, Subtarget);
12644
20
  }
12645
484
12646
484
  // Use dedicated unpack instructions for masks that match their pattern.
12647
484
  
if (SDValue 484
V484
=
12648
484
          lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12649
160
    return V;
12650
324
12651
324
  
if (SDValue 324
Blend324
= lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12652
324
                                                Zeroable, Subtarget, DAG))
12653
152
    return Blend;
12654
172
12655
172
  // Check if the blend happens to exactly fit that of SHUFPD.
12656
172
  
if (SDValue 172
Op172
=
12657
172
      lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12658
61
    return Op;
12659
111
12660
111
  // Try to create an in-lane repeating shuffle mask and then shuffle the
12661
111
  // the results into the target lanes.
12662
111
  
if (SDValue 111
V111
= lowerShuffleAsRepeatedMaskAndLanePermute(
12663
111
          DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12664
24
    return V;
12665
87
12666
87
  // Try to simplify this by merging 128-bit lanes to enable a lane-based
12667
87
  // shuffle. However, if we have AVX2 and either inputs are already in place,
12668
87
  // we will be able to shuffle even across lanes the other input in a single
12669
87
  // instruction so skip this pattern.
12670
87
  
if (87
!(Subtarget.hasAVX2() && 87
(isShuffleMaskInputInPlace(0, Mask) ||
12671
48
                                isShuffleMaskInputInPlace(1, Mask))))
12672
80
    
if (SDValue 80
Result80
= lowerVectorShuffleByMerging128BitLanes(
12673
80
            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12674
16
      return Result;
12675
71
  // If we have VLX support, we can use VEXPAND.
12676
71
  
if (71
Subtarget.hasVLX()71
)
12677
29
    
if (SDValue 29
V29
= lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12678
29
                                               V1, V2, DAG, Subtarget))
12679
2
      return V;
12680
69
12681
69
  // If we have AVX2 then we always want to lower with a blend because an v4 we
12682
69
  // can fully permute the elements.
12683
69
  
if (69
Subtarget.hasAVX2()69
)
12684
38
    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12685
38
                                                      Mask, DAG);
12686
31
12687
31
  // Otherwise fall back on generic lowering.
12688
31
  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12689
31
}
12690
12691
/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12692
///
12693
/// This routine is only called when we have AVX2 and thus a reasonable
12694
/// instruction set for v4i64 shuffling..
12695
static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12696
                                       const APInt &Zeroable,
12697
                                       SDValue V1, SDValue V2,
12698
                                       const X86Subtarget &Subtarget,
12699
1.47k
                                       SelectionDAG &DAG) {
12700
1.47k
  assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12701
1.47k
  assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12702
1.47k
  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12703
1.47k
  assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12704
1.47k
12705
1.47k
  if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12706
1.47k
                                           Zeroable, Subtarget, DAG))
12707
203
    return V;
12708
1.27k
12709
1.27k
  
if (SDValue 1.27k
Blend1.27k
= lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12710
1.27k
                                                Zeroable, Subtarget, DAG))
12711
113
    return Blend;
12712
1.16k
12713
1.16k
  // Check for being able to broadcast a single element.
12714
1.16k
  
if (SDValue 1.16k
Broadcast1.16k
= lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12715
1.16k
                                                        Mask, Subtarget, DAG))
12716
98
    return Broadcast;
12717
1.06k
12718
1.06k
  
if (1.06k
V2.isUndef()1.06k
) {
12719
906
    // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12720
906
    // can use lower latency instructions that will operate on both lanes.
12721
906
    SmallVector<int, 2> RepeatedMask;
12722
906
    if (
is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)906
) {
12723
28
      SmallVector<int, 4> PSHUFDMask;
12724
28
      scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
12725
28
      return DAG.getBitcast(
12726
28
          MVT::v4i64,
12727
28
          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12728
28
                      DAG.getBitcast(MVT::v8i32, V1),
12729
28
                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12730
28
    }
12731
878
12732
878
    // AVX2 provides a direct instruction for permuting a single input across
12733
878
    // lanes.
12734
878
    return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12735
878
                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12736
878
  }
12737
156
12738
156
  // Try to use shift instructions.
12739
156
  
if (SDValue 156
Shift156
= lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12740
156
                                                Zeroable, Subtarget, DAG))
12741
14
    return Shift;
12742
142
12743
142
  // If we have VLX support, we can use VALIGN or VEXPAND.
12744
142
  
if (142
Subtarget.hasVLX()142
) {
12745
63
    if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12746
63
                                                    Mask, Subtarget, DAG))
12747
8
      return Rotate;
12748
55
12749
55
    
if (SDValue 55
V55
= lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12750
55
                                               V1, V2, DAG, Subtarget))
12751
2
      return V;
12752
132
  }
12753
132
12754
132
  // Try to use PALIGNR.
12755
132
  
if (SDValue 132
Rotate132
= lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12756
132
                                                      Mask, Subtarget, DAG))
12757
0
    return Rotate;
12758
132
12759
132
  // Use dedicated unpack instructions for masks that match their pattern.
12760
132
  
if (SDValue 132
V132
=
12761
132
          lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12762
60
    return V;
12763
72
12764
72
  // Try to simplify this by merging 128-bit lanes to enable a lane-based
12765
72
  // shuffle. However, if we have AVX2 and either inputs are already in place,
12766
72
  // we will be able to shuffle even across lanes the other input in a single
12767
72
  // instruction so skip this pattern.
12768
72
  
if (72
!isShuffleMaskInputInPlace(0, Mask) &&
12769
70
      !isShuffleMaskInputInPlace(1, Mask))
12770
63
    
if (SDValue 63
Result63
= lowerVectorShuffleByMerging128BitLanes(
12771
63
            DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12772
8
      return Result;
12773
64
12774
64
  // Otherwise fall back on generic blend lowering.
12775
64
  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12776
64
                                                    Mask, DAG);
12777
64
}
12778
12779
/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12780
///
12781
/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12782
/// isn't available.
12783
static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12784
                                       const APInt &Zeroable,
12785
                                       SDValue V1, SDValue V2,
12786
                                       const X86Subtarget &Subtarget,
12787
1.19k
                                       SelectionDAG &DAG) {
12788
1.19k
  assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12789
1.19k
  assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12790
1.19k
  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12791
1.19k
12792
1.19k
  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12793
1.19k
                                                Zeroable, Subtarget, DAG))
12794
178
    return Blend;
12795
1.01k
12796
1.01k
  // Check for being able to broadcast a single element.
12797
1.01k
  
if (SDValue 1.01k
Broadcast1.01k
= lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12798
1.01k
                                                        Mask, Subtarget, DAG))
12799
57
    return Broadcast;
12800
955
12801
955
  // If the shuffle mask is repeated in each 128-bit lane, we have many more
12802
955
  // options to efficiently lower the shuffle.
12803
955
  SmallVector<int, 4> RepeatedMask;
12804
955
  if (
is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)955
) {
12805
455
    assert(RepeatedMask.size() == 4 &&
12806
455
           "Repeated masks must be half the mask width!");
12807
455
12808
455
    // Use even/odd duplicate instructions for masks that match their pattern.
12809
455
    if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12810
58
      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12811
397
    
if (397
isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})397
)
12812
59
      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12813
338
12814
338
    
if (338
V2.isUndef()338
)
12815
153
      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12816
153
                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12817
185
12818
185
    // Use dedicated unpack instructions for masks that match their pattern.
12819
185
    
if (SDValue 185
V185
=
12820
185
            lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12821
97
      return V;
12822
88
12823
88
    // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12824
88
    // have already handled any direct blends.
12825
88
    return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12826
88
  }
12827
500
12828
500
  // Try to create an in-lane repeating shuffle mask and then shuffle the
12829
500
  // the results into the target lanes.
12830
500
  
if (SDValue 500
V500
= lowerShuffleAsRepeatedMaskAndLanePermute(
12831
500
          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12832
92
    return V;
12833
408
12834
408
  // If we have a single input shuffle with different shuffle patterns in the
12835
408
  // two 128-bit lanes use the variable mask to VPERMILPS.
12836
408
  
if (408
V2.isUndef()408
) {
12837
244
    SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12838
244
    if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12839
107
      return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12840
137
12841
137
    
if (137
Subtarget.hasAVX2()137
)
12842
98
      return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12843
39
12844
39
    // Otherwise, fall back.
12845
39
    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12846
39
                                                   DAG, Subtarget);
12847
39
  }
12848
164
12849
164
  // Try to simplify this by merging 128-bit lanes to enable a lane-based
12850
164
  // shuffle.
12851
164
  
if (SDValue 164
Result164
= lowerVectorShuffleByMerging128BitLanes(
12852
164
          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12853
27
    return Result;
12854
137
  // If we have VLX support, we can use VEXPAND.
12855
137
  
if (137
Subtarget.hasVLX()137
)
12856
50
    
if (SDValue 50
V50
= lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12857
50
                                               V1, V2, DAG, Subtarget))
12858
6
      return V;
12859
131
12860
131
  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12861
131
  // since after split we get a more efficient code using vpunpcklwd and
12862
131
  // vpunpckhwd instrs than vblend.
12863
131
  
if (131
!Subtarget.hasAVX512() && 131
isUnpackWdShuffleMask(Mask, MVT::v8f32)67
)
12864
8
    
if (SDValue 8
V8
= lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12865
8
                                                     Mask, DAG))
12866
8
      return V;
12867
123
12868
123
  // If we have AVX2 then we always want to lower with a blend because at v8 we
12869
123
  // can fully permute the elements.
12870
123
  
if (123
Subtarget.hasAVX2()123
)
12871
78
    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12872
78
                                                      Mask, DAG);
12873
45
12874
45
  // Otherwise fall back on generic lowering.
12875
45
  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12876
45
}
12877
12878
/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12879
///
12880
/// This routine is only called when we have AVX2 and thus a reasonable
12881
/// instruction set for v8i32 shuffling..
12882
static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12883
                                       const APInt &Zeroable,
12884
                                       SDValue V1, SDValue V2,
12885
                                       const X86Subtarget &Subtarget,
12886
993
                                       SelectionDAG &DAG) {
12887
993
  assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12888
993
  assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12889
993
  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12890
993
  assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12891
993
12892
993
  // Whenever we can lower this as a zext, that instruction is strictly faster
12893
993
  // than any alternative. It also allows us to fold memory operands into the
12894
993
  // shuffle in many cases.
12895
993
  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12896
993
          DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12897
17
    return ZExt;
12898
976
12899
976
  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12900
976
  // since after split we get a more efficient code than vblend by using
12901
976
  // vpunpcklwd and vpunpckhwd instrs.
12902
976
  
if (976
isUnpackWdShuffleMask(Mask, MVT::v8i32) && 976
!V2.isUndef()8
&&
12903
6
      !Subtarget.hasAVX512())
12904
5
    
if (SDValue 5
V5
=
12905
5
            lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12906
5
      return V;
12907
971
12908
971
  
if (SDValue 971
Blend971
= lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12909
971
                                                Zeroable, Subtarget, DAG))
12910
124
    return Blend;
12911
847
12912
847
  // Check for being able to broadcast a single element.
12913
847
  
if (SDValue 847
Broadcast847
= lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12914
847
                                                        Mask, Subtarget, DAG))
12915
58
    return Broadcast;
12916
789
12917
789
  // If the shuffle mask is repeated in each 128-bit lane we can use more
12918
789
  // efficient instructions that mirror the shuffles across the two 128-bit
12919
789
  // lanes.
12920
789
  SmallVector<int, 4> RepeatedMask;
12921
789
  bool Is128BitLaneRepeatedShuffle =
12922
789
      is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12923
789
  if (
Is128BitLaneRepeatedShuffle789
) {
12924
388
    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12925
388
    if (V2.isUndef())
12926
300
      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12927
300
                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12928
88
12929
88
    // Use dedicated unpack instructions for masks that match their pattern.
12930
88
    
if (SDValue 88
V88
=
12931
88
            lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12932
30
      return V;
12933
459
  }
12934
459
12935
459
  // Try to use shift instructions.
12936
459
  
if (SDValue 459
Shift459
= lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12937
459
                                                Zeroable, Subtarget, DAG))
12938
10
    return Shift;
12939
449
12940
449
  // If we have VLX support, we can use VALIGN or EXPAND.
12941
449
  
if (449
Subtarget.hasVLX()449
) {
12942
186
    if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12943
186
                                                    Mask, Subtarget, DAG))
12944
9
      return Rotate;
12945
177
12946
177
    
if (SDValue 177
V177
= lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
12947
177
                                               V1, V2, DAG, Subtarget))
12948
2
      return V;
12949
438
  }
12950
438
12951
438
  // Try to use byte rotation instructions.
12952
438
  
if (SDValue 438
Rotate438
= lowerVectorShuffleAsByteRotate(
12953
438
          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12954
14
    return Rotate;
12955
424
12956
424
  // Try to create an in-lane repeating shuffle mask and then shuffle the
12957
424
  // results into the target lanes.
12958
424
  
if (SDValue 424
V424
= lowerShuffleAsRepeatedMaskAndLanePermute(
12959
424
          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12960
197
    return V;
12961
227
12962
227
  // If the shuffle patterns aren't repeated but it is a single input, directly
12963
227
  // generate a cross-lane VPERMD instruction.
12964
227
  
if (227
V2.isUndef()227
) {
12965
127
    SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12966
127
    return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12967
127
  }
12968
100
12969
100
  // Assume that a single SHUFPS is faster than an alternative sequence of
12970
100
  // multiple instructions (even if the CPU has a domain penalty).
12971
100
  // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12972
100
  
if (100
Is128BitLaneRepeatedShuffle && 100
isSingleSHUFPSMask(RepeatedMask)34
) {
12973
19
    SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12974
19
    SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12975
19
    SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12976
19
                                                  CastV1, CastV2, DAG);
12977
19
    return DAG.getBitcast(MVT::v8i32, ShufPS);
12978
19
  }
12979
81
12980
81
  // Try to simplify this by merging 128-bit lanes to enable a lane-based
12981
81
  // shuffle.
12982
81
  
if (SDValue 81
Result81
= lowerVectorShuffleByMerging128BitLanes(
12983
81
          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12984
13
    return Result;
12985
68
12986
68
  // Otherwise fall back on generic blend lowering.
12987
68
  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12988
68
                                                    Mask, DAG);
12989
68
}
12990
12991
/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12992
///
12993
/// This routine is only called when we have AVX2 and thus a reasonable
12994
/// instruction set for v16i16 shuffling..
12995
static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12996
                                        const APInt &Zeroable,
12997
                                        SDValue V1, SDValue V2,
12998
                                        const X86Subtarget &Subtarget,
12999
1.32k
                                        SelectionDAG &DAG) {
13000
1.32k
  assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13001
1.32k
  assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13002
1.32k
  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13003
1.32k
  assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13004
1.32k
13005
1.32k
  // Whenever we can lower this as a zext, that instruction is strictly faster
13006
1.32k
  // than any alternative. It also allows us to fold memory operands into the
13007
1.32k
  // shuffle in many cases.
13008
1.32k
  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13009
1.32k
          DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13010
56
    return ZExt;
13011
1.26k
13012
1.26k
  // Check for being able to broadcast a single element.
13013
1.26k
  
if (SDValue 1.26k
Broadcast1.26k
= lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13014
1.26k
                                                        Mask, Subtarget, DAG))
13015
96
    return Broadcast;
13016
1.17k
13017
1.17k
  
if (SDValue 1.17k
Blend1.17k
= lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13018
1.17k
                                                Zeroable, Subtarget, DAG))
13019
156
    return Blend;
13020
1.01k
13021
1.01k
  // Use dedicated unpack instructions for masks that match their pattern.
13022
1.01k
  
if (SDValue 1.01k
V1.01k
=
13023
1.01k
          lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13024
50
    return V;
13025
967
13026
967
  // Try to use shift instructions.
13027
967
  
if (SDValue 967
Shift967
= lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13028
967
                                                Zeroable, Subtarget, DAG))
13029
64
    return Shift;
13030
903
13031
903
  // Try to use byte rotation instructions.
13032
903
  
if (SDValue 903
Rotate903
= lowerVectorShuffleAsByteRotate(
13033
903
          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13034
39
    return Rotate;
13035
864
13036
864
  // Try to create an in-lane repeating shuffle mask and then shuffle the
13037
864
  // the results into the target lanes.
13038
864
  
if (SDValue 864
V864
= lowerShuffleAsRepeatedMaskAndLanePermute(
13039
864
          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13040
38
    return V;
13041
826
13042
826
  
if (826
V2.isUndef()826
) {
13043
537
    // There are no generalized cross-lane shuffle operations available on i16
13044
537
    // element types.
13045
537
    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13046
137
      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13047
137
                                                     Mask, DAG, Subtarget);
13048
400
13049
400
    SmallVector<int, 8> RepeatedMask;
13050
400
    if (
is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)400
) {
13051
350
      // As this is a single-input shuffle, the repeated mask should be
13052
350
      // a strictly valid v8i16 mask that we can pass through to the v8i16
13053
350
      // lowering to handle even the v16 case.
13054
350
      return lowerV8I16GeneralSingleInputVectorShuffle(
13055
350
          DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13056
350
    }
13057
339
  }
13058
339
13059
339
  
if (SDValue 339
PSHUFB339
= lowerVectorShuffleWithPSHUFB(
13060
339
          DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13061
52
    return PSHUFB;
13062
287
13063
287
  // AVX512BWVL can lower to VPERMW.
13064
287
  
if (287
Subtarget.hasBWI() && 287
Subtarget.hasVLX()148
)
13065
147
    return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13066
140
13067
140
  // Try to simplify this by merging 128-bit lanes to enable a lane-based
13068
140
  // shuffle.
13069
140
  
if (SDValue 140
Result140
= lowerVectorShuffleByMerging128BitLanes(
13070
140
          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13071
6
    return Result;
13072
134
13073
134
  // Otherwise fall back on generic lowering.
13074
134
  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13075
134
}
13076
13077
/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13078
///
13079
/// This routine is only called when we have AVX2 and thus a reasonable
13080
/// instruction set for v32i8 shuffling..
13081
static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13082
                                       const APInt &Zeroable,
13083
                                       SDValue V1, SDValue V2,
13084
                                       const X86Subtarget &Subtarget,
13085
1.82k
                                       SelectionDAG &DAG) {
13086
1.82k
  assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13087
1.82k
  assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13088
1.82k
  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13089
1.82k
  assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13090
1.82k
13091
1.82k
  // Whenever we can lower this as a zext, that instruction is strictly faster
13092
1.82k
  // than any alternative. It also allows us to fold memory operands into the
13093
1.82k
  // shuffle in many cases.
13094
1.82k
  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13095
1.82k
          DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13096
115
    return ZExt;
13097
1.70k
13098
1.70k
  // Check for being able to broadcast a single element.
13099
1.70k
  
if (SDValue 1.70k
Broadcast1.70k
= lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13100
1.70k
                                                        Mask, Subtarget, DAG))
13101
118
    return Broadcast;
13102
1.58k
13103
1.58k
  
if (SDValue 1.58k
Blend1.58k
= lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13104
1.58k
                                                Zeroable, Subtarget, DAG))
13105
359
    return Blend;
13106
1.23k
13107
1.23k
  // Use dedicated unpack instructions for masks that match their pattern.
13108
1.23k
  
if (SDValue 1.23k
V1.23k
=
13109
1.23k
          lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13110
57
    return V;
13111
1.17k
13112
1.17k
  // Try to use shift instructions.
13113
1.17k
  
if (SDValue 1.17k
Shift1.17k
= lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13114
1.17k
                                                Zeroable, Subtarget, DAG))
13115
74
    return Shift;
13116
1.09k
13117
1.09k
  // Try to use byte rotation instructions.
13118
1.09k
  
if (SDValue 1.09k
Rotate1.09k
= lowerVectorShuffleAsByteRotate(
13119
1.09k
          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13120
68
    return Rotate;
13121
1.03k
13122
1.03k
  // Try to create an in-lane repeating shuffle mask and then shuffle the
13123
1.03k
  // the results into the target lanes.
13124
1.03k
  
if (SDValue 1.03k
V1.03k
= lowerShuffleAsRepeatedMaskAndLanePermute(
13125
1.03k
          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13126
67
    return V;
13127
964
13128
964
  // There are no generalized cross-lane shuffle operations available on i8
13129
964
  // element types.
13130
964
  
if (964
V2.isUndef() && 964
is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)327
)
13131
91
    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13132
91
                                                   DAG, Subtarget);
13133
873
13134
873
  
if (SDValue 873
PSHUFB873
= lowerVectorShuffleWithPSHUFB(
13135
873
          DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13136
418
    return PSHUFB;
13137
455
13138
455
  // Try to simplify this by merging 128-bit lanes to enable a lane-based
13139
455
  // shuffle.
13140
455
  
if (SDValue 455
Result455
= lowerVectorShuffleByMerging128BitLanes(
13141
455
          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13142
14
    return Result;
13143
441
13144
441
  // Otherwise fall back on generic lowering.
13145
441
  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13146
441
}
13147
13148
/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13149
///
13150
/// This routine either breaks down the specific type of a 256-bit x86 vector
13151
/// shuffle or splits it into two 128-bit shuffles and fuses the results back
13152
/// together based on the available instructions.
13153
static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13154
                                        MVT VT, SDValue V1, SDValue V2,
13155
                                        const APInt &Zeroable,
13156
                                        const X86Subtarget &Subtarget,
13157
10.1k
                                        SelectionDAG &DAG) {
13158
10.1k
  // If we have a single input to the zero element, insert that into V1 if we
13159
10.1k
  // can do so cheaply.
13160
10.1k
  int NumElts = VT.getVectorNumElements();
13161
139k
  int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13162
10.1k
13163
10.1k
  if (
NumV2Elements == 1 && 10.1k
Mask[0] >= NumElts1.29k
)
13164
234
    
if (SDValue 234
Insertion234
= lowerVectorShuffleAsElementInsertion(
13165
234
            DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13166
64
      return Insertion;
13167
10.0k
13168
10.0k
  // Handle special cases where the lower or upper half is UNDEF.
13169
10.0k
  
if (SDValue 10.0k
V10.0k
=
13170
10.0k
          lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13171
1.06k
    return V;
13172
8.99k
13173
8.99k
  // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13174
8.99k
  // can check for those subtargets here and avoid much of the subtarget
13175
8.99k
  // querying in the per-vector-type lowering routines. With AVX1 we have
13176
8.99k
  // essentially *zero* ability to manipulate a 256-bit vector with integer
13177
8.99k
  // types. Since we'll use floating point types there eventually, just
13178
8.99k
  // immediately cast everything to a float and operate entirely in that domain.
13179
8.99k
  
if (8.99k
VT.isInteger() && 8.99k
!Subtarget.hasAVX2()6.37k
) {
13180
754
    int ElementBits = VT.getScalarSizeInBits();
13181
754
    if (
ElementBits < 32754
) {
13182
443
      // No floating point type available, if we can't use the bit operations
13183
443
      // for masking/blending then decompose into 128-bit vectors.
13184
443
      if (SDValue V =
13185
443
              lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13186
17
        return V;
13187
426
      
if (SDValue 426
V426
= lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13188
22
        return V;
13189
404
      return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13190
404
    }
13191
311
13192
311
    MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13193
311
                                VT.getVectorNumElements());
13194
311
    V1 = DAG.getBitcast(FpVT, V1);
13195
311
    V2 = DAG.getBitcast(FpVT, V2);
13196
311
    return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13197
311
  }
13198
8.23k
13199
8.23k
  switch (VT.SimpleTy) {
13200
1.43k
  case MVT::v4f64:
13201
1.43k
    return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13202
1.47k
  case MVT::v4i64:
13203
1.47k
    return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13204
1.19k
  case MVT::v8f32:
13205
1.19k
    return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13206
993
  case MVT::v8i32:
13207
993
    return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13208
1.32k
  case MVT::v16i16:
13209
1.32k
    return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13210
1.82k
  case MVT::v32i8:
13211
1.82k
    return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13212
8.23k
13213
0
  default:
13214
0
    llvm_unreachable("Not a valid 256-bit x86 vector type!");
13215
0
  }
13216
0
}
13217
13218
/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13219
static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13220
                                        ArrayRef<int> Mask, SDValue V1,
13221
956
                                        SDValue V2, SelectionDAG &DAG) {
13222
956
  assert(VT.getScalarSizeInBits() == 64 &&
13223
956
         "Unexpected element type size for 128bit shuffle.");
13224
956
13225
956
  // To handle 256 bit vector requires VLX and most probably
13226
956
  // function lowerV2X128VectorShuffle() is better solution.
13227
956
  assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13228
956
13229
956
  SmallVector<int, 4> WidenedMask;
13230
956
  if (!canWidenShuffleElements(Mask, WidenedMask))
13231
661
    return SDValue();
13232
295
13233
295
  // Check for patterns which can be matched with a single insert of a 256-bit
13234
295
  // subvector.
13235
295
  bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13236
295
                                        {0, 1, 2, 3, 0, 1, 2, 3});
13237
295
  if (
OnlyUsesV1 || 295
isShuffleEquivalent(V1, V2, Mask,
13238
295
                                        {0, 1, 2, 3, 8, 9, 10, 11})) {
13239
25
    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13240
25
    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13241
25
                              DAG.getIntPtrConstant(0, DL));
13242
25
    SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13243
25
                              OnlyUsesV1 ? 
V14
:
V221
,
13244
25
                              DAG.getIntPtrConstant(0, DL));
13245
25
    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13246
25
  }
13247
270
13248
295
  assert(WidenedMask.size() == 4);
13249
270
13250
270
  // See if this is an insertion of the lower 128-bits of V2 into V1.
13251
270
  bool IsInsert = true;
13252
270
  int V2Index = -1;
13253
676
  for (int i = 0; 
i < 4676
;
++i406
) {
13254
652
    assert(WidenedMask[i] >= -1);
13255
652
    if (WidenedMask[i] < 0)
13256
4
      continue;
13257
648
13258
648
    // Make sure all V1 subvectors are in place.
13259
648
    
if (648
WidenedMask[i] < 4648
) {
13260
484
      if (
WidenedMask[i] != i484
) {
13261
108
        IsInsert = false;
13262
108
        break;
13263
108
      }
13264
164
    } else {
13265
164
      // Make sure we only have a single V2 index and its the lowest 128-bits.
13266
164
      if (
V2Index >= 0 || 164
WidenedMask[i] != 4162
) {
13267
138
        IsInsert = false;
13268
138
        break;
13269
138
      }
13270
26
      V2Index = i;
13271
26
    }
13272
652
  }
13273
270
  if (
IsInsert && 270
V2Index >= 024
) {
13274
24
    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13275
24
    SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13276
24
                                 DAG.getIntPtrConstant(0, DL));
13277
24
    return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13278
24
  }
13279
246
13280
246
  // Try to lower to to vshuf64x2/vshuf32x4.
13281
246
  SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13282
246
  unsigned PermMask = 0;
13283
246
  // Insure elements came from the same Op.
13284
1.22k
  for (int i = 0; 
i < 41.22k
;
++i982
) {
13285
984
    assert(WidenedMask[i] >= -1);
13286
984
    if (WidenedMask[i] < 0)
13287
10
      continue;
13288
974
13289
974
    
SDValue Op = WidenedMask[i] >= 4 ? 974
V2458
:
V1516
;
13290
974
    unsigned OpIndex = i / 2;
13291
974
    if (Ops[OpIndex].isUndef())
13292
488
      Ops[OpIndex] = Op;
13293
486
    else 
if (486
Ops[OpIndex] != Op486
)
13294
2
      return SDValue();
13295
972
13296
972
    // Convert the 128-bit shuffle mask selection values into 128-bit selection
13297
972
    // bits defined by a vshuf64x2 instruction's immediate control byte.
13298
972
    PermMask |= (WidenedMask[i] % 4) << (i * 2);
13299
972
  }
13300
246
13301
244
  return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13302
244
                     DAG.getConstant(PermMask, DL, MVT::i8));
13303
956
}
13304
13305
/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13306
static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13307
                                       const APInt &Zeroable,
13308
                                       SDValue V1, SDValue V2,
13309
                                       const X86Subtarget &Subtarget,
13310
425
                                       SelectionDAG &DAG) {
13311
425
  assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13312
425
  assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13313
425
  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13314
425
13315
425
  if (
V2.isUndef()425
) {
13316
201
    // Use low duplicate instructions for masks that match their pattern.
13317
201
    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13318
31
      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13319
170
13320
170
    
if (170
!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)170
) {
13321
58
      // Non-half-crossing single input shuffles can be lowered with an
13322
58
      // interleaved permutation.
13323
58
      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13324
58
                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13325
58
                              ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13326
58
                              ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13327
58
      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13328
58
                         DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13329
58
    }
13330
112
13331
112
    SmallVector<int, 4> RepeatedMask;
13332
112
    if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13333
44
      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13334
44
                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13335
292
  }
13336
292
13337
292
  
if (SDValue 292
Shuf128292
=
13338
292
          lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13339
82
    return Shuf128;
13340
210
13341
210
  
if (SDValue 210
Unpck210
=
13342
210
          lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13343
80
    return Unpck;
13344
130
13345
130
  // Check if the blend happens to exactly fit that of SHUFPD.
13346
130
  
if (SDValue 130
Op130
=
13347
130
      lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13348
31
    return Op;
13349
99
13350
99
  
if (SDValue 99
V99
= lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13351
99
                                             V2, DAG, Subtarget))
13352
4
    return V;
13353
95
13354
95
  
if (SDValue 95
Blend95
= lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13355
95
                                                Zeroable, Subtarget, DAG))
13356
7
    return Blend;
13357
88
13358
88
  return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13359
88
}
13360
13361
/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13362
static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13363
                                        const APInt &Zeroable,
13364
                                        SDValue V1, SDValue V2,
13365
                                        const X86Subtarget &Subtarget,
13366
253
                                        SelectionDAG &DAG) {
13367
253
  assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13368
253
  assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13369
253
  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13370
253
13371
253
  // If the shuffle mask is repeated in each 128-bit lane, we have many more
13372
253
  // options to efficiently lower the shuffle.
13373
253
  SmallVector<int, 4> RepeatedMask;
13374
253
  if (
is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)253
) {
13375
189
    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13376
189
13377
189
    // Use even/odd duplicate instructions for masks that match their pattern.
13378
189
    if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13379
31
      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13380
158
    
if (158
isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})158
)
13381
31
      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13382
127
13383
127
    
if (127
V2.isUndef()127
)
13384
32
      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13385
32
                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13386
95
13387
95
    // Use dedicated unpack instructions for masks that match their pattern.
13388
95
    
if (SDValue 95
Unpck95
=
13389
95
            lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13390
66
      return Unpck;
13391
29
13392
29
    
if (SDValue 29
Blend29
= lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13393
29
                                                  Zeroable, Subtarget, DAG))
13394
6
      return Blend;
13395
23
13396
23
    // Otherwise, fall back to a SHUFPS sequence.
13397
23
    return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13398
23
  }
13399
64
13400
64
  // If we have a single input shuffle with different shuffle patterns in the
13401
64
  // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
13402
64
  
if (64
V2.isUndef() &&
13403
64
      
!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)44
) {
13404
22
    SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
13405
22
    return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
13406
22
  }
13407
42
13408
42
  // If we have AVX512F support, we can use VEXPAND.
13409
42
  
if (SDValue 42
V42
= lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13410
42
                                             V1, V2, DAG, Subtarget))
13411
8
    return V;
13412
34
13413
34
  return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13414
34
}
13415
13416
/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13417
static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13418
                                       const APInt &Zeroable,
13419
                                       SDValue V1, SDValue V2,
13420
                                       const X86Subtarget &Subtarget,
13421
664
                                       SelectionDAG &DAG) {
13422
664
  assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13423
664
  assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13424
664
  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13425
664
13426
664
  if (SDValue Shuf128 =
13427
664
          lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13428
211
    return Shuf128;
13429
453
13430
453
  
if (453
V2.isUndef()453
) {
13431
144
    // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13432
144
    // can use lower latency instructions that will operate on all four
13433
144
    // 128-bit lanes.
13434
144
    SmallVector<int, 2> Repeated128Mask;
13435
144
    if (
is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)144
) {
13436
8
      SmallVector<int, 4> PSHUFDMask;
13437
8
      scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
13438
8
      return DAG.getBitcast(
13439
8
          MVT::v8i64,
13440
8
          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13441
8
                      DAG.getBitcast(MVT::v16i32, V1),
13442
8
                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13443
8
    }
13444
136
13445
136
    SmallVector<int, 4> Repeated256Mask;
13446
136
    if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13447
48
      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13448
48
                         getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13449
397
  }
13450
397
13451
397
  // Try to use shift instructions.
13452
397
  
if (SDValue 397
Shift397
= lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13453
397
                                                Zeroable, Subtarget, DAG))
13454
4
    return Shift;
13455
393
13456
393
  // Try to use VALIGN.
13457
393
  
if (SDValue 393
Rotate393
= lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13458
393
                                                  Mask, Subtarget, DAG))
13459
25
    return Rotate;
13460
368
13461
368
  // Try to use PALIGNR.
13462
368
  
if (SDValue 368
Rotate368
= lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13463
368
                                                      Mask, Subtarget, DAG))
13464
0
    return Rotate;
13465
368
13466
368
  
if (SDValue 368
Unpck368
=
13467
368
          lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13468
22
    return Unpck;
13469
346
  // If we have AVX512F support, we can use VEXPAND.
13470
346
  
if (SDValue 346
V346
= lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13471
346
                                             V2, DAG, Subtarget))
13472
4
    return V;
13473
342
13474
342
  
if (SDValue 342
Blend342
= lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13475
342
                                                Zeroable, Subtarget, DAG))
13476
9
    return Blend;
13477
333
13478
333
  return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13479
333
}
13480
13481
/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13482
static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13483
                                        const APInt &Zeroable,
13484
                                        SDValue V1, SDValue V2,
13485
                                        const X86Subtarget &Subtarget,
13486
375
                                        SelectionDAG &DAG) {
13487
375
  assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13488
375
  assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13489
375
  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13490
375
13491
375
  // Whenever we can lower this as a zext, that instruction is strictly faster
13492
375
  // than any alternative. It also allows us to fold memory operands into the
13493
375
  // shuffle in many cases.
13494
375
  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13495
375
          DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13496
2
    return ZExt;
13497
373
13498
373
  // If the shuffle mask is repeated in each 128-bit lane we can use more
13499
373
  // efficient instructions that mirror the shuffles across the four 128-bit
13500
373
  // lanes.
13501
373
  SmallVector<int, 4> RepeatedMask;
13502
373
  bool Is128BitLaneRepeatedShuffle =
13503
373
      is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13504
373
  if (
Is128BitLaneRepeatedShuffle373
) {
13505
108
    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13506
108
    if (V2.isUndef())
13507
54
      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13508
54
                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13509
54
13510
54
    // Use dedicated unpack instructions for masks that match their pattern.
13511
54
    
if (SDValue 54
V54
=
13512
54
            lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13513
22
      return V;
13514
297
  }
13515
297
13516
297
  // Try to use shift instructions.
13517
297
  
if (SDValue 297
Shift297
= lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13518
297
                                                Zeroable, Subtarget, DAG))
13519
8
    return Shift;
13520
289
13521
289
  // Try to use VALIGN.
13522
289
  
if (SDValue 289
Rotate289
= lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13523
289
                                                  Mask, Subtarget, DAG))
13524
8
    return Rotate;
13525
281
13526
281
  // Try to use byte rotation instructions.
13527
281
  
if (281
Subtarget.hasBWI()281
)
13528
41
    
if (SDValue 41
Rotate41
= lowerVectorShuffleAsByteRotate(
13529
41
            DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13530
1
      return Rotate;
13531
280
13532
280
  // Assume that a single SHUFPS is faster than using a permv shuffle.
13533
280
  // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13534
280
  
if (280
Is128BitLaneRepeatedShuffle && 280
isSingleSHUFPSMask(RepeatedMask)23
) {
13535
10
    SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13536
10
    SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13537
10
    SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13538
10
                                                  CastV1, CastV2, DAG);
13539
10
    return DAG.getBitcast(MVT::v16i32, ShufPS);
13540
10
  }
13541
270
  // If we have AVX512F support, we can use VEXPAND.
13542
270
  
if (SDValue 270
V270
= lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13543
270
                                             V1, V2, DAG, Subtarget))
13544
4
    return V;
13545
266
13546
266
  
if (SDValue 266
Blend266
= lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13547
266
                                                Zeroable, Subtarget, DAG))
13548
7
    return Blend;
13549
259
  return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13550
259
}
13551
13552
/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13553
static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13554
                                        const APInt &Zeroable,
13555
                                        SDValue V1, SDValue V2,
13556
                                        const X86Subtarget &Subtarget,
13557
119
                                        SelectionDAG &DAG) {
13558
119
  assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13559
119
  assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13560
119
  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13561
119
  assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13562
119
13563
119
  // Whenever we can lower this as a zext, that instruction is strictly faster
13564
119
  // than any alternative. It also allows us to fold memory operands into the
13565
119
  // shuffle in many cases.
13566
119
  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13567
119
          DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13568
2
    return ZExt;
13569
117
13570
117
  // Use dedicated unpack instructions for masks that match their pattern.
13571
117
  
if (SDValue 117
V117
=
13572
117
          lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13573
16
    return V;
13574
101
13575
101
  // Try to use shift instructions.
13576
101
  
if (SDValue 101
Shift101
= lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13577
101
                                                Zeroable, Subtarget, DAG))
13578
19
    return Shift;
13579
82
13580
82
  // Try to use byte rotation instructions.
13581
82
  
if (SDValue 82
Rotate82
= lowerVectorShuffleAsByteRotate(
13582
82
          DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13583
2
    return Rotate;
13584
80
13585
80
  
if (80
V2.isUndef()80
) {
13586
73
    SmallVector<int, 8> RepeatedMask;
13587
73
    if (
is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)73
) {
13588
49
      // As this is a single-input shuffle, the repeated mask should be
13589
49
      // a strictly valid v8i16 mask that we can pass through to the v8i16
13590
49
      // lowering to handle even the v32 case.
13591
49
      return lowerV8I16GeneralSingleInputVectorShuffle(
13592
49
          DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13593
49
    }
13594
31
  }
13595
31
13596
31
  
if (SDValue 31
Blend31
= lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13597
31
                                                Zeroable, Subtarget, DAG))
13598
4
    return Blend;
13599
27
13600
27
  return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13601
27
}
13602
13603
/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13604
static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13605
                                       const APInt &Zeroable,
13606
                                       SDValue V1, SDValue V2,
13607
                                       const X86Subtarget &Subtarget,
13608
368
                                       SelectionDAG &DAG) {
13609
368
  assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13610
368
  assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13611
368
  assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13612
368
  assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13613
368
13614
368
  // Whenever we can lower this as a zext, that instruction is strictly faster
13615
368
  // than any alternative. It also allows us to fold memory operands into the
13616
368
  // shuffle in many cases.
13617
368
  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13618
368
          DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13619
10
    return ZExt;
13620
358
13621
358
  // Use dedicated unpack instructions for masks that match their pattern.
13622
358
  
if (SDValue 358
V358
=
13623
358
          lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13624
23
    return V;
13625
335
13626
335
  // Try to use shift instructions.
13627
335
  
if (SDValue 335
Shift335
= lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13628
335
                                                Zeroable, Subtarget, DAG))
13629
25
    return Shift;
13630
310
13631
310
  // Try to use byte rotation instructions.
13632
310
  
if (SDValue 310
Rotate310
= lowerVectorShuffleAsByteRotate(
13633
310
          DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13634
5
    return Rotate;
13635
305
13636
305
  
if (SDValue 305
PSHUFB305
= lowerVectorShuffleWithPSHUFB(
13637
305
          DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13638
27
    return PSHUFB;
13639
278
13640
278
  // VBMI can use VPERMV/VPERMV3 byte shuffles.
13641
278
  
if (278
Subtarget.hasVBMI()278
)
13642
3
    return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13643
275
13644
275
  // Try to create an in-lane repeating shuffle mask and then shuffle the
13645
275
  // the results into the target lanes.
13646
275
  
if (SDValue 275
V275
= lowerShuffleAsRepeatedMaskAndLanePermute(
13647
275
          DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13648
2
    return V;
13649
273
13650
273
  
if (SDValue 273
Blend273
= lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13651
273
                                                Zeroable, Subtarget, DAG))
13652
7
    return Blend;
13653
266
13654
266
  // FIXME: Implement direct support for this type!
13655
266
  return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13656
266
}
13657
13658
/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13659
///
13660
/// This routine either breaks down the specific type of a 512-bit x86 vector
13661
/// shuffle or splits it into two 256-bit shuffles and fuses the results back
13662
/// together based on the available instructions.
13663
static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13664
                                        MVT VT, SDValue V1, SDValue V2,
13665
                                        const APInt &Zeroable,
13666
                                        const X86Subtarget &Subtarget,
13667
2.46k
                                        SelectionDAG &DAG) {
13668
2.46k
  assert(Subtarget.hasAVX512() &&
13669
2.46k
         "Cannot lower 512-bit vectors w/ basic ISA!");
13670
2.46k
13671
2.46k
  // If we have a single input to the zero element, insert that into V1 if we
13672
2.46k
  // can do so cheaply.
13673
2.46k
  int NumElts = Mask.size();
13674
51.7k
  int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13675
2.46k
13676
2.46k
  if (
NumV2Elements == 1 && 2.46k
Mask[0] >= NumElts738
)
13677
23
    
if (SDValue 23
Insertion23
= lowerVectorShuffleAsElementInsertion(
13678
23
            DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13679
14
      return Insertion;
13680
2.45k
13681
2.45k
  // Handle special cases where the lower or upper half is UNDEF.
13682
2.45k
  
if (SDValue 2.45k
V2.45k
=
13683
2.45k
        lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13684
68
    return V;
13685
2.38k
13686
2.38k
  // Check for being able to broadcast a single element.
13687
2.38k
  
if (SDValue 2.38k
Broadcast2.38k
=
13688
2.38k
          lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13689
182
    return Broadcast;
13690
2.20k
13691
2.20k
  // Dispatch to each element type for lowering. If we don't have support for
13692
2.20k
  // specific element type shuffles at 512 bits, immediately split them and
13693
2.20k
  // lower them. Each lowering routine of a given type is allowed to assume that
13694
2.20k
  // the requisite ISA extensions for that element type are available.
13695
2.20k
  switch (VT.SimpleTy) {
13696
425
  case MVT::v8f64:
13697
425
    return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13698
253
  case MVT::v16f32:
13699
253
    return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13700
664
  case MVT::v8i64:
13701
664
    return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13702
375
  case MVT::v16i32:
13703
375
    return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13704
119
  case MVT::v32i16:
13705
119
    return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13706
368
  case MVT::v64i8:
13707
368
    return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13708
2.20k
13709
0
  default:
13710
0
    llvm_unreachable("Not a valid 512-bit x86 vector type!");
13711
0
  }
13712
0
}
13713
13714
// Lower vXi1 vector shuffles.
13715
// There is no a dedicated instruction on AVX-512 that shuffles the masks.
13716
// The only way to shuffle bits is to sign-extend the mask vector to SIMD
13717
// vector, shuffle and then truncate it back.
13718
static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13719
                                      MVT VT, SDValue V1, SDValue V2,
13720
                                      const X86Subtarget &Subtarget,
13721
716
                                      SelectionDAG &DAG) {
13722
716
  assert(Subtarget.hasAVX512() &&
13723
716
         "Cannot lower 512-bit vectors w/o basic ISA!");
13724
716
  MVT ExtVT;
13725
716
  switch (VT.SimpleTy) {
13726
0
  default:
13727
0
    llvm_unreachable("Expected a vector of i1 elements");
13728
2
  case MVT::v2i1:
13729
2
    ExtVT = MVT::v2i64;
13730
2
    break;
13731
3
  case MVT::v4i1:
13732
3
    ExtVT = MVT::v4i32;
13733
3
    break;
13734
238
  case MVT::v8i1:
13735
238
    ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13736
238
    break;
13737
217
  case MVT::v16i1:
13738
217
    ExtVT = MVT::v16i32;
13739
217
    break;
13740
3
  case MVT::v32i1:
13741
3
    ExtVT = MVT::v32i16;
13742
3
    break;
13743
253
  case MVT::v64i1:
13744
253
    ExtVT = MVT::v64i8;
13745
253
    break;
13746
716
  }
13747
716
13748
716
  
if (716
ISD::isBuildVectorAllZeros(V1.getNode())716
)
13749
3
    V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13750
713
  else 
if (713
ISD::isBuildVectorAllOnes(V1.getNode())713
)
13751
2
    V1 = getOnesVector(ExtVT, DAG, DL);
13752
713
  else
13753
711
    V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13754
716
13755
716
  if (V2.isUndef())
13756
12
    V2 = DAG.getUNDEF(ExtVT);
13757
704
  else 
if (704
ISD::isBuildVectorAllZeros(V2.getNode())704
)
13758
4
    V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13759
700
  else 
if (700
ISD::isBuildVectorAllOnes(V2.getNode())700
)
13760
0
    V2 = getOnesVector(ExtVT, DAG, DL);
13761
700
  else
13762
700
    V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13763
716
13764
716
  SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13765
716
  // i1 was sign extended we can use X86ISD::CVT2MASK.
13766
716
  int NumElems = VT.getVectorNumElements();
13767
716
  if (
(Subtarget.hasBWI() && 716
(NumElems >= 32)278
) ||
13768
460
      
(Subtarget.hasDQI() && 460
(NumElems < 32)22
))
13769
278
    return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13770
438
13771
438
  return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13772
438
}
13773
13774
/// Helper function that returns true if the shuffle mask should be
13775
/// commuted to improve canonicalization.
13776
81.6k
static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13777
81.6k
  int NumElements = Mask.size();
13778
81.6k
13779
81.6k
  int NumV1Elements = 0, NumV2Elements = 0;
13780
81.6k
  for (int M : Mask)
13781
737k
    
if (737k
M < 0737k
)
13782
114k
      continue;
13783
623k
    else 
if (623k
M < NumElements623k
)
13784
413k
      ++NumV1Elements;
13785
623k
    else
13786
209k
      ++NumV2Elements;
13787
81.6k
13788
81.6k
  // Commute the shuffle as needed such that more elements come from V1 than
13789
81.6k
  // V2. This allows us to match the shuffle pattern strictly on how many
13790
81.6k
  // elements come from V1 without handling the symmetric cases.
13791
81.6k
  if (NumV2Elements > NumV1Elements)
13792
3.17k
    return true;
13793
78.4k
13794
81.6k
  assert(NumV1Elements > 0 && "No V1 indices");
13795
78.4k
13796
78.4k
  if (NumV2Elements == 0)
13797
17.9k
    return false;
13798
60.5k
13799
60.5k
  // When the number of V1 and V2 elements are the same, try to minimize the
13800
60.5k
  // number of uses of V2 in the low half of the vector. When that is tied,
13801
60.5k
  // ensure that the sum of indices for V1 is equal to or lower than the sum
13802
60.5k
  // indices for V2. When those are equal, try to ensure that the number of odd
13803
60.5k
  // indices for V1 is lower than the number of odd indices for V2.
13804
60.5k
  
if (60.5k
NumV1Elements == NumV2Elements60.5k
) {
13805
46.4k
    int LowV1Elements = 0, LowV2Elements = 0;
13806
46.4k
    for (int M : Mask.slice(0, NumElements / 2))
13807
174k
      
if (174k
M >= NumElements174k
)
13808
43.1k
        ++LowV2Elements;
13809
131k
      else 
if (131k
M >= 0131k
)
13810
116k
        ++LowV1Elements;
13811
46.4k
    if (LowV2Elements > LowV1Elements)
13812
2.73k
      return true;
13813
43.7k
    
if (43.7k
LowV2Elements == LowV1Elements43.7k
) {
13814
17.3k
      int SumV1Indices = 0, SumV2Indices = 0;
13815
180k
      for (int i = 0, Size = Mask.size(); 
i < Size180k
;
++i163k
)
13816
163k
        
if (163k
Mask[i] >= NumElements163k
)
13817
68.4k
          SumV2Indices += i;
13818
94.8k
        else 
if (94.8k
Mask[i] >= 094.8k
)
13819
68.4k
          SumV1Indices += i;
13820
17.3k
      if (SumV2Indices < SumV1Indices)
13821
674
        return true;
13822
16.6k
      
if (16.6k
SumV2Indices == SumV1Indices16.6k
) {
13823
583
        int NumV1OddIndices = 0, NumV2OddIndices = 0;
13824
3.39k
        for (int i = 0, Size = Mask.size(); 
i < Size3.39k
;
++i2.80k
)
13825
2.80k
          
if (2.80k
Mask[i] >= NumElements2.80k
)
13826
1.34k
            NumV2OddIndices += i % 2;
13827
1.46k
          else 
if (1.46k
Mask[i] >= 01.46k
)
13828
1.34k
            NumV1OddIndices += i % 2;
13829
583
        if (NumV2OddIndices < NumV1OddIndices)
13830
0
          return true;
13831
57.0k
      }
13832
17.3k
    }
13833
46.4k
  }
13834
57.0k
13835
57.0k
  return false;
13836
57.0k
}
13837
13838
/// \brief Top-level lowering for x86 vector shuffles.
13839
///
13840
/// This handles decomposition, canonicalization, and lowering of all x86
13841
/// vector shuffles. Most of the specific lowering strategies are encapsulated
13842
/// above in helper routines. The canonicalization attempts to widen shuffles
13843
/// to involve fewer lanes of wider elements, consolidate symmetric patterns
13844
/// s.t. only one of the two inputs needs to be tested, etc.
13845
static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13846
48.7k
                                  SelectionDAG &DAG) {
13847
48.7k
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13848
48.7k
  ArrayRef<int> Mask = SVOp->getMask();
13849
48.7k
  SDValue V1 = Op.getOperand(0);
13850
48.7k
  SDValue V2 = Op.getOperand(1);
13851
48.7k
  MVT VT = Op.getSimpleValueType();
13852
48.7k
  int NumElements = VT.getVectorNumElements();
13853
48.7k
  SDLoc DL(Op);
13854
48.7k
  bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13855
48.7k
13856
48.7k
  assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13857
48.7k
         "Can't lower MMX shuffles");
13858
48.7k
13859
48.7k
  bool V1IsUndef = V1.isUndef();
13860
48.7k
  bool V2IsUndef = V2.isUndef();
13861
48.7k
  if (
V1IsUndef && 48.7k
V2IsUndef0
)
13862
0
    return DAG.getUNDEF(VT);
13863
48.7k
13864
48.7k
  // When we create a shuffle node we put the UNDEF node to second operand,
13865
48.7k
  // but in some cases the first operand may be transformed to UNDEF.
13866
48.7k
  // In this case we should just commute the node.
13867
48.7k
  
if (48.7k
V1IsUndef48.7k
)
13868
0
    return DAG.getCommutedVectorShuffle(*SVOp);
13869
48.7k
13870
48.7k
  // Check for non-undef masks pointing at an undef vector and make the masks
13871
48.7k
  // undef as well. This makes it easier to match the shuffle based solely on
13872
48.7k
  // the mask.
13873
48.7k
  
if (48.7k
V2IsUndef48.7k
)
13874
21.7k
    for (int M : Mask)
13875
210k
      
if (210k
M >= NumElements210k
) {
13876
0
        SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13877
0
        for (int &M : NewMask)
13878
0
          
if (0
M >= NumElements0
)
13879
0
            M = -1;
13880
21.7k
        return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13881
21.7k
      }
13882
48.7k
13883
48.7k
  // Check for illegal shuffle mask element index values.
13884
48.7k
  
int MaskUpperLimit = Mask.size() * (V2IsUndef ? 48.7k
121.7k
:
226.9k
); (void)MaskUpperLimit;
13885
48.7k
  assert(llvm::all_of(Mask,
13886
48.7k
                      [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13887
48.7k
         "Out of bounds shuffle index");
13888
48.7k
13889
48.7k
  // We actually see shuffles that are entirely re-arrangements of a set of
13890
48.7k
  // zero inputs. This mostly happens while decomposing complex shuffles into
13891
48.7k
  // simple ones. Directly lower these as a buildvector of zeros.
13892
48.7k
  APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13893
48.7k
  if (Zeroable.isAllOnesValue())
13894
47
    return getZeroVector(VT, Subtarget, DAG, DL);
13895
48.6k
13896
48.6k
  // Try to collapse shuffles into using a vector type with fewer elements but
13897
48.6k
  // wider element types. We cap this to not form integers or floating point
13898
48.6k
  // elements wider than 64 bits, but it might be interesting to form i128
13899
48.6k
  // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13900
48.6k
  SmallVector<int, 16> WidenedMask;
13901
48.6k
  if (
VT.getScalarSizeInBits() < 64 && 48.6k
!Is1BitVector36.4k
&&
13902
48.6k
      
canWidenShuffleElements(Mask, WidenedMask)35.6k
) {
13903
7.16k
    MVT NewEltVT = VT.isFloatingPoint()
13904
1.18k
                       ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13905
5.98k
                       : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13906
7.16k
    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13907
7.16k
    // Make sure that the new vector type is legal. For example, v2f64 isn't
13908
7.16k
    // legal on SSE1.
13909
7.16k
    if (
DAG.getTargetLoweringInfo().isTypeLegal(NewVT)7.16k
) {
13910
7.02k
      V1 = DAG.getBitcast(NewVT, V1);
13911
7.02k
      V2 = DAG.getBitcast(NewVT, V2);
13912
7.02k
      return DAG.getBitcast(
13913
7.02k
          VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13914
7.02k
    }
13915
41.6k
  }
13916
41.6k
13917
41.6k
  // Commute the shuffle if it will improve canonicalization.
13918
41.6k
  
if (41.6k
canonicalizeShuffleMaskWithCommute(Mask)41.6k
)
13919
3.72k
    return DAG.getCommutedVectorShuffle(*SVOp);
13920
37.9k
13921
37.9k
  // For each vector width, delegate to a specialized lowering routine.
13922
37.9k
  
if (37.9k
VT.is128BitVector()37.9k
)
13923
24.6k
    return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13924
24.6k
                                    DAG);
13925
13.3k
13926
13.3k
  
if (13.3k
VT.is256BitVector()13.3k
)
13927
10.1k
    return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13928
10.1k
                                    DAG);
13929
3.18k
13930
3.18k
  
if (3.18k
VT.is512BitVector()3.18k
)
13931
2.46k
    return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13932
2.46k
                                    DAG);
13933
716
13934
716
  
if (716
Is1BitVector716
)
13935
716
    return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13936
0
13937
0
  
llvm_unreachable0
("Unimplemented!");
13938
0
}
13939
13940
/// \brief Try to lower a VSELECT instruction to a vector shuffle.
13941
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13942
                                           const X86Subtarget &Subtarget,
13943
2.81k
                                           SelectionDAG &DAG) {
13944
2.81k
  SDValue Cond = Op.getOperand(0);
13945
2.81k
  SDValue LHS = Op.getOperand(1);
13946
2.81k
  SDValue RHS = Op.getOperand(2);
13947
2.81k
  SDLoc dl(Op);
13948
2.81k
  MVT VT = Op.getSimpleValueType();
13949
2.81k
13950
2.81k
  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13951
2.46k
    return SDValue();
13952
354
  auto *CondBV = cast<BuildVectorSDNode>(Cond);
13953
354
13954
354
  // Only non-legal VSELECTs reach this lowering, convert those into generic
13955
354
  // shuffles and re-use the shuffle lowering path for blends.
13956
354
  SmallVector<int, 32> Mask;
13957
2.18k
  for (int i = 0, Size = VT.getVectorNumElements(); 
i < Size2.18k
;
++i1.83k
) {
13958
1.83k
    SDValue CondElt = CondBV->getOperand(i);
13959
1.83k
    Mask.push_back(
13960
1.82k
        isa<ConstantSDNode>(CondElt) ? 
i + (isNullConstant(CondElt) ? 1.82k
Size949
:
0877
)
13961
8
                                     : -1);
13962
1.83k
  }
13963
2.81k
  return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13964
2.81k
}
13965
13966
19.7k
SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13967
19.7k
  // A vselect where all conditions and data are constants can be optimized into
13968
19.7k
  // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13969
19.7k
  if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13970
3.90k
      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13971
181
      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13972
179
    return SDValue();
13973
19.5k
13974
19.5k
  // If this VSELECT has a vector if i1 as a mask, it will be directly matched
13975
19.5k
  // with patterns on the mask registers on AVX-512.
13976
19.5k
  
if (19.5k
Op->getOperand(0).getValueType().getScalarSizeInBits() == 119.5k
)
13977
16.7k
    return Op;
13978
2.81k
13979
2.81k
  // Try to lower this to a blend-style vector shuffle. This can handle all
13980
2.81k
  // constant condition cases.
13981
2.81k
  
if (SDValue 2.81k
BlendOp2.81k
= lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13982
354
    return BlendOp;
13983
2.46k
13984
2.46k
  // Variable blends are only legal from SSE4.1 onward.
13985
2.46k
  
if (2.46k
!Subtarget.hasSSE41()2.46k
)
13986
771
    return SDValue();
13987
1.69k
13988
1.69k
  SDLoc dl(Op);
13989
1.69k
  MVT VT = Op.getSimpleValueType();
13990
1.69k
13991
1.69k
  // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
13992
1.69k
  // into an i1 condition so that we can use the mask-based 512-bit blend
13993
1.69k
  // instructions.
13994
1.69k
  if (
VT.getSizeInBits() == 5121.69k
) {
13995
4
    SDValue Cond = Op.getOperand(0);
13996
4
    // The vNi1 condition case should be handled above as it can be trivially
13997
4
    // lowered.
13998
4
    assert(Cond.getValueType().getScalarSizeInBits() ==
13999
4
               VT.getScalarSizeInBits() &&
14000
4
           "Should have a size-matched integer condition!");
14001
4
    // Build a mask by testing the condition against itself (tests for zero).
14002
4
    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14003
4
    SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14004
4
    // Now return a new VSELECT using the mask.
14005
4
    return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14006
4
  }
14007
1.68k
14008
1.68k
  // Only some types will be legal on some subtargets. If we can emit a legal
14009
1.68k
  // VSELECT-matching blend, return Op, and but if we need to expand, return
14010
1.68k
  // a null value.
14011
1.68k
  switch (VT.SimpleTy) {
14012
1.66k
  default:
14013
1.66k
    // Most of the vector types have blends past SSE4.1.
14014
1.66k
    return Op;
14015
1.68k
14016
1
  case MVT::v32i8:
14017
1
    // The byte blends for AVX vectors were introduced only in AVX2.
14018
1
    if (Subtarget.hasAVX2())
14019
0
      return Op;
14020
1
14021
1
    return SDValue();
14022
1
14023
21
  case MVT::v8i16:
14024
21
  case MVT::v16i16:
14025
21
    // AVX-512 BWI and VLX features support VSELECT with i16 elements.
14026
21
    if (
Subtarget.hasBWI() && 21
Subtarget.hasVLX()0
)
14027
0
      return Op;
14028
21
14029
21
    // FIXME: We should custom lower this by fixing the condition and using i8
14030
21
    // blends.
14031
21
    return SDValue();
14032
19.7k
  }
14033
19.7k
}
14034
14035
14.6k
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14036
14.6k
  MVT VT = Op.getSimpleValueType();
14037
14.6k
  SDLoc dl(Op);
14038
14.6k
14039
14.6k
  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14040
0
    return SDValue();
14041
14.6k
14042
14.6k
  
if (14.6k
VT.getSizeInBits() == 814.6k
) {
14043
2.41k
    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14044
2.41k
                                  Op.getOperand(0), Op.getOperand(1));
14045
2.41k
    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14046
2.41k
                                  DAG.getValueType(VT));
14047
2.41k
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14048
2.41k
  }
14049
12.2k
14050
12.2k
  
if (12.2k
VT == MVT::f3212.2k
) {
14051
3.12k
    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14052
3.12k
    // the result back to FR32 register. It's only worth matching if the
14053
3.12k
    // result has a single use which is a store or a bitcast to i32.  And in
14054
3.12k
    // the case of a store, it's not worth it if the index is a constant 0,
14055
3.12k
    // because a MOVSSmr can be used instead, which is smaller and faster.
14056
3.12k
    if (!Op.hasOneUse())
14057
249
      return SDValue();
14058
2.88k
    SDNode *User = *Op.getNode()->use_begin();
14059
2.88k
    if ((User->getOpcode() != ISD::STORE ||
14060
366
         isNullConstant(Op.getOperand(1))) &&
14061
2.71k
        (User->getOpcode() != ISD::BITCAST ||
14062
8
         User->getValueType(0) != MVT::i32))
14063
2.70k
      return SDValue();
14064
178
    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14065
178
                                  DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14066
178
                                  Op.getOperand(1));
14067
178
    return DAG.getBitcast(MVT::f32, Extract);
14068
178
  }
14069
9.10k
14070
9.10k
  
if (9.10k
VT == MVT::i32 || 9.10k
VT == MVT::i646.90k
) {
14071
6.53k
    // ExtractPS/pextrq works with constant index.
14072
6.53k
    if (isa<ConstantSDNode>(Op.getOperand(1)))
14073
6.53k
      return Op;
14074
2.57k
  }
14075
2.57k
14076
2.57k
  return SDValue();
14077
2.57k
}
14078
14079
/// Extract one bit from mask vector, like v16i1 or v8i1.
14080
/// AVX-512 feature.
14081
SDValue
14082
4.93k
X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
14083
4.93k
  SDValue Vec = Op.getOperand(0);
14084
4.93k
  SDLoc dl(Vec);
14085
4.93k
  MVT VecVT = Vec.getSimpleValueType();
14086
4.93k
  SDValue Idx = Op.getOperand(1);
14087
4.93k
  MVT EltVT = Op.getSimpleValueType();
14088
4.93k
14089
4.93k
  assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
14090
4.93k
         "Unexpected vector type in ExtractBitFromMaskVector");
14091
4.93k
14092
4.93k
  // variable index can't be handled in mask registers,
14093
4.93k
  // extend vector to VR512/128
14094
4.93k
  if (
!isa<ConstantSDNode>(Idx)4.93k
) {
14095
13
    unsigned NumElts = VecVT.getVectorNumElements();
14096
13
    // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14097
13
    // than extending to 128/256bit.
14098
13
    unsigned VecSize = (NumElts <= 4 ? 
1284
:
5129
);
14099
13
    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14100
13
    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14101
13
    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14102
13
                              ExtVT.getVectorElementType(), Ext, Idx);
14103
13
    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14104
13
  }
14105
4.92k
14106
4.92k
  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14107
4.92k
  if (
(!Subtarget.hasDQI() && 4.92k
(VecVT.getVectorNumElements() == 8)4.69k
) ||
14108
4.92k
      
(VecVT.getVectorNumElements() < 8)3.18k
) {
14109
1.77k
    // Use kshiftlw/rw instruction.
14110
1.77k
    VecVT = MVT::v16i1;
14111
1.77k
    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14112
1.77k
                      DAG.getUNDEF(VecVT),
14113
1.77k
                      Vec,
14114
1.77k
                      DAG.getIntPtrConstant(0, dl));
14115
1.77k
  }
14116
4.92k
  unsigned MaxSift = VecVT.getVectorNumElements() - 1;
14117
4.92k
  if (MaxSift - IdxVal)
14118
4.72k
    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14119
4.72k
                      DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
14120
4.93k
  Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14121
4.93k
                    DAG.getConstant(MaxSift, dl, MVT::i8));
14122
4.93k
  return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
14123
4.93k
                     DAG.getIntPtrConstant(0, dl));
14124
4.93k
}
14125
14126
SDValue
14127
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14128
31.4k
                                           SelectionDAG &DAG) const {
14129
31.4k
  SDLoc dl(Op);
14130
31.4k
  SDValue Vec = Op.getOperand(0);
14131
31.4k
  MVT VecVT = Vec.getSimpleValueType();
14132
31.4k
  SDValue Idx = Op.getOperand(1);
14133
31.4k
14134
31.4k
  if (VecVT.getVectorElementType() == MVT::i1)
14135
4.93k
    return ExtractBitFromMaskVector(Op, DAG);
14136
26.5k
14137
26.5k
  
if (26.5k
!isa<ConstantSDNode>(Idx)26.5k
) {
14138
597
    // Its more profitable to go through memory (1 cycles throughput)
14139
597
    // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14140
597
    // IACA tool was used to get performance estimation
14141
597
    // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14142
597
    //
14143
597
    // example : extractelement <16 x i8> %a, i32 %i
14144
597
    //
14145
597
    // Block Throughput: 3.00 Cycles
14146
597
    // Throughput Bottleneck: Port5
14147
597
    //
14148
597
    // | Num Of |   Ports pressure in cycles  |    |
14149
597
    // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
14150
597
    // ---------------------------------------------
14151
597
    // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
14152
597
    // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
14153
597
    // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
14154
597
    // Total Num Of Uops: 4
14155
597
    //
14156
597
    //
14157
597
    // Block Throughput: 1.00 Cycles
14158
597
    // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14159
597
    //
14160
597
    // |    |  Ports pressure in cycles   |  |
14161
597
    // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
14162
597
    // ---------------------------------------------------------
14163
597
    // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14164
597
    // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
14165
597
    // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
14166
597
    // Total Num Of Uops: 4
14167
597
14168
597
    return SDValue();
14169
597
  }
14170
25.9k
14171
25.9k
  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14172
25.9k
14173
25.9k
  // If this is a 256-bit vector result, first extract the 128-bit vector and
14174
25.9k
  // then extract the element from the 128-bit vector.
14175
25.9k
  if (
VecVT.is256BitVector() || 25.9k
VecVT.is512BitVector()24.6k
) {
14176
3.50k
    // Get the 128-bit vector.
14177
3.50k
    Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14178
3.50k
    MVT EltVT = VecVT.getVectorElementType();
14179
3.50k
14180
3.50k
    unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14181
3.50k
    assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14182
3.50k
14183
3.50k
    // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14184
3.50k
    // this can be done with a mask.
14185
3.50k
    IdxVal &= ElemsPerChunk - 1;
14186
3.50k
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14187
3.50k
                       DAG.getConstant(IdxVal, dl, MVT::i32));
14188
3.50k
  }
14189
22.4k
14190
25.9k
  assert(VecVT.is128BitVector() && "Unexpected vector length");
14191
22.4k
14192
22.4k
  MVT VT = Op.getSimpleValueType();
14193
22.4k
14194
22.4k
  if (
VT.getSizeInBits() == 1622.4k
) {
14195
574
    // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14196
574
    // we're going to zero extend the register or fold the store (SSE41 only).
14197
574
    if (
IdxVal == 0 && 574
!MayFoldIntoZeroExtend(Op)175
&&
14198
164
        
!(Subtarget.hasSSE41() && 164
MayFoldIntoStore(Op)122
))
14199
96
      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14200
96
                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14201
96
                                     DAG.getBitcast(MVT::v4i32, Vec), Idx));
14202
478
14203
478
    // Transform it so it match pextrw which produces a 32-bit result.
14204
478
    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14205
478
                                  Op.getOperand(0), Op.getOperand(1));
14206
478
    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14207
478
                                  DAG.getValueType(VT));
14208
478
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14209
478
  }
14210
21.8k
14211
21.8k
  
if (21.8k
Subtarget.hasSSE41()21.8k
)
14212
14.6k
    
if (SDValue 14.6k
Res14.6k
= LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14213
9.12k
      return Res;
14214
12.7k
14215
12.7k
  // TODO: We only extract a single element from v16i8, we can probably afford
14216
12.7k
  // to be more aggressive here before using the default approach of spilling to
14217
12.7k
  // stack.
14218
12.7k
  
if (12.7k
VT.getSizeInBits() == 8 && 12.7k
Op->isOnlyUserOf(Vec.getNode())551
) {
14219
34
    // Extract either the lowest i32 or any i16, and extract the sub-byte.
14220
34
    int DWordIdx = IdxVal / 4;
14221
34
    if (
DWordIdx == 034
) {
14222
15
      SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14223
15
                                DAG.getBitcast(MVT::v4i32, Vec),
14224
15
                                DAG.getIntPtrConstant(DWordIdx, dl));
14225
15
      int ShiftVal = (IdxVal % 4) * 8;
14226
15
      if (ShiftVal != 0)
14227
7
        Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14228
7
                          DAG.getConstant(ShiftVal, dl, MVT::i32));
14229
15
      return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14230
15
    }
14231
19
14232
19
    int WordIdx = IdxVal / 2;
14233
19
    SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14234
19
                              DAG.getBitcast(MVT::v8i16, Vec),
14235
19
                              DAG.getIntPtrConstant(WordIdx, dl));
14236
19
    int ShiftVal = (IdxVal % 2) * 8;
14237
19
    if (ShiftVal != 0)
14238
5
      Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14239
5
                        DAG.getConstant(ShiftVal, dl, MVT::i16));
14240
34
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14241
34
  }
14242
12.7k
14243
12.7k
  
if (12.7k
VT.getSizeInBits() == 3212.7k
) {
14244
6.26k
    if (IdxVal == 0)
14245
5.18k
      return Op;
14246
1.08k
14247
1.08k
    // SHUFPS the element to the lowest double word, then movss.
14248
1.08k
    int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14249
1.08k
    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14250
1.08k
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14251
1.08k
                       DAG.getIntPtrConstant(0, dl));
14252
1.08k
  }
14253
6.43k
14254
6.43k
  
if (6.43k
VT.getSizeInBits() == 646.43k
) {
14255
5.91k
    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14256
5.91k
    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14257
5.91k
    //        to match extract_elt for f64.
14258
5.91k
    if (IdxVal == 0)
14259
5.23k
      return Op;
14260
682
14261
682
    // UNPCKHPD the element to the lowest double word, then movsd.
14262
682
    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14263
682
    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14264
682
    int Mask[2] = { 1, -1 };
14265
682
    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14266
682
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14267
682
                       DAG.getIntPtrConstant(0, dl));
14268
682
  }
14269
517
14270
517
  return SDValue();
14271
517
}
14272
14273
/// Insert one bit to mask vector, like v16i1 or v8i1.
14274
/// AVX-512 feature.
14275
SDValue
14276
891
X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14277
891
  SDLoc dl(Op);
14278
891
  SDValue Vec = Op.getOperand(0);
14279
891
  SDValue Elt = Op.getOperand(1);
14280
891
  SDValue Idx = Op.getOperand(2);
14281
891
  MVT VecVT = Vec.getSimpleValueType();
14282
891
14283
891
  if (
!isa<ConstantSDNode>(Idx)891
) {
14284
0
    // Non constant index. Extend source and destination,
14285
0
    // insert element and then truncate the result.
14286
0
    MVT ExtVecVT = (VecVT == MVT::v8i1 ?  
MVT::v8i640
:
MVT::v16i320
);
14287
0
    MVT ExtEltVT = (VecVT == MVT::v8i1 ?  
MVT::i640
:
MVT::i320
);
14288
0
    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14289
0
      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14290
0
      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14291
0
    return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14292
0
  }
14293
891
14294
891
  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14295
891
  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14296
891
  unsigned NumElems = VecVT.getVectorNumElements();
14297
891
14298
891
  if(
Vec.isUndef()891
) {
14299
6
    if (IdxVal)
14300
0
      EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14301
0
                             DAG.getConstant(IdxVal, dl, MVT::i8));
14302
6
    return EltInVec;
14303
6
  }
14304
885
14305
885
  // Insertion of one bit into first position
14306
885
  
if (885
IdxVal == 0885
) {
14307
185
    // Clean top bits of vector.
14308
185
    EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14309
185
                           DAG.getConstant(NumElems - 1, dl, MVT::i8));
14310
185
    EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14311
185
                           DAG.getConstant(NumElems - 1, dl, MVT::i8));
14312
185
    // Clean the first bit in source vector.
14313
185
    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14314
185
                      DAG.getConstant(1 , dl, MVT::i8));
14315
185
    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14316
185
                      DAG.getConstant(1, dl, MVT::i8));
14317
185
14318
185
    return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14319
185
  }
14320
700
  // Insertion of one bit into last position
14321
700
  
if (700
IdxVal == NumElems -1700
) {
14322
10
    // Move the bit to the last position inside the vector.
14323
10
    EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14324
10
                           DAG.getConstant(IdxVal, dl, MVT::i8));
14325
10
    // Clean the last bit in the source vector.
14326
10
    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14327
10
                           DAG.getConstant(1, dl, MVT::i8));
14328
10
    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14329
10
                           DAG.getConstant(1 , dl, MVT::i8));
14330
10
14331
10
    return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14332
10
  }
14333
690
14334
690
  // Use shuffle to insert element.
14335
690
  SmallVector<int, 64> MaskVec(NumElems);
14336
22.0k
  for (unsigned i = 0; 
i != NumElems22.0k
;
++i21.3k
)
14337
21.3k
    
MaskVec[i] = (i == IdxVal) ? 21.3k
NumElems690
:
i20.6k
;
14338
891
14339
891
  return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14340
891
}
14341
14342
SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14343
11.7k
                                                  SelectionDAG &DAG) const {
14344
11.7k
  MVT VT = Op.getSimpleValueType();
14345
11.7k
  MVT EltVT = VT.getVectorElementType();
14346
11.7k
  unsigned NumElts = VT.getVectorNumElements();
14347
11.7k
14348
11.7k
  if (EltVT == MVT::i1)
14349
891
    return InsertBitToMaskVector(Op, DAG);
14350
10.8k
14351
10.8k
  SDLoc dl(Op);
14352
10.8k
  SDValue N0 = Op.getOperand(0);
14353
10.8k
  SDValue N1 = Op.getOperand(1);
14354
10.8k
  SDValue N2 = Op.getOperand(2);
14355
10.8k
  if (!isa<ConstantSDNode>(N2))
14356
9
    return SDValue();
14357
10.8k
  auto *N2C = cast<ConstantSDNode>(N2);
14358
10.8k
  unsigned IdxVal = N2C->getZExtValue();
14359
10.8k
14360
10.8k
  bool IsZeroElt = X86::isZeroNode(N1);
14361
9.55k
  bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14362
10.8k
14363
10.8k
  // If we are inserting a element, see if we can do this more efficiently with
14364
10.8k
  // a blend shuffle with a rematerializable vector than a costly integer
14365
10.8k
  // insertion.
14366
10.8k
  if (
(IsZeroElt || 10.8k
IsAllOnesElt10.5k
) &&
Subtarget.hasSSE41()288
&&
14367
10.8k
      
16 <= EltVT.getSizeInBits()218
) {
14368
112
    SmallVector<int, 8> BlendMask;
14369
1.00k
    for (unsigned i = 0; 
i != NumElts1.00k
;
++i896
)
14370
896
      
BlendMask.push_back(i == IdxVal ? 896
i + NumElts112
:
i784
);
14371
102
    SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14372
10
                                  : getOnesVector(VT, DAG, dl);
14373
112
    return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14374
112
  }
14375
10.7k
14376
10.7k
  // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14377
10.7k
  // into that, and then insert the subvector back into the result.
14378
10.7k
  
if (10.7k
VT.is256BitVector() || 10.7k
VT.is512BitVector()10.6k
) {
14379
216
    // With a 256-bit vector, we can insert into the zero element efficiently
14380
216
    // using a blend if we have AVX or AVX2 and the right data type.
14381
216
    if (
VT.is256BitVector() && 216
IdxVal == 0157
) {
14382
14
      // TODO: It is worthwhile to cast integer to floating point and back
14383
14
      // and incur a domain crossing penalty if that's what we'll end up
14384
14
      // doing anyway after extracting to a 128-bit vector.
14385
14
      if (
(Subtarget.hasAVX() && 14
(EltVT == MVT::f64 || 14
EltVT == MVT::f3212
)) ||
14386
14
          
(Subtarget.hasAVX2() && 10
EltVT == MVT::i325
)) {
14387
5
        SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14388
5
        N2 = DAG.getIntPtrConstant(1, dl);
14389
5
        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14390
5
      }
14391
211
    }
14392
211
14393
211
    // Get the desired 128-bit vector chunk.
14394
211
    SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14395
211
14396
211
    // Insert the element into the desired chunk.
14397
211
    unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14398
211
    assert(isPowerOf2_32(NumEltsIn128));
14399
211
    // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14400
211
    unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14401
211
14402
211
    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14403
211
                    DAG.getConstant(IdxIn128, dl, MVT::i32));
14404
211
14405
211
    // Insert the changed part back into the bigger vector
14406
211
    return insert128BitVector(N0, V, IdxVal, DAG, dl);
14407
211
  }
14408
10.7k
  assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14409
10.5k
14410
10.5k
  // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14411
10.5k
  // argument. SSE41 required for pinsrb.
14412
10.5k
  if (
VT == MVT::v8i16 || 10.5k
(VT == MVT::v16i8 && 8.88k
Subtarget.hasSSE41()6.09k
)) {
14413
7.75k
    unsigned Opc;
14414
7.75k
    if (
VT == MVT::v8i167.75k
) {
14415
1.66k
      assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14416
1.66k
      Opc = X86ISD::PINSRW;
14417
7.75k
    } else {
14418
6.09k
      assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14419
6.09k
      assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14420
6.09k
      Opc = X86ISD::PINSRB;
14421
6.09k
    }
14422
7.75k
14423
7.75k
    if (N1.getValueType() != MVT::i32)
14424
7.75k
      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14425
7.75k
    if (N2.getValueType() != MVT::i32)
14426
7.12k
      N2 = DAG.getIntPtrConstant(IdxVal, dl);
14427
7.75k
    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14428
7.75k
  }
14429
2.79k
14430
2.79k
  
if (2.79k
Subtarget.hasSSE41()2.79k
) {
14431
2.63k
    if (
EltVT == MVT::f322.63k
) {
14432
998
      // Bits [7:6] of the constant are the source select. This will always be
14433
998
      //   zero here. The DAG Combiner may combine an extract_elt index into
14434
998
      //   these bits. For example (insert (extract, 3), 2) could be matched by
14435
998
      //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14436
998
      // Bits [5:4] of the constant are the destination select. This is the
14437
998
      //   value of the incoming immediate.
14438
998
      // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14439
998
      //   combine either bitwise AND or insert of float 0.0 to set these bits.
14440
998
14441
998
      bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14442
998
      if (
IdxVal == 0 && 998
(!MinSize || 99
!MayFoldLoad(N1)2
)) {
14443
98
        // If this is an insertion of 32-bits into the low 32-bits of
14444
98
        // a vector, we prefer to generate a blend with immediate rather
14445
98
        // than an insertps. Blends are simpler operations in hardware and so
14446
98
        // will always have equal or better performance than insertps.
14447
98
        // But if optimizing for size and there's a load folding opportunity,
14448
98
        // generate insertps because blendps does not have a 32-bit memory
14449
98
        // operand form.
14450
98
        N2 = DAG.getIntPtrConstant(1, dl);
14451
98
        N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14452
98
        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14453
98
      }
14454
900
      N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14455
900
      // Create this as a scalar to vector..
14456
900
      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14457
900
      return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14458
900
    }
14459
1.63k
14460
1.63k
    // PINSR* works with constant index.
14461
1.63k
    
if (1.63k
EltVT == MVT::i32 || 1.63k
EltVT == MVT::i64272
)
14462
1.50k
      return Op;
14463
286
  }
14464
286
14465
286
  return SDValue();
14466
286
}
14467
14468
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14469
7.26k
                                     SelectionDAG &DAG) {
14470
7.26k
  SDLoc dl(Op);
14471
7.26k
  MVT OpVT = Op.getSimpleValueType();
14472
7.26k
14473
7.26k
  // It's always cheaper to replace a xor+movd with xorps and simplifies further
14474
7.26k
  // combines.
14475
7.26k
  if (X86::isZeroNode(Op.getOperand(0)))
14476
19
    return getZeroVector(OpVT, Subtarget, DAG, dl);
14477
7.24k
14478
7.24k
  // If this is a 256-bit vector result, first insert into a 128-bit
14479
7.24k
  // vector and then insert into the 256-bit vector.
14480
7.24k
  
if (7.24k
!OpVT.is128BitVector()7.24k
) {
14481
132
    // Insert into a 128-bit vector.
14482
132
    unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14483
132
    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14484
132
                                 OpVT.getVectorNumElements() / SizeFactor);
14485
132
14486
132
    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14487
132
14488
132
    // Insert the 128-bit vector.
14489
132
    return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14490
132
  }
14491
7.24k
  assert(OpVT.is128BitVector() && "Expected an SSE type!");
14492
7.11k
14493
7.11k
  // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14494
7.11k
  if (OpVT == MVT::v4i32)
14495
6.38k
    return Op;
14496
732
14497
732
  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14498
732
  return DAG.getBitcast(
14499
732
      OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14500
732
}
14501
14502
// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
14503
// a simple subregister reference or explicit instructions to grab
14504
// upper bits of a vector.
14505
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14506
0
                                      SelectionDAG &DAG) {
14507
0
  SDLoc dl(Op);
14508
0
  SDValue In =  Op.getOperand(0);
14509
0
  SDValue Idx = Op.getOperand(1);
14510
0
  MVT ResVT = Op.getSimpleValueType();
14511
0
14512
0
  // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond
14513
0
  // would result with: v1i1 = extract_subvector(vXi1, idx).
14514
0
  // Lower these into extract_vector_elt which is already selectable.
14515
0
  assert(ResVT == MVT::v1i1);
14516
0
  assert(Subtarget.hasAVX512() &&
14517
0
         "Boolean EXTRACT_SUBVECTOR requires AVX512");
14518
0
14519
0
  MVT EltVT = ResVT.getVectorElementType();
14520
0
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14521
0
  MVT LegalVT =
14522
0
      (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT();
14523
0
  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx);
14524
0
  return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res);
14525
0
}
14526
14527
// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
14528
// simple superregister reference or explicit instructions to insert
14529
// the upper bits of a vector.
14530
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14531
2.76k
                                     SelectionDAG &DAG) {
14532
2.76k
  assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14533
2.76k
14534
2.76k
  return insert1BitVector(Op, DAG, Subtarget);
14535
2.76k
}
14536
14537
// Returns the appropriate wrapper opcode for a global reference.
14538
43.7k
unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14539
43.7k
  // References to absolute symbols are never PC-relative.
14540
43.7k
  if (
GV && 43.7k
GV->isAbsoluteSymbolRef()26.6k
)
14541
18
    return X86ISD::Wrapper;
14542
43.7k
14543
43.7k
  CodeModel::Model M = getTargetMachine().getCodeModel();
14544
43.7k
  if (Subtarget.isPICStyleRIPRel() &&
14545
16.7k
      
(M == CodeModel::Small || 16.7k
M == CodeModel::Kernel381
))
14546
16.3k
    return X86ISD::WrapperRIP;
14547
27.3k
14548
27.3k
  return X86ISD::Wrapper;
14549
27.3k
}
14550
14551
// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14552
// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14553
// one of the above mentioned nodes. It has to be wrapped because otherwise
14554
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14555
// be used to form addressing mode. These wrapped nodes will be selected
14556
// into MOV32ri.
14557
SDValue
14558
16.4k
X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14559
16.4k
  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14560
16.4k
14561
16.4k
  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14562
16.4k
  // global base reg.
14563
16.4k
  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14564
16.4k
14565
16.4k
  auto PtrVT = getPointerTy(DAG.getDataLayout());
14566
16.4k
  SDValue Result = DAG.getTargetConstantPool(
14567
16.4k
      CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14568
16.4k
  SDLoc DL(CP);
14569
16.4k
  Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14570
16.4k
  // With PIC, the address is actually $g + Offset.
14571
16.4k
  if (
OpFlag16.4k
) {
14572
404
    Result =
14573
404
        DAG.getNode(ISD::ADD, DL, PtrVT,
14574
404
                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14575
404
  }
14576
16.4k
14577
16.4k
  return Result;
14578
16.4k
}
14579
14580
557
SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14581
557
  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14582
557
14583
557
  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14584
557
  // global base reg.
14585
557
  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14586
557
14587
557
  auto PtrVT = getPointerTy(DAG.getDataLayout());
14588
557
  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14589
557
  SDLoc DL(JT);
14590
557
  Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14591
557
14592
557
  // With PIC, the address is actually $g + Offset.
14593
557
  if (OpFlag)
14594
162
    Result =
14595
162
        DAG.getNode(ISD::ADD, DL, PtrVT,
14596
162
                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14597
557
14598
557
  return Result;
14599
557
}
14600
14601
SDValue
14602
83
X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14603
83
  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14604
83
14605
83
  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14606
83
  // global base reg.
14607
83
  const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14608
83
  unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14609
83
14610
83
  auto PtrVT = getPointerTy(DAG.getDataLayout());
14611
83
  SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14612
83
14613
83
  SDLoc DL(Op);
14614
83
  Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14615
83
14616
83
  // With PIC, the address is actually $g + Offset.
14617
83
  if (
isPositionIndependent() && 83
!Subtarget.is64Bit()33
) {
14618
0
    Result =
14619
0
        DAG.getNode(ISD::ADD, DL, PtrVT,
14620
0
                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14621
0
  }
14622
83
14623
83
  // For symbols that require a load from a stub to get the address, emit the
14624
83
  // load.
14625
83
  if (isGlobalStubReference(OpFlag))
14626
0
    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14627
0
                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14628
83
14629
83
  return Result;
14630
83
}
14631
14632
SDValue
14633
19
X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14634
19
  // Create the TargetBlockAddressAddress node.
14635
19
  unsigned char OpFlags =
14636
19
    Subtarget.classifyBlockAddressReference();
14637
19
  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14638
19
  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14639
19
  SDLoc dl(Op);
14640
19
  auto PtrVT = getPointerTy(DAG.getDataLayout());
14641
19
  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14642
19
  Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14643
19
14644
19
  // With PIC, the address is actually $g + Offset.
14645
19
  if (
isGlobalRelativeToPICBase(OpFlags)19
) {
14646
2
    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14647
2
                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14648
2
  }
14649
19
14650
19
  return Result;
14651
19
}
14652
14653
SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14654
                                              const SDLoc &dl, int64_t Offset,
14655
26.6k
                                              SelectionDAG &DAG) const {
14656
26.6k
  // Create the TargetGlobalAddress node, folding in the constant
14657
26.6k
  // offset if it is legal.
14658
26.6k
  unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14659
26.6k
  CodeModel::Model M = DAG.getTarget().getCodeModel();
14660
26.6k
  auto PtrVT = getPointerTy(DAG.getDataLayout());
14661
26.6k
  SDValue Result;
14662
26.6k
  if (OpFlags == X86II::MO_NO_FLAG &&
14663
26.6k
      
X86::isOffsetSuitableForCodeModel(Offset, M)15.1k
) {
14664
14.7k
    // A direct static reference to a global.
14665
14.7k
    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14666
14.7k
    Offset = 0;
14667
26.6k
  } else {
14668
11.9k
    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14669
11.9k
  }
14670
26.6k
14671
26.6k
  Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14672
26.6k
14673
26.6k
  // With PIC, the address is actually $g + Offset.
14674
26.6k
  if (
isGlobalRelativeToPICBase(OpFlags)26.6k
) {
14675
7.40k
    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14676
7.40k
                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14677
7.40k
  }
14678
26.6k
14679
26.6k
  // For globals that require a load from a stub to get the address, emit the
14680
26.6k
  // load.
14681
26.6k
  if (isGlobalStubReference(OpFlags))
14682
5.16k
    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14683
5.16k
                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14684
26.6k
14685
26.6k
  // If there was a non-zero offset that we didn't fold, create an explicit
14686
26.6k
  // addition for it.
14687
26.6k
  if (Offset != 0)
14688
4
    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14689
4
                         DAG.getConstant(Offset, dl, PtrVT));
14690
26.6k
14691
26.6k
  return Result;
14692
26.6k
}
14693
14694
SDValue
14695
26.6k
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14696
26.6k
  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14697
26.6k
  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14698
26.6k
  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14699
26.6k
}
14700
14701
static SDValue
14702
GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14703
           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14704
38
           unsigned char OperandFlags, bool LocalDynamic = false) {
14705
38
  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14706
38
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14707
38
  SDLoc dl(GA);
14708
38
  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14709
38
                                           GA->getValueType(0),
14710
38
                                           GA->getOffset(),
14711
38
                                           OperandFlags);
14712
38
14713
17
  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14714
21
                                           : X86ISD::TLSADDR;
14715
38
14716
38
  if (
InFlag38
) {
14717
17
    SDValue Ops[] = { Chain,  TGA, *InFlag };
14718
17
    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14719
38
  } else {
14720
21
    SDValue Ops[]  = { Chain, TGA };
14721
21
    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14722
21
  }
14723
38
14724
38
  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14725
38
  MFI.setAdjustsStack(true);
14726
38
  MFI.setHasCalls(true);
14727
38
14728
38
  SDValue Flag = Chain.getValue(1);
14729
38
  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14730
38
}
14731
14732
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14733
static SDValue
14734
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14735
9
                                const EVT PtrVT) {
14736
9
  SDValue InFlag;
14737
9
  SDLoc dl(GA);  // ? function entry point might be better
14738
9
  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14739
9
                                   DAG.getNode(X86ISD::GlobalBaseReg,
14740
9
                                               SDLoc(), PtrVT), InFlag);
14741
9
  InFlag = Chain.getValue(1);
14742
9
14743
9
  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14744
9
}
14745
14746
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14747
static SDValue
14748
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14749
12
                                const EVT PtrVT) {
14750
12
  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14751
12
                    X86::RAX, X86II::MO_TLSGD);
14752
12
}
14753
14754
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14755
                                           SelectionDAG &DAG,
14756
                                           const EVT PtrVT,
14757
17
                                           bool is64Bit) {
14758
17
  SDLoc dl(GA);
14759
17
14760
17
  // Get the start address of the TLS block for this module.
14761
17
  X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14762
17
      .getInfo<X86MachineFunctionInfo>();
14763
17
  MFI->incNumLocalDynamicTLSAccesses();
14764
17
14765
17
  SDValue Base;
14766
17
  if (
is64Bit17
) {
14767
9
    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14768
9
                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
14769
17
  } else {
14770
8
    SDValue InFlag;
14771
8
    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14772
8
        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14773
8
    InFlag = Chain.getValue(1);
14774
8
    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14775
8
                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14776
8
  }
14777
17
14778
17
  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14779
17
  // of Base.
14780
17
14781
17
  // Build x@dtpoff.
14782
17
  unsigned char OperandFlags = X86II::MO_DTPOFF;
14783
17
  unsigned WrapperKind = X86ISD::Wrapper;
14784
17
  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14785
17
                                           GA->getValueType(0),
14786
17
                                           GA->getOffset(), OperandFlags);
14787
17
  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14788
17
14789
17
  // Add x@dtpoff with the base.
14790
17
  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14791
17
}
14792
14793
// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14794
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14795
                                   const EVT PtrVT, TLSModel::Model model,
14796
92
                                   bool is64Bit, bool isPIC) {
14797
92
  SDLoc dl(GA);
14798
92
14799
92
  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14800
92
  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14801
92
                                                         is64Bit ? 
25754
:
25638
));
14802
92
14803
92
  SDValue ThreadPointer =
14804
92
      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14805
92
                  MachinePointerInfo(Ptr));
14806
92
14807
92
  unsigned char OperandFlags = 0;
14808
92
  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
14809
92
  // initialexec.
14810
92
  unsigned WrapperKind = X86ISD::Wrapper;
14811
92
  if (
model == TLSModel::LocalExec92
) {
14812
50
    OperandFlags = is64Bit ? 
X86II::MO_TPOFF26
:
X86II::MO_NTPOFF24
;
14813
92
  } else 
if (42
model == TLSModel::InitialExec42
) {
14814
42
    if (
is64Bit42
) {
14815
28
      OperandFlags = X86II::MO_GOTTPOFF;
14816
28
      WrapperKind = X86ISD::WrapperRIP;
14817
42
    } else {
14818
14
      OperandFlags = isPIC ? 
X86II::MO_GOTNTPOFF5
:
X86II::MO_INDNTPOFF9
;
14819
14
    }
14820
0
  } else {
14821
0
    llvm_unreachable("Unexpected model");
14822
42
  }
14823
92
14824
92
  // emit "addl x@ntpoff,%eax" (local exec)
14825
92
  // or "addl x@indntpoff,%eax" (initial exec)
14826
92
  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14827
92
  SDValue TGA =
14828
92
      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14829
92
                                 GA->getOffset(), OperandFlags);
14830
92
  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14831
92
14832
92
  if (
model == TLSModel::InitialExec92
) {
14833
42
    if (
isPIC && 42
!is64Bit12
) {
14834
5
      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14835
5
                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14836
5
                           Offset);
14837
5
    }
14838
42
14839
42
    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14840
42
                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14841
42
  }
14842
92
14843
92
  // The address of the thread local variable is the add of the thread
14844
92
  // pointer with the offset of the variable.
14845
92
  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14846
92
}
14847
14848
SDValue
14849
373
X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14850
373
14851
373
  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14852
373
14853
373
  if (DAG.getTarget().Options.EmulatedTLS)
14854
124
    return LowerToTLSEmulatedModel(GA, DAG);
14855
249
14856
249
  const GlobalValue *GV = GA->getGlobal();
14857
249
  auto PtrVT = getPointerTy(DAG.getDataLayout());
14858
249
  bool PositionIndependent = isPositionIndependent();
14859
249
14860
249
  if (
Subtarget.isTargetELF()249
) {
14861
130
    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14862
130
    switch (model) {
14863
21
      case TLSModel::GeneralDynamic:
14864
21
        if (Subtarget.is64Bit())
14865
12
          return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14866
9
        return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14867
17
      case TLSModel::LocalDynamic:
14868
17
        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14869
17
                                           Subtarget.is64Bit());
14870
92
      case TLSModel::InitialExec:
14871
92
      case TLSModel::LocalExec:
14872
92
        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14873
92
                                   PositionIndependent);
14874
0
    }
14875
0
    
llvm_unreachable0
("Unknown TLS model.");
14876
0
  }
14877
119
14878
119
  
if (119
Subtarget.isTargetDarwin()119
) {
14879
53
    // Darwin only has one model of TLS.  Lower to that.
14880
53
    unsigned char OpFlag = 0;
14881
53
    unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14882
53
                           
X86ISD::WrapperRIP52
:
X86ISD::Wrapper1
;
14883
53
14884
53
    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14885
53
    // global base reg.
14886
52
    bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14887
53
    if (PIC32)
14888
0
      OpFlag = X86II::MO_TLVP_PIC_BASE;
14889
53
    else
14890
53
      OpFlag = X86II::MO_TLVP;
14891
53
    SDLoc DL(Op);
14892
53
    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14893
53
                                                GA->getValueType(0),
14894
53
                                                GA->getOffset(), OpFlag);
14895
53
    SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14896
53
14897
53
    // With PIC32, the address is actually $g + Offset.
14898
53
    if (PIC32)
14899
0
      Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14900
0
                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14901
0
                           Offset);
14902
53
14903
53
    // Lowering the machine isd will make sure everything is in the right
14904
53
    // location.
14905
53
    SDValue Chain = DAG.getEntryNode();
14906
53
    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14907
53
    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
14908
53
    SDValue Args[] = { Chain, Offset };
14909
53
    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14910
53
    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14911
53
                               DAG.getIntPtrConstant(0, DL, true),
14912
53
                               Chain.getValue(1), DL);
14913
53
14914
53
    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14915
53
    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14916
53
    MFI.setAdjustsStack(true);
14917
53
14918
53
    // And our return value (tls address) is in the standard call return value
14919
53
    // location.
14920
53
    unsigned Reg = Subtarget.is64Bit() ? 
X86::RAX52
:
X86::EAX1
;
14921
53
    return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14922
53
  }
14923
66
14924
66
  
if (66
Subtarget.isTargetKnownWindowsMSVC() ||
14925
34
      Subtarget.isTargetWindowsItanium() ||
14926
66
      
Subtarget.isTargetWindowsGNU()32
) {
14927
66
    // Just use the implicit TLS architecture
14928
66
    // Need to generate something similar to:
14929
66
    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14930
66
    //                                  ; from TEB
14931
66
    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
14932
66
    //   mov     rcx, qword [rdx+rcx*8]
14933
66
    //   mov     eax, .tls$:tlsvar
14934
66
    //   [rax+rcx] contains the address
14935
66
    // Windows 64bit: gs:0x58
14936
66
    // Windows 32bit: fs:__tls_array
14937
66
14938
66
    SDLoc dl(GA);
14939
66
    SDValue Chain = DAG.getEntryNode();
14940
66
14941
66
    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14942
66
    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14943
66
    // use its literal value of 0x2C.
14944
66
    Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14945
32
                                        ? Type::getInt8PtrTy(*DAG.getContext(),
14946
32
                                                             256)
14947
34
                                        : Type::getInt32PtrTy(*DAG.getContext(),
14948
34
                                                              257));
14949
66
14950
66
    SDValue TlsArray = Subtarget.is64Bit()
14951
32
                           ? DAG.getIntPtrConstant(0x58, dl)
14952
34
                           : (Subtarget.isTargetWindowsGNU()
14953
16
                                  ? DAG.getIntPtrConstant(0x2C, dl)
14954
34
                                  : DAG.getExternalSymbol("_tls_array", PtrVT));
14955
66
14956
66
    SDValue ThreadPointer =
14957
66
        DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14958
66
14959
66
    SDValue res;
14960
66
    if (
GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel66
) {
14961
4
      res = ThreadPointer;
14962
66
    } else {
14963
62
      // Load the _tls_index variable
14964
62
      SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14965
62
      if (Subtarget.is64Bit())
14966
30
        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14967
30
                             MachinePointerInfo(), MVT::i32);
14968
62
      else
14969
32
        IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14970
62
14971
62
      auto &DL = DAG.getDataLayout();
14972
62
      SDValue Scale =
14973
62
          DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14974
62
      IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14975
62
14976
62
      res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14977
62
    }
14978
66
14979
66
    res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14980
66
14981
66
    // Get the offset of start of .tls section
14982
66
    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14983
66
                                             GA->getValueType(0),
14984
66
                                             GA->getOffset(), X86II::MO_SECREL);
14985
66
    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14986
66
14987
66
    // The address of the thread local variable is the add of the thread
14988
66
    // pointer with the offset of the variable.
14989
66
    return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14990
66
  }
14991
0
14992
0
  
llvm_unreachable0
("TLS not implemented for this target.");
14993
0
}
14994
14995
/// Lower SRA_PARTS and friends, which return two i32 values
14996
/// and take a 2 x i32 value to shift plus a shift amount.
14997
461
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14998
461
  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14999
461
  MVT VT = Op.getSimpleValueType();
15000
461
  unsigned VTBits = VT.getSizeInBits();
15001
461
  SDLoc dl(Op);
15002
461
  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15003
461
  SDValue ShOpLo = Op.getOperand(0);
15004
461
  SDValue ShOpHi = Op.getOperand(1);
15005
461
  SDValue ShAmt  = Op.getOperand(2);
15006
461
  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15007
461
  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15008
461
  // during isel.
15009
461
  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15010
461
                                  DAG.getConstant(VTBits - 1, dl, MVT::i8));
15011
21
  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15012
21
                                     DAG.getConstant(VTBits - 1, dl, MVT::i8))
15013
440
                       : DAG.getConstant(0, dl, VT);
15014
461
15015
461
  SDValue Tmp2, Tmp3;
15016
461
  if (
Op.getOpcode() == ISD::SHL_PARTS461
) {
15017
261
    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15018
261
    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15019
461
  } else {
15020
200
    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15021
200
    Tmp3 = DAG.getNode(isSRA ? 
ISD::SRA21
:
ISD::SRL179
, dl, VT, ShOpHi, SafeShAmt);
15022
200
  }
15023
461
15024
461
  // If the shift amount is larger or equal than the width of a part we can't
15025
461
  // rely on the results of shld/shrd. Insert a test and select the appropriate
15026
461
  // values for large shift amounts.
15027
461
  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15028
461
                                DAG.getConstant(VTBits, dl, MVT::i8));
15029
461
  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15030
461
                             AndNode, DAG.getConstant(0, dl, MVT::i8));
15031
461
15032
461
  SDValue Hi, Lo;
15033
461
  SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15034
461
  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15035
461
  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15036
461
15037
461
  if (
Op.getOpcode() == ISD::SHL_PARTS461
) {
15038
261
    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15039
261
    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15040
461
  } else {
15041
200
    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15042
200
    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15043
200
  }
15044
461
15045
461
  SDValue Ops[2] = { Lo, Hi };
15046
461
  return DAG.getMergeValues(Ops, dl);
15047
461
}
15048
15049
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15050
3.08k
                                           SelectionDAG &DAG) const {
15051
3.08k
  SDValue Src = Op.getOperand(0);
15052
3.08k
  MVT SrcVT = Src.getSimpleValueType();
15053
3.08k
  MVT VT = Op.getSimpleValueType();
15054
3.08k
  SDLoc dl(Op);
15055
3.08k
15056
3.08k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15057
3.08k
  if (
SrcVT.isVector()3.08k
) {
15058
338
    if (
SrcVT == MVT::v2i32 && 338
VT == MVT::v2f64285
) {
15059
285
      return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15060
285
                         DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15061
285
                                     DAG.getUNDEF(SrcVT)));
15062
285
    }
15063
53
    
if (53
SrcVT.getVectorElementType() == MVT::i153
) {
15064
53
      if (
SrcVT == MVT::v2i1 && 53
TLI.isTypeLegal(SrcVT)4
)
15065
4
        return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15066
4
                           DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
15067
49
      MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15068
49
      return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15069
49
                         DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
15070
49
    }
15071
0
    return SDValue();
15072
0
  }
15073
2.75k
15074
3.08k
  assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
15075
2.75k
         "Unknown SINT_TO_FP to lower!");
15076
2.75k
15077
2.75k
  // These are really Legal; return the operand so the caller accepts it as
15078
2.75k
  // Legal.
15079
2.75k
  if (
SrcVT == MVT::i32 && 2.75k
isScalarFPTypeInSSEReg(Op.getValueType())1.06k
)
15080
689
    return Op;
15081
2.06k
  
if (2.06k
SrcVT == MVT::i64 && 2.06k
isScalarFPTypeInSSEReg(Op.getValueType())1.68k
&&
15082
2.06k
      
Subtarget.is64Bit()1.61k
) {
15083
1.54k
    return Op;
15084
1.54k
  }
15085
518
15086
518
  SDValue ValueToStore = Op.getOperand(0);
15087
518
  if (
SrcVT == MVT::i64 && 518
isScalarFPTypeInSSEReg(Op.getValueType())142
&&
15088
68
      !Subtarget.is64Bit())
15089
518
    // Bitcasting to f64 here allows us to do a single 64-bit store from
15090
518
    // an SSE register, avoiding the store forwarding penalty that would come
15091
518
    // with two 32-bit stores.
15092
68
    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15093
3.08k
15094
3.08k
  unsigned Size = SrcVT.getSizeInBits()/8;
15095
3.08k
  MachineFunction &MF = DAG.getMachineFunction();
15096
3.08k
  auto PtrVT = getPointerTy(MF.getDataLayout());
15097
3.08k
  int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15098
3.08k
  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15099
3.08k
  SDValue Chain = DAG.getStore(
15100
3.08k
      DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15101
3.08k
      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15102
3.08k
  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15103
3.08k
}
15104
15105
SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15106
                                     SDValue StackSlot,
15107
602
                                     SelectionDAG &DAG) const {
15108
602
  // Build the FILD
15109
602
  SDLoc DL(Op);
15110
602
  SDVTList Tys;
15111
602
  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15112
602
  if (useSSE)
15113
77
    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15114
602
  else
15115
525
    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15116
602
15117
602
  unsigned ByteSize = SrcVT.getSizeInBits()/8;
15118
602
15119
602
  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15120
602
  MachineMemOperand *MMO;
15121
602
  if (
FI602
) {
15122
583
    int SSFI = FI->getIndex();
15123
583
    MMO = DAG.getMachineFunction().getMachineMemOperand(
15124
583
        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15125
583
        MachineMemOperand::MOLoad, ByteSize, ByteSize);
15126
602
  } else {
15127
19
    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15128
19
    StackSlot = StackSlot.getOperand(1);
15129
19
  }
15130
602
  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15131
77
  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15132
525
                                           X86ISD::FILD, DL,
15133
602
                                           Tys, Ops, SrcVT, MMO);
15134
602
15135
602
  if (
useSSE602
) {
15136
77
    Chain = Result.getValue(1);
15137
77
    SDValue InFlag = Result.getValue(2);
15138
77
15139
77
    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15140
77
    // shouldn't be necessary except that RFP cannot be live across
15141
77
    // multiple blocks. When stackifier is fixed, they can be uncoupled.
15142
77
    MachineFunction &MF = DAG.getMachineFunction();
15143
77
    unsigned SSFISize = Op.getValueSizeInBits()/8;
15144
77
    int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15145
77
    auto PtrVT = getPointerTy(MF.getDataLayout());
15146
77
    SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15147
77
    Tys = DAG.getVTList(MVT::Other);
15148
77
    SDValue Ops[] = {
15149
77
      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15150
77
    };
15151
77
    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15152
77
        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15153
77
        MachineMemOperand::MOStore, SSFISize, SSFISize);
15154
77
15155
77
    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15156
77
                                    Ops, Op.getValueType(), MMO);
15157
77
    Result = DAG.getLoad(
15158
77
        Op.getValueType(), DL, Chain, StackSlot,
15159
77
        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15160
77
  }
15161
602
15162
602
  return Result;
15163
602
}
15164
15165
/// 64-bit unsigned integer to double expansion.
15166
SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15167
53
                                               SelectionDAG &DAG) const {
15168
53
  // This algorithm is not obvious. Here it is what we're trying to output:
15169
53
  /*
15170
53
     movq       %rax,  %xmm0
15171
53
     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15172
53
     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15173
53
     #ifdef __SSE3__
15174
53
       haddpd   %xmm0, %xmm0
15175
53
     #else
15176
53
       pshufd   $0x4e, %xmm0, %xmm1
15177
53
       addpd    %xmm1, %xmm0
15178
53
     #endif
15179
53
  */
15180
53
15181
53
  SDLoc dl(Op);
15182
53
  LLVMContext *Context = DAG.getContext();
15183
53
15184
53
  // Build some magic constants.
15185
53
  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15186
53
  Constant *C0 = ConstantDataVector::get(*Context, CV0);
15187
53
  auto PtrVT = getPointerTy(DAG.getDataLayout());
15188
53
  SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15189
53
15190
53
  SmallVector<Constant*,2> CV1;
15191
53
  CV1.push_back(
15192
53
    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15193
53
                                      APInt(64, 0x4330000000000000ULL))));
15194
53
  CV1.push_back(
15195
53
    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15196
53
                                      APInt(64, 0x4530000000000000ULL))));
15197
53
  Constant *C1 = ConstantVector::get(CV1);
15198
53
  SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15199
53
15200
53
  // Load the 64-bit value into an XMM register.
15201
53
  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15202
53
                            Op.getOperand(0));
15203
53
  SDValue CLod0 =
15204
53
      DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15205
53
                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15206
53
                  /* Alignment = */ 16);
15207
53
  SDValue Unpck1 =
15208
53
      getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15209
53
15210
53
  SDValue CLod1 =
15211
53
      DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15212
53
                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15213
53
                  /* Alignment = */ 16);
15214
53
  SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15215
53
  // TODO: Are there any fast-math-flags to propagate here?
15216
53
  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15217
53
  SDValue Result;
15218
53
15219
53
  if (
Subtarget.hasSSE3()53
) {
15220
25
    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15221
25
    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15222
53
  } else {
15223
28
    SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15224
28
    SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15225
28
    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15226
28
                         DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15227
28
  }
15228
53
15229
53
  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15230
53
                     DAG.getIntPtrConstant(0, dl));
15231
53
}
15232
15233
/// 32-bit unsigned integer to float expansion.
15234
SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15235
13
                                               SelectionDAG &DAG) const {
15236
13
  SDLoc dl(Op);
15237
13
  // FP constant to bias correct the final result.
15238
13
  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15239
13
                                   MVT::f64);
15240
13
15241
13
  // Load the 32-bit value into an XMM register.
15242
13
  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15243
13
                             Op.getOperand(0));
15244
13
15245
13
  // Zero out the upper parts of the register.
15246
13
  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15247
13
15248
13
  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15249
13
                     DAG.getBitcast(MVT::v2f64, Load),
15250
13
                     DAG.getIntPtrConstant(0, dl));
15251
13
15252
13
  // Or the load with the bias.
15253
13
  SDValue Or = DAG.getNode(
15254
13
      ISD::OR, dl, MVT::v2i64,
15255
13
      DAG.getBitcast(MVT::v2i64,
15256
13
                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15257
13
      DAG.getBitcast(MVT::v2i64,
15258
13
                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15259
13
  Or =
15260
13
      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15261
13
                  DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15262
13
15263
13
  // Subtract the bias.
15264
13
  // TODO: Are there any fast-math-flags to propagate here?
15265
13
  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15266
13
15267
13
  // Handle final rounding.
15268
13
  MVT DestVT = Op.getSimpleValueType();
15269
13
15270
13
  if (DestVT.bitsLT(MVT::f64))
15271
3
    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15272
3
                       DAG.getIntPtrConstant(0, dl));
15273
10
  
if (10
DestVT.bitsGT(MVT::f64)10
)
15274
3
    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15275
7
15276
7
  // Handle final rounding.
15277
7
  return Sub;
15278
7
}
15279
15280
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15281
33
                                     const X86Subtarget &Subtarget, SDLoc &DL) {
15282
33
  if (Op.getSimpleValueType() != MVT::v2f64)
15283
0
    return SDValue();
15284
33
15285
33
  SDValue N0 = Op.getOperand(0);
15286
33
  assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15287
33
15288
33
  // Legalize to v4i32 type.
15289
33
  N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15290
33
                   DAG.getUNDEF(MVT::v2i32));
15291
33
15292
33
  if (Subtarget.hasAVX512())
15293
9
    return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15294
24
15295
24
  // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15296
24
  // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15297
24
  SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15298
24
  SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15299
24
15300
24
  // Two to the power of half-word-size.
15301
24
  SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15302
24
15303
24
  // Clear upper part of LO, lower HI.
15304
24
  SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15305
24
  SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15306
24
15307
24
  SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15308
24
          fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15309
24
  SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15310
24
15311
24
  // Add the two halves.
15312
24
  return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15313
24
}
15314
15315
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15316
57
                                     const X86Subtarget &Subtarget) {
15317
57
  // The algorithm is the following:
15318
57
  // #ifdef __SSE4_1__
15319
57
  //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15320
57
  //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15321
57
  //                                 (uint4) 0x53000000, 0xaa);
15322
57
  // #else
15323
57
  //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15324
57
  //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
15325
57
  // #endif
15326
57
  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15327
57
  //     return (float4) lo + fhi;
15328
57
15329
57
  // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15330
57
  // reassociate the two FADDs, and if we do that, the algorithm fails
15331
57
  // spectacularly (PR24512).
15332
57
  // FIXME: If we ever have some kind of Machine FMF, this should be marked
15333
57
  // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15334
57
  // there's also the MachineCombiner reassociations happening on Machine IR.
15335
57
  if (DAG.getTarget().Options.UnsafeFPMath)
15336
9
    return SDValue();
15337
48
15338
48
  SDLoc DL(Op);
15339
48
  SDValue V = Op->getOperand(0);
15340
48
  MVT VecIntVT = V.getSimpleValueType();
15341
48
  bool Is128 = VecIntVT == MVT::v4i32;
15342
48
  MVT VecFloatVT = Is128 ? 
MVT::v4f3244
:
MVT::v8f324
;
15343
48
  // If we convert to something else than the supported type, e.g., to v4f64,
15344
48
  // abort early.
15345
48
  if (VecFloatVT != Op->getSimpleValueType(0))
15346
8
    return SDValue();
15347
40
15348
48
  assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15349
40
         "Unsupported custom type");
15350
40
15351
40
  // In the #idef/#else code, we have in common:
15352
40
  // - The vector of constants:
15353
40
  // -- 0x4b000000
15354
40
  // -- 0x53000000
15355
40
  // - A shift:
15356
40
  // -- v >> 16
15357
40
15358
40
  // Create the splat vector for 0x4b000000.
15359
40
  SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15360
40
  // Create the splat vector for 0x53000000.
15361
40
  SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15362
40
15363
40
  // Create the right shift.
15364
40
  SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15365
40
  SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15366
40
15367
40
  SDValue Low, High;
15368
40
  if (
Subtarget.hasSSE41()40
) {
15369
19
    MVT VecI16VT = Is128 ? 
MVT::v8i1615
:
MVT::v16i164
;
15370
19
    //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15371
19
    SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15372
19
    SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15373
19
    // Low will be bitcasted right away, so do not bother bitcasting back to its
15374
19
    // original type.
15375
19
    Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15376
19
                      VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15377
19
    //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15378
19
    //                                 (uint4) 0x53000000, 0xaa);
15379
19
    SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15380
19
    SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15381
19
    // High will be bitcasted right away, so do not bother bitcasting back to
15382
19
    // its original type.
15383
19
    High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15384
19
                       VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15385
40
  } else {
15386
21
    SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15387
21
    //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15388
21
    SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15389
21
    Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15390
21
15391
21
    //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
15392
21
    High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15393
21
  }
15394
57
15395
57
  // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15396
57
  SDValue VecCstFAdd = DAG.getConstantFP(
15397
57
      APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15398
57
15399
57
  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15400
57
  SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15401
57
  // TODO: Are there any fast-math-flags to propagate here?
15402
57
  SDValue FHigh =
15403
57
      DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15404
57
  //     return (float4) lo + fhi;
15405
57
  SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15406
57
  return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15407
57
}
15408
15409
SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15410
150
                                               SelectionDAG &DAG) const {
15411
150
  SDValue N0 = Op.getOperand(0);
15412
150
  MVT SrcVT = N0.getSimpleValueType();
15413
150
  SDLoc dl(Op);
15414
150
15415
150
  if (
SrcVT.getVectorElementType() == MVT::i1150
) {
15416
60
    if (SrcVT == MVT::v2i1)
15417
7
      return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15418
7
                         DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15419
53
    MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15420
53
    return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15421
53
                       DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15422
53
  }
15423
90
15424
90
  switch (SrcVT.SimpleTy) {
15425
0
  default:
15426
0
    llvm_unreachable("Custom UINT_TO_FP is not supported!");
15427
0
  case MVT::v4i8:
15428
0
  case MVT::v4i16:
15429
0
  case MVT::v8i8:
15430
0
  case MVT::v8i16: {
15431
0
    MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15432
0
    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15433
0
                       DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15434
0
  }
15435
33
  case MVT::v2i32:
15436
33
    return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15437
57
  case MVT::v4i32:
15438
57
  case MVT::v8i32:
15439
57
    return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15440
0
  case MVT::v16i8:
15441
0
  case MVT::v16i16:
15442
0
    assert(Subtarget.hasAVX512());
15443
0
    return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15444
0
                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15445
0
  }
15446
0
}
15447
15448
SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15449
1.07k
                                           SelectionDAG &DAG) const {
15450
1.07k
  SDValue N0 = Op.getOperand(0);
15451
1.07k
  SDLoc dl(Op);
15452
1.07k
  auto PtrVT = getPointerTy(DAG.getDataLayout());
15453
1.07k
15454
1.07k
  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15455
1.07k
  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15456
1.07k
  // the optimization here.
15457
1.07k
  if (DAG.SignBitIsZero(N0))
15458
0
    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15459
1.07k
15460
1.07k
  
if (1.07k
Op.getSimpleValueType().isVector()1.07k
)
15461
150
    return lowerUINT_TO_FP_vec(Op, DAG);
15462
927
15463
927
  MVT SrcVT = N0.getSimpleValueType();
15464
927
  MVT DstVT = Op.getSimpleValueType();
15465
927
15466
927
  if (
Subtarget.hasAVX512() && 927
isScalarFPTypeInSSEReg(DstVT)616
&&
15467
927
      
(SrcVT == MVT::i32 || 612
(SrcVT == MVT::i64 && 576
Subtarget.is64Bit()576
))) {
15468
610
    // Conversions from unsigned i32 to f32/f64 are legal,
15469
610
    // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
15470
610
    return Op;
15471
610
  }
15472
317
15473
317
  
if (317
SrcVT == MVT::i64 && 317
DstVT == MVT::f64239
&&
X86ScalarSSEf6480
)
15474
53
    return LowerUINT_TO_FP_i64(Op, DAG);
15475
264
  
if (264
SrcVT == MVT::i32 && 264
X86ScalarSSEf6478
)
15476
13
    return LowerUINT_TO_FP_i32(Op, DAG);
15477
251
  
if (251
Subtarget.is64Bit() && 251
SrcVT == MVT::i6480
&&
DstVT == MVT::f3280
)
15478
78
    return SDValue();
15479
173
15480
173
  // Make a 64-bit buffer, and use it to build an FILD.
15481
173
  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15482
173
  if (
SrcVT == MVT::i32173
) {
15483
65
    SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15484
65
    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15485
65
                                  StackSlot, MachinePointerInfo());
15486
65
    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15487
65
                                  OffsetSlot, MachinePointerInfo());
15488
65
    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15489
65
    return Fild;
15490
65
  }
15491
108
15492
173
  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15493
108
  SDValue ValueToStore = Op.getOperand(0);
15494
108
  if (
isScalarFPTypeInSSEReg(Op.getValueType()) && 108
!Subtarget.is64Bit()50
)
15495
108
    // Bitcasting to f64 here allows us to do a single 64-bit store from
15496
108
    // an SSE register, avoiding the store forwarding penalty that would come
15497
108
    // with two 32-bit stores.
15498
50
    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15499
1.07k
  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15500
1.07k
                               MachinePointerInfo());
15501
1.07k
  // For i64 source, we need to add the appropriate power of 2 if the input
15502
1.07k
  // was negative.  This is the same as the optimization in
15503
1.07k
  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15504
1.07k
  // we must be careful to do the computation in x87 extended precision, not
15505
1.07k
  // in SSE. (The generic code can't know it's OK to do this, or how to.)
15506
1.07k
  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15507
1.07k
  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15508
1.07k
      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15509
1.07k
      MachineMemOperand::MOLoad, 8, 8);
15510
1.07k
15511
1.07k
  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15512
1.07k
  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15513
1.07k
  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15514
1.07k
                                         MVT::i64, MMO);
15515
1.07k
15516
1.07k
  APInt FF(32, 0x5F800000ULL);
15517
1.07k
15518
1.07k
  // Check whether the sign bit is set.
15519
1.07k
  SDValue SignSet = DAG.getSetCC(
15520
1.07k
      dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15521
1.07k
      Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15522
1.07k
15523
1.07k
  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15524
1.07k
  SDValue FudgePtr = DAG.getConstantPool(
15525
1.07k
      ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15526
1.07k
15527
1.07k
  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15528
1.07k
  SDValue Zero = DAG.getIntPtrConstant(0, dl);
15529
1.07k
  SDValue Four = DAG.getIntPtrConstant(4, dl);
15530
1.07k
  SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15531
1.07k
  FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15532
1.07k
15533
1.07k
  // Load the value out, extending it from f32 to f80.
15534
1.07k
  // FIXME: Avoid the extend by constructing the right constant pool?
15535
1.07k
  SDValue Fudge = DAG.getExtLoad(
15536
1.07k
      ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15537
1.07k
      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15538
1.07k
      /* Alignment = */ 4);
15539
1.07k
  // Extend everything to 80 bits to force it to be done on x87.
15540
1.07k
  // TODO: Are there any fast-math-flags to propagate here?
15541
1.07k
  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15542
1.07k
  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15543
1.07k
                     DAG.getIntPtrConstant(0, dl));
15544
1.07k
}
15545
15546
// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15547
// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15548
// just return an <SDValue(), SDValue()> pair.
15549
// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15550
// to i16, i32 or i64, and we lower it to a legal sequence.
15551
// If lowered to the final integer result we return a <result, SDValue()> pair.
15552
// Otherwise we lower it to a sequence ending with a FIST, return a
15553
// <FIST, StackSlot> pair, and the caller is responsible for loading
15554
// the final integer result from StackSlot.
15555
std::pair<SDValue,SDValue>
15556
X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15557
1.91k
                                   bool IsSigned, bool IsReplace) const {
15558
1.91k
  SDLoc DL(Op);
15559
1.91k
15560
1.91k
  EVT DstTy = Op.getValueType();
15561
1.91k
  EVT TheVT = Op.getOperand(0).getValueType();
15562
1.91k
  auto PtrVT = getPointerTy(DAG.getDataLayout());
15563
1.91k
15564
1.91k
  if (
TheVT != MVT::f32 && 1.91k
TheVT != MVT::f64857
&&
TheVT != MVT::f80106
) {
15565
22
    // f16 must be promoted before using the lowering in this routine.
15566
22
    // fp128 does not use this lowering.
15567
22
    return std::make_pair(SDValue(), SDValue());
15568
22
  }
15569
1.89k
15570
1.89k
  // If using FIST to compute an unsigned i64, we'll need some fixup
15571
1.89k
  // to handle values above the maximum signed i64.  A FIST is always
15572
1.89k
  // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15573
1.89k
  bool UnsignedFixup = !IsSigned &&
15574
256
                       DstTy == MVT::i64 &&
15575
196
                       (!Subtarget.is64Bit() ||
15576
196
                        !isScalarFPTypeInSSEReg(TheVT));
15577
1.89k
15578
1.89k
  if (
!IsSigned && 1.89k
DstTy != MVT::i64256
&&
!Subtarget.hasAVX512()60
) {
15579
46
    // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15580
46
    // The low 32 bits of the fist result will have the correct uint32 result.
15581
46
    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15582
46
    DstTy = MVT::i64;
15583
46
  }
15584
1.89k
15585
1.89k
  assert(DstTy.getSimpleVT() <= MVT::i64 &&
15586
1.89k
         DstTy.getSimpleVT() >= MVT::i16 &&
15587
1.89k
         "Unknown FP_TO_INT to lower!");
15588
1.89k
15589
1.89k
  // These are really Legal.
15590
1.89k
  if (DstTy == MVT::i32 &&
15591
493
      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15592
383
    return std::make_pair(SDValue(), SDValue());
15593
1.50k
  
if (1.50k
Subtarget.is64Bit() &&
15594
1.16k
      DstTy == MVT::i64 &&
15595
1.14k
      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15596
1.11k
    return std::make_pair(SDValue(), SDValue());
15597
397
15598
397
  // We lower FP->int64 into FISTP64 followed by a load from a temporary
15599
397
  // stack slot.
15600
397
  MachineFunction &MF = DAG.getMachineFunction();
15601
397
  unsigned MemSize = DstTy.getSizeInBits()/8;
15602
397
  int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15603
397
  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15604
397
15605
397
  unsigned Opc;
15606
397
  switch (DstTy.getSimpleVT().SimpleTy) {
15607
0
  
default: 0
llvm_unreachable0
("Invalid FP_TO_SINT to lower!");
15608
2
  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15609
110
  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15610
285
  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15611
397
  }
15612
397
15613
397
  SDValue Chain = DAG.getEntryNode();
15614
397
  SDValue Value = Op.getOperand(0);
15615
397
  SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15616
397
15617
397
  if (
UnsignedFixup397
) {
15618
112
    //
15619
112
    // Conversion to unsigned i64 is implemented with a select,
15620
112
    // depending on whether the source value fits in the range
15621
112
    // of a signed i64.  Let Thresh be the FP equivalent of
15622
112
    // 0x8000000000000000ULL.
15623
112
    //
15624
112
    //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15625
112
    //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
15626
112
    //  Fist-to-mem64 FistSrc
15627
112
    //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15628
112
    //  to XOR'ing the high 32 bits with Adjust.
15629
112
    //
15630
112
    // Being a power of 2, Thresh is exactly representable in all FP formats.
15631
112
    // For X87 we'd like to use the smallest FP type for this constant, but
15632
112
    // for DAG type consistency we have to match the FP operand type.
15633
112
15634
112
    APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15635
112
    LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15636
112
    bool LosesInfo = false;
15637
112
    if (TheVT == MVT::f64)
15638
112
      // The rounding mode is irrelevant as the conversion should be exact.
15639
38
      Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15640
38
                              &LosesInfo);
15641
74
    else 
if (74
TheVT == MVT::f8074
)
15642
10
      Status = Thresh.convert(APFloat::x87DoubleExtended(),
15643
10
                              APFloat::rmNearestTiesToEven, &LosesInfo);
15644
112
15645
112
    assert(Status == APFloat::opOK && !LosesInfo &&
15646
112
           "FP conversion should have been exact");
15647
112
15648
112
    SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15649
112
15650
112
    SDValue Cmp = DAG.getSetCC(DL,
15651
112
                               getSetCCResultType(DAG.getDataLayout(),
15652
112
                                                  *DAG.getContext(), TheVT),
15653
112
                               Value, ThreshVal, ISD::SETLT);
15654
112
    Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15655
112
                           DAG.getConstant(0, DL, MVT::i32),
15656
112
                           DAG.getConstant(0x80000000, DL, MVT::i32));
15657
112
    SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15658
112
    Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15659
112
                                              *DAG.getContext(), TheVT),
15660
112
                       Value, ThreshVal, ISD::SETLT);
15661
112
    Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15662
112
  }
15663
397
15664
397
  // FIXME This causes a redundant load/store if the SSE-class value is already
15665
397
  // in memory, such as if it is on the callstack.
15666
397
  if (
isScalarFPTypeInSSEReg(TheVT)397
) {
15667
135
    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15668
135
    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15669
135
                         MachinePointerInfo::getFixedStack(MF, SSFI));
15670
135
    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15671
135
    SDValue Ops[] = {
15672
135
      Chain, StackSlot, DAG.getValueType(TheVT)
15673
135
    };
15674
135
15675
135
    MachineMemOperand *MMO =
15676
135
        MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15677
135
                                MachineMemOperand::MOLoad, MemSize, MemSize);
15678
135
    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15679
135
    Chain = Value.getValue(1);
15680
135
    SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15681
135
    StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15682
135
  }
15683
397
15684
397
  MachineMemOperand *MMO =
15685
397
      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15686
397
                              MachineMemOperand::MOStore, MemSize, MemSize);
15687
397
15688
397
  if (
UnsignedFixup397
) {
15689
112
15690
112
    // Insert the FIST, load its result as two i32's,
15691
112
    // and XOR the high i32 with Adjust.
15692
112
15693
112
    SDValue FistOps[] = { Chain, Value, StackSlot };
15694
112
    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15695
112
                                           FistOps, DstTy, MMO);
15696
112
15697
112
    SDValue Low32 =
15698
112
        DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15699
112
    SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15700
112
15701
112
    SDValue High32 =
15702
112
        DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15703
112
    High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15704
112
15705
112
    if (
Subtarget.is64Bit()112
) {
15706
2
      // Join High32 and Low32 into a 64-bit result.
15707
2
      // (High32 << 32) | Low32
15708
2
      Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15709
2
      High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15710
2
      High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15711
2
                           DAG.getConstant(32, DL, MVT::i8));
15712
2
      SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15713
2
      return std::make_pair(Result, SDValue());
15714
2
    }
15715
110
15716
110
    SDValue ResultOps[] = { Low32, High32 };
15717
110
15718
110
    SDValue pair = IsReplace
15719
110
      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15720
0
      : DAG.getMergeValues(ResultOps, DL);
15721
112
    return std::make_pair(pair, SDValue());
15722
0
  } else {
15723
285
    // Build the FP_TO_INT*_IN_MEM
15724
285
    SDValue Ops[] = { Chain, Value, StackSlot };
15725
285
    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15726
285
                                           Ops, DstTy, MMO);
15727
285
    return std::make_pair(FIST, StackSlot);
15728
285
  }
15729
0
}
15730
15731
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15732
384
                              const X86Subtarget &Subtarget) {
15733
384
  MVT VT = Op->getSimpleValueType(0);
15734
384
  SDValue In = Op->getOperand(0);
15735
384
  MVT InVT = In.getSimpleValueType();
15736
384
  SDLoc dl(Op);
15737
384
15738
384
  if (
VT.is512BitVector() || 384
InVT.getVectorElementType() == MVT::i1277
)
15739
107
    return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15740
277
15741
277
  // Optimize vectors in AVX mode:
15742
277
  //
15743
277
  //   v8i16 -> v8i32
15744
277
  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
15745
277
  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
15746
277
  //   Concat upper and lower parts.
15747
277
  //
15748
277
  //   v4i32 -> v4i64
15749
277
  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
15750
277
  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
15751
277
  //   Concat upper and lower parts.
15752
277
  //
15753
277
15754
277
  
if (277
((VT != MVT::v16i16) || 277
(InVT != MVT::v16i8)70
) &&
15755
207
      
((VT != MVT::v8i32) || 207
(InVT != MVT::v8i16)173
) &&
15756
34
      
((VT != MVT::v4i64) || 34
(InVT != MVT::v4i32)34
))
15757
0
    return SDValue();
15758
277
15759
277
  
if (277
Subtarget.hasInt256()277
)
15760
269
    return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15761
8
15762
8
  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15763
8
  SDValue Undef = DAG.getUNDEF(InVT);
15764
8
  bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15765
8
  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? 
ZeroVec0
:
Undef8
);
15766
8
  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? 
ZeroVec0
:
Undef8
);
15767
384
15768
384
  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15769
384
                             VT.getVectorNumElements()/2);
15770
384
15771
384
  OpLo = DAG.getBitcast(HVT, OpLo);
15772
384
  OpHi = DAG.getBitcast(HVT, OpHi);
15773
384
15774
384
  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15775
384
}
15776
15777
static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15778
395
                  const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15779
395
  MVT VT = Op->getSimpleValueType(0);
15780
395
  SDValue In = Op->getOperand(0);
15781
395
  MVT InVT = In.getSimpleValueType();
15782
395
  SDLoc DL(Op);
15783
395
  unsigned NumElts = VT.getVectorNumElements();
15784
395
15785
395
  if (
VT.is512BitVector() && 395
InVT.getVectorElementType() != MVT::i1341
&&
15786
319
      
(NumElts == 8 || 319
NumElts == 16298
||
Subtarget.hasBWI()52
))
15787
319
    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15788
76
15789
76
  
if (76
InVT.getVectorElementType() != MVT::i176
)
15790
0
    return SDValue();
15791
76
15792
76
  // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15793
76
  MVT ExtVT = VT;
15794
76
  if (
!VT.is512BitVector() && 76
!Subtarget.hasVLX()54
)
15795
12
    ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15796
76
15797
76
  SDValue One =
15798
76
   DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15799
76
  SDValue Zero =
15800
76
   DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15801
76
15802
76
  SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
15803
76
  if (VT == ExtVT)
15804
64
    return SelectedVal;
15805
12
  return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15806
12
}
15807
15808
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15809
205
                               SelectionDAG &DAG) {
15810
205
  if (Subtarget.hasFp256())
15811
205
    
if (SDValue 205
Res205
= LowerAVXExtend(Op, DAG, Subtarget))
15812
205
      return Res;
15813
0
15814
0
  return SDValue();
15815
0
}
15816
15817
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15818
574
                                SelectionDAG &DAG) {
15819
574
  SDLoc DL(Op);
15820
574
  MVT VT = Op.getSimpleValueType();
15821
574
  SDValue In = Op.getOperand(0);
15822
574
  MVT SVT = In.getSimpleValueType();
15823
574
15824
574
  if (
VT.is512BitVector() || 574
SVT.getVectorElementType() == MVT::i1233
)
15825
395
    return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15826
179
15827
179
  
if (179
Subtarget.hasFp256()179
)
15828
179
    
if (SDValue 179
Res179
= LowerAVXExtend(Op, DAG, Subtarget))
15829
179
      return Res;
15830
0
15831
179
  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15832
0
         VT.getVectorNumElements() != SVT.getVectorNumElements());
15833
0
  return SDValue();
15834
0
}
15835
15836
/// Helper to recursively truncate vector elements in half with PACKSS.
15837
/// It makes use of the fact that vector comparison results will be all-zeros
15838
/// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15839
/// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15840
/// within each 128-bit lane.
15841
static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15842
                                               const SDLoc &DL,
15843
                                               SelectionDAG &DAG,
15844
569
                                               const X86Subtarget &Subtarget) {
15845
569
  // Requires SSE2 but AVX512 has fast truncate.
15846
569
  if (
!Subtarget.hasSSE2() || 569
Subtarget.hasAVX512()569
)
15847
0
    return SDValue();
15848
569
15849
569
  EVT SrcVT = In.getValueType();
15850
569
15851
569
  // No truncation required, we might get here due to recursive calls.
15852
569
  if (SrcVT == DstVT)
15853
54
    return In;
15854
515
15855
515
  // We only support vector truncation to 128bits or greater from a
15856
515
  // 256bits or greater source.
15857
515
  
if (515
(DstVT.getSizeInBits() % 128) != 0515
)
15858
0
    return SDValue();
15859
515
  
if (515
(SrcVT.getSizeInBits() % 256) != 0515
)
15860
0
    return SDValue();
15861
515
15862
515
  unsigned NumElems = SrcVT.getVectorNumElements();
15863
515
  assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15864
515
  assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15865
515
15866
515
  EVT PackedSVT =
15867
515
      EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15868
515
15869
515
  // Extract lower/upper subvectors.
15870
515
  unsigned NumSubElts = NumElems / 2;
15871
515
  unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15872
515
  SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15873
515
  SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15874
515
15875
515
  // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15876
515
  if (
SrcVT.is256BitVector()515
) {
15877
364
    Lo = DAG.getBitcast(MVT::v8i16, Lo);
15878
364
    Hi = DAG.getBitcast(MVT::v8i16, Hi);
15879
364
    SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15880
364
    return DAG.getBitcast(DstVT, Res);
15881
364
  }
15882
151
15883
151
  // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15884
151
  // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15885
151
  
if (151
SrcVT.is512BitVector() && 151
Subtarget.hasInt256()119
) {
15886
41
    Lo = DAG.getBitcast(MVT::v16i16, Lo);
15887
41
    Hi = DAG.getBitcast(MVT::v16i16, Hi);
15888
41
    SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15889
41
15890
41
    // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15891
41
    // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15892
41
    Res = DAG.getBitcast(MVT::v4i64, Res);
15893
41
    Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15894
41
15895
41
    if (DstVT.is256BitVector())
15896
29
      return DAG.getBitcast(DstVT, Res);
15897
12
15898
12
    // If 512bit -> 128bit truncate another stage.
15899
12
    EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15900
12
    Res = DAG.getBitcast(PackedVT, Res);
15901
12
    return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15902
12
  }
15903
110
15904
110
  // Recursively pack lower/upper subvectors, concat result and pack again.
15905
151
  assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15906
110
  EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15907
110
  Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15908
110
  Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15909
110
15910
110
  PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15911
110
  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15912
110
  return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15913
110
}
15914
15915
static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15916
2.04k
                                  const X86Subtarget &Subtarget) {
15917
2.04k
15918
2.04k
  SDLoc DL(Op);
15919
2.04k
  MVT VT = Op.getSimpleValueType();
15920
2.04k
  SDValue In = Op.getOperand(0);
15921
2.04k
  MVT InVT = In.getSimpleValueType();
15922
2.04k
15923
2.04k
  assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15924
2.04k
15925
2.04k
  // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15926
2.04k
  unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15927
2.04k
  if (
InVT.getScalarSizeInBits() <= 162.04k
) {
15928
1.43k
    if (
Subtarget.hasBWI()1.43k
) {
15929
128
      // legal, will go to VPMOVB2M, VPMOVW2M
15930
128
      // Shift packed bytes not supported natively, bitcast to word
15931
128
      MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15932
128
      SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15933
128
                                       DAG.getBitcast(ExtVT, In),
15934
128
                                       DAG.getConstant(ShiftInx, DL, ExtVT));
15935
128
      ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15936
128
      return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15937
128
    }
15938
1.30k
    // Use TESTD/Q, extended vector to packed dword/qword.
15939
1.43k
    assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15940
1.30k
           "Unexpected vector type.");
15941
1.30k
    unsigned NumElts = InVT.getVectorNumElements();
15942
1.30k
    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15943
1.30k
    In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15944
1.30k
    InVT = ExtVT;
15945
1.30k
    ShiftInx = InVT.getScalarSizeInBits() - 1;
15946
1.30k
  }
15947
2.04k
15948
1.91k
  SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15949
1.91k
                                   DAG.getConstant(ShiftInx, DL, InVT));
15950
1.91k
  return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15951
2.04k
}
15952
15953
3.49k
SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15954
3.49k
  SDLoc DL(Op);
15955
3.49k
  MVT VT = Op.getSimpleValueType();
15956
3.49k
  SDValue In = Op.getOperand(0);
15957
3.49k
  MVT InVT = In.getSimpleValueType();
15958
3.49k
15959
3.49k
  if (
VT == MVT::i13.49k
) {
15960
0
    assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15961
0
           "Invalid scalar TRUNCATE operation");
15962
0
    if (InVT.getSizeInBits() >= 32)
15963
0
      return SDValue();
15964
0
    In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15965
0
    return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15966
0
  }
15967
3.49k
  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15968
3.49k
         "Invalid TRUNCATE operation");
15969
3.49k
15970
3.49k
  if (VT.getVectorElementType() == MVT::i1)
15971
2.04k
    return LowerTruncateVecI1(Op, DAG, Subtarget);
15972
1.45k
15973
1.45k
  // vpmovqb/w/d, vpmovdb/w, vpmovwb
15974
1.45k
  
if (1.45k
Subtarget.hasAVX512()1.45k
) {
15975
954
    // word to byte only under BWI
15976
954
    if (
InVT == MVT::v16i16 && 954
!Subtarget.hasBWI()111
) // v16i16 -> v16i8
15977
89
      return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15978
89
                         getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
15979
865
    return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15980
865
  }
15981
500
15982
500
  // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
15983
500
  
if (500
InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In)500
)
15984
1
    
if (SDValue 1
V1
= truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15985
1
      return V;
15986
499
15987
499
  
if (499
(VT == MVT::v4i32) && 499
(InVT == MVT::v4i64)167
) {
15988
167
    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15989
167
    if (
Subtarget.hasInt256()167
) {
15990
133
      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15991
133
      In = DAG.getBitcast(MVT::v8i32, In);
15992
133
      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
15993
133
      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15994
133
                         DAG.getIntPtrConstant(0, DL));
15995
133
    }
15996
34
15997
34
    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15998
34
                               DAG.getIntPtrConstant(0, DL));
15999
34
    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16000
34
                               DAG.getIntPtrConstant(2, DL));
16001
34
    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16002
34
    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16003
34
    static const int ShufMask[] = {0, 2, 4, 6};
16004
34
    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16005
34
  }
16006
332
16007
332
  
if (332
(VT == MVT::v8i16) && 332
(InVT == MVT::v8i32)212
) {
16008
212
    // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16009
212
    if (
Subtarget.hasInt256()212
) {
16010
176
      In = DAG.getBitcast(MVT::v32i8, In);
16011
176
16012
176
      // The PSHUFB mask:
16013
176
      static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
16014
176
                                      -1, -1, -1, -1, -1, -1, -1, -1,
16015
176
                                      16, 17, 20, 21, 24, 25, 28, 29,
16016
176
                                      -1, -1, -1, -1, -1, -1, -1, -1 };
16017
176
      In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16018
176
      In = DAG.getBitcast(MVT::v4i64, In);
16019
176
16020
176
      static const int ShufMask2[] = {0,  2,  -1,  -1};
16021
176
      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, In, ShufMask2);
16022
176
      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16023
176
                       DAG.getIntPtrConstant(0, DL));
16024
176
      return DAG.getBitcast(VT, In);
16025
176
    }
16026
36
16027
36
    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16028
36
                               DAG.getIntPtrConstant(0, DL));
16029
36
16030
36
    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16031
36
                               DAG.getIntPtrConstant(4, DL));
16032
36
16033
36
    OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16034
36
    OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16035
36
16036
36
    // The PSHUFB mask:
16037
36
    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
16038
36
                                   -1, -1, -1, -1, -1, -1, -1, -1};
16039
36
16040
36
    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16041
36
    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16042
36
16043
36
    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16044
36
    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16045
36
16046
36
    // The MOVLHPS Mask:
16047
36
    static const int ShufMask2[] = {0, 1, 4, 5};
16048
36
    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16049
36
    return DAG.getBitcast(MVT::v8i16, res);
16050
36
  }
16051
120
16052
120
  // Handle truncation of V256 to V128 using shuffles.
16053
120
  
if (120
!VT.is128BitVector() || 120
!InVT.is256BitVector()120
)
16054
0
    return SDValue();
16055
120
16056
120
  assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
16057
120
16058
120
  unsigned NumElems = VT.getVectorNumElements();
16059
120
  MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16060
120
16061
120
  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16062
120
  // Prepare truncation shuffle mask
16063
2.04k
  for (unsigned i = 0; 
i != NumElems2.04k
;
++i1.92k
)
16064
1.92k
    MaskVec[i] = i * 2;
16065
3.49k
  In = DAG.getBitcast(NVT, In);
16066
3.49k
  SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16067
3.49k
  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16068
3.49k
                     DAG.getIntPtrConstant(0, DL));
16069
3.49k
}
16070
16071
1.68k
SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16072
1.68k
  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16073
1.68k
  MVT VT = Op.getSimpleValueType();
16074
1.68k
16075
1.68k
  if (
VT.isVector()1.68k
) {
16076
2
    assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
16077
2
    SDValue Src = Op.getOperand(0);
16078
2
    SDLoc dl(Op);
16079
2
    if (
VT == MVT::v2i64 && 2
Src.getSimpleValueType() == MVT::v2f322
) {
16080
2
      return DAG.getNode(IsSigned ? 
X86ISD::CVTTP2SI1
:
X86ISD::CVTTP2UI1
, dl, VT,
16081
2
                         DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16082
2
                                     DAG.getUNDEF(MVT::v2f32)));
16083
2
    }
16084
0
16085
0
    return SDValue();
16086
0
  }
16087
1.68k
16088
1.68k
  assert(!VT.isVector());
16089
1.68k
16090
1.68k
  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16091
1.68k
    IsSigned, /*IsReplace=*/ false);
16092
1.68k
  SDValue FIST = Vals.first, StackSlot = Vals.second;
16093
1.68k
  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16094
1.68k
  if (!FIST.getNode())
16095
1.49k
    return Op;
16096
189
16097
189
  
if (189
StackSlot.getNode()189
)
16098
189
    // Load the result.
16099
187
    return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16100
2
16101
2
  // The node is the result.
16102
2
  return FIST;
16103
2
}
16104
16105
45
static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16106
45
  SDLoc DL(Op);
16107
45
  MVT VT = Op.getSimpleValueType();
16108
45
  SDValue In = Op.getOperand(0);
16109
45
  MVT SVT = In.getSimpleValueType();
16110
45
16111
45
  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16112
45
16113
45
  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16114
45
                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16115
45
                                 In, DAG.getUNDEF(SVT)));
16116
45
}
16117
16118
/// The only differences between FABS and FNEG are the mask and the logic op.
16119
/// FNEG also has a folding opportunity for FNEG(FABS(x)).
16120
529
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16121
529
  assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16122
529
         "Wrong opcode for lowering FABS or FNEG.");
16123
529
16124
529
  bool IsFABS = (Op.getOpcode() == ISD::FABS);
16125
529
16126
529
  // If this is a FABS and it has an FNEG user, bail out to fold the combination
16127
529
  // into an FNABS. We'll lower the FABS after that if it is still in use.
16128
529
  if (IsFABS)
16129
367
    for (SDNode *User : Op->uses())
16130
535
      
if (535
User->getOpcode() == ISD::FNEG535
)
16131
6
        return Op;
16132
523
16133
523
  SDLoc dl(Op);
16134
523
  MVT VT = Op.getSimpleValueType();
16135
523
16136
523
  bool IsF128 = (VT == MVT::f128);
16137
523
16138
523
  // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16139
523
  // decide if we should generate a 16-byte constant mask when we only need 4 or
16140
523
  // 8 bytes for the scalar case.
16141
523
16142
523
  MVT LogicVT;
16143
523
  MVT EltVT;
16144
523
16145
523
  if (
VT.isVector()523
) {
16146
153
    LogicVT = VT;
16147
153
    EltVT = VT.getVectorElementType();
16148
523
  } else 
if (370
IsF128370
) {
16149
6
    // SSE instructions are used for optimized f128 logical operations.
16150
6
    LogicVT = MVT::f128;
16151
6
    EltVT = VT;
16152
370
  } else {
16153
364
    // There are no scalar bitwise logical SSE/AVX instructions, so we
16154
364
    // generate a 16-byte vector constant and logic op even for the scalar case.
16155
364
    // Using a 16-byte mask allows folding the load of the mask with
16156
364
    // the logic op, so it can save (~4 bytes) on code size.
16157
364
    LogicVT = (VT == MVT::f64) ? 
MVT::v2f64175
:
MVT::v4f32189
;
16158
370
    EltVT = VT;
16159
370
  }
16160
523
16161
523
  unsigned EltBits = EltVT.getSizeInBits();
16162
523
  // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16163
523
  APInt MaskElt =
16164
523
    IsFABS ? 
APInt::getSignedMaxValue(EltBits)361
:
APInt::getSignMask(EltBits)162
;
16165
523
  const fltSemantics &Sem =
16166
238
      EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16167
285
          
(IsF128 ? 285
APFloat::IEEEquad()6
:
APFloat::IEEEsingle()279
);
16168
523
  SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16169
523
16170
523
  SDValue Op0 = Op.getOperand(0);
16171
162
  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16172
523
  unsigned LogicOp =
16173
523
    IsFABS ? 
X86ISD::FAND361
:
IsFNABS ? 162
X86ISD::FOR8
:
X86ISD::FXOR154
;
16174
523
  SDValue Operand = IsFNABS ? 
Op0.getOperand(0)8
:
Op0515
;
16175
523
16176
523
  if (
VT.isVector() || 523
IsF128370
)
16177
159
    return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16178
364
16179
364
  // For the scalar case extend to a 128-bit vector, perform the logic op,
16180
364
  // and extract the scalar result back out.
16181
364
  Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16182
364
  SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16183
364
  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16184
364
                     DAG.getIntPtrConstant(0, dl));
16185
364
}
16186
16187
403
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16188
403
  SDValue Mag = Op.getOperand(0);
16189
403
  SDValue Sign = Op.getOperand(1);
16190
403
  SDLoc dl(Op);
16191
403
16192
403
  // If the sign operand is smaller, extend it first.
16193
403
  MVT VT = Op.getSimpleValueType();
16194
403
  if (Sign.getSimpleValueType().bitsLT(VT))
16195
7
    Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16196
403
16197
403
  // And if it is bigger, shrink it first.
16198
403
  if (Sign.getSimpleValueType().bitsGT(VT))
16199
5
    Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16200
403
16201
403
  // At this point the operands and the result should have the same
16202
403
  // type, and that won't be f80 since that is not custom lowered.
16203
403
  bool IsF128 = (VT == MVT::f128);
16204
403
  assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16205
403
          VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16206
403
          VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16207
403
         "Unexpected type in LowerFCOPYSIGN");
16208
403
16209
403
  MVT EltVT = VT.getScalarType();
16210
403
  const fltSemantics &Sem =
16211
185
      EltVT == MVT::f64 ? APFloat::IEEEdouble()
16212
218
                        : 
(IsF128 ? 218
APFloat::IEEEquad()3
:
APFloat::IEEEsingle()215
);
16213
403
16214
403
  // Perform all scalar logic operations as 16-byte vectors because there are no
16215
403
  // scalar FP logic instructions in SSE.
16216
403
  // TODO: This isn't necessary. If we used scalar types, we might avoid some
16217
403
  // unnecessary splats, but we might miss load folding opportunities. Should
16218
403
  // this decision be based on OptimizeForSize?
16219
346
  bool IsFakeVector = !VT.isVector() && !IsF128;
16220
403
  MVT LogicVT = VT;
16221
403
  if (IsFakeVector)
16222
343
    
LogicVT = (VT == MVT::f64) ? 343
MVT::v2f64164
:
MVT::v4f32179
;
16223
403
16224
403
  // The mask constants are automatically splatted for vector types.
16225
403
  unsigned EltSizeInBits = VT.getScalarSizeInBits();
16226
403
  SDValue SignMask = DAG.getConstantFP(
16227
403
      APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16228
403
  SDValue MagMask = DAG.getConstantFP(
16229
403
      APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16230
403
16231
403
  // First, clear all bits but the sign bit from the second operand (sign).
16232
403
  if (IsFakeVector)
16233
343
    Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16234
403
  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16235
403
16236
403
  // Next, clear the sign bit from the first operand (magnitude).
16237
403
  // TODO: If we had general constant folding for FP logic ops, this check
16238
403
  // wouldn't be necessary.
16239
403
  SDValue MagBits;
16240
403
  if (ConstantFPSDNode *
Op0CN403
= dyn_cast<ConstantFPSDNode>(Mag)) {
16241
177
    APFloat APF = Op0CN->getValueAPF();
16242
177
    APF.clearSign();
16243
177
    MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16244
403
  } else {
16245
226
    // If the magnitude operand wasn't a constant, we need to AND out the sign.
16246
226
    if (IsFakeVector)
16247
166
      Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16248
226
    MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16249
226
  }
16250
403
16251
403
  // OR the magnitude value with the sign bit.
16252
403
  SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16253
403
  return !IsFakeVector ? 
Or60
: DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16254
343
                                          DAG.getIntPtrConstant(0, dl));
16255
403
}
16256
16257
5
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16258
5
  SDValue N0 = Op.getOperand(0);
16259
5
  SDLoc dl(Op);
16260
5
  MVT VT = Op.getSimpleValueType();
16261
5
16262
5
  MVT OpVT = N0.getSimpleValueType();
16263
5
  assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16264
5
         "Unexpected type for FGETSIGN");
16265
5
16266
5
  // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16267
5
  MVT VecVT = (OpVT == MVT::f32 ? 
MVT::v4f323
:
MVT::v2f642
);
16268
5
  SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16269
5
  Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16270
5
  Res = DAG.getZExtOrTrunc(Res, dl, VT);
16271
5
  Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16272
5
  return Res;
16273
5
}
16274
16275
// Check whether an OR'd tree is PTEST-able.
16276
static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16277
629
                                      SelectionDAG &DAG) {
16278
629
  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16279
629
16280
629
  if (!Subtarget.hasSSE41())
16281
542
    return SDValue();
16282
87
16283
87
  
if (87
!Op->hasOneUse()87
)
16284
0
    return SDValue();
16285
87
16286
87
  SDNode *N = Op.getNode();
16287
87
  SDLoc DL(N);
16288
87
16289
87
  SmallVector<SDValue, 8> Opnds;
16290
87
  DenseMap<SDValue, unsigned> VecInMap;
16291
87
  SmallVector<SDValue, 8> VecIns;
16292
87
  EVT VT = MVT::Other;
16293
87
16294
87
  // Recognize a special case where a vector is casted into wide integer to
16295
87
  // test all 0s.
16296
87
  Opnds.push_back(N->getOperand(0));
16297
87
  Opnds.push_back(N->getOperand(1));
16298
87
16299
224
  for (unsigned Slot = 0, e = Opnds.size(); 
Slot < e224
;
++Slot137
) {
16300
206
    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16301
206
    // BFS traverse all OR'd operands.
16302
206
    if (
I->getOpcode() == ISD::OR206
) {
16303
53
      Opnds.push_back(I->getOperand(0));
16304
53
      Opnds.push_back(I->getOperand(1));
16305
53
      // Re-evaluate the number of nodes to be traversed.
16306
53
      e += 2; // 2 more nodes (LHS and RHS) are pushed.
16307
53
      continue;
16308
53
    }
16309
153
16310
153
    // Quit if a non-EXTRACT_VECTOR_ELT
16311
153
    
if (153
I->getOpcode() != ISD::EXTRACT_VECTOR_ELT153
)
16312
69
      return SDValue();
16313
84
16314
84
    // Quit if without a constant index.
16315
84
    SDValue Idx = I->getOperand(1);
16316
84
    if (!isa<ConstantSDNode>(Idx))
16317
0
      return SDValue();
16318
84
16319
84
    SDValue ExtractedFromVec = I->getOperand(0);
16320
84
    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16321
84
    if (
M == VecInMap.end()84
) {
16322
33
      VT = ExtractedFromVec.getValueType();
16323
33
      // Quit if not 128/256-bit vector.
16324
33
      if (
!VT.is128BitVector() && 33
!VT.is256BitVector()9
)
16325
0
        return SDValue();
16326
33
      // Quit if not the same type.
16327
33
      
if (33
VecInMap.begin() != VecInMap.end() &&
16328
15
          VT != VecInMap.begin()->first.getValueType())
16329
0
        return SDValue();
16330
33
      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16331
33
      VecIns.push_back(ExtractedFromVec);
16332
33
    }
16333
84
    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16334
84
  }
16335
87
16336
18
  assert((VT.is128BitVector() || VT.is256BitVector()) &&
16337
18
         "Not extracted from 128-/256-bit vector.");
16338
18
16339
18
  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16340
18
16341
18
  for (DenseMap<SDValue, unsigned>::const_iterator
16342
51
        I = VecInMap.begin(), E = VecInMap.end(); 
I != E51
;
++I33
) {
16343
33
    // Quit if not all elements are used.
16344
33
    if (I->second != FullMask)
16345
0
      return SDValue();
16346
33
  }
16347
18
16348
18
  
MVT TestVT = VT.is128BitVector() ? 18
MVT::v2i6412
:
MVT::v4i646
;
16349
18
16350
18
  // Cast all vectors into TestVT for PTEST.
16351
51
  for (unsigned i = 0, e = VecIns.size(); 
i < e51
;
++i33
)
16352
33
    VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16353
18
16354
18
  // If more than one full vector is evaluated, OR them first before PTEST.
16355
33
  for (unsigned Slot = 0, e = VecIns.size(); 
e - Slot > 133
;
Slot += 2, e += 115
) {
16356
15
    // Each iteration will OR 2 nodes and append the result until there is only
16357
15
    // 1 node left, i.e. the final OR'd value of all vectors.
16358
15
    SDValue LHS = VecIns[Slot];
16359
15
    SDValue RHS = VecIns[Slot + 1];
16360
15
    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16361
15
  }
16362
18
16363
18
  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16364
629
}
16365
16366
/// \brief return true if \c Op has a use that doesn't just read flags.
16367
4.19k
static bool hasNonFlagsUse(SDValue Op) {
16368
8.37k
  for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16369
4.19k
       
++UI4.18k
) {
16370
4.61k
    SDNode *User = *UI;
16371
4.61k
    unsigned UOpNo = UI.getOperandNo();
16372
4.61k
    if (
User->getOpcode() == ISD::TRUNCATE && 4.61k
User->hasOneUse()78
) {
16373
78
      // Look pass truncate.
16374
78
      UOpNo = User->use_begin().getOperandNo();
16375
78
      User = *User->use_begin();
16376
78
    }
16377
4.61k
16378
4.61k
    if (
User->getOpcode() != ISD::BRCOND && 4.61k
User->getOpcode() != ISD::SETCC3.00k
&&
16379
724
        
!(User->getOpcode() == ISD::SELECT && 724
UOpNo == 0304
))
16380
428
      return true;
16381
4.61k
  }
16382
3.76k
  return false;
16383
4.19k
}
16384
16385
// Emit KTEST instruction for bit vectors on AVX-512
16386
static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16387
18.7k
                         const X86Subtarget &Subtarget) {
16388
18.7k
  if (
Op.getOpcode() == ISD::BITCAST18.7k
) {
16389
15
    auto hasKTEST = [&](MVT VT) {
16390
15
      unsigned SizeInBits = VT.getSizeInBits();
16391
7
      return (Subtarget.hasDQI() && 
(SizeInBits == 8 || 7
SizeInBits == 163
)) ||
16392
9
        
(Subtarget.hasBWI() && 9
(SizeInBits == 32 || 5
SizeInBits == 643
));
16393
15
    };
16394
40
    SDValue Op0 = Op.getOperand(0);
16395
40
    MVT Op0VT = Op0.getValueType().getSimpleVT();
16396
40
    if (
Op0VT.isVector() && 40
Op0VT.getVectorElementType() == MVT::i115
&&
16397
15
        hasKTEST(Op0VT))
16398
8
      return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16399
18.7k
  }
16400
18.7k
  return SDValue();
16401
18.7k
}
16402
16403
/// Emit nodes that will be selected as "test Op0,Op0", or something
16404
/// equivalent.
16405
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16406
21.2k
                                    SelectionDAG &DAG) const {
16407
21.2k
  if (
Op.getValueType() == MVT::i121.2k
) {
16408
0
    SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16409
0
    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16410
0
                       DAG.getConstant(0, dl, MVT::i8));
16411
0
  }
16412
21.2k
  // CF and OF aren't always set the way we want. Determine which
16413
21.2k
  // of these we need.
16414
21.2k
  bool NeedCF = false;
16415
21.2k
  bool NeedOF = false;
16416
21.2k
  switch (X86CC) {
16417
20.6k
  default: break;
16418
0
  
case X86::COND_A: 0
case X86::COND_AE:
16419
0
  
case X86::COND_B: 0
case X86::COND_BE:
16420
0
    NeedCF = true;
16421
0
    break;
16422
607
  
case X86::COND_G: 607
case X86::COND_GE:
16423
607
  
case X86::COND_L: 607
case X86::COND_LE:
16424
607
  
case X86::COND_O: 607
case X86::COND_NO: {
16425
607
    // Check if we really need to set the
16426
607
    // Overflow flag. If NoSignedWrap is present
16427
607
    // that is not actually needed.
16428
607
    switch (Op->getOpcode()) {
16429
54
    case ISD::ADD:
16430
54
    case ISD::SUB:
16431
54
    case ISD::MUL:
16432
54
    case ISD::SHL:
16433
54
      if (Op.getNode()->getFlags().hasNoSignedWrap())
16434
10
        break;
16435
44
      
LLVM_FALLTHROUGH44
;
16436
597
    default:
16437
597
      NeedOF = true;
16438
597
      break;
16439
607
    }
16440
607
    break;
16441
607
  }
16442
21.2k
  }
16443
21.2k
  // See if we can use the EFLAGS value from the operand instead of
16444
21.2k
  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16445
21.2k
  // we prove that the arithmetic won't overflow, we can't use OF or CF.
16446
21.2k
  
if (21.2k
Op.getResNo() != 0 || 21.2k
NeedOF21.2k
||
NeedCF20.6k
) {
16447
609
    // Emit KTEST for bit vectors
16448
609
    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16449
0
      return Node;
16450
609
    // Emit a CMP with 0, which is the TEST pattern.
16451
609
    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16452
609
                       DAG.getConstant(0, dl, Op.getValueType()));
16453
609
  }
16454
20.6k
  unsigned Opcode = 0;
16455
20.6k
  unsigned NumOperands = 0;
16456
20.6k
16457
20.6k
  // Truncate operations may prevent the merge of the SETCC instruction
16458
20.6k
  // and the arithmetic instruction before it. Attempt to truncate the operands
16459
20.6k
  // of the arithmetic instruction and use a reduced bit-width instruction.
16460
20.6k
  bool NeedTruncation = false;
16461
20.6k
  SDValue ArithOp = Op;
16462
20.6k
  if (
Op->getOpcode() == ISD::TRUNCATE && 20.6k
Op->hasOneUse()222
) {
16463
103
    SDValue Arith = Op->getOperand(0);
16464
103
    // Both the trunc and the arithmetic op need to have one user each.
16465
103
    if (Arith->hasOneUse())
16466
60
      switch (Arith.getOpcode()) {
16467
52
        default: break;
16468
8
        case ISD::ADD:
16469
8
        case ISD::SUB:
16470
8
        case ISD::AND:
16471
8
        case ISD::OR:
16472
8
        case ISD::XOR: {
16473
8
          NeedTruncation = true;
16474
8
          ArithOp = Arith;
16475
8
        }
16476
60
      }
16477
103
  }
16478
20.6k
16479
20.6k
  // Sometimes flags can be set either with an AND or with an SRL/SHL
16480
20.6k
  // instruction. SRL/SHL variant should be preferred for masks longer than this
16481
20.6k
  // number of bits.
16482
20.6k
  const int ShiftToAndMaxMaskWidth = 32;
16483
10.4k
  const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16484
20.6k
16485
20.6k
  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16486
20.6k
  // which may be the result of a CAST.  We use the variable 'Op', which is the
16487
20.6k
  // non-casted variable when we check for possible users.
16488
20.6k
  switch (ArithOp.getOpcode()) {
16489
1.23k
  case ISD::ADD:
16490
1.23k
    // We only want to rewrite this as a target-specific node with attached
16491
1.23k
    // flags if there is a reasonable chance of either using that to do custom
16492
1.23k
    // instructions selection that can fold some of the memory operands, or if
16493
1.23k
    // only the flags are used. If there are other uses, leave the node alone
16494
1.23k
    // and emit a test instruction.
16495
1.23k
    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16496
3.69k
         UE = Op.getNode()->use_end(); 
UI != UE3.69k
;
++UI2.45k
)
16497
2.46k
      
if (2.46k
UI->getOpcode() != ISD::CopyToReg &&
16498
1.32k
          UI->getOpcode() != ISD::SETCC &&
16499
82
          UI->getOpcode() != ISD::STORE)
16500
11
        goto default_case;
16501
1.23k
16502
1.22k
    
if (ConstantSDNode *1.22k
C1.22k
=
16503
1.20k
        dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16504
1.20k
      // An add of one will be selected as an INC.
16505
1.20k
      if (C->isOne() &&
16506
423
          (!Subtarget.slowIncDec() ||
16507
1.20k
           
DAG.getMachineFunction().getFunction()->optForSize()98
)) {
16508
326
        Opcode = X86ISD::INC;
16509
326
        NumOperands = 1;
16510
326
        break;
16511
326
      }
16512
876
16513
876
      // An add of negative one (subtract of one) will be selected as a DEC.
16514
876
      
if (876
C->isAllOnesValue() &&
16515
344
          (!Subtarget.slowIncDec() ||
16516
876
           
DAG.getMachineFunction().getFunction()->optForSize()48
)) {
16517
297
        Opcode = X86ISD::DEC;
16518
297
        NumOperands = 1;
16519
297
        break;
16520
297
      }
16521
605
    }
16522
605
16523
605
    // Otherwise use a regular EFLAGS-setting add.
16524
605
    Opcode = X86ISD::ADD;
16525
605
    NumOperands = 2;
16526
605
    break;
16527
256
  case ISD::SHL:
16528
256
  case ISD::SRL:
16529
256
    // If we have a constant logical shift that's only used in a comparison
16530
256
    // against zero turn it into an equivalent AND. This allows turning it into
16531
256
    // a TEST instruction later.
16532
256
    if (
ZeroCheck && 256
Op->hasOneUse()255
&&
16533
256
        
isa<ConstantSDNode>(Op->getOperand(1))161
&&
!hasNonFlagsUse(Op)76
) {
16534
76
      EVT VT = Op.getValueType();
16535
76
      unsigned BitWidth = VT.getSizeInBits();
16536
76
      unsigned ShAmt = Op->getConstantOperandVal(1);
16537
76
      if (ShAmt >= BitWidth) // Avoid undefined shifts.
16538
0
        break;
16539
76
      APInt Mask = ArithOp.getOpcode() == ISD::SRL
16540
73
                       ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16541
3
                       : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16542
76
      if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16543
24
        break;
16544
52
      Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16545
52
                       DAG.getConstant(Mask, dl, VT));
16546
52
    }
16547
232
    break;
16548
256
16549
4.11k
  case ISD::AND:
16550
4.11k
    // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16551
4.11k
    // because a TEST instruction will be better. However, AND should be
16552
4.11k
    // preferred if the instruction can be combined into ANDN.
16553
4.11k
    if (
!hasNonFlagsUse(Op)4.11k
) {
16554
3.69k
      SDValue Op0 = ArithOp->getOperand(0);
16555
3.69k
      SDValue Op1 = ArithOp->getOperand(1);
16556
3.69k
      EVT VT = ArithOp.getValueType();
16557
3.66k
      bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16558
3.03k
      bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16559
3.69k
      bool isProperAndn = isAndn && 
isLegalAndnType28
&&
Subtarget.hasBMI()26
;
16560
3.69k
16561
3.69k
      // If we cannot select an ANDN instruction, check if we can replace
16562
3.69k
      // AND+IMM64 with a shift before giving up. This is possible for masks
16563
3.69k
      // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16564
3.69k
      if (
!isProperAndn3.69k
) {
16565
3.66k
        if (!ZeroCheck)
16566
113
          break;
16567
3.55k
16568
3.66k
        assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16569
3.55k
        auto *CN = dyn_cast<ConstantSDNode>(Op1);
16570
3.55k
        if (!CN)
16571
445
          break;
16572
3.11k
16573
3.11k
        const APInt &Mask = CN->getAPIntValue();
16574
3.11k
        if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16575
3.08k
          break; // Prefer TEST instruction.
16576
23
16577
23
        unsigned BitWidth = Mask.getBitWidth();
16578
23
        unsigned LeadingOnes = Mask.countLeadingOnes();
16579
23
        unsigned TrailingZeros = Mask.countTrailingZeros();
16580
23
16581
23
        if (
LeadingOnes + TrailingZeros == BitWidth23
) {
16582
17
          assert(TrailingZeros < VT.getSizeInBits() &&
16583
17
                 "Shift amount should be less than the type width");
16584
17
          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16585
17
          SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16586
17
          Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16587
17
          break;
16588
17
        }
16589
6
16590
6
        unsigned LeadingZeros = Mask.countLeadingZeros();
16591
6
        unsigned TrailingOnes = Mask.countTrailingOnes();
16592
6
16593
6
        if (
LeadingZeros + TrailingOnes == BitWidth6
) {
16594
1
          assert(LeadingZeros < VT.getSizeInBits() &&
16595
1
                 "Shift amount should be less than the type width");
16596
1
          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16597
1
          SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16598
1
          Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16599
1
          break;
16600
1
        }
16601
5
16602
5
        break;
16603
5
      }
16604
3.69k
    }
16605
450
    
LLVM_FALLTHROUGH450
;
16606
1.48k
  case ISD::SUB:
16607
1.48k
  case ISD::OR:
16608
1.48k
  case ISD::XOR:
16609
1.48k
    // Similar to ISD::ADD above, check if the uses will preclude useful
16610
1.48k
    // lowering of the target-specific node.
16611
1.48k
    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16612
3.25k
           UE = Op.getNode()->use_end(); 
UI != UE3.25k
;
++UI1.76k
)
16613
1.96k
      
if (1.96k
UI->getOpcode() != ISD::CopyToReg &&
16614
1.64k
          UI->getOpcode() != ISD::SETCC &&
16615
314
          UI->getOpcode() != ISD::STORE)
16616
203
        goto default_case;
16617
1.48k
16618
1.48k
    // Otherwise use a regular EFLAGS-setting instruction.
16619
1.28k
    switch (ArithOp.getOpcode()) {
16620
0
    
default: 0
llvm_unreachable0
("unexpected operator!");
16621
308
    case ISD::SUB: Opcode = X86ISD::SUB; break;
16622
34
    case ISD::XOR: Opcode = X86ISD::XOR; break;
16623
311
    case ISD::AND: Opcode = X86ISD::AND; break;
16624
632
    case ISD::OR: {
16625
632
      if (
!NeedTruncation && 632
ZeroCheck630
) {
16626
629
        if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16627
18
          return EFLAGS;
16628
614
      }
16629
614
      Opcode = X86ISD::OR;
16630
614
      break;
16631
614
    }
16632
1.26k
    }
16633
1.26k
16634
1.26k
    NumOperands = 2;
16635
1.26k
    break;
16636
16
  case X86ISD::ADD:
16637
16
  case X86ISD::SUB:
16638
16
  case X86ISD::INC:
16639
16
  case X86ISD::DEC:
16640
16
  case X86ISD::OR:
16641
16
  case X86ISD::XOR:
16642
16
  case X86ISD::AND:
16643
16
    return SDValue(Op.getNode(), 1);
16644
13.9k
  default:
16645
14.1k
  default_case:
16646
14.1k
    break;
16647
20.6k
  }
16648
20.6k
16649
20.6k
  // If we found that truncation is beneficial, perform the truncation and
16650
20.6k
  // update 'Op'.
16651
20.6k
  
if (20.6k
NeedTruncation20.6k
) {
16652
8
    EVT VT = Op.getValueType();
16653
8
    SDValue WideVal = Op->getOperand(0);
16654
8
    EVT WideVT = WideVal.getValueType();
16655
8
    unsigned ConvertedOp = 0;
16656
8
    // Use a target machine opcode to prevent further DAGCombine
16657
8
    // optimizations that may separate the arithmetic operations
16658
8
    // from the setcc node.
16659
8
    switch (WideVal.getOpcode()) {
16660
0
      default: break;
16661
1
      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16662
0
      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16663
3
      case ISD::AND: ConvertedOp = X86ISD::AND; break;
16664
2
      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
16665
2
      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16666
8
    }
16667
8
16668
8
    
if (8
ConvertedOp8
) {
16669
8
      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16670
8
      if (
TLI.isOperationLegal(WideVal.getOpcode(), WideVT)8
) {
16671
8
        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16672
8
        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16673
8
        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16674
8
      }
16675
8
    }
16676
8
  }
16677
20.6k
16678
20.6k
  
if (20.6k
Opcode == 020.6k
) {
16679
18.1k
    // Emit KTEST for bit vectors
16680
18.1k
    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16681
8
      return Node;
16682
18.1k
16683
18.1k
    // Emit a CMP with 0, which is the TEST pattern.
16684
18.1k
    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16685
18.1k
                       DAG.getConstant(0, dl, Op.getValueType()));
16686
18.1k
  }
16687
2.49k
  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16688
2.49k
  SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16689
2.49k
16690
2.49k
  SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16691
2.49k
  DAG.ReplaceAllUsesWith(Op, New);
16692
2.49k
  return SDValue(New.getNode(), 1);
16693
2.49k
}
16694
16695
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
16696
/// equivalent.
16697
SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16698
36.9k
                                   const SDLoc &dl, SelectionDAG &DAG) const {
16699
36.9k
  if (isNullConstant(Op1))
16700
17.6k
    return EmitTest(Op0, X86CC, dl, DAG);
16701
19.3k
16702
36.9k
  assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16703
19.3k
         "Unexpected comparison operation for MVT::i1 operands");
16704
19.3k
16705
19.3k
  if (
(Op0.getValueType() == MVT::i8 || 19.3k
Op0.getValueType() == MVT::i1617.9k
||
16706
19.3k
       
Op0.getValueType() == MVT::i3217.7k
||
Op0.getValueType() == MVT::i647.73k
)) {
16707
17.2k
    // Only promote the compare up to I32 if it is a 16 bit operation
16708
17.2k
    // with an immediate.  16 bit immediates are to be avoided.
16709
17.2k
    if ((Op0.getValueType() == MVT::i16 &&
16710
185
         
(isa<ConstantSDNode>(Op0) || 185
isa<ConstantSDNode>(Op1)185
)) &&
16711
66
        !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16712
17.2k
        
!Subtarget.isAtom()54
) {
16713
54
      unsigned ExtendOp =
16714
54
          isX86CCUnsigned(X86CC) ? 
ISD::ZERO_EXTEND53
:
ISD::SIGN_EXTEND1
;
16715
54
      Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16716
54
      Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16717
54
    }
16718
17.2k
    // Use SUB instead of CMP to enable CSE between SUB and CMP.
16719
17.2k
    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16720
17.2k
    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16721
17.2k
                              Op0, Op1);
16722
17.2k
    return SDValue(Sub.getNode(), 1);
16723
17.2k
  }
16724
2.05k
  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16725
2.05k
}
16726
16727
/// Convert a comparison if required by the subtarget.
16728
SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16729
73.0k
                                                 SelectionDAG &DAG) const {
16730
73.0k
  // If the subtarget does not support the FUCOMI instruction, floating-point
16731
73.0k
  // comparisons have to be converted.
16732
73.0k
  if (Subtarget.hasCMov() ||
16733
3.87k
      Cmp.getOpcode() != X86ISD::CMP ||
16734
1.60k
      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16735
352
      !Cmp.getOperand(1).getValueType().isFloatingPoint())
16736
72.6k
    return Cmp;
16737
352
16738
352
  // The instruction selector will select an FUCOM instruction instead of
16739
352
  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16740
352
  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16741
352
  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16742
352
  SDLoc dl(Cmp);
16743
352
  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16744
352
  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16745
352
  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16746
352
                            DAG.getConstant(8, dl, MVT::i8));
16747
352
  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16748
352
16749
352
  // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16750
352
  assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16751
352
  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16752
352
}
16753
16754
/// Check if replacement of SQRT with RSQRT should be disabled.
16755
76
bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16756
76
  EVT VT = Op.getValueType();
16757
76
16758
76
  // We never want to use both SQRT and RSQRT instructions for the same input.
16759
76
  if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16760
0
    return false;
16761
76
16762
76
  
if (76
VT.isVector()76
)
16763
39
    return Subtarget.hasFastVectorFSQRT();
16764
37
  return Subtarget.hasFastScalarFSQRT();
16765
37
}
16766
16767
/// The minimum architected relative accuracy is 2^-12. We need one
16768
/// Newton-Raphson step to have a good float result (24 bits of precision).
16769
SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16770
                                           SelectionDAG &DAG, int Enabled,
16771
                                           int &RefinementSteps,
16772
                                           bool &UseOneConstNR,
16773
35
                                           bool Reciprocal) const {
16774
35
  EVT VT = Op.getValueType();
16775
35
16776
35
  // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16777
35
  // TODO: Add support for AVX512 (v16f32).
16778
35
  // It is likely not profitable to do this for f64 because a double-precision
16779
35
  // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16780
35
  // instructions: convert to single, rsqrtss, convert back to double, refine
16781
35
  // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16782
35
  // along with FMA, this could be a throughput win.
16783
35
  if (
(VT == MVT::f32 && 35
Subtarget.hasSSE1()8
) ||
16784
27
      
(VT == MVT::v4f32 && 27
Subtarget.hasSSE1()14
) ||
16785
35
      
(VT == MVT::v8f32 && 13
Subtarget.hasAVX()7
)) {
16786
25
    if (RefinementSteps == ReciprocalEstimate::Unspecified)
16787
23
      RefinementSteps = 1;
16788
25
16789
25
    UseOneConstNR = false;
16790
25
    return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16791
25
  }
16792
10
  return SDValue();
16793
10
}
16794
16795
/// The minimum architected relative accuracy is 2^-12. We need one
16796
/// Newton-Raphson step to have a good float result (24 bits of precision).
16797
SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16798
                                            int Enabled,
16799
240
                                            int &RefinementSteps) const {
16800
240
  EVT VT = Op.getValueType();
16801
240
16802
240
  // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16803
240
  // TODO: Add support for AVX512 (v16f32).
16804
240
  // It is likely not profitable to do this for f64 because a double-precision
16805
240
  // reciprocal estimate with refinement on x86 prior to FMA requires
16806
240
  // 15 instructions: convert to single, rcpss, convert back to double, refine
16807
240
  // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16808
240
  // along with FMA, this could be a throughput win.
16809
240
16810
240
  if (
(VT == MVT::f32 && 240
Subtarget.hasSSE1()80
) ||
16811
160
      
(VT == MVT::v4f32 && 160
Subtarget.hasSSE1()68
) ||
16812
240
      
(VT == MVT::v8f32 && 92
Subtarget.hasAVX()73
)) {
16813
212
    // Enable estimate codegen with 1 refinement step for vector division.
16814
212
    // Scalar division estimates are disabled because they break too much
16815
212
    // real-world code. These defaults are intended to match GCC behavior.
16816
212
    if (
VT == MVT::f32 && 212
Enabled == ReciprocalEstimate::Unspecified80
)
16817
17
      return SDValue();
16818
195
16819
195
    
if (195
RefinementSteps == ReciprocalEstimate::Unspecified195
)
16820
110
      RefinementSteps = 1;
16821
212
16822
212
    return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16823
212
  }
16824
28
  return SDValue();
16825
28
}
16826
16827
/// If we have at least two divisions that use the same divisor, convert to
16828
/// multiplication by a reciprocal. This may need to be adjusted for a given
16829
/// CPU if a division's cost is not at least twice the cost of a multiplication.
16830
/// This is because we still need one division to calculate the reciprocal and
16831
/// then we need two multiplies by that reciprocal as replacements for the
16832
/// original divisions.
16833
159
unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16834
159
  return 2;
16835
159
}
16836
16837
/// Helper for creating a X86ISD::SETCC node.
16838
static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16839
45.8k
                        SelectionDAG &DAG) {
16840
45.8k
  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16841
45.8k
                     DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16842
45.8k
}
16843
16844
/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16845
/// according to equal/not-equal condition code \p CC.
16846
static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16847
515
                                   const SDLoc &dl, SelectionDAG &DAG) {
16848
515
  // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
16849
515
  // instruction.  Since the shift amount is in-range-or-undefined, we know
16850
515
  // that doing a bittest on the i32 value is ok.  We extend to i32 because
16851
515
  // the encoding for the i16 version is larger than the i32 version.
16852
515
  // Also promote i16 to i32 for performance / code size reason.
16853
515
  if (
Src.getValueType() == MVT::i8 || 515
Src.getValueType() == MVT::i16495
)
16854
23
    Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16855
515
16856
515
  // See if we can use the 32-bit instruction instead of the 64-bit one for a
16857
515
  // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16858
515
  // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16859
515
  // known to be zero.
16860
515
  if (Src.getValueType() == MVT::i64 &&
16861
205
      DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16862
5
    Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16863
515
16864
515
  // If the operand types disagree, extend the shift amount to match.  Since
16865
515
  // BT ignores high bits (like shifts) we can use anyextend.
16866
515
  if (Src.getValueType() != BitNo.getValueType())
16867
406
    BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16868
515
16869
515
  SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16870
515
  X86::CondCode Cond = CC == ISD::SETEQ ? 
X86::COND_AE245
:
X86::COND_B270
;
16871
515
  return getSETCC(Cond, BT, dl , DAG);
16872
515
}
16873
16874
/// Result of 'and' is compared against zero. Change to a BT node if possible.
16875
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16876
4.02k
                            const SDLoc &dl, SelectionDAG &DAG) {
16877
4.02k
  assert(And.getOpcode() == ISD::AND && "Expected AND node!");
16878
4.02k
  SDValue Op0 = And.getOperand(0);
16879
4.02k
  SDValue Op1 = And.getOperand(1);
16880
4.02k
  if (Op0.getOpcode() == ISD::TRUNCATE)
16881
170
    Op0 = Op0.getOperand(0);
16882
4.02k
  if (Op1.getOpcode() == ISD::TRUNCATE)
16883
8
    Op1 = Op1.getOperand(0);
16884
4.02k
16885
4.02k
  SDValue LHS, RHS;
16886
4.02k
  if (Op1.getOpcode() == ISD::SHL)
16887
197
    std::swap(Op0, Op1);
16888
4.02k
  if (
Op0.getOpcode() == ISD::SHL4.02k
) {
16889
326
    if (
isOneConstant(Op0.getOperand(0))326
) {
16890
326
      // If we looked past a truncate, check that it's only truncating away
16891
326
      // known zeros.
16892
326
      unsigned BitWidth = Op0.getValueSizeInBits();
16893
326
      unsigned AndBitWidth = And.getValueSizeInBits();
16894
326
      if (
BitWidth > AndBitWidth326
) {
16895
0
        KnownBits Known;
16896
0
        DAG.computeKnownBits(Op0, Known);
16897
0
        if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
16898
0
          return SDValue();
16899
326
      }
16900
326
      LHS = Op1;
16901
326
      RHS = Op0.getOperand(1);
16902
326
    }
16903
4.02k
  } else 
if (3.70k
Op1.getOpcode() == ISD::Constant3.70k
) {
16904
3.24k
    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16905
3.24k
    uint64_t AndRHSVal = AndRHS->getZExtValue();
16906
3.24k
    SDValue AndLHS = Op0;
16907
3.24k
16908
3.24k
    if (
AndRHSVal == 1 && 3.24k
AndLHS.getOpcode() == ISD::SRL2.58k
) {
16909
169
      LHS = AndLHS.getOperand(0);
16910
169
      RHS = AndLHS.getOperand(1);
16911
169
    }
16912
3.24k
16913
3.24k
    // Use BT if the immediate can't be encoded in a TEST instruction.
16914
3.24k
    if (
!isUInt<32>(AndRHSVal) && 3.24k
isPowerOf2_64(AndRHSVal)43
) {
16915
20
      LHS = AndLHS;
16916
20
      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16917
20
    }
16918
3.70k
  }
16919
4.02k
16920
4.02k
  
if (4.02k
LHS.getNode()4.02k
)
16921
515
    return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16922
3.51k
16923
3.51k
  return SDValue();
16924
3.51k
}
16925
16926
/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16927
/// CMPs.
16928
static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16929
1.67k
                              SDValue &Op1) {
16930
1.67k
  unsigned SSECC;
16931
1.67k
  bool Swap = false;
16932
1.67k
16933
1.67k
  // SSE Condition code mapping:
16934
1.67k
  //  0 - EQ
16935
1.67k
  //  1 - LT
16936
1.67k
  //  2 - LE
16937
1.67k
  //  3 - UNORD
16938
1.67k
  //  4 - NEQ
16939
1.67k
  //  5 - NLT
16940
1.67k
  //  6 - NLE
16941
1.67k
  //  7 - ORD
16942
1.67k
  switch (SetCCOpcode) {
16943
0
  
default: 0
llvm_unreachable0
("Unexpected SETCC condition");
16944
379
  case ISD::SETOEQ:
16945
379
  case ISD::SETEQ:  SSECC = 0; break;
16946
475
  case ISD::SETOGT:
16947
475
  
case ISD::SETGT: Swap = true; 475
LLVM_FALLTHROUGH475
;
16948
657
  case ISD::SETLT:
16949
657
  case ISD::SETOLT: SSECC = 1; break;
16950
187
  case ISD::SETOGE:
16951
187
  
case ISD::SETGE: Swap = true; 187
LLVM_FALLTHROUGH187
;
16952
229
  case ISD::SETLE:
16953
229
  case ISD::SETOLE: SSECC = 2; break;
16954
242
  case ISD::SETUO:  SSECC = 3; break;
16955
35
  case ISD::SETUNE:
16956
35
  case ISD::SETNE:  SSECC = 4; break;
16957
6
  
case ISD::SETULE: Swap = true; 6
LLVM_FALLTHROUGH6
;
16958
23
  case ISD::SETUGE: SSECC = 5; break;
16959
21
  
case ISD::SETULT: Swap = true; 21
LLVM_FALLTHROUGH21
;
16960
43
  case ISD::SETUGT: SSECC = 6; break;
16961
28
  case ISD::SETO:   SSECC = 7; break;
16962
36
  case ISD::SETUEQ:
16963
36
  case ISD::SETONE: SSECC = 8; break;
16964
1.67k
  }
16965
1.67k
  
if (1.67k
Swap1.67k
)
16966
689
    std::swap(Op0, Op1);
16967
1.67k
16968
1.67k
  return SSECC;
16969
1.67k
}
16970
16971
/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16972
/// concatenate the result back.
16973
219
static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16974
219
  MVT VT = Op.getSimpleValueType();
16975
219
16976
219
  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16977
219
         "Unsupported value type for operation");
16978
219
16979
219
  unsigned NumElems = VT.getVectorNumElements();
16980
219
  SDLoc dl(Op);
16981
219
  SDValue CC = Op.getOperand(2);
16982
219
16983
219
  // Extract the LHS vectors
16984
219
  SDValue LHS = Op.getOperand(0);
16985
219
  SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16986
219
  SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16987
219
16988
219
  // Extract the RHS vectors
16989
219
  SDValue RHS = Op.getOperand(1);
16990
219
  SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16991
219
  SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16992
219
16993
219
  // Issue the operation on the smaller types and concatenate the result back
16994
219
  MVT EltVT = VT.getVectorElementType();
16995
219
  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16996
219
  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16997
219
                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16998
219
                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16999
219
}
17000
17001
8
static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17002
8
  SDValue Op0 = Op.getOperand(0);
17003
8
  SDValue Op1 = Op.getOperand(1);
17004
8
  SDValue CC = Op.getOperand(2);
17005
8
  MVT VT = Op.getSimpleValueType();
17006
8
  SDLoc dl(Op);
17007
8
17008
8
  assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
17009
8
         "Unexpected type for boolean compare operation");
17010
8
  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17011
8
  SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17012
8
                               DAG.getConstant(-1, dl, VT));
17013
8
  SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17014
8
                               DAG.getConstant(-1, dl, VT));
17015
8
  switch (SetCCOpcode) {
17016
0
  
default: 0
llvm_unreachable0
("Unexpected SETCC condition");
17017
4
  case ISD::SETEQ:
17018
4
    // (x == y) -> ~(x ^ y)
17019
4
    return DAG.getNode(ISD::XOR, dl, VT,
17020
4
                       DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17021
4
                       DAG.getConstant(-1, dl, VT));
17022
2
  case ISD::SETNE:
17023
2
    // (x != y) -> (x ^ y)
17024
2
    return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17025
1
  case ISD::SETUGT:
17026
1
  case ISD::SETGT:
17027
1
    // (x > y) -> (x & ~y)
17028
1
    return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17029
1
  case ISD::SETULT:
17030
1
  case ISD::SETLT:
17031
1
    // (x < y) -> (~x & y)
17032
1
    return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17033
0
  case ISD::SETULE:
17034
0
  case ISD::SETLE:
17035
0
    // (x <= y) -> (~x | y)
17036
0
    return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17037
0
  case ISD::SETUGE:
17038
0
  case ISD::SETGE:
17039
0
    // (x >=y) -> (x | ~y)
17040
0
    return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17041
0
  }
17042
0
}
17043
17044
2.01k
static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17045
2.01k
17046
2.01k
  SDValue Op0 = Op.getOperand(0);
17047
2.01k
  SDValue Op1 = Op.getOperand(1);
17048
2.01k
  SDValue CC = Op.getOperand(2);
17049
2.01k
  MVT VT = Op.getSimpleValueType();
17050
2.01k
  SDLoc dl(Op);
17051
2.01k
17052
2.01k
  assert(VT.getVectorElementType() == MVT::i1 &&
17053
2.01k
         "Cannot set masked compare for this operation");
17054
2.01k
17055
2.01k
  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17056
2.01k
  unsigned  Opc = 0;
17057
2.01k
  bool Unsigned = false;
17058
2.01k
  bool Swap = false;
17059
2.01k
  unsigned SSECC;
17060
2.01k
  switch (SetCCOpcode) {
17061
0
  
default: 0
llvm_unreachable0
("Unexpected SETCC condition");
17062
294
  case ISD::SETNE:  SSECC = 4; break;
17063
441
  case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
17064
78
  case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17065
109
  
case ISD::SETLT: Swap = true; 109
LLVM_FALLTHROUGH109
;
17066
531
  case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
17067
264
  case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17068
42
  case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17069
275
  case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
17070
44
  
case ISD::SETULE: Unsigned = true; 44
LLVM_FALLTHROUGH44
;
17071
94
  case ISD::SETLE:  SSECC = 2; break;
17072
2.01k
  }
17073
2.01k
17074
2.01k
  
if (2.01k
Swap2.01k
)
17075
384
    std::swap(Op0, Op1);
17076
2.01k
  if (Opc)
17077
972
    return DAG.getNode(Opc, dl, VT, Op0, Op1);
17078
1.04k
  
Opc = Unsigned ? 1.04k
X86ISD::CMPMU428
:
X86ISD::CMPM619
;
17079
2.01k
  return DAG.getNode(Opc, dl, VT, Op0, Op1,
17080
2.01k
                     DAG.getConstant(SSECC, dl, MVT::i8));
17081
2.01k
}
17082
17083
/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17084
/// operand \p Op1.  If non-trivial (for example because it's not constant)
17085
/// return an empty value.
17086
static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17087
49
                                      SelectionDAG &DAG) {
17088
49
  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17089
49
  if (!BV)
17090
43
    return SDValue();
17091
6
17092
6
  MVT VT = Op1.getSimpleValueType();
17093
6
  MVT EVT = VT.getVectorElementType();
17094
6
  unsigned n = VT.getVectorNumElements();
17095
6
  SmallVector<SDValue, 8> ULTOp1;
17096
6
17097
54
  for (unsigned i = 0; 
i < n54
;
++i48
) {
17098
50
    ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17099
50
    if (
!Elt || 50
Elt->isOpaque()50
||
Elt->getSimpleValueType(0) != EVT50
)
17100
0
      return SDValue();
17101
50
17102
50
    // Avoid underflow.
17103
50
    APInt Val = Elt->getAPIntValue();
17104
50
    if (Val == 0)
17105
2
      return SDValue();
17106
48
17107
48
    ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17108
48
  }
17109
6
17110
4
  return DAG.getBuildVector(VT, dl, ULTOp1);
17111
49
}
17112
17113
static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17114
7.56k
                           SelectionDAG &DAG) {
17115
7.56k
  SDValue Op0 = Op.getOperand(0);
17116
7.56k
  SDValue Op1 = Op.getOperand(1);
17117
7.56k
  SDValue CC = Op.getOperand(2);
17118
7.56k
  MVT VT = Op.getSimpleValueType();
17119
7.56k
  ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17120
7.56k
  bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17121
7.56k
  SDLoc dl(Op);
17122
7.56k
17123
7.56k
  if (
isFP7.56k
) {
17124
#ifndef NDEBUG
17125
    MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17126
    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17127
#endif
17128
17129
1.14k
    unsigned Opc;
17130
1.14k
    if (
Subtarget.hasAVX512() && 1.14k
VT.getVectorElementType() == MVT::i1463
) {
17131
341
      assert(VT.getVectorNumElements() <= 16);
17132
341
      Opc = X86ISD::CMPM;
17133
1.14k
    } else {
17134
806
      Opc = X86ISD::CMPP;
17135
806
      // The SSE/AVX packed FP comparison nodes are defined with a
17136
806
      // floating-point vector result that matches the operand type. This allows
17137
806
      // them to work with an SSE1 target (integer vector types are not legal).
17138
806
      VT = Op0.getSimpleValueType();
17139
806
    }
17140
1.14k
17141
1.14k
    // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17142
1.14k
    // emit two comparisons and a logic op to tie them together.
17143
1.14k
    // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17144
1.14k
    // available.
17145
1.14k
    SDValue Cmp;
17146
1.14k
    unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17147
1.14k
    if (
SSECC == 81.14k
) {
17148
32
      // LLVM predicate is SETUEQ or SETONE.
17149
32
      unsigned CC0, CC1;
17150
32
      unsigned CombineOpc;
17151
32
      if (
Cond == ISD::SETUEQ32
) {
17152
10
        CC0 = 3; // UNORD
17153
10
        CC1 = 0; // EQ
17154
10
        CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17155
0
                                           static_cast<unsigned>(ISD::OR);
17156
32
      } else {
17157
22
        assert(Cond == ISD::SETONE);
17158
22
        CC0 = 7; // ORD
17159
22
        CC1 = 4; // NEQ
17160
10
        CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17161
12
                                           static_cast<unsigned>(ISD::AND);
17162
22
      }
17163
32
17164
32
      SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17165
32
                                 DAG.getConstant(CC0, dl, MVT::i8));
17166
32
      SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17167
32
                                 DAG.getConstant(CC1, dl, MVT::i8));
17168
32
      Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17169
1.14k
    } else {
17170
1.11k
      // Handle all other FP comparisons here.
17171
1.11k
      Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17172
1.11k
                        DAG.getConstant(SSECC, dl, MVT::i8));
17173
1.11k
    }
17174
1.14k
17175
1.14k
    // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17176
1.14k
    // result type of SETCC. The bitcast is expected to be optimized away
17177
1.14k
    // during combining/isel.
17178
1.14k
    if (Opc == X86ISD::CMPP)
17179
806
      Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17180
1.14k
17181
1.14k
    return Cmp;
17182
1.14k
  }
17183
6.41k
17184
6.41k
  MVT VTOp0 = Op0.getSimpleValueType();
17185
6.41k
  assert(VTOp0 == Op1.getSimpleValueType() &&
17186
6.41k
         "Expected operands with same type!");
17187
6.41k
  assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17188
6.41k
         "Invalid number of packed elements for source and destination!");
17189
6.41k
17190
6.41k
  if (
VT.is128BitVector() && 6.41k
VTOp0.is256BitVector()3.28k
) {
17191
1
    // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17192
1
    // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
17193
1
    // legalizer firstly checks if the first operand in input to the setcc has
17194
1
    // a legal type. If so, then it promotes the return type to that same type.
17195
1
    // Otherwise, the return type is promoted to the 'next legal type' which,
17196
1
    // for a vector of MVT::i1 is always a 128-bit integer vector type.
17197
1
    //
17198
1
    // We reach this code only if the following two conditions are met:
17199
1
    // 1. Both return type and operand type have been promoted to wider types
17200
1
    //    by the type legalizer.
17201
1
    // 2. The original operand type has been promoted to a 256-bit vector.
17202
1
    //
17203
1
    // Note that condition 2. only applies for AVX targets.
17204
1
    SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17205
1
    return DAG.getZExtOrTrunc(NewOp, dl, VT);
17206
1
  }
17207
6.41k
17208
6.41k
  // The non-AVX512 code below works under the assumption that source and
17209
6.41k
  // destination types are the same.
17210
6.41k
  assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17211
6.41k
         "Value types for source and destination must be the same!");
17212
6.41k
17213
6.41k
  // Break 256-bit integer vector compare into smaller ones.
17214
6.41k
  if (
VT.is256BitVector() && 6.41k
!Subtarget.hasInt256()938
)
17215
219
    return Lower256IntVSETCC(Op, DAG);
17216
6.19k
17217
6.19k
  // Operands are boolean (vectors of i1)
17218
6.19k
  MVT OpVT = Op1.getSimpleValueType();
17219
6.19k
  if (OpVT.getVectorElementType() == MVT::i1)
17220
8
    return LowerBoolVSETCC_AVX512(Op, DAG);
17221
6.18k
17222
6.18k
  // The result is boolean, but operands are int/float
17223
6.18k
  
if (6.18k
VT.getVectorElementType() == MVT::i16.18k
) {
17224
2.18k
    // In AVX-512 architecture setcc returns mask with i1 elements,
17225
2.18k
    // But there is no compare instruction for i8 and i16 elements in KNL.
17226
2.18k
    // In this case use SSE compare
17227
2.18k
    bool UseAVX512Inst =
17228
2.18k
      (OpVT.is512BitVector() ||
17229
1.39k
       OpVT.getScalarSizeInBits() >= 32 ||
17230
503
       
(Subtarget.hasBWI() && 503
Subtarget.hasVLX()341
));
17231
2.18k
17232
2.18k
    if (UseAVX512Inst)
17233
2.01k
      return LowerIntVSETCC_AVX512(Op, DAG);
17234
166
17235
166
    return DAG.getNode(ISD::TRUNCATE, dl, VT,
17236
166
                        DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17237
166
  }
17238
4.00k
17239
4.00k
  // Lower using XOP integer comparisons.
17240
4.00k
  
if (4.00k
(VT == MVT::v16i8 || 4.00k
VT == MVT::v8i163.36k
||
17241
4.00k
       
VT == MVT::v4i322.74k
||
VT == MVT::v2i641.89k
) &&
Subtarget.hasXOP()3.28k
) {
17242
96
    // Translate compare code to XOP PCOM compare mode.
17243
96
    unsigned CmpMode = 0;
17244
96
    switch (Cond) {
17245
0
    
default: 0
llvm_unreachable0
("Unexpected SETCC condition");
17246
16
    case ISD::SETULT:
17247
16
    case ISD::SETLT: CmpMode = 0x00; break;
17248
16
    case ISD::SETULE:
17249
16
    case ISD::SETLE: CmpMode = 0x01; break;
17250
16
    case ISD::SETUGT:
17251
16
    case ISD::SETGT: CmpMode = 0x02; break;
17252
16
    case ISD::SETUGE:
17253
16
    case ISD::SETGE: CmpMode = 0x03; break;
17254
16
    case ISD::SETEQ: CmpMode = 0x04; break;
17255
16
    case ISD::SETNE: CmpMode = 0x05; break;
17256
96
    }
17257
96
17258
96
    // Are we comparing unsigned or signed integers?
17259
96
    unsigned Opc =
17260
96
        ISD::isUnsignedIntSetCC(Cond) ? 
X86ISD::VPCOMU32
:
X86ISD::VPCOM64
;
17261
96
17262
96
    return DAG.getNode(Opc, dl, VT, Op0, Op1,
17263
96
                       DAG.getConstant(CmpMode, dl, MVT::i8));
17264
96
  }
17265
3.90k
17266
3.90k
  // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
17267
3.90k
  // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
17268
3.90k
  
if (3.90k
Cond == ISD::SETNE && 3.90k
ISD::isBuildVectorAllZeros(Op1.getNode())312
) {
17269
246
    SDValue BC0 = peekThroughBitcasts(Op0);
17270
246
    if (
BC0.getOpcode() == ISD::AND246
) {
17271
169
      APInt UndefElts;
17272
169
      SmallVector<APInt, 64> EltBits;
17273
169
      if (getTargetConstantBitsFromNode(BC0.getOperand(1),
17274
169
                                        VT.getScalarSizeInBits(), UndefElts,
17275
169
                                        EltBits, false, false)) {
17276
1.47k
        if (
llvm::all_of(EltBits, [](APInt &V) 169
{ return V.isPowerOf2(); }1.47k
)) {
17277
169
          Cond = ISD::SETEQ;
17278
169
          Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
17279
169
        }
17280
169
      }
17281
169
    }
17282
246
  }
17283
3.90k
17284
3.90k
  // We are handling one of the integer comparisons here. Since SSE only has
17285
3.90k
  // GT and EQ comparisons for integer, swapping operands and multiple
17286
3.90k
  // operations may be required for some comparisons.
17287
3.90k
  unsigned Opc = (Cond == ISD::SETEQ || 
Cond == ISD::SETNE2.87k
) ?
X86ISD::PCMPEQ1.17k
17288
2.73k
                                                            : X86ISD::PCMPGT;
17289
3.67k
  bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
17290
3.90k
              
Cond == ISD::SETGE3.29k
||
Cond == ISD::SETUGE3.02k
;
17291
3.90k
  bool Invert = Cond == ISD::SETNE ||
17292
3.76k
                
(Cond != ISD::SETEQ && 3.76k
ISD::isTrueWhenEqual(Cond)2.73k
);
17293
3.90k
17294
3.90k
  // If both operands are known non-negative, then an unsigned compare is the
17295
3.90k
  // same as a signed compare and there's no need to flip signbits.
17296
3.90k
  // TODO: We could check for more general simplifications here since we're
17297
3.90k
  // computing known bits.
17298
3.90k
  bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17299
899
                   
!(DAG.SignBitIsZero(Op0) && 899
DAG.SignBitIsZero(Op1)123
);
17300
3.90k
17301
3.90k
  // Special case: Use min/max operations for SETULE/SETUGE
17302
3.90k
  MVT VET = VT.getVectorElementType();
17303
3.90k
  bool HasMinMax =
17304
2.89k
      (Subtarget.hasSSE41() && 
(VET >= MVT::i8 && 2.89k
VET <= MVT::i322.89k
)) ||
17305
2.14k
      
(Subtarget.hasSSE2() && 2.14k
(VET == MVT::i8)2.14k
);
17306
3.90k
  bool MinMax = false;
17307
3.90k
  if (
HasMinMax3.90k
) {
17308
1.94k
    switch (Cond) {
17309
1.84k
    default: break;
17310
38
    case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17311
60
    case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17312
1.94k
    }
17313
1.94k
17314
1.94k
    
if (1.94k
MinMax1.94k
)
17315
98
      Swap = Invert = FlipSigns = false;
17316
1.94k
  }
17317
3.90k
17318
3.90k
  
bool HasSubus = Subtarget.hasSSE2() && 3.90k
(VET == MVT::i8 || 3.90k
VET == MVT::i163.11k
);
17319
3.90k
  bool Subus = false;
17320
3.90k
  if (
!MinMax && 3.90k
HasSubus3.80k
) {
17321
1.50k
    // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17322
1.50k
    // Op0 u<= Op1:
17323
1.50k
    //   t = psubus Op0, Op1
17324
1.50k
    //   pcmpeq t, <0..0>
17325
1.50k
    switch (Cond) {
17326
1.35k
    default: break;
17327
101
    case ISD::SETULT: {
17328
101
      // If the comparison is against a constant we can turn this into a
17329
101
      // setule.  With psubus, setule does not require a swap.  This is
17330
101
      // beneficial because the constant in the register is no longer
17331
101
      // destructed as the destination so it can be hoisted out of a loop.
17332
101
      // Only do this pre-AVX since vpcmp* is no longer destructive.
17333
101
      if (Subtarget.hasAVX())
17334
52
        break;
17335
49
      
if (SDValue 49
ULEOp149
= ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17336
4
        Op1 = ULEOp1;
17337
4
        Subus = true; Invert = false; Swap = false;
17338
4
      }
17339
49
      break;
17340
49
    }
17341
49
    // Psubus is better than flip-sign because it requires no inversion.
17342
20
    case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
17343
20
    case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17344
1.50k
    }
17345
1.50k
17346
1.50k
    
if (1.50k
Subus1.50k
) {
17347
44
      Opc = X86ISD::SUBUS;
17348
44
      FlipSigns = false;
17349
44
    }
17350
1.50k
  }
17351
3.90k
17352
3.90k
  
if (3.90k
Swap3.90k
)
17353
998
    std::swap(Op0, Op1);
17354
3.90k
17355
3.90k
  // Check that the operation in question is available (most are plain SSE2,
17356
3.90k
  // but PCMPGTQ and PCMPEQQ have different requirements).
17357
3.90k
  if (
VT == MVT::v2i643.90k
) {
17358
1.14k
    if (
Opc == X86ISD::PCMPGT && 1.14k
!Subtarget.hasSSE42()955
) {
17359
278
      assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17360
278
17361
278
      // First cast everything to the right type.
17362
278
      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17363
278
      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17364
278
17365
278
      // Since SSE has no unsigned integer comparisons, we need to flip the sign
17366
278
      // bits of the inputs before performing those operations. The lower
17367
278
      // compare is always unsigned.
17368
278
      SDValue SB;
17369
278
      if (
FlipSigns278
) {
17370
104
        SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17371
278
      } else {
17372
174
        SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17373
174
        SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17374
174
        SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17375
174
      }
17376
278
      Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17377
278
      Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17378
278
17379
278
      // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17380
278
      SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17381
278
      SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17382
278
17383
278
      // Create masks for only the low parts/high parts of the 64 bit integers.
17384
278
      static const int MaskHi[] = { 1, 1, 3, 3 };
17385
278
      static const int MaskLo[] = { 0, 0, 2, 2 };
17386
278
      SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17387
278
      SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17388
278
      SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17389
278
17390
278
      SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17391
278
      Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17392
278
17393
278
      if (Invert)
17394
96
        Result = DAG.getNOT(dl, Result, MVT::v4i32);
17395
278
17396
278
      return DAG.getBitcast(VT, Result);
17397
278
    }
17398
870
17399
870
    
if (870
Opc == X86ISD::PCMPEQ && 870
!Subtarget.hasSSE41()193
) {
17400
47
      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17401
47
      // pcmpeqd + pshufd + pand.
17402
47
      assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17403
47
17404
47
      // First cast everything to the right type.
17405
47
      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17406
47
      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17407
47
17408
47
      // Do the compare.
17409
47
      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17410
47
17411
47
      // Make sure the lower and upper halves are both all-ones.
17412
47
      static const int Mask[] = { 1, 0, 3, 2 };
17413
47
      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17414
47
      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17415
47
17416
47
      if (Invert)
17417
3
        Result = DAG.getNOT(dl, Result, MVT::v4i32);
17418
47
17419
47
      return DAG.getBitcast(VT, Result);
17420
47
    }
17421
3.58k
  }
17422
3.58k
17423
3.58k
  // Since SSE has no unsigned integer comparisons, we need to flip the sign
17424
3.58k
  // bits of the inputs before performing those operations.
17425
3.58k
  
if (3.58k
FlipSigns3.58k
) {
17426
605
    MVT EltVT = VT.getVectorElementType();
17427
605
    SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17428
605
                                 VT);
17429
605
    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17430
605
    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17431
605
  }
17432
3.58k
17433
3.58k
  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17434
3.58k
17435
3.58k
  // If the logical-not of the result is required, perform that now.
17436
3.58k
  if (Invert)
17437
660
    Result = DAG.getNOT(dl, Result, VT);
17438
3.58k
17439
3.58k
  if (MinMax)
17440
98
    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17441
3.58k
17442
3.58k
  if (Subus)
17443
44
    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17444
44
                         getZeroVector(VT, Subtarget, DAG, dl));
17445
7.56k
17446
7.56k
  return Result;
17447
7.56k
}
17448
17449
45.2k
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17450
45.2k
17451
45.2k
  MVT VT = Op.getSimpleValueType();
17452
45.2k
17453
45.2k
  if (
VT.isVector()45.2k
)
return LowerVSETCC(Op, Subtarget, DAG)7.53k
;
17454
37.7k
17455
45.2k
  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17456
37.7k
  SDValue Op0 = Op.getOperand(0);
17457
37.7k
  SDValue Op1 = Op.getOperand(1);
17458
37.7k
  SDLoc dl(Op);
17459
37.7k
  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17460
37.7k
17461
37.7k
  // Optimize to BT if possible.
17462
37.7k
  // Lower (X & (1 << N)) == 0 to BT(X, N).
17463
37.7k
  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17464
37.7k
  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17465
37.7k
  if (
Op0.getOpcode() == ISD::AND && 37.7k
Op0.hasOneUse()3.22k
&&
isNullConstant(Op1)2.46k
&&
17466
37.7k
      
(CC == ISD::SETEQ || 2.32k
CC == ISD::SETNE717
)) {
17467
2.21k
    if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
17468
393
      return NewSetCC;
17469
37.3k
  }
17470
37.3k
17471
37.3k
  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
17472
37.3k
  // these.
17473
37.3k
  
if (37.3k
(isOneConstant(Op1) || 37.3k
isNullConstant(Op1)36.1k
) &&
17474
37.3k
      
(CC == ISD::SETEQ || 18.1k
CC == ISD::SETNE7.77k
)) {
17475
16.8k
17476
16.8k
    // If the input is a setcc, then reuse the input setcc or use a new one with
17477
16.8k
    // the inverted condition.
17478
16.8k
    if (
Op0.getOpcode() == X86ISD::SETCC16.8k
) {
17479
187
      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17480
187
      bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17481
187
      if (!Invert)
17482
0
        return Op0;
17483
187
17484
187
      CCode = X86::GetOppositeBranchCondition(CCode);
17485
187
      SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17486
187
      if (VT == MVT::i1)
17487
0
        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17488
187
      return SetCC;
17489
187
    }
17490
16.8k
  }
17491
37.1k
  
if (37.1k
Op0.getValueType() == MVT::i1 && 37.1k
(CC == ISD::SETEQ || 0
CC == ISD::SETNE0
)) {
17492
0
    if (
isOneConstant(Op1)0
) {
17493
0
      ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17494
0
      return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17495
0
    }
17496
0
    
if (0
!isNullConstant(Op1)0
) {
17497
0
      SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17498
0
      return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17499
0
    }
17500
37.1k
  }
17501
37.1k
17502
37.1k
  bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17503
37.1k
  X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17504
37.1k
  if (X86CC == X86::COND_INVALID)
17505
154
    return SDValue();
17506
36.9k
17507
36.9k
  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17508
36.9k
  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17509
36.9k
  SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17510
36.9k
  if (VT == MVT::i1)
17511
0
    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17512
36.9k
  return SetCC;
17513
36.9k
}
17514
17515
548
SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
17516
548
  SDValue LHS = Op.getOperand(0);
17517
548
  SDValue RHS = Op.getOperand(1);
17518
548
  SDValue Carry = Op.getOperand(2);
17519
548
  SDValue Cond = Op.getOperand(3);
17520
548
  SDLoc DL(Op);
17521
548
17522
548
  assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
17523
548
  X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17524
548
17525
548
  // Recreate the carry if needed.
17526
548
  EVT CarryVT = Carry.getValueType();
17527
548
  APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
17528
548
  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
17529
548
                      Carry, DAG.getConstant(NegOne, DL, CarryVT));
17530
548
17531
548
  SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17532
548
  SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
17533
548
  SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17534
548
  if (Op.getSimpleValueType() == MVT::i1)
17535
0
    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17536
548
  return SetCC;
17537
548
}
17538
17539
/// Return true if opcode is a X86 logical comparison.
17540
35.2k
static bool isX86LogicalCmp(SDValue Op) {
17541
35.2k
  unsigned Opc = Op.getOpcode();
17542
35.2k
  if (
Opc == X86ISD::CMP || 35.2k
Opc == X86ISD::COMI19.6k
||
Opc == X86ISD::UCOMI19.6k
||
17543
19.6k
      Opc == X86ISD::SAHF)
17544
15.9k
    return true;
17545
19.3k
  
if (19.3k
Op.getResNo() == 1 &&
17546
18.5k
      
(Opc == X86ISD::ADD || 18.5k
Opc == X86ISD::SUB17.9k
||
Opc == X86ISD::ADC1.43k
||
17547
18.5k
       
Opc == X86ISD::SBB1.43k
||
Opc == X86ISD::SMUL1.43k
||
17548
18.5k
       
Opc == X86ISD::INC1.43k
||
Opc == X86ISD::DEC1.10k
||
Opc == X86ISD::OR810
||
17549
18.5k
       
Opc == X86ISD::XOR325
||
Opc == X86ISD::AND295
))
17550
18.5k
    return true;
17551
793
17552
793
  
if (793
Op.getResNo() == 2 && 793
Opc == X86ISD::UMUL0
)
17553
0
    return true;
17554
793
17555
793
  return false;
17556
793
}
17557
17558
3.74k
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17559
3.74k
  if (V.getOpcode() != ISD::TRUNCATE)
17560
3.54k
    return false;
17561
196
17562
196
  SDValue VOp0 = V.getOperand(0);
17563
196
  unsigned InBits = VOp0.getValueSizeInBits();
17564
196
  unsigned Bits = V.getValueSizeInBits();
17565
196
  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17566
196
}
17567
17568
5.44k
SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17569
5.44k
  bool AddTest = true;
17570
5.44k
  SDValue Cond  = Op.getOperand(0);
17571
5.44k
  SDValue Op1 = Op.getOperand(1);
17572
5.44k
  SDValue Op2 = Op.getOperand(2);
17573
5.44k
  SDLoc DL(Op);
17574
5.44k
  MVT VT = Op1.getSimpleValueType();
17575
5.44k
  SDValue CC;
17576
5.44k
17577
5.44k
  // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17578
5.44k
  // are available or VBLENDV if AVX is available.
17579
5.44k
  // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17580
5.44k
  if (Cond.getOpcode() == ISD::SETCC &&
17581
4.77k
      
((Subtarget.hasSSE2() && 4.77k
(VT == MVT::f32 || 3.94k
VT == MVT::f643.52k
)) ||
17582
4.77k
       
(Subtarget.hasSSE1() && 4.03k
VT == MVT::f323.33k
)) &&
17583
5.44k
      
VT == Cond.getOperand(0).getSimpleValueType()765
&&
Cond->hasOneUse()529
) {
17584
525
    SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17585
525
    int SSECC = translateX86FSETCC(
17586
525
        cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17587
525
17588
525
    if (
SSECC != 8525
) {
17589
521
      if (
Subtarget.hasAVX512()521
) {
17590
32
        SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17591
32
                                  CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17592
32
        assert(!VT.isVector() && "Not a scalar type?");
17593
32
        return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17594
32
      }
17595
489
17596
489
      SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17597
489
                                DAG.getConstant(SSECC, DL, MVT::i8));
17598
489
17599
489
      // If we have AVX, we can use a variable vector select (VBLENDV) instead
17600
489
      // of 3 logic instructions for size savings and potentially speed.
17601
489
      // Unfortunately, there is no scalar form of VBLENDV.
17602
489
17603
489
      // If either operand is a constant, don't try this. We can expect to
17604
489
      // optimize away at least one of the logic instructions later in that
17605
489
      // case, so that sequence would be faster than a variable blend.
17606
489
17607
489
      // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17608
489
      // uses XMM0 as the selection register. That may need just as many
17609
489
      // instructions as the AND/ANDN/OR sequence due to register moves, so
17610
489
      // don't bother.
17611
489
17612
489
      if (Subtarget.hasAVX() &&
17613
489
          
!isa<ConstantFPSDNode>(Op1)82
&&
!isa<ConstantFPSDNode>(Op2)72
) {
17614
54
17615
54
        // Convert to vectors, do a VSELECT, and convert back to scalar.
17616
54
        // All of the conversions should be optimized away.
17617
54
17618
54
        MVT VecVT = VT == MVT::f32 ? 
MVT::v4f3227
:
MVT::v2f6427
;
17619
54
        SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17620
54
        SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17621
54
        SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17622
54
17623
54
        MVT VCmpVT = VT == MVT::f32 ? 
MVT::v4i3227
:
MVT::v2i6427
;
17624
54
        VCmp = DAG.getBitcast(VCmpVT, VCmp);
17625
54
17626
54
        SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
17627
54
17628
54
        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17629
54
                           VSel, DAG.getIntPtrConstant(0, DL));
17630
54
      }
17631
435
      SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17632
435
      SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17633
435
      return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17634
435
    }
17635
525
  }
17636
4.92k
17637
4.92k
  // AVX512 fallback is to lower selects of scalar floats to masked moves.
17638
4.92k
  
if (4.92k
(VT == MVT::f64 || 4.92k
VT == MVT::f324.70k
) &&
Subtarget.hasAVX512()553
) {
17639
19
    SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
17640
19
    return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17641
19
  }
17642
4.90k
17643
4.90k
  
if (4.90k
VT.isVector() && 4.90k
VT.getVectorElementType() == MVT::i152
) {
17644
22
    SDValue Op1Scalar;
17645
22
    if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17646
4
      Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17647
18
    else 
if (18
Op1.getOpcode() == ISD::BITCAST && 18
Op1.getOperand(0)4
)
17648
4
      Op1Scalar = Op1.getOperand(0);
17649
22
    SDValue Op2Scalar;
17650
22
    if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17651
4
      Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17652
18
    else 
if (18
Op2.getOpcode() == ISD::BITCAST && 18
Op2.getOperand(0)4
)
17653
4
      Op2Scalar = Op2.getOperand(0);
17654
22
    if (
Op1Scalar.getNode() && 22
Op2Scalar.getNode()8
) {
17655
8
      SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
17656
8
                                        Op1Scalar, Op2Scalar);
17657
8
      if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17658
8
        return DAG.getBitcast(VT, newSelect);
17659
0
      SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17660
0
      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17661
0
                         DAG.getIntPtrConstant(0, DL));
17662
0
    }
17663
22
  }
17664
4.89k
17665
4.89k
  
if (4.89k
VT == MVT::v4i1 || 4.89k
VT == MVT::v2i14.89k
) {
17666
1
    SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17667
1
    Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17668
1
                      DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17669
1
    Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17670
1
                      DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17671
1
    SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
17672
1
    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17673
1
  }
17674
4.89k
17675
4.89k
  
if (4.89k
Cond.getOpcode() == ISD::SETCC4.89k
) {
17676
4.24k
    if (SDValue 
NewCond4.24k
= LowerSETCC(Cond, DAG)) {
17677
4.22k
      Cond = NewCond;
17678
4.22k
      // If the condition was updated, it's possible that the operands of the
17679
4.22k
      // select were also updated (for example, EmitTest has a RAUW). Refresh
17680
4.22k
      // the local references to the select operands in case they got stale.
17681
4.22k
      Op1 = Op.getOperand(1);
17682
4.22k
      Op2 = Op.getOperand(2);
17683
4.22k
    }
17684
4.24k
  }
17685
4.89k
17686
4.89k
  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17687
4.89k
  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17688
4.89k
  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17689
4.89k
  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17690
4.89k
  // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17691
4.89k
  // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17692
4.89k
  if (Cond.getOpcode() == X86ISD::SETCC &&
17693
4.22k
      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17694
4.89k
      
isNullConstant(Cond.getOperand(1).getOperand(1))2.01k
) {
17695
1.32k
    SDValue Cmp = Cond.getOperand(1);
17696
1.32k
    unsigned CondCode =
17697
1.32k
        cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17698
1.32k
17699
1.32k
    if (
(isAllOnesConstant(Op1) || 1.32k
isAllOnesConstant(Op2)1.27k
) &&
17700
1.32k
        
(CondCode == X86::COND_E || 63
CondCode == X86::COND_NE48
)) {
17701
61
      SDValue Y = isAllOnesConstant(Op2) ? 
Op115
:
Op246
;
17702
61
      SDValue CmpOp0 = Cmp.getOperand(0);
17703
61
17704
61
      // Apply further optimizations for special cases
17705
61
      // (select (x != 0), -1, 0) -> neg & sbb
17706
61
      // (select (x == 0), 0, -1) -> neg & sbb
17707
61
      if (isNullConstant(Y) &&
17708
61
          
(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))41
) {
17709
41
        SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17710
41
        SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
17711
41
        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
17712
41
        SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17713
41
                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
17714
41
                                  SDValue(Neg.getNode(), 1));
17715
41
        return Res;
17716
41
      }
17717
20
17718
20
      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17719
20
                        CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17720
20
      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17721
20
17722
20
      SDValue Res =   // Res = 0 or -1.
17723
20
        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17724
20
                    DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17725
20
17726
20
      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17727
14
        Res = DAG.getNOT(DL, Res, Res.getValueType());
17728
20
17729
20
      if (!isNullConstant(Op2))
17730
20
        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17731
61
      return Res;
17732
1.25k
    } else 
if (1.25k
!Subtarget.hasCMov() && 1.25k
CondCode == X86::COND_E224
&&
17733
68
               Cmp.getOperand(0).getOpcode() == ISD::AND &&
17734
1.25k
               
isOneConstant(Cmp.getOperand(0).getOperand(1))8
) {
17735
8
      SDValue CmpOp0 = Cmp.getOperand(0);
17736
8
      SDValue Src1, Src2;
17737
8
      // true if Op2 is XOR or OR operator and one of its operands
17738
8
      // is equal to Op1
17739
8
      // ( a , a op b) || ( b , a op b)
17740
8
      auto isOrXorPattern = [&]() {
17741
8
        if (
(Op2.getOpcode() == ISD::XOR || 8
Op2.getOpcode() == ISD::OR6
) &&
17742
8
            
(Op2.getOperand(0) == Op1 || 4
Op2.getOperand(1) == Op13
)) {
17743
4
          Src1 =
17744
4
              Op2.getOperand(0) == Op1 ? 
Op2.getOperand(1)1
:
Op2.getOperand(0)3
;
17745
4
          Src2 = Op1;
17746
4
          return true;
17747
4
        }
17748
4
        return false;
17749
4
      };
17750
8
17751
8
      if (
isOrXorPattern()8
) {
17752
4
        SDValue Neg;
17753
4
        unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17754
4
        // we need mask of all zeros or ones with same size of the other
17755
4
        // operands.
17756
4
        if (CmpSz > VT.getSizeInBits())
17757
0
          Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17758
4
        else 
if (4
CmpSz < VT.getSizeInBits()4
)
17759
3
          Neg = DAG.getNode(ISD::AND, DL, VT,
17760
3
              DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17761
3
              DAG.getConstant(1, DL, VT));
17762
4
        else
17763
1
          Neg = CmpOp0;
17764
4
        SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17765
4
                                   Neg); // -(and (x, 0x1))
17766
4
        SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17767
4
        return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
17768
4
      }
17769
4.83k
    }
17770
1.32k
  }
17771
4.83k
17772
4.83k
  // Look past (and (setcc_carry (cmp ...)), 1).
17773
4.83k
  
if (4.83k
Cond.getOpcode() == ISD::AND &&
17774
247
      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17775
0
      isOneConstant(Cond.getOperand(1)))
17776
0
    Cond = Cond.getOperand(0);
17777
4.83k
17778
4.83k
  // If condition flag is set by a X86ISD::CMP, then use it as the condition
17779
4.83k
  // setting operand in place of the X86ISD::SETCC.
17780
4.83k
  unsigned CondOpcode = Cond.getOpcode();
17781
4.83k
  if (CondOpcode == X86ISD::SETCC ||
17782
4.83k
      
CondOpcode == X86ISD::SETCC_CARRY666
) {
17783
4.16k
    CC = Cond.getOperand(0);
17784
4.16k
17785
4.16k
    SDValue Cmp = Cond.getOperand(1);
17786
4.16k
    unsigned Opc = Cmp.getOpcode();
17787
4.16k
    MVT VT = Op.getSimpleValueType();
17788
4.16k
17789
4.16k
    bool IllegalFPCMov = false;
17790
4.16k
    if (
VT.isFloatingPoint() && 4.16k
!VT.isVector()956
&&
17791
951
        !isScalarFPTypeInSSEReg(VT))  // FPStack?
17792
708
      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17793
4.16k
17794
4.16k
    if (
(isX86LogicalCmp(Cmp) && 4.16k
!IllegalFPCMov4.15k
) ||
17795
4.16k
        
Opc == X86ISD::BT58
) { // FIXME
17796
4.10k
      Cond = Cmp;
17797
4.10k
      AddTest = false;
17798
4.10k
    }
17799
4.83k
  } else 
if (666
CondOpcode == ISD::USUBO || 666
CondOpcode == ISD::SSUBO659
||
17800
666
             
CondOpcode == ISD::UADDO652
||
CondOpcode == ISD::SADDO645
||
17801
638
             
((CondOpcode == ISD::UMULO || 638
CondOpcode == ISD::SMULO628
) &&
17802
666
              
Cond.getOperand(0).getValueType() != MVT::i817
)) {
17803
43
    SDValue LHS = Cond.getOperand(0);
17804
43
    SDValue RHS = Cond.getOperand(1);
17805
43
    unsigned X86Opcode;
17806
43
    unsigned X86Cond;
17807
43
    SDVTList VTs;
17808
43
    switch (CondOpcode) {
17809
7
    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17810
7
    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17811
7
    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17812
7
    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17813
9
    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17814
6
    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17815
0
    
default: 0
llvm_unreachable0
("unexpected overflowing operator");
17816
43
    }
17817
43
    
if (43
CondOpcode == ISD::UMULO43
)
17818
9
      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17819
9
                          MVT::i32);
17820
43
    else
17821
34
      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17822
43
17823
43
    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17824
43
17825
43
    if (CondOpcode == ISD::UMULO)
17826
9
      Cond = X86Op.getValue(2);
17827
43
    else
17828
34
      Cond = X86Op.getValue(1);
17829
666
17830
666
    CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17831
666
    AddTest = false;
17832
666
  }
17833
4.83k
17834
4.83k
  
if (4.83k
AddTest4.83k
) {
17835
678
    // Look past the truncate if the high bits are known zero.
17836
678
    if (isTruncWithZeroHighBitsInput(Cond, DAG))
17837
128
      Cond = Cond.getOperand(0);
17838
678
17839
678
    // We know the result of AND is compared against zero. Try to match
17840
678
    // it to BT.
17841
678
    if (
Cond.getOpcode() == ISD::AND && 678
Cond.hasOneUse()327
) {
17842
174
      if (SDValue 
NewSetCC174
= LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
17843
80
        CC = NewSetCC.getOperand(0);
17844
80
        Cond = NewSetCC.getOperand(1);
17845
80
        AddTest = false;
17846
80
      }
17847
174
    }
17848
678
  }
17849
4.83k
17850
4.83k
  if (
AddTest4.83k
) {
17851
598
    CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17852
598
    Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17853
598
  }
17854
4.83k
17855
4.83k
  // a <  b ? -1 :  0 -> RES = ~setcc_carry
17856
4.83k
  // a <  b ?  0 : -1 -> RES = setcc_carry
17857
4.83k
  // a >= b ? -1 :  0 -> RES = setcc_carry
17858
4.83k
  // a >= b ?  0 : -1 -> RES = ~setcc_carry
17859
4.83k
  if (
Cond.getOpcode() == X86ISD::SUB4.83k
) {
17860
1.98k
    Cond = ConvertCmpIfNecessary(Cond, DAG);
17861
1.98k
    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17862
1.98k
17863
1.98k
    if (
(CondCode == X86::COND_AE || 1.98k
CondCode == X86::COND_B1.96k
) &&
17864
504
        
(isAllOnesConstant(Op1) || 504
isAllOnesConstant(Op2)504
) &&
17865
1.98k
        
(isNullConstant(Op1) || 12
isNullConstant(Op2)12
)) {
17866
0
      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17867
0
                                DAG.getConstant(X86::COND_B, DL, MVT::i8),
17868
0
                                Cond);
17869
0
      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17870
0
        return DAG.getNOT(DL, Res, Res.getValueType());
17871
0
      return Res;
17872
0
    }
17873
1.98k
  }
17874
4.83k
17875
4.83k
  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17876
4.83k
  // widen the cmov and push the truncate through. This avoids introducing a new
17877
4.83k
  // branch during isel and doesn't add any extensions.
17878
4.83k
  
if (4.83k
Op.getValueType() == MVT::i8 &&
17879
4.83k
      
Op1.getOpcode() == ISD::TRUNCATE223
&&
Op2.getOpcode() == ISD::TRUNCATE16
) {
17880
10
    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17881
10
    if (T1.getValueType() == T2.getValueType() &&
17882
10
        // Blacklist CopyFromReg to avoid partial register stalls.
17883
10
        
T1.getOpcode() != ISD::CopyFromReg4
&&
T2.getOpcode()!=ISD::CopyFromReg3
){
17884
3
      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17885
3
      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17886
3
      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17887
3
    }
17888
4.82k
  }
17889
4.82k
17890
4.82k
  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17891
4.82k
  // condition is true.
17892
4.82k
  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17893
4.82k
  SDValue Ops[] = { Op2, Op1, CC, Cond };
17894
4.82k
  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17895
4.82k
}
17896
17897
static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17898
                                       const X86Subtarget &Subtarget,
17899
3.35k
                                       SelectionDAG &DAG) {
17900
3.35k
  MVT VT = Op->getSimpleValueType(0);
17901
3.35k
  SDValue In = Op->getOperand(0);
17902
3.35k
  MVT InVT = In.getSimpleValueType();
17903
3.35k
  MVT VTElt = VT.getVectorElementType();
17904
3.35k
  MVT InVTElt = InVT.getVectorElementType();
17905
3.35k
  SDLoc dl(Op);
17906
3.35k
17907
3.35k
  // SKX processor
17908
3.35k
  if ((InVTElt == MVT::i1) &&
17909
1.87k
      
(((Subtarget.hasBWI() && 1.87k
VTElt.getSizeInBits() <= 16783
)) ||
17910
1.87k
17911
1.21k
       
((Subtarget.hasDQI() && 1.21k
VTElt.getSizeInBits() >= 32124
))))
17912
3.35k
17913
754
    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17914
2.59k
17915
2.59k
  unsigned NumElts = VT.getVectorNumElements();
17916
2.59k
17917
2.59k
  if (
VT.is512BitVector() && 2.59k
InVTElt != MVT::i12.38k
&&
17918
2.59k
      
(NumElts == 8 || 1.48k
NumElts == 161.32k
||
Subtarget.hasBWI()31
)) {
17919
1.48k
    if (
In.getOpcode() == X86ISD::VSEXT || 1.48k
In.getOpcode() == X86ISD::VZEXT1.48k
)
17920
0
      return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
17921
1.48k
    return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
17922
1.48k
  }
17923
1.11k
17924
1.11k
  
if (1.11k
InVTElt != MVT::i11.11k
)
17925
0
    return SDValue();
17926
1.11k
17927
1.11k
  MVT ExtVT = VT;
17928
1.11k
  if (
!VT.is512BitVector() && 1.11k
!Subtarget.hasVLX()207
)
17929
164
    ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17930
1.11k
17931
1.11k
  SDValue V;
17932
1.11k
  if (
Subtarget.hasDQI()1.11k
) {
17933
23
    V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
17934
23
    assert(!VT.is512BitVector() && "Unexpected vector type");
17935
1.11k
  } else {
17936
1.09k
    SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
17937
1.09k
    SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
17938
1.09k
    V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
17939
1.09k
    if (ExtVT == VT)
17940
952
      return V;
17941
164
  }
17942
164
17943
164
  return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17944
164
}
17945
17946
// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17947
// For sign extend this needs to handle all vector sizes and SSE4.1 and
17948
// non-SSE4.1 targets. For zero extend this should only handle inputs of
17949
// MVT::v64i8 when BWI is not supported, but AVX512 is.
17950
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17951
                                        const X86Subtarget &Subtarget,
17952
434
                                        SelectionDAG &DAG) {
17953
434
  SDValue In = Op->getOperand(0);
17954
434
  MVT VT = Op->getSimpleValueType(0);
17955
434
  MVT InVT = In.getSimpleValueType();
17956
434
  assert(VT.getSizeInBits() == InVT.getSizeInBits());
17957
434
17958
434
  MVT SVT = VT.getVectorElementType();
17959
434
  MVT InSVT = InVT.getVectorElementType();
17960
434
  assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17961
434
17962
434
  if (
SVT != MVT::i64 && 434
SVT != MVT::i32250
&&
SVT != MVT::i1643
)
17963
0
    return SDValue();
17964
434
  
if (434
InSVT != MVT::i32 && 434
InSVT != MVT::i16394
&&
InSVT != MVT::i8277
)
17965
0
    return SDValue();
17966
434
  
if (434
!(VT.is128BitVector() && 434
Subtarget.hasSSE2()327
) &&
17967
107
      
!(VT.is256BitVector() && 107
Subtarget.hasInt256()99
) &&
17968
8
      
!(VT.is512BitVector() && 8
Subtarget.hasAVX512()8
))
17969
0
    return SDValue();
17970
434
17971
434
  SDLoc dl(Op);
17972
434
17973
434
  // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17974
434
  // For 512-bit vectors, we need 128-bits or 256-bits.
17975
434
  if (
VT.getSizeInBits() > 128434
) {
17976
107
    // Input needs to be at least the same number of elements as output, and
17977
107
    // at least 128-bits.
17978
107
    int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17979
107
    In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17980
107
  }
17981
434
17982
434
  assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17983
434
          InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17984
434
17985
434
  // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
17986
434
  // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
17987
434
  // need to be handled here for 256/512-bit results.
17988
434
  if (
Subtarget.hasInt256()434
) {
17989
107
    assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
17990
107
    unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17991
107
                        
X86ISD::VSEXT105
:
X86ISD::VZEXT2
;
17992
107
    return DAG.getNode(ExtOpc, dl, VT, In);
17993
107
  }
17994
327
17995
327
  // We should only get here for sign extend.
17996
434
  assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17997
327
         "Unexpected opcode!");
17998
327
17999
327
  // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18000
327
  SDValue Curr = In;
18001
327
  MVT CurrVT = InVT;
18002
327
18003
327
  // As SRAI is only available on i16/i32 types, we expand only up to i32
18004
327
  // and handle i64 separately.
18005
785
  while (
CurrVT != VT && 785
CurrVT.getVectorElementType() != MVT::i32578
) {
18006
458
    Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18007
458
    MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18008
458
    CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18009
458
    Curr = DAG.getBitcast(CurrVT, Curr);
18010
458
  }
18011
327
18012
327
  SDValue SignExt = Curr;
18013
327
  if (
CurrVT != InVT327
) {
18014
291
    unsigned SignExtShift =
18015
291
        CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18016
291
    SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18017
291
                          DAG.getConstant(SignExtShift, dl, MVT::i8));
18018
291
  }
18019
327
18020
327
  if (CurrVT == VT)
18021
207
    return SignExt;
18022
120
18023
120
  
if (120
VT == MVT::v2i64 && 120
CurrVT == MVT::v4i32120
) {
18024
120
    SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18025
120
                               DAG.getConstant(31, dl, MVT::i8));
18026
120
    SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18027
120
    return DAG.getBitcast(VT, Ext);
18028
120
  }
18029
0
18030
0
  return SDValue();
18031
0
}
18032
18033
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18034
3.62k
                                SelectionDAG &DAG) {
18035
3.62k
  MVT VT = Op->getSimpleValueType(0);
18036
3.62k
  SDValue In = Op->getOperand(0);
18037
3.62k
  MVT InVT = In.getSimpleValueType();
18038
3.62k
  SDLoc dl(Op);
18039
3.62k
18040
3.62k
  if (
VT.is512BitVector() || 3.62k
InVT.getVectorElementType() == MVT::i1653
)
18041
3.35k
    return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
18042
272
18043
272
  
if (272
(VT != MVT::v4i64 || 272
InVT != MVT::v4i3260
) &&
18044
212
      
(VT != MVT::v8i32 || 212
InVT != MVT::v8i1672
) &&
18045
140
      
(VT != MVT::v16i16 || 140
InVT != MVT::v16i8140
))
18046
0
    return SDValue();
18047
272
18048
272
  
if (272
Subtarget.hasInt256()272
)
18049
262
    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18050
10
18051
10
  // Optimize vectors in AVX mode
18052
10
  // Sign extend  v8i16 to v8i32 and
18053
10
  //              v4i32 to v4i64
18054
10
  //
18055
10
  // Divide input vector into two parts
18056
10
  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18057
10
  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18058
10
  // concat the vectors to original VT
18059
10
18060
10
  unsigned NumElems = InVT.getVectorNumElements();
18061
10
  SDValue Undef = DAG.getUNDEF(InVT);
18062
10
18063
10
  SmallVector<int,8> ShufMask1(NumElems, -1);
18064
36
  for (unsigned i = 0; 
i != NumElems/236
;
++i26
)
18065
26
    ShufMask1[i] = i;
18066
10
18067
10
  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18068
10
18069
10
  SmallVector<int,8> ShufMask2(NumElems, -1);
18070
36
  for (unsigned i = 0; 
i != NumElems/236
;
++i26
)
18071
26
    ShufMask2[i] = i + NumElems/2;
18072
3.62k
18073
3.62k
  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18074
3.62k
18075
3.62k
  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18076
3.62k
                                VT.getVectorNumElements() / 2);
18077
3.62k
18078
3.62k
  OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18079
3.62k
  OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18080
3.62k
18081
3.62k
  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18082
3.62k
}
18083
18084
// Lower truncating store. We need a special lowering to vXi1 vectors
18085
static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18086
819
                                    SelectionDAG &DAG) {
18087
819
  StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18088
819
  SDLoc dl(St);
18089
819
  EVT MemVT = St->getMemoryVT();
18090
819
  assert(St->isTruncatingStore() && "We only custom truncating store.");
18091
819
  assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18092
819
         "Expected truncstore of i1 vector");
18093
819
18094
819
  SDValue Op = St->getValue();
18095
819
  MVT OpVT = Op.getValueType().getSimpleVT();
18096
819
  unsigned NumElts = OpVT.getVectorNumElements();
18097
819
  if (
(Subtarget.hasVLX() && 819
Subtarget.hasBWI()0
&&
Subtarget.hasDQI()0
) ||
18098
819
      
NumElts == 16819
) {
18099
0
    // Truncate and store - everything is legal
18100
0
    Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18101
0
    if (MemVT.getSizeInBits() < 8)
18102
0
      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18103
0
                       DAG.getUNDEF(MVT::v8i1), Op,
18104
0
                       DAG.getIntPtrConstant(0, dl));
18105
0
    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18106
0
                        St->getMemOperand());
18107
0
  }
18108
819
18109
819
  // A subset, assume that we have only AVX-512F
18110
819
  
if (819
NumElts <= 8819
) {
18111
42
    if (
NumElts < 842
) {
18112
42
      // Extend to 8-elts vector
18113
42
      MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18114
42
      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18115
42
                        DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18116
42
    }
18117
42
    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18118
42
    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18119
42
                        St->getMemOperand());
18120
42
  }
18121
777
  // v32i8
18122
819
  assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18123
777
  // Divide the vector into 2 parts and store each part separately
18124
777
  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18125
777
                            DAG.getIntPtrConstant(0, dl));
18126
777
  Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18127
777
  SDValue BasePtr = St->getBasePtr();
18128
777
  SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18129
777
                              St->getMemOperand());
18130
777
  SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18131
777
                            DAG.getIntPtrConstant(16, dl));
18132
777
  Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18133
777
18134
777
  SDValue BasePtrHi =
18135
777
    DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18136
777
                DAG.getConstant(2, dl, BasePtr.getValueType()));
18137
777
18138
777
  SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18139
777
                              BasePtrHi, St->getMemOperand());
18140
777
  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18141
777
}
18142
18143
static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18144
                                           const X86Subtarget &Subtarget,
18145
250
                                           SelectionDAG &DAG) {
18146
250
18147
250
  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18148
250
  SDLoc dl(Ld);
18149
250
  EVT MemVT = Ld->getMemoryVT();
18150
250
  assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18151
250
         "Expected i1 vector load");
18152
250
  unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18153
250
    
ISD::ZERO_EXTEND6
:
ISD::SIGN_EXTEND244
;
18154
250
  MVT VT = Op.getValueType().getSimpleVT();
18155
250
  unsigned NumElts = VT.getVectorNumElements();
18156
250
18157
250
  if (
(Subtarget.hasBWI() && 250
NumElts >= 3256
) ||
18158
236
      
(Subtarget.hasDQI() && 236
NumElts < 1628
) ||
18159
250
      
NumElts == 16228
) {
18160
46
    // Load and extend - everything is legal
18161
46
    if (
NumElts < 846
) {
18162
4
      SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18163
4
                                 Ld->getBasePtr(),
18164
4
                                 Ld->getMemOperand());
18165
4
      // Replace chain users with the new chain.
18166
4
      assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18167
4
      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18168
4
      MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18169
4
      SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18170
4
18171
4
      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18172
4
                                   DAG.getIntPtrConstant(0, dl));
18173
4
    }
18174
42
    SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18175
42
                               Ld->getBasePtr(),
18176
42
                               Ld->getMemOperand());
18177
42
    // Replace chain users with the new chain.
18178
42
    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18179
42
    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18180
42
18181
42
    // Finally, do a normal sign-extend to the desired register.
18182
42
    return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18183
42
  }
18184
204
18185
204
  
if (204
NumElts <= 8204
) {
18186
126
    // A subset, assume that we have only AVX-512F
18187
126
    unsigned NumBitsToLoad = 8;
18188
126
    MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18189
126
    SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18190
126
                              Ld->getBasePtr(),
18191
126
                              Ld->getMemOperand());
18192
126
    // Replace chain users with the new chain.
18193
126
    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18194
126
    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18195
126
18196
126
    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18197
126
    SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18198
126
18199
126
    if (NumElts == 8)
18200
12
      return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18201
114
18202
114
      // we should take care to v4i1 and v2i1
18203
114
18204
114
    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18205
114
    SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18206
114
    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18207
114
                        DAG.getIntPtrConstant(0, dl));
18208
114
  }
18209
78
18210
204
  assert(VT == MVT::v32i8 && "Unexpected extload type");
18211
78
18212
78
  SmallVector<SDValue, 2> Chains;
18213
78
18214
78
  SDValue BasePtr = Ld->getBasePtr();
18215
78
  SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18216
78
                               Ld->getBasePtr(),
18217
78
                               Ld->getMemOperand());
18218
78
  Chains.push_back(LoadLo.getValue(1));
18219
78
18220
78
  SDValue BasePtrHi =
18221
78
    DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18222
78
                DAG.getConstant(2, dl, BasePtr.getValueType()));
18223
78
18224
78
  SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18225
78
                               BasePtrHi,
18226
78
                               Ld->getMemOperand());
18227
78
  Chains.push_back(LoadHi.getValue(1));
18228
78
  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18229
78
  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18230
78
18231
78
  SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18232
78
  SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18233
78
  return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18234
78
}
18235
18236
// Lower vector extended loads using a shuffle. If SSSE3 is not available we
18237
// may emit an illegal shuffle but the expansion is still better than scalar
18238
// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18239
// we'll emit a shuffle and a arithmetic shift.
18240
// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18241
// TODO: It is possible to support ZExt by zeroing the undef values during
18242
// the shuffle phase or after the shuffle.
18243
static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18244
1.96k
                                 SelectionDAG &DAG) {
18245
1.96k
  MVT RegVT = Op.getSimpleValueType();
18246
1.96k
  assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18247
1.96k
  assert(RegVT.isInteger() &&
18248
1.96k
         "We only custom lower integer vector sext loads.");
18249
1.96k
18250
1.96k
  // Nothing useful we can do without SSE2 shuffles.
18251
1.96k
  assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18252
1.96k
18253
1.96k
  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18254
1.96k
  SDLoc dl(Ld);
18255
1.96k
  EVT MemVT = Ld->getMemoryVT();
18256
1.96k
  if (MemVT.getScalarType() == MVT::i1)
18257
250
    return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18258
1.71k
18259
1.71k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18260
1.71k
  unsigned RegSz = RegVT.getSizeInBits();
18261
1.71k
18262
1.71k
  ISD::LoadExtType Ext = Ld->getExtensionType();
18263
1.71k
18264
1.71k
  assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18265
1.71k
         && "Only anyext and sext are currently implemented.");
18266
1.71k
  assert(MemVT != RegVT && "Cannot extend to the same type");
18267
1.71k
  assert(MemVT.isVector() && "Must load a vector from memory");
18268
1.71k
18269
1.71k
  unsigned NumElems = RegVT.getVectorNumElements();
18270
1.71k
  unsigned MemSz = MemVT.getSizeInBits();
18271
1.71k
  assert(RegSz > MemSz && "Register size must be greater than the mem size");
18272
1.71k
18273
1.71k
  if (
Ext == ISD::SEXTLOAD && 1.71k
RegSz == 256200
&&
!Subtarget.hasInt256()20
) {
18274
20
    // The only way in which we have a legal 256-bit vector result but not the
18275
20
    // integer 256-bit operations needed to directly lower a sextload is if we
18276
20
    // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18277
20
    // a 128-bit vector and a normal sign_extend to 256-bits that should get
18278
20
    // correctly legalized. We do this late to allow the canonical form of
18279
20
    // sextload to persist throughout the rest of the DAG combiner -- it wants
18280
20
    // to fold together any extensions it can, and so will fuse a sign_extend
18281
20
    // of an sextload into a sextload targeting a wider value.
18282
20
    SDValue Load;
18283
20
    if (
MemSz == 12820
) {
18284
0
      // Just switch this to a normal load.
18285
0
      assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18286
0
                                       "it must be a legal 128-bit vector "
18287
0
                                       "type!");
18288
0
      Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18289
0
                         Ld->getPointerInfo(), Ld->getAlignment(),
18290
0
                         Ld->getMemOperand()->getFlags());
18291
20
    } else {
18292
20
      assert(MemSz < 128 &&
18293
20
             "Can't extend a type wider than 128 bits to a 256 bit vector!");
18294
20
      // Do an sext load to a 128-bit vector type. We want to use the same
18295
20
      // number of elements, but elements half as wide. This will end up being
18296
20
      // recursively lowered by this routine, but will succeed as we definitely
18297
20
      // have all the necessary features if we're using AVX1.
18298
20
      EVT HalfEltVT =
18299
20
          EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18300
20
      EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18301
20
      Load =
18302
20
          DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18303
20
                         Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18304
20
                         Ld->getMemOperand()->getFlags());
18305
20
    }
18306
20
18307
20
    // Replace chain users with the new chain.
18308
20
    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18309
20
    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18310
20
18311
20
    // Finally, do a normal sign-extend to the desired register.
18312
20
    return DAG.getSExtOrTrunc(Load, dl, RegVT);
18313
20
  }
18314
1.69k
18315
1.69k
  // All sizes must be a power of two.
18316
1.71k
  assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18317
1.69k
         "Non-power-of-two elements are not custom lowered!");
18318
1.69k
18319
1.69k
  // Attempt to load the original value using scalar loads.
18320
1.69k
  // Find the largest scalar type that divides the total loaded size.
18321
1.69k
  MVT SclrLoadTy = MVT::i8;
18322
10.1k
  for (MVT Tp : MVT::integer_valuetypes()) {
18323
10.1k
    if (
TLI.isTypeLegal(Tp) && 10.1k
((MemSz % Tp.getSizeInBits()) == 0)6.15k
) {
18324
5.41k
      SclrLoadTy = Tp;
18325
5.41k
    }
18326
10.1k
  }
18327
1.69k
18328
1.69k
  // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18329
1.69k
  if (
TLI.isTypeLegal(MVT::f64) && 1.69k
SclrLoadTy.getSizeInBits() < 641.69k
&&
18330
1.17k
      (64 <= MemSz))
18331
375
    SclrLoadTy = MVT::f64;
18332
1.69k
18333
1.69k
  // Calculate the number of scalar loads that we need to perform
18334
1.69k
  // in order to load our vector from memory.
18335
1.69k
  unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18336
1.69k
18337
1.69k
  assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18338
1.69k
         "Can only lower sext loads with a single scalar load!");
18339
1.69k
18340
1.69k
  unsigned loadRegZize = RegSz;
18341
1.69k
  if (
Ext == ISD::SEXTLOAD && 1.69k
RegSz >= 256180
)
18342
0
    loadRegZize = 128;
18343
1.69k
18344
1.69k
  // Represent our vector as a sequence of elements which are the
18345
1.69k
  // largest scalar that we can load.
18346
1.69k
  EVT LoadUnitVecVT = EVT::getVectorVT(
18347
1.69k
      *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18348
1.69k
18349
1.69k
  // Represent the data using the same element type that is stored in
18350
1.69k
  // memory. In practice, we ''widen'' MemVT.
18351
1.69k
  EVT WideVecVT =
18352
1.69k
      EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18353
1.69k
                       loadRegZize / MemVT.getScalarSizeInBits());
18354
1.69k
18355
1.69k
  assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18356
1.69k
         "Invalid vector type");
18357
1.69k
18358
1.69k
  // We can't shuffle using an illegal type.
18359
1.69k
  assert(TLI.isTypeLegal(WideVecVT) &&
18360
1.69k
         "We only lower types that form legal widened vector types");
18361
1.69k
18362
1.69k
  SmallVector<SDValue, 8> Chains;
18363
1.69k
  SDValue Ptr = Ld->getBasePtr();
18364
1.69k
  SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18365
1.69k
                                      TLI.getPointerTy(DAG.getDataLayout()));
18366
1.69k
  SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18367
1.69k
18368
3.38k
  for (unsigned i = 0; 
i < NumLoads3.38k
;
++i1.69k
) {
18369
1.69k
    // Perform a single load.
18370
1.69k
    SDValue ScalarLoad =
18371
1.69k
        DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18372
1.69k
                    Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18373
1.69k
    Chains.push_back(ScalarLoad.getValue(1));
18374
1.69k
    // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18375
1.69k
    // another round of DAGCombining.
18376
1.69k
    if (i == 0)
18377
1.69k
      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18378
1.69k
    else
18379
0
      Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18380
0
                        ScalarLoad, DAG.getIntPtrConstant(i, dl));
18381
1.69k
18382
1.69k
    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18383
1.69k
  }
18384
1.69k
18385
1.69k
  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18386
1.69k
18387
1.69k
  // Bitcast the loaded value to a vector of the original element type, in
18388
1.69k
  // the size of the target vector type.
18389
1.69k
  SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18390
1.69k
  unsigned SizeRatio = RegSz / MemSz;
18391
1.69k
18392
1.69k
  if (
Ext == ISD::SEXTLOAD1.69k
) {
18393
180
    // If we have SSE4.1, we can directly emit a VSEXT node.
18394
180
    if (
Subtarget.hasSSE41()180
) {
18395
0
      SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18396
0
      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18397
0
      return Sext;
18398
0
    }
18399
180
18400
180
    // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18401
180
    // lanes.
18402
0
    assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18403
180
           "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18404
180
18405
180
    SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18406
180
    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18407
180
    return Shuff;
18408
180
  }
18409
1.51k
18410
1.51k
  // Redistribute the loaded elements into the different locations.
18411
1.51k
  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18412
8.59k
  for (unsigned i = 0; 
i != NumElems8.59k
;
++i7.08k
)
18413
7.08k
    ShuffleVec[i * SizeRatio] = i;
18414
1.96k
18415
1.96k
  SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18416
1.96k
                                       DAG.getUNDEF(WideVecVT), ShuffleVec);
18417
1.96k
18418
1.96k
  // Bitcast to the requested type.
18419
1.96k
  Shuff = DAG.getBitcast(RegVT, Shuff);
18420
1.96k
  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18421
1.96k
  return Shuff;
18422
1.96k
}
18423
18424
/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18425
/// each of which has no other use apart from the AND / OR.
18426
90.4k
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18427
90.4k
  Opc = Op.getOpcode();
18428
90.4k
  if (
Opc != ISD::OR && 90.4k
Opc != ISD::AND70.7k
)
18429
32.0k
    return false;
18430
58.3k
  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18431
378
          Op.getOperand(0).hasOneUse() &&
18432
378
          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18433
353
          Op.getOperand(1).hasOneUse());
18434
90.4k
}
18435
18436
/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18437
/// SETCC node has a single use.
18438
33.6k
static bool isXor1OfSetCC(SDValue Op) {
18439
33.6k
  if (Op.getOpcode() != ISD::XOR)
18440
33.6k
    return false;
18441
0
  
if (0
isOneConstant(Op.getOperand(1))0
)
18442
0
    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18443
0
           Op.getOperand(0).hasOneUse();
18444
0
  return false;
18445
0
}
18446
18447
33.9k
SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18448
33.9k
  bool addTest = true;
18449
33.9k
  SDValue Chain = Op.getOperand(0);
18450
33.9k
  SDValue Cond  = Op.getOperand(1);
18451
33.9k
  SDValue Dest  = Op.getOperand(2);
18452
33.9k
  SDLoc dl(Op);
18453
33.9k
  SDValue CC;
18454
33.9k
  bool Inverted = false;
18455
33.9k
18456
33.9k
  if (
Cond.getOpcode() == ISD::SETCC33.9k
) {
18457
31.0k
    // Check for setcc([su]{add,sub,mul}o == 0).
18458
31.0k
    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18459
12.3k
        isNullConstant(Cond.getOperand(1)) &&
18460
9.23k
        Cond.getOperand(0).getResNo() == 1 &&
18461
0
        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18462
0
         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18463
0
         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18464
0
         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18465
0
         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18466
31.0k
         
Cond.getOperand(0).getOpcode() == ISD::UMULO0
)) {
18467
0
      Inverted = true;
18468
0
      Cond = Cond.getOperand(0);
18469
31.0k
    } else {
18470
31.0k
      if (SDValue NewCond = LowerSETCC(Cond, DAG))
18471
30.9k
        Cond = NewCond;
18472
31.0k
    }
18473
31.0k
  }
18474
#if 0
18475
  // FIXME: LowerXALUO doesn't handle these!!
18476
  else if (Cond.getOpcode() == X86ISD::ADD  ||
18477
           Cond.getOpcode() == X86ISD::SUB  ||
18478
           Cond.getOpcode() == X86ISD::SMUL ||
18479
           Cond.getOpcode() == X86ISD::UMUL)
18480
    Cond = LowerXALUO(Cond, DAG);
18481
#endif
18482
18483
33.9k
  // Look pass (and (setcc_carry (cmp ...)), 1).
18484
33.9k
  if (Cond.getOpcode() == ISD::AND &&
18485
1.58k
      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18486
0
      isOneConstant(Cond.getOperand(1)))
18487
0
    Cond = Cond.getOperand(0);
18488
33.9k
18489
33.9k
  // If condition flag is set by a X86ISD::CMP, then use it as the condition
18490
33.9k
  // setting operand in place of the X86ISD::SETCC.
18491
33.9k
  unsigned CondOpcode = Cond.getOpcode();
18492
33.9k
  if (CondOpcode == X86ISD::SETCC ||
18493
33.9k
      
CondOpcode == X86ISD::SETCC_CARRY2.81k
) {
18494
31.1k
    CC = Cond.getOperand(0);
18495
31.1k
18496
31.1k
    SDValue Cmp = Cond.getOperand(1);
18497
31.1k
    unsigned Opc = Cmp.getOpcode();
18498
31.1k
    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18499
31.1k
    if (
isX86LogicalCmp(Cmp) || 31.1k
Opc == X86ISD::BT784
) {
18500
30.7k
      Cond = Cmp;
18501
30.7k
      addTest = false;
18502
31.1k
    } else {
18503
405
      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18504
405
      default: break;
18505
0
      case X86::COND_O:
18506
0
      case X86::COND_B:
18507
0
        // These can only come from an arithmetic instruction with overflow,
18508
0
        // e.g. SADDO, UADDO.
18509
0
        Cond = Cond.getOperand(1);
18510
0
        addTest = false;
18511
0
        break;
18512
33.9k
      }
18513
33.9k
    }
18514
31.1k
  }
18515
33.9k
  CondOpcode = Cond.getOpcode();
18516
33.9k
  if (
CondOpcode == ISD::UADDO || 33.9k
CondOpcode == ISD::SADDO33.9k
||
18517
33.9k
      
CondOpcode == ISD::USUBO33.9k
||
CondOpcode == ISD::SSUBO33.9k
||
18518
33.9k
      
((CondOpcode == ISD::UMULO || 33.9k
CondOpcode == ISD::SMULO33.9k
) &&
18519
33.9k
       
Cond.getOperand(0).getValueType() != MVT::i82
)) {
18520
14
    SDValue LHS = Cond.getOperand(0);
18521
14
    SDValue RHS = Cond.getOperand(1);
18522
14
    unsigned X86Opcode;
18523
14
    unsigned X86Cond;
18524
14
    SDVTList VTs;
18525
14
    // Keep this in sync with LowerXALUO, otherwise we might create redundant
18526
14
    // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18527
14
    // X86ISD::INC).
18528
14
    switch (CondOpcode) {
18529
4
    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18530
6
    case ISD::SADDO:
18531
6
      if (
isOneConstant(RHS)6
) {
18532
1
          X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18533
1
          break;
18534
1
        }
18535
5
      X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18536
1
    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18537
1
    case ISD::SSUBO:
18538
1
      if (
isOneConstant(RHS)1
) {
18539
0
          X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18540
0
          break;
18541
0
        }
18542
1
      X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18543
1
    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18544
1
    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18545
0
    
default: 0
llvm_unreachable0
("unexpected overflowing operator");
18546
14
    }
18547
14
    
if (14
Inverted14
)
18548
0
      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18549
14
    if (CondOpcode == ISD::UMULO)
18550
1
      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18551
1
                          MVT::i32);
18552
14
    else
18553
13
      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18554
14
18555
14
    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18556
14
18557
14
    if (CondOpcode == ISD::UMULO)
18558
1
      Cond = X86Op.getValue(2);
18559
14
    else
18560
13
      Cond = X86Op.getValue(1);
18561
14
18562
14
    CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18563
14
    addTest = false;
18564
33.9k
  } else {
18565
33.9k
    unsigned CondOpc;
18566
33.9k
    if (
Cond.hasOneUse() && 33.9k
isAndOrOfSetCCs(Cond, CondOpc)33.6k
) {
18567
0
      SDValue Cmp = Cond.getOperand(0).getOperand(1);
18568
0
      if (
CondOpc == ISD::OR0
) {
18569
0
        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18570
0
        // two branches instead of an explicit OR instruction with a
18571
0
        // separate test.
18572
0
        if (Cmp == Cond.getOperand(1).getOperand(1) &&
18573
0
            
isX86LogicalCmp(Cmp)0
) {
18574
0
          CC = Cond.getOperand(0).getOperand(0);
18575
0
          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18576
0
                              Chain, Dest, CC, Cmp);
18577
0
          CC = Cond.getOperand(1).getOperand(0);
18578
0
          Cond = Cmp;
18579
0
          addTest = false;
18580
0
        }
18581
0
      } else { // ISD::AND
18582
0
        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18583
0
        // two branches instead of an explicit AND instruction with a
18584
0
        // separate test. However, we only do this if this block doesn't
18585
0
        // have a fall-through edge, because this requires an explicit
18586
0
        // jmp when the condition is false.
18587
0
        if (Cmp == Cond.getOperand(1).getOperand(1) &&
18588
0
            isX86LogicalCmp(Cmp) &&
18589
0
            
Op.getNode()->hasOneUse()0
) {
18590
0
          X86::CondCode CCode =
18591
0
            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18592
0
          CCode = X86::GetOppositeBranchCondition(CCode);
18593
0
          CC = DAG.getConstant(CCode, dl, MVT::i8);
18594
0
          SDNode *User = *Op.getNode()->use_begin();
18595
0
          // Look for an unconditional branch following this conditional branch.
18596
0
          // We need this because we need to reverse the successors in order
18597
0
          // to implement FCMP_OEQ.
18598
0
          if (
User->getOpcode() == ISD::BR0
) {
18599
0
            SDValue FalseBB = User->getOperand(1);
18600
0
            SDNode *NewBR =
18601
0
              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18602
0
            assert(NewBR == User);
18603
0
            (void)NewBR;
18604
0
            Dest = FalseBB;
18605
0
18606
0
            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18607
0
                                Chain, Dest, CC, Cmp);
18608
0
            X86::CondCode CCode =
18609
0
              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18610
0
            CCode = X86::GetOppositeBranchCondition(CCode);
18611
0
            CC = DAG.getConstant(CCode, dl, MVT::i8);
18612
0
            Cond = Cmp;
18613
0
            addTest = false;
18614
0
          }
18615
0
        }
18616
0
      }
18617
33.9k
    } else 
if (33.9k
Cond.hasOneUse() && 33.9k
isXor1OfSetCC(Cond)33.6k
) {
18618
0
      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18619
0
      // It should be transformed during dag combiner except when the condition
18620
0
      // is set by a arithmetics with overflow node.
18621
0
      X86::CondCode CCode =
18622
0
        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18623
0
      CCode = X86::GetOppositeBranchCondition(CCode);
18624
0
      CC = DAG.getConstant(CCode, dl, MVT::i8);
18625
0
      Cond = Cond.getOperand(0).getOperand(1);
18626
0
      addTest = false;
18627
33.9k
    } else 
if (33.9k
Cond.getOpcode() == ISD::SETCC &&
18628
33.9k
               
cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ137
) {
18629
84
      // For FCMP_OEQ, we can emit
18630
84
      // two branches instead of an explicit AND instruction with a
18631
84
      // separate test. However, we only do this if this block doesn't
18632
84
      // have a fall-through edge, because this requires an explicit
18633
84
      // jmp when the condition is false.
18634
84
      if (
Op.getNode()->hasOneUse()84
) {
18635
84
        SDNode *User = *Op.getNode()->use_begin();
18636
84
        // Look for an unconditional branch following this conditional branch.
18637
84
        // We need this because we need to reverse the successors in order
18638
84
        // to implement FCMP_OEQ.
18639
84
        if (
User->getOpcode() == ISD::BR84
) {
18640
84
          SDValue FalseBB = User->getOperand(1);
18641
84
          SDNode *NewBR =
18642
84
            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18643
84
          assert(NewBR == User);
18644
84
          (void)NewBR;
18645
84
          Dest = FalseBB;
18646
84
18647
84
          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18648
84
                                    Cond.getOperand(0), Cond.getOperand(1));
18649
84
          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18650
84
          CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18651
84
          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18652
84
                              Chain, Dest, CC, Cmp);
18653
84
          CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18654
84
          Cond = Cmp;
18655
84
          addTest = false;
18656
84
        }
18657
84
      }
18658
33.9k
    } else 
if (33.8k
Cond.getOpcode() == ISD::SETCC &&
18659
33.8k
               
cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE53
) {
18660
53
      // For FCMP_UNE, we can emit
18661
53
      // two branches instead of an explicit AND instruction with a
18662
53
      // separate test. However, we only do this if this block doesn't
18663
53
      // have a fall-through edge, because this requires an explicit
18664
53
      // jmp when the condition is false.
18665
53
      if (
Op.getNode()->hasOneUse()53
) {
18666
53
        SDNode *User = *Op.getNode()->use_begin();
18667
53
        // Look for an unconditional branch following this conditional branch.
18668
53
        // We need this because we need to reverse the successors in order
18669
53
        // to implement FCMP_UNE.
18670
53
        if (
User->getOpcode() == ISD::BR53
) {
18671
53
          SDValue FalseBB = User->getOperand(1);
18672
53
          SDNode *NewBR =
18673
53
            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18674
53
          assert(NewBR == User);
18675
53
          (void)NewBR;
18676
53
18677
53
          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18678
53
                                    Cond.getOperand(0), Cond.getOperand(1));
18679
53
          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18680
53
          CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18681
53
          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18682
53
                              Chain, Dest, CC, Cmp);
18683
53
          CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18684
53
          Cond = Cmp;
18685
53
          addTest = false;
18686
53
          Dest = FalseBB;
18687
53
        }
18688
53
      }
18689
33.9k
    }
18690
33.9k
  }
18691
33.9k
18692
33.9k
  
if (33.9k
addTest33.9k
) {
18693
3.06k
    // Look pass the truncate if the high bits are known zero.
18694
3.06k
    if (isTruncWithZeroHighBitsInput(Cond, DAG))
18695
67
        Cond = Cond.getOperand(0);
18696
3.06k
18697
3.06k
    // We know the result of AND is compared against zero. Try to match
18698
3.06k
    // it to BT.
18699
3.06k
    if (
Cond.getOpcode() == ISD::AND && 3.06k
Cond.hasOneUse()1.64k
) {
18700
1.64k
      if (SDValue 
NewSetCC1.64k
= LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
18701
42
        CC = NewSetCC.getOperand(0);
18702
42
        Cond = NewSetCC.getOperand(1);
18703
42
        addTest = false;
18704
42
      }
18705
1.64k
    }
18706
3.06k
  }
18707
33.9k
18708
33.9k
  if (
addTest33.9k
) {
18709
3.02k
    X86::CondCode X86Cond = Inverted ? 
X86::COND_E0
:
X86::COND_NE3.02k
;
18710
3.02k
    CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18711
3.02k
    Cond = EmitTest(Cond, X86Cond, dl, DAG);
18712
3.02k
  }
18713
33.9k
  Cond = ConvertCmpIfNecessary(Cond, DAG);
18714
33.9k
  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18715
33.9k
                     Chain, Dest, CC, Cond);
18716
33.9k
}
18717
18718
// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18719
// Calls to _alloca are needed to probe the stack when allocating more than 4k
18720
// bytes in one go. Touching the stack at 4K increments is necessary to ensure
18721
// that the guard pages used by the OS virtual memory manager are allocated in
18722
// correct sequence.
18723
SDValue
18724
X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18725
256
                                           SelectionDAG &DAG) const {
18726
256
  MachineFunction &MF = DAG.getMachineFunction();
18727
256
  bool SplitStack = MF.shouldSplitStack();
18728
256
  bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
18729
90
  bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18730
256
               
SplitStack168
||
EmitStackProbe162
;
18731
256
  SDLoc dl(Op);
18732
256
18733
256
  // Get the inputs.
18734
256
  SDNode *Node = Op.getNode();
18735
256
  SDValue Chain = Op.getOperand(0);
18736
256
  SDValue Size  = Op.getOperand(1);
18737
256
  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18738
256
  EVT VT = Node->getValueType(0);
18739
256
18740
256
  // Chain the dynamic stack allocation so that it doesn't modify the stack
18741
256
  // pointer when other instructions are using the stack.
18742
256
  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
18743
256
18744
256
  bool Is64Bit = Subtarget.is64Bit();
18745
256
  MVT SPTy = getPointerTy(DAG.getDataLayout());
18746
256
18747
256
  SDValue Result;
18748
256
  if (
!Lower256
) {
18749
162
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18750
162
    unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18751
162
    assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18752
162
                    " not tell us which reg is the stack pointer!");
18753
162
18754
162
    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18755
162
    Chain = SP.getValue(1);
18756
162
    const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18757
162
    unsigned StackAlign = TFI.getStackAlignment();
18758
162
    Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18759
162
    if (Align > StackAlign)
18760
14
      Result = DAG.getNode(ISD::AND, dl, VT, Result,
18761
14
                         DAG.getConstant(-(uint64_t)Align, dl, VT));
18762
162
    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18763
256
  } else 
if (94
SplitStack94
) {
18764
6
    MachineRegisterInfo &MRI = MF.getRegInfo();
18765
6
18766
6
    if (
Is64Bit6
) {
18767
4
      // The 64 bit implementation of segmented stacks needs to clobber both r10
18768
4
      // r11. This makes it impossible to use it along with nested parameters.
18769
4
      const Function *F = MF.getFunction();
18770
4
      for (const auto &A : F->args()) {
18771
4
        if (A.hasNestAttr())
18772
0
          report_fatal_error("Cannot use segmented stacks with functions that "
18773
0
                             "have nested arguments.");
18774
6
      }
18775
4
    }
18776
6
18777
6
    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18778
6
    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18779
6
    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18780
6
    Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18781
6
                                DAG.getRegister(Vreg, SPTy));
18782
94
  } else {
18783
88
    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18784
88
    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18785
88
    MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18786
88
18787
88
    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18788
88
    unsigned SPReg = RegInfo->getStackRegister();
18789
88
    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18790
88
    Chain = SP.getValue(1);
18791
88
18792
88
    if (
Align88
) {
18793
5
      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18794
5
                       DAG.getConstant(-(uint64_t)Align, dl, VT));
18795
5
      Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18796
5
    }
18797
94
18798
94
    Result = SP;
18799
94
  }
18800
256
18801
256
  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18802
256
                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18803
256
18804
256
  SDValue Ops[2] = {Result, Chain};
18805
256
  return DAG.getMergeValues(Ops, dl);
18806
256
}
18807
18808
84
SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18809
84
  MachineFunction &MF = DAG.getMachineFunction();
18810
84
  auto PtrVT = getPointerTy(MF.getDataLayout());
18811
84
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18812
84
18813
84
  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18814
84
  SDLoc DL(Op);
18815
84
18816
84
  if (!Subtarget.is64Bit() ||
18817
84
      
Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())60
) {
18818
43
    // vastart just stores the address of the VarArgsFrameIndex slot into the
18819
43
    // memory location argument.
18820
43
    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18821
43
    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18822
43
                        MachinePointerInfo(SV));
18823
43
  }
18824
41
18825
41
  // __va_list_tag:
18826
41
  //   gp_offset         (0 - 6 * 8)
18827
41
  //   fp_offset         (48 - 48 + 8 * 16)
18828
41
  //   overflow_arg_area (point to parameters coming in memory).
18829
41
  //   reg_save_area
18830
41
  SmallVector<SDValue, 8> MemOps;
18831
41
  SDValue FIN = Op.getOperand(1);
18832
41
  // Store gp_offset
18833
41
  SDValue Store = DAG.getStore(
18834
41
      Op.getOperand(0), DL,
18835
41
      DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18836
41
      MachinePointerInfo(SV));
18837
41
  MemOps.push_back(Store);
18838
41
18839
41
  // Store fp_offset
18840
41
  FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18841
41
  Store = DAG.getStore(
18842
41
      Op.getOperand(0), DL,
18843
41
      DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18844
41
      MachinePointerInfo(SV, 4));
18845
41
  MemOps.push_back(Store);
18846
41
18847
41
  // Store ptr to overflow_arg_area
18848
41
  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18849
41
  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18850
41
  Store =
18851
41
      DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18852
41
  MemOps.push_back(Store);
18853
41
18854
41
  // Store ptr to reg_save_area.
18855
41
  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18856
41
      Subtarget.isTarget64BitLP64() ? 
837
:
44
, DL));
18857
41
  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18858
41
  Store = DAG.getStore(
18859
41
      Op.getOperand(0), DL, RSFIN, FIN,
18860
41
      MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 
1637
:
124
));
18861
84
  MemOps.push_back(Store);
18862
84
  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18863
84
}
18864
18865
3
SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18866
3
  assert(Subtarget.is64Bit() &&
18867
3
         "LowerVAARG only handles 64-bit va_arg!");
18868
3
  assert(Op.getNumOperands() == 4);
18869
3
18870
3
  MachineFunction &MF = DAG.getMachineFunction();
18871
3
  if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18872
3
    // The Win64 ABI uses char* instead of a structure.
18873
3
    return DAG.expandVAArg(Op.getNode());
18874
0
18875
0
  SDValue Chain = Op.getOperand(0);
18876
0
  SDValue SrcPtr = Op.getOperand(1);
18877
0
  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18878
0
  unsigned Align = Op.getConstantOperandVal(3);
18879
0
  SDLoc dl(Op);
18880
0
18881
0
  EVT ArgVT = Op.getNode()->getValueType(0);
18882
0
  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18883
0
  uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18884
0
  uint8_t ArgMode;
18885
0
18886
0
  // Decide which area this value should be read from.
18887
0
  // TODO: Implement the AMD64 ABI in its entirety. This simple
18888
0
  // selection mechanism works only for the basic types.
18889
0
  if (
ArgVT == MVT::f800
) {
18890
0
    llvm_unreachable("va_arg for f80 not yet implemented");
18891
0
  } else 
if (0
ArgVT.isFloatingPoint() && 0
ArgSize <= 160
/*bytes*/) {
18892
0
    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
18893
0
  } else 
if (0
ArgVT.isInteger() && 0
ArgSize <= 320
/*bytes*/) {
18894
0
    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
18895
0
  } else {
18896
0
    llvm_unreachable("Unhandled argument type in LowerVAARG");
18897
0
  }
18898
0
18899
0
  
if (0
ArgMode == 20
) {
18900
0
    // Sanity Check: Make sure using fp_offset makes sense.
18901
0
    assert(!Subtarget.useSoftFloat() &&
18902
0
           !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18903
0
           Subtarget.hasSSE1());
18904
0
  }
18905
3
18906
3
  // Insert VAARG_64 node into the DAG
18907
3
  // VAARG_64 returns two values: Variable Argument Address, Chain
18908
3
  SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18909
3
                       DAG.getConstant(ArgMode, dl, MVT::i8),
18910
3
                       DAG.getConstant(Align, dl, MVT::i32)};
18911
3
  SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18912
3
  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18913
3
                                          VTs, InstOps, MVT::i64,
18914
3
                                          MachinePointerInfo(SV),
18915
3
                                          /*Align=*/0,
18916
3
                                          /*Volatile=*/false,
18917
3
                                          /*ReadMem=*/true,
18918
3
                                          /*WriteMem=*/true);
18919
3
  Chain = VAARG.getValue(1);
18920
3
18921
3
  // Load the next argument and return it
18922
3
  return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18923
3
}
18924
18925
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18926
22
                           SelectionDAG &DAG) {
18927
22
  // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18928
22
  // where a va_list is still an i8*.
18929
22
  assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18930
22
  if (Subtarget.isCallingConvWin64(
18931
22
        DAG.getMachineFunction().getFunction()->getCallingConv()))
18932
22
    // Probably a Win64 va_copy.
18933
4
    return DAG.expandVACopy(Op.getNode());
18934
18
18935
18
  SDValue Chain = Op.getOperand(0);
18936
18
  SDValue DstPtr = Op.getOperand(1);
18937
18
  SDValue SrcPtr = Op.getOperand(2);
18938
18
  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18939
18
  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18940
18
  SDLoc DL(Op);
18941
18
18942
18
  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18943
18
                       DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18944
18
                       false, false,
18945
18
                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18946
18
}
18947
18948
/// Handle vector element shifts where the shift amount is a constant.
18949
/// Takes immediate version of shift as input.
18950
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18951
                                          SDValue SrcOp, uint64_t ShiftAmt,
18952
9.86k
                                          SelectionDAG &DAG) {
18953
9.86k
  MVT ElementType = VT.getVectorElementType();
18954
9.86k
18955
9.86k
  // Bitcast the source vector to the output type, this is mainly necessary for
18956
9.86k
  // vXi8/vXi64 shifts.
18957
9.86k
  if (VT != SrcOp.getSimpleValueType())
18958
1.32k
    SrcOp = DAG.getBitcast(VT, SrcOp);
18959
9.86k
18960
9.86k
  // Fold this packed shift into its first operand if ShiftAmt is 0.
18961
9.86k
  if (ShiftAmt == 0)
18962
57
    return SrcOp;
18963
9.80k
18964
9.80k
  // Check for ShiftAmt >= element width
18965
9.80k
  
if (9.80k
ShiftAmt >= ElementType.getSizeInBits()9.80k
) {
18966
2
    if (Opc == X86ISD::VSRAI)
18967
2
      ShiftAmt = ElementType.getSizeInBits() - 1;
18968
2
    else
18969
0
      return DAG.getConstant(0, dl, VT);
18970
9.80k
  }
18971
9.80k
18972
9.80k
  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18973
9.80k
         && "Unknown target vector shift-by-constant node");
18974
9.80k
18975
9.80k
  // Fold this packed vector shift into a build vector if SrcOp is a
18976
9.80k
  // vector of Constants or UNDEFs.
18977
9.80k
  if (
ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())9.80k
) {
18978
31
    SmallVector<SDValue, 8> Elts;
18979
31
    unsigned NumElts = SrcOp->getNumOperands();
18980
31
    ConstantSDNode *ND;
18981
31
18982
31
    switch(Opc) {
18983
0
    
default: 0
llvm_unreachable0
("Unknown opcode!");
18984
8
    case X86ISD::VSHLI:
18985
44
      for (unsigned i=0; 
i!=NumElts44
;
++i36
) {
18986
36
        SDValue CurrentOp = SrcOp->getOperand(i);
18987
36
        if (
CurrentOp->isUndef()36
) {
18988
5
          Elts.push_back(CurrentOp);
18989
5
          continue;
18990
5
        }
18991
31
        ND = cast<ConstantSDNode>(CurrentOp);
18992
31
        const APInt &C = ND->getAPIntValue();
18993
31
        Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18994
31
      }
18995
8
      break;
18996
13
    case X86ISD::VSRLI:
18997
73
      for (unsigned i=0; 
i!=NumElts73
;
++i60
) {
18998
60
        SDValue CurrentOp = SrcOp->getOperand(i);
18999
60
        if (
CurrentOp->isUndef()60
) {
19000
11
          Elts.push_back(CurrentOp);
19001
11
          continue;
19002
11
        }
19003
49
        ND = cast<ConstantSDNode>(CurrentOp);
19004
49
        const APInt &C = ND->getAPIntValue();
19005
49
        Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19006
49
      }
19007
13
      break;
19008
10
    case X86ISD::VSRAI:
19009
74
      for (unsigned i=0; 
i!=NumElts74
;
++i64
) {
19010
64
        SDValue CurrentOp = SrcOp->getOperand(i);
19011
64
        if (
CurrentOp->isUndef()64
) {
19012
16
          Elts.push_back(CurrentOp);
19013
16
          continue;
19014
16
        }
19015
48
        ND = cast<ConstantSDNode>(CurrentOp);
19016
48
        const APInt &C = ND->getAPIntValue();
19017
48
        Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19018
48
      }
19019
10
      break;
19020
31
    }
19021
31
19022
31
    return DAG.getBuildVector(VT, dl, Elts);
19023
31
  }
19024
9.77k
19025
9.77k
  return DAG.getNode(Opc, dl, VT, SrcOp,
19026
9.77k
                     DAG.getConstant(ShiftAmt, dl, MVT::i8));
19027
9.77k
}
19028
19029
/// Handle vector element shifts where the shift amount may or may not be a
19030
/// constant. Takes immediate version of shift as input.
19031
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19032
                                   SDValue SrcOp, SDValue ShAmt,
19033
                                   const X86Subtarget &Subtarget,
19034
1.19k
                                   SelectionDAG &DAG) {
19035
1.19k
  MVT SVT = ShAmt.getSimpleValueType();
19036
1.19k
  assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19037
1.19k
19038
1.19k
  // Catch shift-by-constant.
19039
1.19k
  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19040
483
    return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19041
483
                                      CShAmt->getZExtValue(), DAG);
19042
708
19043
708
  // Change opcode to non-immediate version
19044
708
  switch (Opc) {
19045
0
    
default: 0
llvm_unreachable0
("Unknown target vector shift node");
19046
219
    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19047
411
    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19048
78
    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19049
708
  }
19050
708
19051
708
  // Need to build a vector containing shift amount.
19052
708
  // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19053
708
  // +=================+============+=======================================+
19054
708
  // | ShAmt is        | HasSSE4.1? | Construct ShAmt vector as             |
19055
708
  // +=================+============+=======================================+
19056
708
  // | i64             | Yes, No    | Use ShAmt as lowest elt               |
19057
708
  // | i32             | Yes        | zero-extend in-reg                    |
19058
708
  // | (i32 zext(i16)) | Yes        | zero-extend in-reg                    |
19059
708
  // | i16/i32         | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19060
708
  // +=================+============+=======================================+
19061
708
19062
708
  
if (708
SVT == MVT::i64708
)
19063
494
    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19064
214
  else 
if (214
Subtarget.hasSSE41() && 214
ShAmt.getOpcode() == ISD::ZERO_EXTEND159
&&
19065
214
           
ShAmt.getOperand(0).getSimpleValueType() == MVT::i1678
) {
19066
78
    ShAmt = ShAmt.getOperand(0);
19067
78
    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19068
78
    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19069
214
  } else 
if (136
Subtarget.hasSSE41() &&
19070
136
             
ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT81
) {
19071
75
    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19072
75
    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19073
136
  } else {
19074
61
    SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
19075
61
                        DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19076
61
    ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19077
61
  }
19078
1.19k
19079
1.19k
  // The return type has to be a 128-bit type with the same element
19080
1.19k
  // type as the input type.
19081
1.19k
  MVT EltVT = VT.getVectorElementType();
19082
1.19k
  MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19083
1.19k
19084
1.19k
  ShAmt = DAG.getBitcast(ShVT, ShAmt);
19085
1.19k
  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19086
1.19k
}
19087
19088
/// \brief Return Mask with the necessary casting or extending
19089
/// for \p Mask according to \p MaskVT when lowering masking intrinsics
19090
static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19091
                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
19092
1.27k
                           const SDLoc &dl) {
19093
1.27k
19094
1.27k
  if (isAllOnesConstant(Mask))
19095
32
    return DAG.getTargetConstant(1, dl, MaskVT);
19096
1.24k
  
if (1.24k
X86::isZeroNode(Mask)1.24k
)
19097
3
    return DAG.getTargetConstant(0, dl, MaskVT);
19098
1.23k
19099
1.23k
  
if (1.23k
MaskVT.bitsGT(Mask.getSimpleValueType())1.23k
) {
19100
36
    // Mask should be extended
19101
36
    Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19102
36
                       MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19103
36
  }
19104
1.23k
19105
1.23k
  if (
Mask.getSimpleValueType() == MVT::i64 && 1.23k
Subtarget.is32Bit()24
) {
19106
5
    if (
MaskVT == MVT::v64i15
) {
19107
3
      assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19108
3
      // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19109
3
      SDValue Lo, Hi;
19110
3
      Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19111
3
                          DAG.getConstant(0, dl, MVT::i32));
19112
3
      Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19113
3
                          DAG.getConstant(1, dl, MVT::i32));
19114
3
19115
3
      Lo = DAG.getBitcast(MVT::v32i1, Lo);
19116
3
      Hi = DAG.getBitcast(MVT::v32i1, Hi);
19117
3
19118
3
      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19119
0
    } else {
19120
2
      // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19121
2
      // and bitcast.
19122
2
      MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19123
2
      return DAG.getBitcast(MaskVT,
19124
2
                            DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19125
2
    }
19126
1.23k
19127
1.23k
  } else {
19128
1.23k
    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19129
1.23k
                                     Mask.getSimpleValueType().getSizeInBits());
19130
1.23k
    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19131
1.23k
    // are extracted by EXTRACT_SUBVECTOR.
19132
1.23k
    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19133
1.23k
                       DAG.getBitcast(BitcastVT, Mask),
19134
1.23k
                       DAG.getIntPtrConstant(0, dl));
19135
1.23k
  }
19136
0
}
19137
19138
/// \brief Return (and \p Op, \p Mask) for compare instructions or
19139
/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19140
/// necessary casting or extending for \p Mask when lowering masking intrinsics
19141
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19142
                  SDValue PreservedSrc,
19143
                  const X86Subtarget &Subtarget,
19144
1.99k
                  SelectionDAG &DAG) {
19145
1.99k
  MVT VT = Op.getSimpleValueType();
19146
1.99k
  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19147
1.99k
  unsigned OpcodeSelect = ISD::VSELECT;
19148
1.99k
  SDLoc dl(Op);
19149
1.99k
19150
1.99k
  if (isAllOnesConstant(Mask))
19151
948
    return Op;
19152
1.05k
19153
1.05k
  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19154
1.05k
19155
1.05k
  switch (Op.getOpcode()) {
19156
924
  default: break;
19157
0
  case X86ISD::PCMPEQM:
19158
0
  case X86ISD::PCMPGTM:
19159
0
  case X86ISD::CMPM:
19160
0
  case X86ISD::CMPMU:
19161
0
    return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19162
6
  case X86ISD::VFPCLASS:
19163
6
    case X86ISD::VFPCLASSS:
19164
6
    return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19165
120
  case X86ISD::VTRUNC:
19166
120
  case X86ISD::VTRUNCS:
19167
120
  case X86ISD::VTRUNCUS:
19168
120
  case X86ISD::CVTPS2PH:
19169
120
    // We can't use ISD::VSELECT here because it is not always "Legal"
19170
120
    // for the destination type. For example vpmovqb require only AVX512
19171
120
    // and vselect that can operate on byte element type require BWI
19172
120
    OpcodeSelect = X86ISD::SELECT;
19173
120
    break;
19174
1.04k
  }
19175
1.04k
  
if (1.04k
PreservedSrc.isUndef()1.04k
)
19176
2
    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19177
1.99k
  return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19178
1.99k
}
19179
19180
/// \brief Creates an SDNode for a predicated scalar operation.
19181
/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19182
/// The mask is coming as MVT::i8 and it should be transformed
19183
/// to MVT::v1i1 while lowering masking intrinsics.
19184
/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19185
/// "X86select" instead of "vselect". We just can't create the "vselect" node
19186
/// for a scalar instruction.
19187
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19188
                                    SDValue PreservedSrc,
19189
                                    const X86Subtarget &Subtarget,
19190
184
                                    SelectionDAG &DAG) {
19191
184
19192
184
  if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19193
69
    
if (69
MaskConst->getZExtValue() & 0x169
)
19194
64
      return Op;
19195
120
19196
120
  MVT VT = Op.getSimpleValueType();
19197
120
  SDLoc dl(Op);
19198
120
19199
120
  SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19200
120
  if (Op.getOpcode() == X86ISD::FSETCCM ||
19201
116
      Op.getOpcode() == X86ISD::FSETCCM_RND)
19202
7
    return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19203
113
  
if (113
Op.getOpcode() == X86ISD::VFPCLASSS113
)
19204
2
    return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19205
111
19206
111
  
if (111
PreservedSrc.isUndef()111
)
19207
0
    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19208
184
  return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19209
184
}
19210
19211
5
static int getSEHRegistrationNodeSize(const Function *Fn) {
19212
5
  if (!Fn->hasPersonalityFn())
19213
0
    report_fatal_error(
19214
0
        "querying registration node size for function without personality");
19215
5
  // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19216
5
  // WinEHStatePass for the full struct definition.
19217
5
  switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19218
5
  case EHPersonality::MSVC_X86SEH: return 24;
19219
0
  case EHPersonality::MSVC_CXX: return 16;
19220
0
  default: break;
19221
0
  }
19222
0
  report_fatal_error(
19223
0
      "can only recover FP for 32-bit MSVC EH personality functions");
19224
0
}
19225
19226
/// When the MSVC runtime transfers control to us, either to an outlined
19227
/// function or when returning to a parent frame after catching an exception, we
19228
/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19229
/// Here's the math:
19230
///   RegNodeBase = EntryEBP - RegNodeSize
19231
///   ParentFP = RegNodeBase - ParentFrameOffset
19232
/// Subtracting RegNodeSize takes us to the offset of the registration node, and
19233
/// subtracting the offset (negative on x86) takes us back to the parent FP.
19234
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19235
5
                                   SDValue EntryEBP) {
19236
5
  MachineFunction &MF = DAG.getMachineFunction();
19237
5
  SDLoc dl;
19238
5
19239
5
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19240
5
  MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19241
5
19242
5
  // It's possible that the parent function no longer has a personality function
19243
5
  // if the exceptional code was optimized away, in which case we just return
19244
5
  // the incoming EBP.
19245
5
  if (!Fn->hasPersonalityFn())
19246
0
    return EntryEBP;
19247
5
19248
5
  // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19249
5
  // registration, or the .set_setframe offset.
19250
5
  MCSymbol *OffsetSym =
19251
5
      MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19252
5
          GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19253
5
  SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19254
5
  SDValue ParentFrameOffset =
19255
5
      DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19256
5
19257
5
  // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19258
5
  // prologue to RBP in the parent function.
19259
5
  const X86Subtarget &Subtarget =
19260
5
      static_cast<const X86Subtarget &>(DAG.getSubtarget());
19261
5
  if (Subtarget.is64Bit())
19262
0
    return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19263
5
19264
5
  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19265
5
  // RegNodeBase = EntryEBP - RegNodeSize
19266
5
  // ParentFP = RegNodeBase - ParentFrameOffset
19267
5
  SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19268
5
                                    DAG.getConstant(RegNodeSize, dl, PtrVT));
19269
5
  return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19270
5
}
19271
19272
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19273
14.0k
                                       SelectionDAG &DAG) {
19274
14.0k
  // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19275
310
  auto isRoundModeCurDirection = [](SDValue Rnd) {
19276
310
    if (!isa<ConstantSDNode>(Rnd))
19277
0
      return false;
19278
310
19279
310
    unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19280
310
    return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19281
310
  };
19282
14.0k
19283
14.0k
  SDLoc dl(Op);
19284
14.0k
  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19285
14.0k
  MVT VT = Op.getSimpleValueType();
19286
14.0k
  const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19287
14.0k
  if (
IntrData14.0k
) {
19288
7.92k
    switch(IntrData->Type) {
19289
371
    case INTR_TYPE_1OP:
19290
371
      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19291
3.72k
    case INTR_TYPE_2OP:
19292
3.72k
      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19293
3.72k
        Op.getOperand(2));
19294
1.04k
    case INTR_TYPE_3OP:
19295
1.04k
      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19296
1.04k
        Op.getOperand(2), Op.getOperand(3));
19297
144
    case INTR_TYPE_4OP:
19298
144
      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19299
144
        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19300
26
    case INTR_TYPE_1OP_MASK_RM: {
19301
26
      SDValue Src = Op.getOperand(1);
19302
26
      SDValue PassThru = Op.getOperand(2);
19303
26
      SDValue Mask = Op.getOperand(3);
19304
26
      SDValue RoundingMode;
19305
26
      // We always add rounding mode to the Node.
19306
26
      // If the rounding mode is not specified, we add the
19307
26
      // "current direction" mode.
19308
26
      if (Op.getNumOperands() == 4)
19309
8
        RoundingMode =
19310
8
          DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19311
26
      else
19312
18
        RoundingMode = Op.getOperand(4);
19313
26
      assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19314
26
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19315
26
                                              RoundingMode),
19316
26
                                  Mask, PassThru, Subtarget, DAG);
19317
7.92k
    }
19318
399
    case INTR_TYPE_1OP_MASK: {
19319
399
      SDValue Src = Op.getOperand(1);
19320
399
      SDValue PassThru = Op.getOperand(2);
19321
399
      SDValue Mask = Op.getOperand(3);
19322
399
      // We add rounding mode to the Node when
19323
399
      //   - RM Opcode is specified and
19324
399
      //   - RM is not "current direction".
19325
399
      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19326
399
      if (
IntrWithRoundingModeOpcode != 0399
) {
19327
49
        SDValue Rnd = Op.getOperand(4);
19328
49
        if (
!isRoundModeCurDirection(Rnd)49
) {
19329
31
          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19330
31
                                      dl, Op.getValueType(),
19331
31
                                      Src, Rnd),
19332
31
                                      Mask, PassThru, Subtarget, DAG);
19333
31
        }
19334
368
      }
19335
368
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19336
368
                                  Mask, PassThru, Subtarget, DAG);
19337
368
    }
19338
25
    case INTR_TYPE_SCALAR_MASK: {
19339
25
      SDValue Src1 = Op.getOperand(1);
19340
25
      SDValue Src2 = Op.getOperand(2);
19341
25
      SDValue passThru = Op.getOperand(3);
19342
25
      SDValue Mask = Op.getOperand(4);
19343
25
      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19344
25
      if (
IntrWithRoundingModeOpcode != 025
) {
19345
17
        SDValue Rnd = Op.getOperand(5);
19346
17
        if (!isRoundModeCurDirection(Rnd))
19347
6
          return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19348
6
                                                  dl, VT, Src1, Src2, Rnd),
19349
6
                                      Mask, passThru, Subtarget, DAG);
19350
19
      }
19351
19
      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19352
19
                                  Mask, passThru, Subtarget, DAG);
19353
19
    }
19354
65
    case INTR_TYPE_SCALAR_MASK_RM: {
19355
65
      SDValue Src1 = Op.getOperand(1);
19356
65
      SDValue Src2 = Op.getOperand(2);
19357
65
      SDValue Src0 = Op.getOperand(3);
19358
65
      SDValue Mask = Op.getOperand(4);
19359
65
      // There are 2 kinds of intrinsics in this group:
19360
65
      // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19361
65
      // (2) With rounding mode and sae - 7 operands.
19362
65
      if (
Op.getNumOperands() == 665
) {
19363
57
        SDValue Sae  = Op.getOperand(5);
19364
57
        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19365
57
                                                Sae),
19366
57
                                    Mask, Src0, Subtarget, DAG);
19367
57
      }
19368
65
      assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19369
8
      SDValue RoundingMode  = Op.getOperand(5);
19370
8
      SDValue Sae  = Op.getOperand(6);
19371
8
      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19372
8
                                              RoundingMode, Sae),
19373
8
                                  Mask, Src0, Subtarget, DAG);
19374
8
    }
19375
418
    case INTR_TYPE_2OP_MASK:
19376
418
    case INTR_TYPE_2OP_IMM8_MASK: {
19377
418
      SDValue Src1 = Op.getOperand(1);
19378
418
      SDValue Src2 = Op.getOperand(2);
19379
418
      SDValue PassThru = Op.getOperand(3);
19380
418
      SDValue Mask = Op.getOperand(4);
19381
418
19382
418
      if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19383
30
        Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19384
418
19385
418
      // We specify 2 possible opcodes for intrinsics with rounding modes.
19386
418
      // First, we check if the intrinsic may have non-default rounding mode,
19387
418
      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19388
418
      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19389
418
      if (
IntrWithRoundingModeOpcode != 0418
) {
19390
86
        SDValue Rnd = Op.getOperand(5);
19391
86
        if (
!isRoundModeCurDirection(Rnd)86
) {
19392
58
          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19393
58
                                      dl, Op.getValueType(),
19394
58
                                      Src1, Src2, Rnd),
19395
58
                                      Mask, PassThru, Subtarget, DAG);
19396
58
        }
19397
360
      }
19398
360
      // TODO: Intrinsics should have fast-math-flags to propagate.
19399
360
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19400
360
                                  Mask, PassThru, Subtarget, DAG);
19401
360
    }
19402
47
    case INTR_TYPE_2OP_MASK_RM: {
19403
47
      SDValue Src1 = Op.getOperand(1);
19404
47
      SDValue Src2 = Op.getOperand(2);
19405
47
      SDValue PassThru = Op.getOperand(3);
19406
47
      SDValue Mask = Op.getOperand(4);
19407
47
      // We specify 2 possible modes for intrinsics, with/without rounding
19408
47
      // modes.
19409
47
      // First, we check if the intrinsic have rounding mode (6 operands),
19410
47
      // if not, we set rounding mode to "current".
19411
47
      SDValue Rnd;
19412
47
      if (Op.getNumOperands() == 6)
19413
14
        Rnd = Op.getOperand(5);
19414
47
      else
19415
33
        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19416
47
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19417
47
                                              Src1, Src2, Rnd),
19418
47
                                  Mask, PassThru, Subtarget, DAG);
19419
360
    }
19420
8
    case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19421
8
      SDValue Src1 = Op.getOperand(1);
19422
8
      SDValue Src2 = Op.getOperand(2);
19423
8
      SDValue Src3 = Op.getOperand(3);
19424
8
      SDValue PassThru = Op.getOperand(4);
19425
8
      SDValue Mask = Op.getOperand(5);
19426
8
      SDValue Sae  = Op.getOperand(6);
19427
8
19428
8
      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19429
8
                                              Src2, Src3, Sae),
19430
8
                                  Mask, PassThru, Subtarget, DAG);
19431
360
    }
19432
12
    case INTR_TYPE_3OP_MASK_RM: {
19433
12
      SDValue Src1 = Op.getOperand(1);
19434
12
      SDValue Src2 = Op.getOperand(2);
19435
12
      SDValue Imm = Op.getOperand(3);
19436
12
      SDValue PassThru = Op.getOperand(4);
19437
12
      SDValue Mask = Op.getOperand(5);
19438
12
      // We specify 2 possible modes for intrinsics, with/without rounding
19439
12
      // modes.
19440
12
      // First, we check if the intrinsic have rounding mode (7 operands),
19441
12
      // if not, we set rounding mode to "current".
19442
12
      SDValue Rnd;
19443
12
      if (Op.getNumOperands() == 7)
19444
4
        Rnd = Op.getOperand(6);
19445
12
      else
19446
8
        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19447
12
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19448
12
                                              Src1, Src2, Imm, Rnd),
19449
12
                                  Mask, PassThru, Subtarget, DAG);
19450
360
    }
19451
31
    case INTR_TYPE_3OP_IMM8_MASK:
19452
31
    case INTR_TYPE_3OP_MASK: {
19453
31
      SDValue Src1 = Op.getOperand(1);
19454
31
      SDValue Src2 = Op.getOperand(2);
19455
31
      SDValue Src3 = Op.getOperand(3);
19456
31
      SDValue PassThru = Op.getOperand(4);
19457
31
      SDValue Mask = Op.getOperand(5);
19458
31
19459
31
      if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19460
31
        Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19461
31
19462
31
      // We specify 2 possible opcodes for intrinsics with rounding modes.
19463
31
      // First, we check if the intrinsic may have non-default rounding mode,
19464
31
      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19465
31
      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19466
31
      if (
IntrWithRoundingModeOpcode != 031
) {
19467
0
        SDValue Rnd = Op.getOperand(6);
19468
0
        if (
!isRoundModeCurDirection(Rnd)0
) {
19469
0
          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19470
0
                                      dl, Op.getValueType(),
19471
0
                                      Src1, Src2, Src3, Rnd),
19472
0
                                      Mask, PassThru, Subtarget, DAG);
19473
0
        }
19474
31
      }
19475
31
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19476
31
                                              Src1, Src2, Src3),
19477
31
                                  Mask, PassThru, Subtarget, DAG);
19478
31
    }
19479
102
    case VPERM_2OP_MASK : {
19480
102
      SDValue Src1 = Op.getOperand(1);
19481
102
      SDValue Src2 = Op.getOperand(2);
19482
102
      SDValue PassThru = Op.getOperand(3);
19483
102
      SDValue Mask = Op.getOperand(4);
19484
102
19485
102
      // Swap Src1 and Src2 in the node creation
19486
102
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19487
102
                                  Mask, PassThru, Subtarget, DAG);
19488
31
    }
19489
334
    case VPERM_3OP_MASKZ:
19490
334
    case VPERM_3OP_MASK:{
19491
334
      MVT VT = Op.getSimpleValueType();
19492
334
      // Src2 is the PassThru
19493
334
      SDValue Src1 = Op.getOperand(1);
19494
334
      // PassThru needs to be the same type as the destination in order
19495
334
      // to pattern match correctly.
19496
334
      SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19497
334
      SDValue Src3 = Op.getOperand(3);
19498
334
      SDValue Mask = Op.getOperand(4);
19499
334
      SDValue PassThru = SDValue();
19500
334
19501
334
      // set PassThru element
19502
334
      if (IntrData->Type == VPERM_3OP_MASKZ)
19503
153
        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19504
334
      else
19505
181
        PassThru = Src2;
19506
334
19507
334
      // Swap Src1 and Src2 in the node creation
19508
334
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19509
334
                                              dl, Op.getValueType(),
19510
334
                                              Src2, Src1, Src3),
19511
334
                                  Mask, PassThru, Subtarget, DAG);
19512
334
    }
19513
230
    case FMA_OP_MASK3:
19514
230
    case FMA_OP_MASKZ:
19515
230
    case FMA_OP_MASK: {
19516
230
      SDValue Src1 = Op.getOperand(1);
19517
230
      SDValue Src2 = Op.getOperand(2);
19518
230
      SDValue Src3 = Op.getOperand(3);
19519
230
      SDValue Mask = Op.getOperand(4);
19520
230
      MVT VT = Op.getSimpleValueType();
19521
230
      SDValue PassThru = SDValue();
19522
230
19523
230
      // set PassThru element
19524
230
      if (IntrData->Type == FMA_OP_MASKZ)
19525
24
        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19526
206
      else 
if (206
IntrData->Type == FMA_OP_MASK3206
)
19527
60
        PassThru = Src3;
19528
206
      else
19529
146
        PassThru = Src1;
19530
230
19531
230
      // We specify 2 possible opcodes for intrinsics with rounding modes.
19532
230
      // First, we check if the intrinsic may have non-default rounding mode,
19533
230
      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19534
230
      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19535
230
      if (
IntrWithRoundingModeOpcode != 0230
) {
19536
108
        SDValue Rnd = Op.getOperand(5);
19537
108
        if (!isRoundModeCurDirection(Rnd))
19538
56
          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19539
56
                                                  dl, Op.getValueType(),
19540
56
                                                  Src1, Src2, Src3, Rnd),
19541
56
                                      Mask, PassThru, Subtarget, DAG);
19542
174
      }
19543
174
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19544
174
                                              dl, Op.getValueType(),
19545
174
                                              Src1, Src2, Src3),
19546
174
                                  Mask, PassThru, Subtarget, DAG);
19547
174
    }
19548
59
    case FMA_OP_SCALAR_MASK:
19549
59
    case FMA_OP_SCALAR_MASK3:
19550
59
    case FMA_OP_SCALAR_MASKZ: {
19551
59
      SDValue Src1 = Op.getOperand(1);
19552
59
      SDValue Src2 = Op.getOperand(2);
19553
59
      SDValue Src3 = Op.getOperand(3);
19554
59
      SDValue Mask = Op.getOperand(4);
19555
59
      MVT VT = Op.getSimpleValueType();
19556
59
      SDValue PassThru = SDValue();
19557
59
19558
59
      // set PassThru element
19559
59
      if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19560
11
        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19561
48
      else 
if (48
IntrData->Type == FMA_OP_SCALAR_MASK348
)
19562
27
        PassThru = Src3;
19563
48
      else
19564
21
        PassThru = Src1;
19565
59
19566
59
      SDValue Rnd = Op.getOperand(5);
19567
59
      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19568
59
                                              Op.getValueType(), Src1, Src2,
19569
59
                                              Src3, Rnd),
19570
59
                                  Mask, PassThru, Subtarget, DAG);
19571
59
    }
19572
60
    case IFMA_OP_MASKZ:
19573
60
    case IFMA_OP_MASK: {
19574
60
      SDValue Src1 = Op.getOperand(1);
19575
60
      SDValue Src2 = Op.getOperand(2);
19576
60
      SDValue Src3 = Op.getOperand(3);
19577
60
      SDValue Mask = Op.getOperand(4);
19578
60
      MVT VT = Op.getSimpleValueType();
19579
60
      SDValue PassThru = Src1;
19580
60
19581
60
      // set PassThru element
19582
60
      if (IntrData->Type == IFMA_OP_MASKZ)
19583
28
        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19584
60
19585
60
      // Node we need to swizzle the operands to pass the multiply operands
19586
60
      // first.
19587
60
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19588
60
                                              dl, Op.getValueType(),
19589
60
                                              Src2, Src3, Src1),
19590
60
                                  Mask, PassThru, Subtarget, DAG);
19591
60
    }
19592
114
    case TERLOG_OP_MASK:
19593
114
    case TERLOG_OP_MASKZ: {
19594
114
      SDValue Src1 = Op.getOperand(1);
19595
114
      SDValue Src2 = Op.getOperand(2);
19596
114
      SDValue Src3 = Op.getOperand(3);
19597
114
      SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19598
114
      SDValue Mask = Op.getOperand(5);
19599
114
      MVT VT = Op.getSimpleValueType();
19600
114
      SDValue PassThru = Src1;
19601
114
      // Set PassThru element.
19602
114
      if (IntrData->Type == TERLOG_OP_MASKZ)
19603
39
        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19604
114
19605
114
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19606
114
                                              Src1, Src2, Src3, Src4),
19607
114
                                  Mask, PassThru, Subtarget, DAG);
19608
114
    }
19609
4
    case CVTPD2PS:
19610
4
      // ISD::FP_ROUND has a second argument that indicates if the truncation
19611
4
      // does not change the value. Set it to 0 since it can change.
19612
4
      return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19613
4
                         DAG.getIntPtrConstant(0, dl));
19614
4
    case CVTPD2PS_MASK: {
19615
4
      SDValue Src = Op.getOperand(1);
19616
4
      SDValue PassThru = Op.getOperand(2);
19617
4
      SDValue Mask = Op.getOperand(3);
19618
4
      // We add rounding mode to the Node when
19619
4
      //   - RM Opcode is specified and
19620
4
      //   - RM is not "current direction".
19621
4
      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19622
4
      if (
IntrWithRoundingModeOpcode != 04
) {
19623
2
        SDValue Rnd = Op.getOperand(4);
19624
2
        if (
!isRoundModeCurDirection(Rnd)2
) {
19625
1
          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19626
1
                                      dl, Op.getValueType(),
19627
1
                                      Src, Rnd),
19628
1
                                      Mask, PassThru, Subtarget, DAG);
19629
1
        }
19630
3
      }
19631
4
      assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19632
3
      // ISD::FP_ROUND has a second argument that indicates if the truncation
19633
3
      // does not change the value. Set it to 0 since it can change.
19634
3
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19635
3
                                              DAG.getIntPtrConstant(0, dl)),
19636
3
                                  Mask, PassThru, Subtarget, DAG);
19637
3
    }
19638
12
    case FPCLASS: {
19639
12
      // FPclass intrinsics with mask
19640
12
       SDValue Src1 = Op.getOperand(1);
19641
12
       MVT VT = Src1.getSimpleValueType();
19642
12
       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19643
12
       SDValue Imm = Op.getOperand(2);
19644
12
       SDValue Mask = Op.getOperand(3);
19645
12
       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19646
12
                                     Mask.getSimpleValueType().getSizeInBits());
19647
12
       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19648
12
       SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19649
12
                                                 DAG.getTargetConstant(0, dl, MaskVT),
19650
12
                                                 Subtarget, DAG);
19651
12
       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19652
12
                                 DAG.getUNDEF(BitcastVT), FPclassMask,
19653
12
                                 DAG.getIntPtrConstant(0, dl));
19654
12
       return DAG.getBitcast(Op.getValueType(), Res);
19655
3
    }
19656
4
    case FPCLASSS: {
19657
4
      SDValue Src1 = Op.getOperand(1);
19658
4
      SDValue Imm = Op.getOperand(2);
19659
4
      SDValue Mask = Op.getOperand(3);
19660
4
      SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
19661
4
      SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19662
4
        DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19663
4
      return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
19664
4
                         DAG.getIntPtrConstant(0, dl));
19665
3
    }
19666
96
    case CMP_MASK:
19667
96
    case CMP_MASK_CC: {
19668
96
      // Comparison intrinsics with masks.
19669
96
      // Example of transformation:
19670
96
      // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19671
96
      //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19672
96
      // (i8 (bitcast
19673
96
      //   (v8i1 (insert_subvector undef,
19674
96
      //           (v2i1 (and (PCMPEQM %a, %b),
19675
96
      //                      (extract_subvector
19676
96
      //                         (v8i1 (bitcast %mask)), 0))), 0))))
19677
96
      MVT VT = Op.getOperand(1).getSimpleValueType();
19678
96
      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19679
96
      SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 
436
:
360
);
19680
96
      MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19681
96
                                       Mask.getSimpleValueType().getSizeInBits());
19682
96
      SDValue Cmp;
19683
96
      if (
IntrData->Type == CMP_MASK_CC96
) {
19684
36
        SDValue CC = Op.getOperand(3);
19685
36
        CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19686
36
        // We specify 2 possible opcodes for intrinsics with rounding modes.
19687
36
        // First, we check if the intrinsic may have non-default rounding mode,
19688
36
        // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19689
36
        if (
IntrData->Opc1 != 036
) {
19690
28
          SDValue Rnd = Op.getOperand(5);
19691
28
          if (!isRoundModeCurDirection(Rnd))
19692
21
            Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19693
21
                              Op.getOperand(2), CC, Rnd);
19694
28
        }
19695
36
        //default rounding mode
19696
36
        if(!Cmp.getNode())
19697
15
            Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19698
15
                              Op.getOperand(2), CC);
19699
36
19700
96
      } else {
19701
60
        assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19702
60
        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19703
60
                          Op.getOperand(2));
19704
60
      }
19705
96
      SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19706
96
                                             DAG.getTargetConstant(0, dl,
19707
96
                                                                   MaskVT),
19708
96
                                             Subtarget, DAG);
19709
96
      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19710
96
                                DAG.getUNDEF(BitcastVT), CmpMask,
19711
96
                                DAG.getIntPtrConstant(0, dl));
19712
96
      return DAG.getBitcast(Op.getValueType(), Res);
19713
96
    }
19714
11
    case CMP_MASK_SCALAR_CC: {
19715
11
      SDValue Src1 = Op.getOperand(1);
19716
11
      SDValue Src2 = Op.getOperand(2);
19717
11
      SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19718
11
      SDValue Mask = Op.getOperand(4);
19719
11
19720
11
      SDValue Cmp;
19721
11
      if (
IntrData->Opc1 != 011
) {
19722
11
        SDValue Rnd = Op.getOperand(5);
19723
11
        if (!isRoundModeCurDirection(Rnd))
19724
5
          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
19725
11
      }
19726
11
      //default rounding mode
19727
11
      if(!Cmp.getNode())
19728
6
        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
19729
11
19730
11
      SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19731
11
                                             DAG.getTargetConstant(0, dl,
19732
11
                                                                   MVT::i1),
19733
11
                                             Subtarget, DAG);
19734
11
      return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
19735
11
                         DAG.getIntPtrConstant(0, dl));
19736
96
    }
19737
201
    case COMI: { // Comparison intrinsics
19738
201
      ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19739
201
      SDValue LHS = Op.getOperand(1);
19740
201
      SDValue RHS = Op.getOperand(2);
19741
201
      SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19742
201
      SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19743
201
      SDValue SetCC;
19744
201
      switch (CC) {
19745
100
      case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19746
100
        SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19747
100
        SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19748
100
        SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19749
100
        break;
19750
201
      }
19751
20
      case ISD::SETNE: { // (ZF = 1 or PF = 1)
19752
20
        SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19753
20
        SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19754
20
        SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19755
20
        break;
19756
201
      }
19757
20
      case ISD::SETGT: // (CF = 0 and ZF = 0)
19758
20
        SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19759
20
        break;
19760
20
      case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19761
20
        SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19762
20
        break;
19763
201
      }
19764
21
      case ISD::SETGE: // CF = 0
19765
21
        SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19766
21
        break;
19767
20
      case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19768
20
        SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19769
20
        break;
19770
0
      default:
19771
0
        llvm_unreachable("Unexpected illegal condition!");
19772
201
      }
19773
201
      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19774
201
    }
19775
9
    case COMI_RM: { // Comparison intrinsics with Sae
19776
9
      SDValue LHS = Op.getOperand(1);
19777
9
      SDValue RHS = Op.getOperand(2);
19778
9
      unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19779
9
      SDValue Sae = Op.getOperand(4);
19780
9
19781
9
      SDValue FCmp;
19782
9
      if (isRoundModeCurDirection(Sae))
19783
5
        FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
19784
5
                           DAG.getConstant(CondVal, dl, MVT::i8));
19785
9
      else
19786
4
        FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
19787
4
                           DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19788
9
      return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
19789
9
                         DAG.getIntPtrConstant(0, dl));
19790
201
    }
19791
279
    case VSHIFT:
19792
279
      return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19793
279
                                 Op.getOperand(1), Op.getOperand(2), Subtarget,
19794
279
                                 DAG);
19795
12
    case COMPRESS_EXPAND_IN_REG: {
19796
12
      SDValue Mask = Op.getOperand(3);
19797
12
      SDValue DataToCompress = Op.getOperand(1);
19798
12
      SDValue PassThru = Op.getOperand(2);
19799
12
      if (isAllOnesConstant(Mask)) // return data as is
19800
2
        return Op.getOperand(1);
19801
10
19802
10
      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19803
10
                                              DataToCompress),
19804
10
                                  Mask, PassThru, Subtarget, DAG);
19805
10
    }
19806
6
    case BROADCASTM: {
19807
6
      SDValue Mask = Op.getOperand(1);
19808
6
      MVT MaskVT = MVT::getVectorVT(MVT::i1,
19809
6
                                    Mask.getSimpleValueType().getSizeInBits());
19810
6
      Mask = DAG.getBitcast(MaskVT, Mask);
19811
6
      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19812
10
    }
19813
5
    case KUNPCK: {
19814
5
      MVT VT = Op.getSimpleValueType();
19815
5
      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19816
5
19817
5
      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19818
5
      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19819
5
      // Arguments should be swapped.
19820
5
      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19821
5
                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19822
5
                                Src2, Src1);
19823
5
      return DAG.getBitcast(VT, Res);
19824
10
    }
19825
9
    case MASK_BINOP: {
19826
9
      MVT VT = Op.getSimpleValueType();
19827
9
      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19828
9
19829
9
      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19830
9
      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19831
9
      SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19832
9
      return DAG.getBitcast(VT, Res);
19833
10
    }
19834
47
    case FIXUPIMMS:
19835
47
    case FIXUPIMMS_MASKZ:
19836
47
    case FIXUPIMM:
19837
47
    case FIXUPIMM_MASKZ:{
19838
47
      SDValue Src1 = Op.getOperand(1);
19839
47
      SDValue Src2 = Op.getOperand(2);
19840
47
      SDValue Src3 = Op.getOperand(3);
19841
47
      SDValue Imm = Op.getOperand(4);
19842
47
      SDValue Mask = Op.getOperand(5);
19843
29
      SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19844
47
                                         
Src124
:
getZeroVector(VT, Subtarget, DAG, dl)23
;
19845
47
      // We specify 2 possible modes for intrinsics, with/without rounding
19846
47
      // modes.
19847
47
      // First, we check if the intrinsic have rounding mode (7 operands),
19848
47
      // if not, we set rounding mode to "current".
19849
47
      SDValue Rnd;
19850
47
      if (Op.getNumOperands() == 7)
19851
24
        Rnd = Op.getOperand(6);
19852
47
      else
19853
23
        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19854
47
      if (
IntrData->Type == FIXUPIMM || 47
IntrData->Type == FIXUPIMM_MASKZ29
)
19855
35
        return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19856
35
                                                Src1, Src2, Src3, Imm, Rnd),
19857
35
                                    Mask, Passthru, Subtarget, DAG);
19858
47
      else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19859
12
        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19860
12
                                       Src1, Src2, Src3, Imm, Rnd),
19861
12
                                    Mask, Passthru, Subtarget, DAG);
19862
0
    }
19863
16
    case CONVERT_TO_MASK: {
19864
16
      MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19865
16
      MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19866
16
      MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19867
16
19868
16
      SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19869
16
                                    Op.getOperand(1));
19870
16
      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19871
16
                                DAG.getUNDEF(BitcastVT), CvtMask,
19872
16
                                DAG.getIntPtrConstant(0, dl));
19873
16
      return DAG.getBitcast(Op.getValueType(), Res);
19874
0
    }
19875
0
    case BRCST32x2_TO_VEC: {
19876
0
      SDValue Src = Op.getOperand(1);
19877
0
      SDValue PassThru = Op.getOperand(2);
19878
0
      SDValue Mask = Op.getOperand(3);
19879
0
19880
0
      assert((VT.getScalarType() == MVT::i32 ||
19881
0
              VT.getScalarType() == MVT::f32) && "Unexpected type!");
19882
0
      //bitcast Src to packed 64
19883
0
      MVT ScalarVT = VT.getScalarType() == MVT::i32 ? 
MVT::i640
:
MVT::f640
;
19884
0
      MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19885
0
      Src = DAG.getBitcast(BitcastVT, Src);
19886
0
      MVT ResVT = MVT::getVectorVT(ScalarVT, VT.getSizeInBits()/64);
19887
0
      SDValue Res = DAG.getNode(IntrData->Opc0, dl, ResVT, Src);
19888
0
      Res = DAG.getBitcast(VT, Res);
19889
0
19890
0
      return getVectorMaskingNode(Res, Mask, PassThru, Subtarget, DAG);
19891
0
    }
19892
0
    default:
19893
0
      break;
19894
6.17k
    }
19895
6.17k
  }
19896
6.17k
19897
6.17k
  switch (IntNo) {
19898
5.74k
  default: return SDValue();    // Don't custom lower most intrinsics.
19899
6.17k
19900
96
  case Intrinsic::x86_avx2_permd:
19901
96
  case Intrinsic::x86_avx2_permps:
19902
96
    // Operands intentionally swapped. Mask is last operand to intrinsic,
19903
96
    // but second operand for node/instruction.
19904
96
    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19905
96
                       Op.getOperand(2), Op.getOperand(1));
19906
96
19907
96
  // ptest and testp intrinsics. The intrinsic these come from are designed to
19908
96
  // return an integer value, not just an instruction so lower it to the ptest
19909
96
  // or testp pattern and a setcc for the result.
19910
181
  case Intrinsic::x86_sse41_ptestz:
19911
181
  case Intrinsic::x86_sse41_ptestc:
19912
181
  case Intrinsic::x86_sse41_ptestnzc:
19913
181
  case Intrinsic::x86_avx_ptestz_256:
19914
181
  case Intrinsic::x86_avx_ptestc_256:
19915
181
  case Intrinsic::x86_avx_ptestnzc_256:
19916
181
  case Intrinsic::x86_avx_vtestz_ps:
19917
181
  case Intrinsic::x86_avx_vtestc_ps:
19918
181
  case Intrinsic::x86_avx_vtestnzc_ps:
19919
181
  case Intrinsic::x86_avx_vtestz_pd:
19920
181
  case Intrinsic::x86_avx_vtestc_pd:
19921
181
  case Intrinsic::x86_avx_vtestnzc_pd:
19922
181
  case Intrinsic::x86_avx_vtestz_ps_256:
19923
181
  case Intrinsic::x86_avx_vtestc_ps_256:
19924
181
  case Intrinsic::x86_avx_vtestnzc_ps_256:
19925
181
  case Intrinsic::x86_avx_vtestz_pd_256:
19926
181
  case Intrinsic::x86_avx_vtestc_pd_256:
19927
181
  case Intrinsic::x86_avx_vtestnzc_pd_256: {
19928
181
    bool IsTestPacked = false;
19929
181
    X86::CondCode X86CC;
19930
181
    switch (IntNo) {
19931
0
    
default: 0
llvm_unreachable0
("Bad fallthrough in Intrinsic lowering.");
19932
16
    case Intrinsic::x86_avx_vtestz_ps:
19933
16
    case Intrinsic::x86_avx_vtestz_pd:
19934
16
    case Intrinsic::x86_avx_vtestz_ps_256:
19935
16
    case Intrinsic::x86_avx_vtestz_pd_256:
19936
16
      IsTestPacked = true;
19937
16
      LLVM_FALLTHROUGH;
19938
40
    case Intrinsic::x86_sse41_ptestz:
19939
40
    case Intrinsic::x86_avx_ptestz_256:
19940
40
      // ZF = 1
19941
40
      X86CC = X86::COND_E;
19942
40
      break;
19943
76
    case Intrinsic::x86_avx_vtestc_ps:
19944
76
    case Intrinsic::x86_avx_vtestc_pd:
19945
76
    case Intrinsic::x86_avx_vtestc_ps_256:
19946
76
    case Intrinsic::x86_avx_vtestc_pd_256:
19947
76
      IsTestPacked = true;
19948
76
      LLVM_FALLTHROUGH;
19949
112
    case Intrinsic::x86_sse41_ptestc:
19950
112
    case Intrinsic::x86_avx_ptestc_256:
19951
112
      // CF = 1
19952
112
      X86CC = X86::COND_B;
19953
112
      break;
19954
16
    case Intrinsic::x86_avx_vtestnzc_ps:
19955
16
    case Intrinsic::x86_avx_vtestnzc_pd:
19956
16
    case Intrinsic::x86_avx_vtestnzc_ps_256:
19957
16
    case Intrinsic::x86_avx_vtestnzc_pd_256:
19958
16
      IsTestPacked = true;
19959
16
      LLVM_FALLTHROUGH;
19960
29
    case Intrinsic::x86_sse41_ptestnzc:
19961
29
    case Intrinsic::x86_avx_ptestnzc_256:
19962
29
      // ZF and CF = 0
19963
29
      X86CC = X86::COND_A;
19964
29
      break;
19965
181
    }
19966
181
19967
181
    SDValue LHS = Op.getOperand(1);
19968
181
    SDValue RHS = Op.getOperand(2);
19969
181
    unsigned TestOpc = IsTestPacked ? 
X86ISD::TESTP108
:
X86ISD::PTEST73
;
19970
181
    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19971
181
    SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19972
181
    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19973
181
  }
19974
2
  case Intrinsic::x86_avx512_kortestz_w:
19975
2
  case Intrinsic::x86_avx512_kortestc_w: {
19976
2
    X86::CondCode X86CC =
19977
2
        (IntNo == Intrinsic::x86_avx512_kortestz_w) ? 
X86::COND_E1
:
X86::COND_B1
;
19978
2
    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19979
2
    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19980
2
    SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19981
2
    SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19982
2
    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19983
2
  }
19984
2
19985
1
  case Intrinsic::x86_avx512_knot_w: {
19986
1
    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19987
1
    SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
19988
1
    SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19989
1
    return DAG.getBitcast(MVT::i16, Res);
19990
2
  }
19991
2
19992
2
  case Intrinsic::x86_avx512_kandn_w: {
19993
2
    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19994
2
    // Invert LHS for the not.
19995
2
    LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
19996
2
                      DAG.getConstant(1, dl, MVT::v16i1));
19997
2
    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19998
2
    SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
19999
2
    return DAG.getBitcast(MVT::i16, Res);
20000
2
  }
20001
2
20002
2
  case Intrinsic::x86_avx512_kxnor_w: {
20003
2
    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20004
2
    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20005
2
    SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20006
2
    // Invert result for the not.
20007
2
    Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20008
2
                      DAG.getConstant(1, dl, MVT::v16i1));
20009
2
    return DAG.getBitcast(MVT::i16, Res);
20010
2
  }
20011
2
20012
50
  case Intrinsic::x86_sse42_pcmpistria128:
20013
50
  case Intrinsic::x86_sse42_pcmpestria128:
20014
50
  case Intrinsic::x86_sse42_pcmpistric128:
20015
50
  case Intrinsic::x86_sse42_pcmpestric128:
20016
50
  case Intrinsic::x86_sse42_pcmpistrio128:
20017
50
  case Intrinsic::x86_sse42_pcmpestrio128:
20018
50
  case Intrinsic::x86_sse42_pcmpistris128:
20019
50
  case Intrinsic::x86_sse42_pcmpestris128:
20020
50
  case Intrinsic::x86_sse42_pcmpistriz128:
20021
50
  case Intrinsic::x86_sse42_pcmpestriz128: {
20022
50
    unsigned Opcode;
20023
50
    X86::CondCode X86CC;
20024
50
    switch (IntNo) {
20025
0
    
default: 0
llvm_unreachable0
("Impossible intrinsic"); // Can't reach here.
20026
5
    case Intrinsic::x86_sse42_pcmpistria128:
20027
5
      Opcode = X86ISD::PCMPISTRI;
20028
5
      X86CC = X86::COND_A;
20029
5
      break;
20030
5
    case Intrinsic::x86_sse42_pcmpestria128:
20031
5
      Opcode = X86ISD::PCMPESTRI;
20032
5
      X86CC = X86::COND_A;
20033
5
      break;
20034
5
    case Intrinsic::x86_sse42_pcmpistric128:
20035
5
      Opcode = X86ISD::PCMPISTRI;
20036
5
      X86CC = X86::COND_B;
20037
5
      break;
20038
5
    case Intrinsic::x86_sse42_pcmpestric128:
20039
5
      Opcode = X86ISD::PCMPESTRI;
20040
5
      X86CC = X86::COND_B;
20041
5
      break;
20042
5
    case Intrinsic::x86_sse42_pcmpistrio128:
20043
5
      Opcode = X86ISD::PCMPISTRI;
20044
5
      X86CC = X86::COND_O;
20045
5
      break;
20046
5
    case Intrinsic::x86_sse42_pcmpestrio128:
20047
5
      Opcode = X86ISD::PCMPESTRI;
20048
5
      X86CC = X86::COND_O;
20049
5
      break;
20050
5
    case Intrinsic::x86_sse42_pcmpistris128:
20051
5
      Opcode = X86ISD::PCMPISTRI;
20052
5
      X86CC = X86::COND_S;
20053
5
      break;
20054
5
    case Intrinsic::x86_sse42_pcmpestris128:
20055
5
      Opcode = X86ISD::PCMPESTRI;
20056
5
      X86CC = X86::COND_S;
20057
5
      break;
20058
5
    case Intrinsic::x86_sse42_pcmpistriz128:
20059
5
      Opcode = X86ISD::PCMPISTRI;
20060
5
      X86CC = X86::COND_E;
20061
5
      break;
20062
5
    case Intrinsic::x86_sse42_pcmpestriz128:
20063
5
      Opcode = X86ISD::PCMPESTRI;
20064
5
      X86CC = X86::COND_E;
20065
5
      break;
20066
50
    }
20067
50
    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20068
50
    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20069
50
    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20070
50
    SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20071
50
    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20072
50
  }
20073
50
20074
52
  case Intrinsic::x86_sse42_pcmpistri128:
20075
52
  case Intrinsic::x86_sse42_pcmpestri128: {
20076
52
    unsigned Opcode;
20077
52
    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20078
26
      Opcode = X86ISD::PCMPISTRI;
20079
52
    else
20080
26
      Opcode = X86ISD::PCMPESTRI;
20081
52
20082
52
    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20083
52
    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20084
52
    return DAG.getNode(Opcode, dl, VTs, NewOps);
20085
52
  }
20086
52
20087
2
  case Intrinsic::eh_sjlj_lsda: {
20088
2
    MachineFunction &MF = DAG.getMachineFunction();
20089
2
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20090
2
    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20091
2
    auto &Context = MF.getMMI().getContext();
20092
2
    MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20093
2
                                            Twine(MF.getFunctionNumber()));
20094
2
    return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
20095
52
  }
20096
52
20097
30
  case Intrinsic::x86_seh_lsda: {
20098
30
    // Compute the symbol for the LSDA. We know it'll get emitted later.
20099
30
    MachineFunction &MF = DAG.getMachineFunction();
20100
30
    SDValue Op1 = Op.getOperand(1);
20101
30
    auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20102
30
    MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20103
30
        GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20104
30
20105
30
    // Generate a simple absolute symbol reference. This intrinsic is only
20106
30
    // supported on 32-bit Windows, which isn't PIC.
20107
30
    SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20108
30
    return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20109
52
  }
20110
52
20111
5
  case Intrinsic::x86_seh_recoverfp: {
20112
5
    SDValue FnOp = Op.getOperand(1);
20113
5
    SDValue IncomingFPOp = Op.getOperand(2);
20114
5
    GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20115
5
    auto *Fn = dyn_cast_or_null<Function>(GSD ? 
GSD->getGlobal()5
:
nullptr0
);
20116
5
    if (!Fn)
20117
0
      report_fatal_error(
20118
0
          "llvm.x86.seh.recoverfp must take a function as the first argument");
20119
5
    return recoverFramePointer(DAG, Fn, IncomingFPOp);
20120
5
  }
20121
5
20122
5
  case Intrinsic::localaddress: {
20123
5
    // Returns one of the stack, base, or frame pointer registers, depending on
20124
5
    // which is used to reference local variables.
20125
5
    MachineFunction &MF = DAG.getMachineFunction();
20126
5
    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20127
5
    unsigned Reg;
20128
5
    if (RegInfo->hasBasePointer(MF))
20129
0
      Reg = RegInfo->getBaseRegister();
20130
5
    else // This function handles the SP or FP case.
20131
5
      Reg = RegInfo->getPtrSizedFrameRegister(MF);
20132
5
    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20133
0
  }
20134
14.0k
  }
20135
14.0k
}
20136
20137
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20138
                                 SDValue Src, SDValue Mask, SDValue Base,
20139
                                 SDValue Index, SDValue ScaleOp, SDValue Chain,
20140
174
                                 const X86Subtarget &Subtarget) {
20141
174
  SDLoc dl(Op);
20142
174
  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20143
174
  // Scale must be constant.
20144
174
  if (!C)
20145
0
    return SDValue();
20146
174
  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20147
174
  EVT MaskVT = Mask.getValueType();
20148
174
  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20149
174
  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20150
174
  SDValue Segment = DAG.getRegister(0, MVT::i32);
20151
174
  // If source is undef or we know it won't be used, use a zero vector
20152
174
  // to break register dependency.
20153
174
  // TODO: use undef instead and let ExecutionDepsFix deal with it?
20154
174
  if (
Src.isUndef() || 174
ISD::isBuildVectorAllOnes(Mask.getNode())134
)
20155
44
    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20156
174
  SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20157
174
  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20158
174
  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20159
174
  return DAG.getMergeValues(RetOps, dl);
20160
174
}
20161
20162
static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20163
                              SDValue Src, SDValue Mask, SDValue Base,
20164
                              SDValue Index, SDValue ScaleOp, SDValue Chain,
20165
48
                              const X86Subtarget &Subtarget) {
20166
48
  SDLoc dl(Op);
20167
48
  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20168
48
  // Scale must be constant.
20169
48
  if (!C)
20170
2
    return SDValue();
20171
46
  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20172
46
  MVT MaskVT = MVT::getVectorVT(MVT::i1,
20173
46
                             Index.getSimpleValueType().getVectorNumElements());
20174
46
20175
46
  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20176
46
  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20177
46
  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20178
46
  SDValue Segment = DAG.getRegister(0, MVT::i32);
20179
46
  // If source is undef or we know it won't be used, use a zero vector
20180
46
  // to break register dependency.
20181
46
  // TODO: use undef instead and let ExecutionDepsFix deal with it?
20182
46
  if (
Src.isUndef() || 46
ISD::isBuildVectorAllOnes(VMask.getNode())46
)
20183
13
    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20184
48
  SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20185
48
  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20186
48
  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20187
48
  return DAG.getMergeValues(RetOps, dl);
20188
48
}
20189
20190
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20191
                               SDValue Src, SDValue Mask, SDValue Base,
20192
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
20193
49
                               const X86Subtarget &Subtarget) {
20194
49
  SDLoc dl(Op);
20195
49
  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20196
49
  // Scale must be constant.
20197
49
  if (!C)
20198
0
    return SDValue();
20199
49
  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20200
49
  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20201
49
  SDValue Segment = DAG.getRegister(0, MVT::i32);
20202
49
  MVT MaskVT = MVT::getVectorVT(MVT::i1,
20203
49
                             Index.getSimpleValueType().getVectorNumElements());
20204
49
20205
49
  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20206
49
  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20207
49
  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20208
49
  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20209
49
  return SDValue(Res, 1);
20210
49
}
20211
20212
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20213
                               SDValue Mask, SDValue Base, SDValue Index,
20214
                               SDValue ScaleOp, SDValue Chain,
20215
4
                               const X86Subtarget &Subtarget) {
20216
4
  SDLoc dl(Op);
20217
4
  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20218
4
  // Scale must be constant.
20219
4
  if (!C)
20220
0
    return SDValue();
20221
4
  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20222
4
  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20223
4
  SDValue Segment = DAG.getRegister(0, MVT::i32);
20224
4
  MVT MaskVT =
20225
4
    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20226
4
  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20227
4
  SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20228
4
  SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20229
4
  return SDValue(Res, 0);
20230
4
}
20231
20232
/// Handles the lowering of builtin intrinsic that return the value
20233
/// of the extended control register.
20234
static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20235
                                       SelectionDAG &DAG,
20236
                                       const X86Subtarget &Subtarget,
20237
2
                                       SmallVectorImpl<SDValue> &Results) {
20238
2
  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20239
2
  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20240
2
  SDValue LO, HI;
20241
2
20242
2
  // The ECX register is used to select the index of the XCR register to
20243
2
  // return.
20244
2
  SDValue Chain =
20245
2
      DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20246
2
  SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20247
2
  Chain = SDValue(N1, 0);
20248
2
20249
2
  // Reads the content of XCR and returns it in registers EDX:EAX.
20250
2
  if (
Subtarget.is64Bit()2
) {
20251
1
    LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20252
1
    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20253
1
                            LO.getValue(2));
20254
2
  } else {
20255
1
    LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20256
1
    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20257
1
                            LO.getValue(2));
20258
1
  }
20259
2
  Chain = HI.getValue(1);
20260
2
20261
2
  if (
Subtarget.is64Bit()2
) {
20262
1
    // Merge the two 32-bit values into a 64-bit one..
20263
1
    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20264
1
                              DAG.getConstant(32, DL, MVT::i8));
20265
1
    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20266
1
    Results.push_back(Chain);
20267
1
    return;
20268
1
  }
20269
1
20270
1
  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20271
1
  SDValue Ops[] = { LO, HI };
20272
1
  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20273
1
  Results.push_back(Pair);
20274
1
  Results.push_back(Chain);
20275
1
}
20276
20277
/// Handles the lowering of builtin intrinsics that read performance monitor
20278
/// counters (x86_rdpmc).
20279
static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20280
                                      SelectionDAG &DAG,
20281
                                      const X86Subtarget &Subtarget,
20282
2
                                      SmallVectorImpl<SDValue> &Results) {
20283
2
  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20284
2
  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20285
2
  SDValue LO, HI;
20286
2
20287
2
  // The ECX register is used to select the index of the performance counter
20288
2
  // to read.
20289
2
  SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20290
2
                                   N->getOperand(2));
20291
2
  SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20292
2
20293
2
  // Reads the content of a 64-bit performance counter and returns it in the
20294
2
  // registers EDX:EAX.
20295
2
  if (
Subtarget.is64Bit()2
) {
20296
1
    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20297
1
    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20298
1
                            LO.getValue(2));
20299
2
  } else {
20300
1
    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20301
1
    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20302
1
                            LO.getValue(2));
20303
1
  }
20304
2
  Chain = HI.getValue(1);
20305
2
20306
2
  if (
Subtarget.is64Bit()2
) {
20307
1
    // The EAX register is loaded with the low-order 32 bits. The EDX register
20308
1
    // is loaded with the supported high-order bits of the counter.
20309
1
    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20310
1
                              DAG.getConstant(32, DL, MVT::i8));
20311
1
    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20312
1
    Results.push_back(Chain);
20313
1
    return;
20314
1
  }
20315
1
20316
1
  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20317
1
  SDValue Ops[] = { LO, HI };
20318
1
  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20319
1
  Results.push_back(Pair);
20320
1
  Results.push_back(Chain);
20321
1
}
20322
20323
/// Handles the lowering of builtin intrinsics that read the time stamp counter
20324
/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20325
/// READCYCLECOUNTER nodes.
20326
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20327
                                    SelectionDAG &DAG,
20328
                                    const X86Subtarget &Subtarget,
20329
10
                                    SmallVectorImpl<SDValue> &Results) {
20330
10
  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20331
10
  SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20332
10
  SDValue LO, HI;
20333
10
20334
10
  // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20335
10
  // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20336
10
  // and the EAX register is loaded with the low-order 32 bits.
20337
10
  if (
Subtarget.is64Bit()10
) {
20338
7
    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20339
7
    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20340
7
                            LO.getValue(2));
20341
10
  } else {
20342
3
    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20343
3
    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20344
3
                            LO.getValue(2));
20345
3
  }
20346
10
  SDValue Chain = HI.getValue(1);
20347
10
20348
10
  if (
Opcode == X86ISD::RDTSCP_DAG10
) {
20349
2
    assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20350
2
20351
2
    // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20352
2
    // the ECX register. Add 'ecx' explicitly to the chain.
20353
2
    SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20354
2
                                     HI.getValue(2));
20355
2
    // Explicitly store the content of ECX at the location passed in input
20356
2
    // to the 'rdtscp' intrinsic.
20357
2
    Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20358
2
                         MachinePointerInfo());
20359
2
  }
20360
10
20361
10
  if (
Subtarget.is64Bit()10
) {
20362
7
    // The EDX register is loaded with the high-order 32 bits of the MSR, and
20363
7
    // the EAX register is loaded with the low-order 32 bits.
20364
7
    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20365
7
                              DAG.getConstant(32, DL, MVT::i8));
20366
7
    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20367
7
    Results.push_back(Chain);
20368
7
    return;
20369
7
  }
20370
3
20371
3
  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20372
3
  SDValue Ops[] = { LO, HI };
20373
3
  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20374
3
  Results.push_back(Pair);
20375
3
  Results.push_back(Chain);
20376
3
}
20377
20378
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20379
5
                                     SelectionDAG &DAG) {
20380
5
  SmallVector<SDValue, 2> Results;
20381
5
  SDLoc DL(Op);
20382
5
  getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20383
5
                          Results);
20384
5
  return DAG.getMergeValues(Results, DL);
20385
5
}
20386
20387
30
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20388
30
  MachineFunction &MF = DAG.getMachineFunction();
20389
30
  SDValue Chain = Op.getOperand(0);
20390
30
  SDValue RegNode = Op.getOperand(2);
20391
30
  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20392
30
  if (!EHInfo)
20393
0
    report_fatal_error("EH registrations only live in functions using WinEH");
20394
30
20395
30
  // Cast the operand to an alloca, and remember the frame index.
20396
30
  auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20397
30
  if (!FINode)
20398
0
    report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20399
30
  EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20400
30
20401
30
  // Return the chain operand without making any DAG nodes.
20402
30
  return Chain;
20403
30
}
20404
20405
2
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20406
2
  MachineFunction &MF = DAG.getMachineFunction();
20407
2
  SDValue Chain = Op.getOperand(0);
20408
2
  SDValue EHGuard = Op.getOperand(2);
20409
2
  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20410
2
  if (!EHInfo)
20411
0
    report_fatal_error("EHGuard only live in functions using WinEH");
20412
2
20413
2
  // Cast the operand to an alloca, and remember the frame index.
20414
2
  auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20415
2
  if (!FINode)
20416
0
    report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20417
2
  EHInfo->EHGuardFrameIndex = FINode->getIndex();
20418
2
20419
2
  // Return the chain operand without making any DAG nodes.
20420
2
  return Chain;
20421
2
}
20422
20423
/// Emit Truncating Store with signed or unsigned saturation.
20424
static SDValue
20425
EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20426
                SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20427
48
                SelectionDAG &DAG) {
20428
48
20429
48
  SDVTList VTs = DAG.getVTList(MVT::Other);
20430
48
  SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20431
48
  SDValue Ops[] = { Chain, Val, Ptr, Undef };
20432
48
  return SignedSat ?
20433
19
    DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20434
29
    DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20435
48
}
20436
20437
/// Emit Masked Truncating Store with signed or unsigned saturation.
20438
static SDValue
20439
EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20440
                      SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20441
38
                      MachineMemOperand *MMO, SelectionDAG &DAG) {
20442
38
20443
38
  SDVTList VTs = DAG.getVTList(MVT::Other);
20444
38
  SDValue Ops[] = { Chain, Ptr, Mask, Val };
20445
38
  return SignedSat ?
20446
19
    DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20447
19
    DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20448
38
}
20449
20450
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20451
1.58k
                                      SelectionDAG &DAG) {
20452
1.58k
  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20453
1.58k
20454
1.58k
  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20455
1.58k
  if (
!IntrData1.58k
) {
20456
1.13k
    switch (IntNo) {
20457
30
    case llvm::Intrinsic::x86_seh_ehregnode:
20458
30
      return MarkEHRegistrationNode(Op, DAG);
20459
2
    case llvm::Intrinsic::x86_seh_ehguard:
20460
2
      return MarkEHGuard(Op, DAG);
20461
12
    case llvm::Intrinsic::x86_flags_read_u32:
20462
12
    case llvm::Intrinsic::x86_flags_read_u64:
20463
12
    case llvm::Intrinsic::x86_flags_write_u32:
20464
12
    case llvm::Intrinsic::x86_flags_write_u64: {
20465
12
      // We need a frame pointer because this will get lowered to a PUSH/POP
20466
12
      // sequence.
20467
12
      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20468
12
      MFI.setHasCopyImplyingStackAdjustment(true);
20469
12
      // Don't do anything here, we will expand these intrinsics out later
20470
12
      // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20471
12
      return SDValue();
20472
12
    }
20473
32
    case Intrinsic::x86_lwpins32:
20474
32
    case Intrinsic::x86_lwpins64: {
20475
32
      SDLoc dl(Op);
20476
32
      SDValue Chain = Op->getOperand(0);
20477
32
      SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20478
32
      SDValue LwpIns =
20479
32
          DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20480
32
                      Op->getOperand(3), Op->getOperand(4));
20481
32
      SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20482
32
      SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20483
32
      return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20484
32
                         LwpIns.getValue(1));
20485
1.06k
    }
20486
1.06k
    }
20487
1.06k
    return SDValue();
20488
1.06k
  }
20489
449
20490
449
  SDLoc dl(Op);
20491
449
  switch(IntrData->Type) {
20492
0
  
default: 0
llvm_unreachable0
("Unknown Intrinsic Type");
20493
22
  case RDSEED:
20494
22
  case RDRAND: {
20495
22
    // Emit the node with the right value type.
20496
22
    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20497
22
    SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20498
22
20499
22
    // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20500
22
    // Otherwise return the value from Rand, which is always 0, casted to i32.
20501
22
    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20502
22
                      DAG.getConstant(1, dl, Op->getValueType(1)),
20503
22
                      DAG.getConstant(X86::COND_B, dl, MVT::i32),
20504
22
                      SDValue(Result.getNode(), 1) };
20505
22
    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20506
22
                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
20507
22
                                  Ops);
20508
22
20509
22
    // Return { result, isValid, chain }.
20510
22
    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20511
22
                       SDValue(Result.getNode(), 2));
20512
22
  }
20513
174
  case GATHER_AVX2: {
20514
174
    SDValue Chain = Op.getOperand(0);
20515
174
    SDValue Src   = Op.getOperand(2);
20516
174
    SDValue Base  = Op.getOperand(3);
20517
174
    SDValue Index = Op.getOperand(4);
20518
174
    SDValue Mask  = Op.getOperand(5);
20519
174
    SDValue Scale = Op.getOperand(6);
20520
174
    return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20521
174
                             Scale, Chain, Subtarget);
20522
22
  }
20523
48
  case GATHER: {
20524
48
  //gather(v1, mask, index, base, scale);
20525
48
    SDValue Chain = Op.getOperand(0);
20526
48
    SDValue Src   = Op.getOperand(2);
20527
48
    SDValue Base  = Op.getOperand(3);
20528
48
    SDValue Index = Op.getOperand(4);
20529
48
    SDValue Mask  = Op.getOperand(5);
20530
48
    SDValue Scale = Op.getOperand(6);
20531
48
    return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20532
48
                         Chain, Subtarget);
20533
22
  }
20534
49
  case SCATTER: {
20535
49
  //scatter(base, mask, index, v1, scale);
20536
49
    SDValue Chain = Op.getOperand(0);
20537
49
    SDValue Base  = Op.getOperand(2);
20538
49
    SDValue Mask  = Op.getOperand(3);
20539
49
    SDValue Index = Op.getOperand(4);
20540
49
    SDValue Src   = Op.getOperand(5);
20541
49
    SDValue Scale = Op.getOperand(6);
20542
49
    return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20543
49
                          Scale, Chain, Subtarget);
20544
22
  }
20545
4
  case PREFETCH: {
20546
4
    SDValue Hint = Op.getOperand(6);
20547
4
    unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20548
4
    assert((HintVal == 2 || HintVal == 3) &&
20549
4
           "Wrong prefetch hint in intrinsic: should be 2 or 3");
20550
4
    unsigned Opcode = (HintVal == 2 ? 
IntrData->Opc12
:
IntrData->Opc02
);
20551
4
    SDValue Chain = Op.getOperand(0);
20552
4
    SDValue Mask  = Op.getOperand(2);
20553
4
    SDValue Index = Op.getOperand(3);
20554
4
    SDValue Base  = Op.getOperand(4);
20555
4
    SDValue Scale = Op.getOperand(5);
20556
4
    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20557
4
                           Subtarget);
20558
22
  }
20559
22
  // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20560
2
  case RDTSC: {
20561
2
    SmallVector<SDValue, 2> Results;
20562
2
    getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20563
2
                            Results);
20564
2
    return DAG.getMergeValues(Results, dl);
20565
22
  }
20566
22
  // Read Performance Monitoring Counters.
20567
1
  case RDPMC: {
20568
1
    SmallVector<SDValue, 2> Results;
20569
1
    getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20570
1
    return DAG.getMergeValues(Results, dl);
20571
22
  }
20572
22
  // Get Extended Control Register.
20573
1
  case XGETBV: {
20574
1
    SmallVector<SDValue, 2> Results;
20575
1
    getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20576
1
    return DAG.getMergeValues(Results, dl);
20577
22
  }
20578
22
  // XTEST intrinsics.
20579
1
  case XTEST: {
20580
1
    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20581
1
    SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20582
1
20583
1
    SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20584
1
    SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20585
1
    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20586
1
                       Ret, SDValue(InTrans.getNode(), 1));
20587
22
  }
20588
22
  // ADC/ADCX/SBB
20589
22
  case ADX: {
20590
22
    SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
20591
22
    SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
20592
22
    SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20593
22
                                DAG.getConstant(-1, dl, MVT::i8));
20594
22
    SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20595
22
                              Op.getOperand(4), GenCF.getValue(1));
20596
22
    SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20597
22
                                 Op.getOperand(5), MachinePointerInfo());
20598
22
    SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20599
22
    SDValue Results[] = { SetCC, Store };
20600
22
    return DAG.getMergeValues(Results, dl);
20601
22
  }
20602
5
  case COMPRESS_TO_MEM: {
20603
5
    SDValue Mask = Op.getOperand(4);
20604
5
    SDValue DataToCompress = Op.getOperand(3);
20605
5
    SDValue Addr = Op.getOperand(2);
20606
5
    SDValue Chain = Op.getOperand(0);
20607
5
    MVT VT = DataToCompress.getSimpleValueType();
20608
5
20609
5
    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20610
5
    assert(MemIntr && "Expected MemIntrinsicSDNode!");
20611
5
20612
5
    if (isAllOnesConstant(Mask)) // return just a store
20613
1
      return DAG.getStore(Chain, dl, DataToCompress, Addr,
20614
1
                          MemIntr->getMemOperand());
20615
4
20616
4
    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20617
4
    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20618
4
20619
4
    return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20620
4
                              MemIntr->getMemOperand(),
20621
4
                              false /* truncating */, true /* compressing */);
20622
4
  }
20623
114
  case TRUNCATE_TO_MEM_VI8:
20624
114
  case TRUNCATE_TO_MEM_VI16:
20625
114
  case TRUNCATE_TO_MEM_VI32: {
20626
114
    SDValue Mask = Op.getOperand(4);
20627
114
    SDValue DataToTruncate = Op.getOperand(3);
20628
114
    SDValue Addr = Op.getOperand(2);
20629
114
    SDValue Chain = Op.getOperand(0);
20630
114
20631
114
    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20632
114
    assert(MemIntr && "Expected MemIntrinsicSDNode!");
20633
114
20634
114
    EVT MemVT  = MemIntr->getMemoryVT();
20635
114
20636
114
    uint16_t TruncationOp = IntrData->Opc0;
20637
114
    switch (TruncationOp) {
20638
38
    case X86ISD::VTRUNC: {
20639
38
      if (isAllOnesConstant(Mask)) // return just a truncate store
20640
19
        return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20641
19
                                 MemIntr->getMemOperand());
20642
19
20643
19
      MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20644
19
      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20645
19
20646
19
      return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20647
19
                                MemIntr->getMemOperand(), true /* truncating */);
20648
19
    }
20649
76
    case X86ISD::VTRUNCUS:
20650
76
    case X86ISD::VTRUNCS: {
20651
76
      bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20652
76
      if (isAllOnesConstant(Mask))
20653
38
        return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20654
38
                               MemIntr->getMemOperand(), DAG);
20655
38
20656
38
      MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20657
38
      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20658
38
20659
38
      return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20660
38
                                   VMask, MemVT, MemIntr->getMemOperand(), DAG);
20661
38
    }
20662
0
    default:
20663
0
      llvm_unreachable("Unsupported truncstore intrinsic");
20664
0
    }
20665
0
  }
20666
0
20667
6
  case EXPAND_FROM_MEM: {
20668
6
    SDValue Mask = Op.getOperand(4);
20669
6
    SDValue PassThru = Op.getOperand(3);
20670
6
    SDValue Addr = Op.getOperand(2);
20671
6
    SDValue Chain = Op.getOperand(0);
20672
6
    MVT VT = Op.getSimpleValueType();
20673
6
20674
6
    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20675
6
    assert(MemIntr && "Expected MemIntrinsicSDNode!");
20676
6
20677
6
    if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20678
2
      return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20679
4
    
if (4
X86::isZeroNode(Mask)4
)
20680
0
      return DAG.getUNDEF(VT);
20681
4
20682
4
    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20683
4
    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20684
4
    return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20685
4
                             MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20686
4
                             true /* expanding */);
20687
4
  }
20688
1.58k
  }
20689
1.58k
}
20690
20691
SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20692
27
                                           SelectionDAG &DAG) const {
20693
27
  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20694
27
  MFI.setReturnAddressIsTaken(true);
20695
27
20696
27
  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20697
0
    return SDValue();
20698
27
20699
27
  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20700
27
  SDLoc dl(Op);
20701
27
  EVT PtrVT = getPointerTy(DAG.getDataLayout());
20702
27
20703
27
  if (
Depth > 027
) {
20704
4
    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20705
4
    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20706
4
    SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20707
4
    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20708
4
                       DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20709
4
                       MachinePointerInfo());
20710
4
  }
20711
23
20712
23
  // Just load the return address.
20713
23
  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20714
23
  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20715
23
                     MachinePointerInfo());
20716
23
}
20717
20718
SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20719
6
                                                 SelectionDAG &DAG) const {
20720
6
  DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20721
6
  return getReturnAddressFrameIndex(DAG);
20722
6
}
20723
20724
35
SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20725
35
  MachineFunction &MF = DAG.getMachineFunction();
20726
35
  MachineFrameInfo &MFI = MF.getFrameInfo();
20727
35
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20728
35
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20729
35
  EVT VT = Op.getValueType();
20730
35
20731
35
  MFI.setFrameAddressIsTaken(true);
20732
35
20733
35
  if (
MF.getTarget().getMCAsmInfo()->usesWindowsCFI()35
) {
20734
4
    // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
20735
4
    // is not possible to crawl up the stack without looking at the unwind codes
20736
4
    // simultaneously.
20737
4
    int FrameAddrIndex = FuncInfo->getFAIndex();
20738
4
    if (
!FrameAddrIndex4
) {
20739
4
      // Set up a frame object for the return address.
20740
4
      unsigned SlotSize = RegInfo->getSlotSize();
20741
4
      FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20742
4
          SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20743
4
      FuncInfo->setFAIndex(FrameAddrIndex);
20744
4
    }
20745
4
    return DAG.getFrameIndex(FrameAddrIndex, VT);
20746
4
  }
20747
31
20748
31
  unsigned FrameReg =
20749
31
      RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20750
31
  SDLoc dl(Op);  // FIXME probably not meaningful
20751
31
  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20752
31
  assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20753
31
          (FrameReg == X86::EBP && VT == MVT::i32)) &&
20754
31
         "Invalid Frame Register!");
20755
31
  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20756
53
  while (Depth--)
20757
22
    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20758
22
                            MachinePointerInfo());
20759
35
  return FrameAddr;
20760
35
}
20761
20762
// FIXME? Maybe this could be a TableGen attribute on some registers and
20763
// this table could be generated automatically from RegInfo.
20764
unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20765
13
                                              SelectionDAG &DAG) const {
20766
13
  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20767
13
  const MachineFunction &MF = DAG.getMachineFunction();
20768
13
20769
13
  unsigned Reg = StringSwitch<unsigned>(RegName)
20770
13
                       .Case("esp", X86::ESP)
20771
13
                       .Case("rsp", X86::RSP)
20772
13
                       .Case("ebp", X86::EBP)
20773
13
                       .Case("rbp", X86::RBP)
20774
13
                       .Default(0);
20775
13
20776
13
  if (
Reg == X86::EBP || 13
Reg == X86::RBP12
) {
20777
5
    if (!TFI.hasFP(MF))
20778
1
      report_fatal_error("register " + StringRef(RegName) +
20779
1
                         " is allocatable: function has no frame pointer");
20780
#ifndef NDEBUG
20781
    else {
20782
      const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20783
      unsigned FrameReg =
20784
          RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20785
      assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20786
             "Invalid Frame Register!");
20787
    }
20788
#endif
20789
  }
20790
12
20791
12
  
if (12
Reg12
)
20792
8
    return Reg;
20793
4
20794
4
  report_fatal_error("Invalid register name global variable");
20795
4
}
20796
20797
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20798
2
                                                     SelectionDAG &DAG) const {
20799
2
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20800
2
  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20801
2
}
20802
20803
unsigned X86TargetLowering::getExceptionPointerRegister(
20804
438
    const Constant *PersonalityFn) const {
20805
438
  if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20806
3
    
return Subtarget.isTarget64BitLP64() ? 3
X86::RDX3
:
X86::EDX0
;
20807
435
20808
435
  
return Subtarget.isTarget64BitLP64() ? 435
X86::RAX255
:
X86::EAX180
;
20809
438
}
20810
20811
unsigned X86TargetLowering::getExceptionSelectorRegister(
20812
216
    const Constant *PersonalityFn) const {
20813
216
  // Funclet personalities don't use selectors (the runtime does the selection).
20814
216
  assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20815
216
  return Subtarget.isTarget64BitLP64() ? 
X86::RDX126
:
X86::EDX90
;
20816
216
}
20817
20818
7
bool X86TargetLowering::needsFixedCatchObjects() const {
20819
7
  return Subtarget.isTargetWin64();
20820
7
}
20821
20822
6
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20823
6
  SDValue Chain     = Op.getOperand(0);
20824
6
  SDValue Offset    = Op.getOperand(1);
20825
6
  SDValue Handler   = Op.getOperand(2);
20826
6
  SDLoc dl      (Op);
20827
6
20828
6
  EVT PtrVT = getPointerTy(DAG.getDataLayout());
20829
6
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20830
6
  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20831
6
  assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20832
6
          (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20833
6
         "Invalid Frame Register!");
20834
6
  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20835
6
  unsigned StoreAddrReg = (PtrVT == MVT::i64) ? 
X86::RCX4
:
X86::ECX2
;
20836
6
20837
6
  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20838
6
                                 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20839
6
                                                       dl));
20840
6
  StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20841
6
  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20842
6
  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20843
6
20844
6
  return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20845
6
                     DAG.getRegister(StoreAddrReg, PtrVT));
20846
6
}
20847
20848
SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20849
10
                                               SelectionDAG &DAG) const {
20850
10
  SDLoc DL(Op);
20851
10
  // If the subtarget is not 64bit, we may need the global base reg
20852
10
  // after isel expand pseudo, i.e., after CGBR pass ran.
20853
10
  // Therefore, ask for the GlobalBaseReg now, so that the pass
20854
10
  // inserts the code for us in case we need it.
20855
10
  // Otherwise, we will end up in a situation where we will
20856
10
  // reference a virtual register that is not defined!
20857
10
  if (
!Subtarget.is64Bit()10
) {
20858
4
    const X86InstrInfo *TII = Subtarget.getInstrInfo();
20859
4
    (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20860
4
  }
20861
10
  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20862
10
                     DAG.getVTList(MVT::i32, MVT::Other),
20863
10
                     Op.getOperand(0), Op.getOperand(1));
20864
10
}
20865
20866
SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20867
4
                                                SelectionDAG &DAG) const {
20868
4
  SDLoc DL(Op);
20869
4
  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20870
4
                     Op.getOperand(0), Op.getOperand(1));
20871
4
}
20872
20873
SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20874
2
                                                       SelectionDAG &DAG) const {
20875
2
  SDLoc DL(Op);
20876
2
  return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20877
2
                     Op.getOperand(0));
20878
2
}
20879
20880
0
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20881
0
  return Op.getOperand(0);
20882
0
}
20883
20884
SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20885
2
                                                SelectionDAG &DAG) const {
20886
2
  SDValue Root = Op.getOperand(0);
20887
2
  SDValue Trmp = Op.getOperand(1); // trampoline
20888
2
  SDValue FPtr = Op.getOperand(2); // nested function
20889
2
  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20890
2
  SDLoc dl (Op);
20891
2
20892
2
  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20893
2
  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20894
2
20895
2
  if (
Subtarget.is64Bit()2
) {
20896
1
    SDValue OutChains[6];
20897
1
20898
1
    // Large code-model.
20899
1
    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
20900
1
    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20901
1
20902
1
    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20903
1
    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20904
1
20905
1
    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20906
1
20907
1
    // Load the pointer to the nested function into R11.
20908
1
    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20909
1
    SDValue Addr = Trmp;
20910
1
    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20911
1
                                Addr, MachinePointerInfo(TrmpAddr));
20912
1
20913
1
    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20914
1
                       DAG.getConstant(2, dl, MVT::i64));
20915
1
    OutChains[1] =
20916
1
        DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20917
1
                     /* Alignment = */ 2);
20918
1
20919
1
    // Load the 'nest' parameter value into R10.
20920
1
    // R10 is specified in X86CallingConv.td
20921
1
    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20922
1
    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20923
1
                       DAG.getConstant(10, dl, MVT::i64));
20924
1
    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20925
1
                                Addr, MachinePointerInfo(TrmpAddr, 10));
20926
1
20927
1
    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20928
1
                       DAG.getConstant(12, dl, MVT::i64));
20929
1
    OutChains[3] =
20930
1
        DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20931
1
                     /* Alignment = */ 2);
20932
1
20933
1
    // Jump to the nested function.
20934
1
    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20935
1
    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20936
1
                       DAG.getConstant(20, dl, MVT::i64));
20937
1
    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20938
1
                                Addr, MachinePointerInfo(TrmpAddr, 20));
20939
1
20940
1
    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20941
1
    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20942
1
                       DAG.getConstant(22, dl, MVT::i64));
20943
1
    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20944
1
                                Addr, MachinePointerInfo(TrmpAddr, 22));
20945
1
20946
1
    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20947
0
  } else {
20948
1
    const Function *Func =
20949
1
      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20950
1
    CallingConv::ID CC = Func->getCallingConv();
20951
1
    unsigned NestReg;
20952
1
20953
1
    switch (CC) {
20954
0
    default:
20955
0
      llvm_unreachable("Unsupported calling convention");
20956
1
    case CallingConv::C:
20957
1
    case CallingConv::X86_StdCall: {
20958
1
      // Pass 'nest' parameter in ECX.
20959
1
      // Must be kept in sync with X86CallingConv.td
20960
1
      NestReg = X86::ECX;
20961
1
20962
1
      // Check that ECX wasn't needed by an 'inreg' parameter.
20963
1
      FunctionType *FTy = Func->getFunctionType();
20964
1
      const AttributeList &Attrs = Func->getAttributes();
20965
1
20966
1
      if (
!Attrs.isEmpty() && 1
!Func->isVarArg()1
) {
20967
1
        unsigned InRegCount = 0;
20968
1
        unsigned Idx = 1;
20969
1
20970
1
        for (FunctionType::param_iterator I = FTy->param_begin(),
20971
4
             E = FTy->param_end(); 
I != E4
;
++I, ++Idx3
)
20972
3
          
if (3
Attrs.hasAttribute(Idx, Attribute::InReg)3
) {
20973
0
            auto &DL = DAG.getDataLayout();
20974
0
            // FIXME: should only count parameters that are lowered to integers.
20975
0
            InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20976
0
          }
20977
1
20978
1
        if (
InRegCount > 21
) {
20979
0
          report_fatal_error("Nest register in use - reduce number of inreg"
20980
0
                             " parameters!");
20981
0
        }
20982
1
      }
20983
1
      break;
20984
1
    }
20985
0
    case CallingConv::X86_FastCall:
20986
0
    case CallingConv::X86_ThisCall:
20987
0
    case CallingConv::Fast:
20988
0
      // Pass 'nest' parameter in EAX.
20989
0
      // Must be kept in sync with X86CallingConv.td
20990
0
      NestReg = X86::EAX;
20991
0
      break;
20992
1
    }
20993
1
20994
1
    SDValue OutChains[4];
20995
1
    SDValue Addr, Disp;
20996
1
20997
1
    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20998
1
                       DAG.getConstant(10, dl, MVT::i32));
20999
1
    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21000
1
21001
1
    // This is storing the opcode for MOV32ri.
21002
1
    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21003
1
    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21004
1
    OutChains[0] =
21005
1
        DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21006
1
                     Trmp, MachinePointerInfo(TrmpAddr));
21007
1
21008
1
    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21009
1
                       DAG.getConstant(1, dl, MVT::i32));
21010
1
    OutChains[1] =
21011
1
        DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21012
1
                     /* Alignment = */ 1);
21013
1
21014
1
    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21015
1
    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21016
1
                       DAG.getConstant(5, dl, MVT::i32));
21017
1
    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21018
1
                                Addr, MachinePointerInfo(TrmpAddr, 5),
21019
1
                                /* Alignment = */ 1);
21020
1
21021
1
    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21022
1
                       DAG.getConstant(6, dl, MVT::i32));
21023
1
    OutChains[3] =
21024
1
        DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21025
1
                     /* Alignment = */ 1);
21026
1
21027
1
    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21028
1
  }
21029
2
}
21030
21031
SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21032
0
                                            SelectionDAG &DAG) const {
21033
0
  /*
21034
0
   The rounding mode is in bits 11:10 of FPSR, and has the following
21035
0
   settings:
21036
0
     00 Round to nearest
21037
0
     01 Round to -inf
21038
0
     10 Round to +inf
21039
0
     11 Round to 0
21040
0
21041
0
  FLT_ROUNDS, on the other hand, expects the following:
21042
0
    -1 Undefined
21043
0
     0 Round to 0
21044
0
     1 Round to nearest
21045
0
     2 Round to +inf
21046
0
     3 Round to -inf
21047
0
21048
0
  To perform the conversion, we do:
21049
0
    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21050
0
  */
21051
0
21052
0
  MachineFunction &MF = DAG.getMachineFunction();
21053
0
  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21054
0
  unsigned StackAlignment = TFI.getStackAlignment();
21055
0
  MVT VT = Op.getSimpleValueType();
21056
0
  SDLoc DL(Op);
21057
0
21058
0
  // Save FP Control Word to stack slot
21059
0
  int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21060
0
  SDValue StackSlot =
21061
0
      DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21062
0
21063
0
  MachineMemOperand *MMO =
21064
0
      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21065
0
                              MachineMemOperand::MOStore, 2, 2);
21066
0
21067
0
  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21068
0
  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21069
0
                                          DAG.getVTList(MVT::Other),
21070
0
                                          Ops, MVT::i16, MMO);
21071
0
21072
0
  // Load FP Control Word from stack slot
21073
0
  SDValue CWD =
21074
0
      DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21075
0
21076
0
  // Transform as necessary
21077
0
  SDValue CWD1 =
21078
0
    DAG.getNode(ISD::SRL, DL, MVT::i16,
21079
0
                DAG.getNode(ISD::AND, DL, MVT::i16,
21080
0
                            CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21081
0
                DAG.getConstant(11, DL, MVT::i8));
21082
0
  SDValue CWD2 =
21083
0
    DAG.getNode(ISD::SRL, DL, MVT::i16,
21084
0
                DAG.getNode(ISD::AND, DL, MVT::i16,
21085
0
                            CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21086
0
                DAG.getConstant(9, DL, MVT::i8));
21087
0
21088
0
  SDValue RetVal =
21089
0
    DAG.getNode(ISD::AND, DL, MVT::i16,
21090
0
                DAG.getNode(ISD::ADD, DL, MVT::i16,
21091
0
                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21092
0
                            DAG.getConstant(1, DL, MVT::i16)),
21093
0
                DAG.getConstant(3, DL, MVT::i16));
21094
0
21095
0
  return DAG.getNode((VT.getSizeInBits() < 16 ?
21096
0
                      
ISD::TRUNCATE0
:
ISD::ZERO_EXTEND0
), DL, VT, RetVal);
21097
0
}
21098
21099
// Split an unary integer op into 2 half sized ops.
21100
91
static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21101
91
  MVT VT = Op.getSimpleValueType();
21102
91
  unsigned NumElems = VT.getVectorNumElements();
21103
91
  unsigned SizeInBits = VT.getSizeInBits();
21104
91
21105
91
  // Extract the Lo/Hi vectors
21106
91
  SDLoc dl(Op);
21107
91
  SDValue Src = Op.getOperand(0);
21108
91
  SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21109
91
  SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21110
91
21111
91
  MVT EltVT = VT.getVectorElementType();
21112
91
  MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21113
91
  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21114
91
                     DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21115
91
                     DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21116
91
}
21117
21118
// Decompose 256-bit ops into smaller 128-bit ops.
21119
67
static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21120
67
  assert(Op.getSimpleValueType().is256BitVector() &&
21121
67
         Op.getSimpleValueType().isInteger() &&
21122
67
         "Only handle AVX 256-bit vector integer operation");
21123
67
  return LowerVectorIntUnary(Op, DAG);
21124
67
}
21125
21126
// Decompose 512-bit ops into smaller 256-bit ops.
21127
8
static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21128
8
  assert(Op.getSimpleValueType().is512BitVector() &&
21129
8
         Op.getSimpleValueType().isInteger() &&
21130
8
         "Only handle AVX 512-bit vector integer operation");
21131
8
  return LowerVectorIntUnary(Op, DAG);
21132
8
}
21133
21134
/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21135
//
21136
// i8/i16 vector implemented using dword LZCNT vector instruction
21137
// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21138
// split the vector, perform operation on it's Lo a Hi part and
21139
// concatenate the results.
21140
60
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21141
60
  assert(Op.getOpcode() == ISD::CTLZ);
21142
60
  SDLoc dl(Op);
21143
60
  MVT VT = Op.getSimpleValueType();
21144
60
  MVT EltVT = VT.getVectorElementType();
21145
60
  unsigned NumElems = VT.getVectorNumElements();
21146
60
21147
60
  assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21148
60
          "Unsupported element type");
21149
60
21150
60
  // Split vector, it's Lo and Hi parts will be handled in next iteration.
21151
60
  if (16 < NumElems)
21152
16
    return LowerVectorIntUnary(Op, DAG);
21153
44
21154
44
  MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21155
44
  assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21156
44
          "Unsupported value type for operation");
21157
44
21158
44
  // Use native supported vector instruction vplzcntd.
21159
44
  Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21160
44
  SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21161
44
  SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21162
44
  SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21163
44
21164
44
  return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21165
44
}
21166
21167
// Lower CTLZ using a PSHUFB lookup table implementation.
21168
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21169
                                       const X86Subtarget &Subtarget,
21170
114
                                       SelectionDAG &DAG) {
21171
114
  MVT VT = Op.getSimpleValueType();
21172
114
  int NumElts = VT.getVectorNumElements();
21173
114
  int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21174
114
  MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21175
114
21176
114
  // Per-nibble leading zero PSHUFB lookup table.
21177
114
  const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21178
114
                       /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21179
114
                       /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21180
114
                       /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21181
114
21182
114
  SmallVector<SDValue, 64> LUTVec;
21183
2.57k
  for (int i = 0; 
i < NumBytes2.57k
;
++i2.46k
)
21184
2.46k
    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21185
114
  SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21186
114
21187
114
  // Begin by bitcasting the input to byte vector, then split those bytes
21188
114
  // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21189
114
  // If the hi input nibble is zero then we add both results together, otherwise
21190
114
  // we just take the hi result (by masking the lo result to zero before the
21191
114
  // add).
21192
114
  SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21193
114
  SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21194
114
21195
114
  SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21196
114
  SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21197
114
  SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21198
114
  SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21199
114
  SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21200
114
21201
114
  Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21202
114
  Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21203
114
  Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21204
114
  SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21205
114
21206
114
  // Merge result back from vXi8 back to VT, working on the lo/hi halves
21207
114
  // of the current vector width in the same way we did for the nibbles.
21208
114
  // If the upper half of the input element is zero then add the halves'
21209
114
  // leading zero counts together, otherwise just use the upper half's.
21210
114
  // Double the width of the result until we are at target width.
21211
278
  while (
CurrVT != VT278
) {
21212
164
    int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21213
164
    int CurrNumElts = CurrVT.getVectorNumElements();
21214
164
    MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21215
164
    MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21216
164
    SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21217
164
21218
164
    // Check if the upper half of the input element is zero.
21219
164
    SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21220
164
                               DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21221
164
    HiZ = DAG.getBitcast(NextVT, HiZ);
21222
164
21223
164
    // Move the upper/lower halves to the lower bits as we'll be extending to
21224
164
    // NextVT. Mask the lower result to zero if HiZ is true and add the results
21225
164
    // together.
21226
164
    SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21227
164
    SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21228
164
    SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21229
164
    R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21230
164
    Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21231
164
    CurrVT = NextVT;
21232
164
  }
21233
114
21234
114
  return Res;
21235
114
}
21236
21237
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21238
                               const X86Subtarget &Subtarget,
21239
182
                               SelectionDAG &DAG) {
21240
182
  MVT VT = Op.getSimpleValueType();
21241
182
21242
182
  if (Subtarget.hasCDI())
21243
60
    return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21244
122
21245
122
  // Decompose 256-bit ops into smaller 128-bit ops.
21246
122
  
if (122
VT.is256BitVector() && 122
!Subtarget.hasInt256()48
)
21247
8
    return Lower256IntUnary(Op, DAG);
21248
114
21249
114
  // Decompose 512-bit ops into smaller 256-bit ops.
21250
114
  
if (114
VT.is512BitVector() && 114
!Subtarget.hasBWI()0
)
21251
0
    return Lower512IntUnary(Op, DAG);
21252
114
21253
114
  assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21254
114
  return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21255
114
}
21256
21257
static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21258
657
                         SelectionDAG &DAG) {
21259
657
  MVT VT = Op.getSimpleValueType();
21260
657
  MVT OpVT = VT;
21261
657
  unsigned NumBits = VT.getSizeInBits();
21262
657
  SDLoc dl(Op);
21263
657
  unsigned Opc = Op.getOpcode();
21264
657
21265
657
  if (VT.isVector())
21266
182
    return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21267
475
21268
475
  Op = Op.getOperand(0);
21269
475
  if (
VT == MVT::i8475
) {
21270
7
    // Zero extend to i32 since there is not an i8 bsr.
21271
7
    OpVT = MVT::i32;
21272
7
    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21273
7
  }
21274
475
21275
475
  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21276
475
  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21277
475
  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21278
475
21279
475
  if (
Opc == ISD::CTLZ475
) {
21280
11
    // If src is zero (i.e. bsr sets ZF), returns NumBits.
21281
11
    SDValue Ops[] = {
21282
11
      Op,
21283
11
      DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21284
11
      DAG.getConstant(X86::COND_E, dl, MVT::i8),
21285
11
      Op.getValue(1)
21286
11
    };
21287
11
    Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21288
11
  }
21289
475
21290
475
  // Finally xor with NumBits-1.
21291
475
  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21292
475
                   DAG.getConstant(NumBits - 1, dl, OpVT));
21293
475
21294
475
  if (VT == MVT::i8)
21295
7
    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21296
657
  return Op;
21297
657
}
21298
21299
171
static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21300
171
  MVT VT = Op.getSimpleValueType();
21301
171
  unsigned NumBits = VT.getScalarSizeInBits();
21302
171
  SDLoc dl(Op);
21303
171
21304
171
  if (
VT.isVector()171
) {
21305
170
    SDValue N0 = Op.getOperand(0);
21306
170
    SDValue Zero = DAG.getConstant(0, dl, VT);
21307
170
21308
170
    // lsb(x) = (x & -x)
21309
170
    SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21310
170
                              DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21311
170
21312
170
    // cttz_undef(x) = (width - 1) - ctlz(lsb)
21313
170
    if (
Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF170
) {
21314
12
      SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21315
12
      return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21316
12
                         DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21317
12
    }
21318
158
21319
158
    // cttz(x) = ctpop(lsb - 1)
21320
158
    SDValue One = DAG.getConstant(1, dl, VT);
21321
158
    return DAG.getNode(ISD::CTPOP, dl, VT,
21322
158
                       DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21323
158
  }
21324
1
21325
171
  assert(Op.getOpcode() == ISD::CTTZ &&
21326
1
         "Only scalar CTTZ requires custom lowering");
21327
1
21328
1
  // Issue a bsf (scan bits forward) which also sets EFLAGS.
21329
1
  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21330
1
  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21331
1
21332
1
  // If src is zero (i.e. bsf sets ZF), returns NumBits.
21333
1
  SDValue Ops[] = {
21334
1
    Op,
21335
1
    DAG.getConstant(NumBits, dl, VT),
21336
1
    DAG.getConstant(X86::COND_E, dl, MVT::i8),
21337
1
    Op.getValue(1)
21338
1
  };
21339
1
  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21340
1
}
21341
21342
/// Break a 256-bit integer operation into two new 128-bit ones and then
21343
/// concatenate the result back.
21344
1.03k
static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21345
1.03k
  MVT VT = Op.getSimpleValueType();
21346
1.03k
21347
1.03k
  assert(VT.is256BitVector() && VT.isInteger() &&
21348
1.03k
         "Unsupported value type for operation");
21349
1.03k
21350
1.03k
  unsigned NumElems = VT.getVectorNumElements();
21351
1.03k
  SDLoc dl(Op);
21352
1.03k
21353
1.03k
  // Extract the LHS vectors
21354
1.03k
  SDValue LHS = Op.getOperand(0);
21355
1.03k
  SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21356
1.03k
  SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21357
1.03k
21358
1.03k
  // Extract the RHS vectors
21359
1.03k
  SDValue RHS = Op.getOperand(1);
21360
1.03k
  SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21361
1.03k
  SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21362
1.03k
21363
1.03k
  MVT EltVT = VT.getVectorElementType();
21364
1.03k
  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21365
1.03k
21366
1.03k
  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21367
1.03k
                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21368
1.03k
                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21369
1.03k
}
21370
21371
/// Break a 512-bit integer operation into two new 256-bit ones and then
21372
/// concatenate the result back.
21373
2
static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21374
2
  MVT VT = Op.getSimpleValueType();
21375
2
21376
2
  assert(VT.is512BitVector() && VT.isInteger() &&
21377
2
         "Unsupported value type for operation");
21378
2
21379
2
  unsigned NumElems = VT.getVectorNumElements();
21380
2
  SDLoc dl(Op);
21381
2
21382
2
  // Extract the LHS vectors
21383
2
  SDValue LHS = Op.getOperand(0);
21384
2
  SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21385
2
  SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21386
2
21387
2
  // Extract the RHS vectors
21388
2
  SDValue RHS = Op.getOperand(1);
21389
2
  SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21390
2
  SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21391
2
21392
2
  MVT EltVT = VT.getVectorElementType();
21393
2
  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21394
2
21395
2
  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21396
2
                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21397
2
                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21398
2
}
21399
21400
409
static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21401
409
  MVT VT = Op.getSimpleValueType();
21402
409
  if (VT.getScalarType() == MVT::i1)
21403
20
    return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21404
20
                       Op.getOperand(0), Op.getOperand(1));
21405
409
  assert(Op.getSimpleValueType().is256BitVector() &&
21406
389
         Op.getSimpleValueType().isInteger() &&
21407
389
         "Only handle AVX 256-bit vector integer operation");
21408
389
  return Lower256IntArith(Op, DAG);
21409
389
}
21410
21411
11
static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21412
11
  assert(Op.getSimpleValueType().is256BitVector() &&
21413
11
         Op.getSimpleValueType().isInteger() &&
21414
11
         "Only handle AVX 256-bit vector integer operation");
21415
11
  return Lower256IntUnary(Op, DAG);
21416
11
}
21417
21418
184
static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21419
184
  assert(Op.getSimpleValueType().is256BitVector() &&
21420
184
         Op.getSimpleValueType().isInteger() &&
21421
184
         "Only handle AVX 256-bit vector integer operation");
21422
184
  return Lower256IntArith(Op, DAG);
21423
184
}
21424
21425
static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21426
413
                        SelectionDAG &DAG) {
21427
413
  SDLoc dl(Op);
21428
413
  MVT VT = Op.getSimpleValueType();
21429
413
21430
413
  if (VT.getScalarType() == MVT::i1)
21431
10
    return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21432
403
21433
403
  // Decompose 256-bit ops into smaller 128-bit ops.
21434
403
  
if (403
VT.is256BitVector() && 403
!Subtarget.hasInt256()86
)
21435
30
    return Lower256IntArith(Op, DAG);
21436
373
21437
373
  SDValue A = Op.getOperand(0);
21438
373
  SDValue B = Op.getOperand(1);
21439
373
21440
373
  // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21441
373
  // vector pairs, multiply and truncate.
21442
373
  if (
VT == MVT::v16i8 || 373
VT == MVT::v32i8267
||
VT == MVT::v64i8239
) {
21443
136
    if (
Subtarget.hasInt256()136
) {
21444
86
      // For 512-bit vectors, split into 256-bit vectors to allow the
21445
86
      // sign-extension to occur.
21446
86
      if (VT == MVT::v64i8)
21447
2
        return Lower512IntArith(Op, DAG);
21448
84
21449
84
      // For 256-bit vectors, split into 128-bit vectors to allow the
21450
84
      // sign-extension to occur. We don't need this on AVX512BW as we can
21451
84
      // safely sign-extend to v32i16.
21452
84
      
if (84
VT == MVT::v32i8 && 84
!Subtarget.hasBWI()28
)
21453
20
        return Lower256IntArith(Op, DAG);
21454
64
21455
64
      MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21456
64
      return DAG.getNode(
21457
64
          ISD::TRUNCATE, dl, VT,
21458
64
          DAG.getNode(ISD::MUL, dl, ExVT,
21459
64
                      DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21460
64
                      DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21461
64
    }
21462
50
21463
0
    assert(VT == MVT::v16i8 &&
21464
50
           "Pre-AVX2 support only supports v16i8 multiplication");
21465
50
    MVT ExVT = MVT::v8i16;
21466
50
21467
50
    // Extract the lo parts and sign extend to i16
21468
50
    SDValue ALo, BLo;
21469
50
    if (
Subtarget.hasSSE41()50
) {
21470
34
      ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21471
34
      BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21472
50
    } else {
21473
16
      const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21474
16
                              -1, 4, -1, 5, -1, 6, -1, 7};
21475
16
      ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21476
16
      BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21477
16
      ALo = DAG.getBitcast(ExVT, ALo);
21478
16
      BLo = DAG.getBitcast(ExVT, BLo);
21479
16
      ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21480
16
      BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21481
16
    }
21482
50
21483
50
    // Extract the hi parts and sign extend to i16
21484
50
    SDValue AHi, BHi;
21485
50
    if (
Subtarget.hasSSE41()50
) {
21486
34
      const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21487
34
                              -1, -1, -1, -1, -1, -1, -1, -1};
21488
34
      AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21489
34
      BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21490
34
      AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21491
34
      BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21492
50
    } else {
21493
16
      const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
21494
16
                              -1, 12, -1, 13, -1, 14, -1, 15};
21495
16
      AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21496
16
      BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21497
16
      AHi = DAG.getBitcast(ExVT, AHi);
21498
16
      BHi = DAG.getBitcast(ExVT, BHi);
21499
16
      AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21500
16
      BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21501
16
    }
21502
136
21503
136
    // Multiply, mask the lower 8bits of the lo/hi results and pack
21504
136
    SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21505
136
    SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21506
136
    RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21507
136
    RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21508
136
    return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21509
136
  }
21510
237
21511
237
  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21512
237
  
if (237
VT == MVT::v4i32237
) {
21513
52
    assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21514
52
           "Should not custom lower when pmuldq is available!");
21515
52
21516
52
    // Extract the odd parts.
21517
52
    static const int UnpackMask[] = { 1, -1, 3, -1 };
21518
52
    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21519
52
    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21520
52
21521
52
    // Multiply the even parts.
21522
52
    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21523
52
    // Now multiply odd parts.
21524
52
    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21525
52
21526
52
    Evens = DAG.getBitcast(VT, Evens);
21527
52
    Odds = DAG.getBitcast(VT, Odds);
21528
52
21529
52
    // Merge the two vectors back together with a shuffle. This expands into 2
21530
52
    // shuffles.
21531
52
    static const int ShufMask[] = { 0, 4, 2, 6 };
21532
52
    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21533
52
  }
21534
185
21535
237
  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21536
185
         "Only know how to lower V2I64/V4I64/V8I64 multiply");
21537
185
21538
185
  // 32-bit vector types used for MULDQ/MULUDQ.
21539
185
  MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21540
185
21541
185
  // MULDQ returns the 64-bit result of the signed multiplication of the lower
21542
185
  // 32-bits. We can lower with this if the sign bits stretch that far.
21543
185
  if (
Subtarget.hasSSE41() && 185
DAG.ComputeNumSignBits(A) > 32128
&&
21544
185
      
DAG.ComputeNumSignBits(B) > 3215
) {
21545
15
    return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21546
15
                       DAG.getBitcast(MulVT, B));
21547
15
  }
21548
170
21549
170
  //  Ahi = psrlqi(a, 32);
21550
170
  //  Bhi = psrlqi(b, 32);
21551
170
  //
21552
170
  //  AloBlo = pmuludq(a, b);
21553
170
  //  AloBhi = pmuludq(a, Bhi);
21554
170
  //  AhiBlo = pmuludq(Ahi, b);
21555
170
  //
21556
170
  //  Hi = psllqi(AloBhi + AhiBlo, 32);
21557
170
  //  return AloBlo + Hi;
21558
170
  APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21559
170
  bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21560
170
  bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21561
170
21562
170
  APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21563
170
  bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21564
170
  bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21565
170
21566
170
  // Bit cast to 32-bit vectors for MULUDQ.
21567
170
  SDValue Alo = DAG.getBitcast(MulVT, A);
21568
170
  SDValue Blo = DAG.getBitcast(MulVT, B);
21569
170
21570
170
  SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21571
170
21572
170
  // Only multiply lo/hi halves that aren't known to be zero.
21573
170
  SDValue AloBlo = Zero;
21574
170
  if (
!ALoIsZero && 170
!BLoIsZero170
)
21575
163
    AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21576
170
21577
170
  SDValue AloBhi = Zero;
21578
170
  if (
!ALoIsZero && 170
!BHiIsZero170
) {
21579
83
    SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21580
83
    Bhi = DAG.getBitcast(MulVT, Bhi);
21581
83
    AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21582
83
  }
21583
170
21584
170
  SDValue AhiBlo = Zero;
21585
170
  if (
!AHiIsZero && 170
!BLoIsZero119
) {
21586
119
    SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21587
119
    Ahi = DAG.getBitcast(MulVT, Ahi);
21588
119
    AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21589
119
  }
21590
413
21591
413
  SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21592
413
  Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21593
413
21594
413
  return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21595
413
}
21596
21597
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21598
52
                         SelectionDAG &DAG) {
21599
52
  SDLoc dl(Op);
21600
52
  MVT VT = Op.getSimpleValueType();
21601
52
21602
52
  // Decompose 256-bit ops into smaller 128-bit ops.
21603
52
  if (
VT.is256BitVector() && 52
!Subtarget.hasInt256()24
)
21604
8
    return Lower256IntArith(Op, DAG);
21605
44
21606
44
  // Only i8 vectors should need custom lowering after this.
21607
52
  assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21608
44
         "Unsupported vector type");
21609
44
21610
44
  // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21611
44
  // logical shift down the upper half and pack back to i8.
21612
44
  SDValue A = Op.getOperand(0);
21613
44
  SDValue B = Op.getOperand(1);
21614
44
21615
44
  // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21616
44
  // and then ashr/lshr the upper bits down to the lower bits before multiply.
21617
44
  unsigned Opcode = Op.getOpcode();
21618
44
  unsigned ExShift = (ISD::MULHU == Opcode ? 
ISD::SRL22
:
ISD::SRA22
);
21619
44
  unsigned ExAVX = (ISD::MULHU == Opcode ? 
ISD::ZERO_EXTEND22
:
ISD::SIGN_EXTEND22
);
21620
44
21621
44
  // AVX2 implementations - extend xmm subvectors to ymm.
21622
44
  if (
Subtarget.hasInt256()44
) {
21623
24
    unsigned NumElems = VT.getVectorNumElements();
21624
24
    SDValue Lo = DAG.getIntPtrConstant(0, dl);
21625
24
    SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
21626
24
21627
24
    if (
VT == MVT::v32i824
) {
21628
16
      if (
Subtarget.hasBWI()16
) {
21629
4
        SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
21630
4
        SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
21631
4
        SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
21632
4
        Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
21633
4
                          DAG.getConstant(8, dl, MVT::v32i16));
21634
4
        // The ymm variant of PACKUS treats the 128-bit lanes separately, so
21635
4
        // before using PACKUS we need to permute the inputs to the correct
21636
4
        // lo/hi xmm lane.
21637
4
        const int Mask[] = { 0,  1,  2,  3,  4,  5,  6,  7,
21638
4
                            16, 17, 18, 19, 20, 21, 22, 23,
21639
4
                             8,  9, 10, 11, 12, 13, 14, 15,
21640
4
                            24, 25, 26, 27, 28, 29, 30, 31};
21641
4
        Mul = DAG.getVectorShuffle(MVT::v32i16, dl, Mul, Mul, Mask);
21642
4
        Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i16, Mul, Lo);
21643
4
        Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i16, Mul, Hi);
21644
4
        return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21645
4
      }
21646
12
      SDValue ALo = extract128BitVector(A, 0, DAG, dl);
21647
12
      SDValue BLo = extract128BitVector(B, 0, DAG, dl);
21648
12
      SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
21649
12
      SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
21650
12
      ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
21651
12
      BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
21652
12
      AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
21653
12
      BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
21654
12
      Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21655
12
                       DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21656
12
                       DAG.getConstant(8, dl, MVT::v16i16));
21657
12
      Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21658
12
                       DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21659
12
                       DAG.getConstant(8, dl, MVT::v16i16));
21660
12
      // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21661
12
      // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21662
12
      const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
21663
12
                            16, 17, 18, 19, 20, 21, 22, 23};
21664
12
      const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21665
12
                            24, 25, 26, 27, 28, 29, 30, 31};
21666
12
      return DAG.getNode(X86ISD::PACKUS, dl, VT,
21667
12
                         DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21668
12
                         DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21669
12
    }
21670
8
21671
8
    SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
21672
8
    SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
21673
8
    SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21674
8
    SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21675
8
                               DAG.getConstant(8, dl, MVT::v16i16));
21676
8
    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21677
8
    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21678
8
    return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21679
8
  }
21680
20
21681
44
  assert(VT == MVT::v16i8 &&
21682
20
         "Pre-AVX2 support only supports v16i8 multiplication");
21683
20
  MVT ExVT = MVT::v8i16;
21684
20
  unsigned ExSSE41 = (ISD::MULHU == Opcode ? 
X86ISD::VZEXT10
:
X86ISD::VSEXT10
);
21685
20
21686
20
  // Extract the lo parts and zero/sign extend to i16.
21687
20
  SDValue ALo, BLo;
21688
20
  if (
Subtarget.hasSSE41()20
) {
21689
16
    ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21690
16
    BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21691
20
  } else {
21692
4
    const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21693
4
                            -1, 4, -1, 5, -1, 6, -1, 7};
21694
4
    ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21695
4
    BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21696
4
    ALo = DAG.getBitcast(ExVT, ALo);
21697
4
    BLo = DAG.getBitcast(ExVT, BLo);
21698
4
    ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21699
4
    BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21700
4
  }
21701
20
21702
20
  // Extract the hi parts and zero/sign extend to i16.
21703
20
  SDValue AHi, BHi;
21704
20
  if (
Subtarget.hasSSE41()20
) {
21705
16
    const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21706
16
                            -1, -1, -1, -1, -1, -1, -1, -1};
21707
16
    AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21708
16
    BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21709
16
    AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21710
16
    BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21711
20
  } else {
21712
4
    const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
21713
4
                            -1, 12, -1, 13, -1, 14, -1, 15};
21714
4
    AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21715
4
    BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21716
4
    AHi = DAG.getBitcast(ExVT, AHi);
21717
4
    BHi = DAG.getBitcast(ExVT, BHi);
21718
4
    AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21719
4
    BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21720
4
  }
21721
52
21722
52
  // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21723
52
  // pack back to v16i8.
21724
52
  SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21725
52
  SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21726
52
  RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21727
52
  RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21728
52
  return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21729
52
}
21730
21731
3
SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21732
3
  assert(Subtarget.isTargetWin64() && "Unexpected target");
21733
3
  EVT VT = Op.getValueType();
21734
3
  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21735
3
         "Unexpected return type for lowering");
21736
3
21737
3
  RTLIB::Libcall LC;
21738
3
  bool isSigned;
21739
3
  switch (Op->getOpcode()) {
21740
0
  
default: 0
llvm_unreachable0
("Unexpected request for libcall!");
21741
0
  case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
21742
0
  case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
21743
3
  case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
21744
0
  case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
21745
0
  case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
21746
0
  case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21747
3
  }
21748
3
21749
3
  SDLoc dl(Op);
21750
3
  SDValue InChain = DAG.getEntryNode();
21751
3
21752
3
  TargetLowering::ArgListTy Args;
21753
3
  TargetLowering::ArgListEntry Entry;
21754
9
  for (unsigned i = 0, e = Op->getNumOperands(); 
i != e9
;
++i6
) {
21755
6
    EVT ArgVT = Op->getOperand(i).getValueType();
21756
6
    assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21757
6
           "Unexpected argument type for lowering");
21758
6
    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21759
6
    Entry.Node = StackPtr;
21760
6
    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21761
6
                           MachinePointerInfo(), /* Alignment = */ 16);
21762
6
    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21763
6
    Entry.Ty = PointerType::get(ArgTy,0);
21764
6
    Entry.IsSExt = false;
21765
6
    Entry.IsZExt = false;
21766
6
    Args.push_back(Entry);
21767
6
  }
21768
3
21769
3
  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21770
3
                                         getPointerTy(DAG.getDataLayout()));
21771
3
21772
3
  TargetLowering::CallLoweringInfo CLI(DAG);
21773
3
  CLI.setDebugLoc(dl)
21774
3
      .setChain(InChain)
21775
3
      .setLibCallee(
21776
3
          getLibcallCallingConv(LC),
21777
3
          static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21778
3
          std::move(Args))
21779
3
      .setInRegister()
21780
3
      .setSExtResult(isSigned)
21781
3
      .setZExtResult(!isSigned);
21782
3
21783
3
  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21784
3
  return DAG.getBitcast(VT, CallInfo.first);
21785
3
}
21786
21787
static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21788
54
                             SelectionDAG &DAG) {
21789
54
  SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21790
54
  MVT VT = Op0.getSimpleValueType();
21791
54
  SDLoc dl(Op);
21792
54
21793
54
  // Decompose 256-bit ops into smaller 128-bit ops.
21794
54
  if (
VT.is256BitVector() && 54
!Subtarget.hasInt256()12
) {
21795
4
    unsigned Opcode = Op.getOpcode();
21796
4
    unsigned NumElems = VT.getVectorNumElements();
21797
4
    MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21798
4
    SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21799
4
    SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21800
4
    SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21801
4
    SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21802
4
    SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21803
4
    SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21804
4
    SDValue Ops[] = {
21805
4
      DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21806
4
      DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21807
4
    };
21808
4
    return DAG.getMergeValues(Ops, dl);
21809
4
  }
21810
50
21811
54
  assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21812
50
         (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
21813
50
         (VT == MVT::v16i32 && Subtarget.hasAVX512()));
21814
50
21815
50
  int NumElts = VT.getVectorNumElements();
21816
50
21817
50
  // PMULxD operations multiply each even value (starting at 0) of LHS with
21818
50
  // the related value of RHS and produce a widen result.
21819
50
  // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21820
50
  // => <2 x i64> <ae|cg>
21821
50
  //
21822
50
  // In other word, to have all the results, we need to perform two PMULxD:
21823
50
  // 1. one with the even values.
21824
50
  // 2. one with the odd values.
21825
50
  // To achieve #2, with need to place the odd values at an even position.
21826
50
  //
21827
50
  // Place the odd value at an even position (basically, shift all values 1
21828
50
  // step to the left):
21829
50
  const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
21830
50
  // <a|b|c|d> => <b|undef|d|undef>
21831
50
  SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21832
50
                                      makeArrayRef(&Mask[0], NumElts));
21833
50
  // <e|f|g|h> => <f|undef|h|undef>
21834
50
  SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21835
50
                                      makeArrayRef(&Mask[0], NumElts));
21836
50
21837
50
  // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21838
50
  // ints.
21839
50
  MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
21840
50
  bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21841
50
  unsigned Opcode =
21842
50
      (!IsSigned || 
!Subtarget.hasSSE41()28
) ?
X86ISD::PMULUDQ25
:
X86ISD::PMULDQ25
;
21843
50
  // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21844
50
  // => <2 x i64> <ae|cg>
21845
50
  SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21846
50
  // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21847
50
  // => <2 x i64> <bf|dh>
21848
50
  SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21849
50
21850
50
  // Shuffle it back into the right order.
21851
50
  SmallVector<int, 16> HighMask(NumElts);
21852
50
  SmallVector<int, 16> LowMask(NumElts);
21853
378
  for (int i = 0; 
i != NumElts378
;
++i328
) {
21854
328
    HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
21855
328
    LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
21856
328
  }
21857
50
21858
50
  SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21859
50
  SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21860
50
21861
50
  // If we have a signed multiply but no PMULDQ fix up the high parts of a
21862
50
  // unsigned multiply.
21863
50
  if (
IsSigned && 50
!Subtarget.hasSSE41()28
) {
21864
3
    SDValue ShAmt = DAG.getConstant(
21865
3
        31, dl,
21866
3
        DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21867
3
    SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21868
3
                             DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21869
3
    SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21870
3
                             DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21871
3
21872
3
    SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21873
3
    Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21874
3
  }
21875
54
21876
54
  // The first result of MUL_LOHI is actually the low value, followed by the
21877
54
  // high value.
21878
54
  SDValue Ops[] = {Lows, Highs};
21879
54
  return DAG.getMergeValues(Ops, dl);
21880
54
}
21881
21882
// Return true if the required (according to Opcode) shift-imm form is natively
21883
// supported by the Subtarget
21884
static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21885
14.4k
                                        unsigned Opcode) {
21886
14.4k
  if (VT.getScalarSizeInBits() < 16)
21887
2.23k
    return false;
21888
12.1k
21889
12.1k
  
if (12.1k
VT.is512BitVector() && 12.1k
Subtarget.hasAVX512()2.61k
&&
21890
2.61k
      
(VT.getScalarSizeInBits() > 16 || 2.61k
Subtarget.hasBWI()230
))
21891
2.61k
    return true;
21892
9.55k
21893
9.55k
  
bool LShift = (VT.is128BitVector() && 9.55k
Subtarget.hasSSE2()6.90k
) ||
21894
2.64k
                
(VT.is256BitVector() && 2.64k
Subtarget.hasInt256()2.64k
);
21895
9.55k
21896
9.06k
  bool AShift = LShift && (Subtarget.hasAVX512() ||
21897
9.06k
                           
(VT != MVT::v2i64 && 7.13k
VT != MVT::v4i645.30k
));
21898
9.55k
  return (Opcode == ISD::SRA) ? 
AShift2.36k
:
LShift7.18k
;
21899
14.4k
}
21900
21901
// The shift amount is a variable, but it is the same for all vector lanes.
21902
// These instructions are defined together with shift-immediate.
21903
static
21904
bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21905
4.68k
                                      unsigned Opcode) {
21906
4.68k
  return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21907
4.68k
}
21908
21909
// Return true if the required (according to Opcode) variable-shift form is
21910
// natively supported by the Subtarget
21911
static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21912
3.97k
                                    unsigned Opcode) {
21913
3.97k
21914
3.97k
  if (
!Subtarget.hasInt256() || 3.97k
VT.getScalarSizeInBits() < 162.68k
)
21915
1.51k
    return false;
21916
2.46k
21917
2.46k
  // vXi16 supported only on AVX-512, BWI
21918
2.46k
  
if (2.46k
VT.getScalarSizeInBits() == 16 && 2.46k
!Subtarget.hasBWI()505
)
21919
180
    return false;
21920
2.28k
21921
2.28k
  
if (2.28k
Subtarget.hasAVX512()2.28k
)
21922
1.29k
    return true;
21923
988
21924
988
  
bool LShift = VT.is128BitVector() || 988
VT.is256BitVector()574
;
21925
988
  bool AShift = LShift &&  
VT != MVT::v2i64988
&&
VT != MVT::v4i64854
;
21926
988
  return (Opcode == ISD::SRA) ? 
AShift171
:
LShift817
;
21927
3.97k
}
21928
21929
static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21930
13.7k
                                         const X86Subtarget &Subtarget) {
21931
13.7k
  MVT VT = Op.getSimpleValueType();
21932
13.7k
  SDLoc dl(Op);
21933
13.7k
  SDValue R = Op.getOperand(0);
21934
13.7k
  SDValue Amt = Op.getOperand(1);
21935
13.7k
21936
5.70k
  unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21937
8.03k
    
(Op.getOpcode() == ISD::SRL) ? 8.03k
X86ISD::VSRLI5.33k
:
X86ISD::VSRAI2.69k
;
21938
13.7k
21939
217
  auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21940
217
    assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21941
217
    MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21942
217
    SDValue Ex = DAG.getBitcast(ExVT, R);
21943
217
21944
217
    // ashr(R, 63) === cmp_slt(R, 0)
21945
217
    if (
ShiftAmt == 63 && 217
Subtarget.hasSSE42()67
) {
21946
26
      assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
21947
26
             "Unsupported PCMPGT op");
21948
26
      return DAG.getNode(X86ISD::PCMPGT, dl, VT,
21949
26
                         getZeroVector(VT, Subtarget, DAG, dl), R);
21950
26
    }
21951
191
21952
191
    
if (191
ShiftAmt >= 32191
) {
21953
133
      // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21954
133
      SDValue Upper =
21955
133
          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21956
133
      SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21957
133
                                                 ShiftAmt - 32, DAG);
21958
133
      if (VT == MVT::v2i64)
21959
132
        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21960
133
      if (VT == MVT::v4i64)
21961
1
        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21962
1
                                  {9, 1, 11, 3, 13, 5, 15, 7});
21963
191
    } else {
21964
58
      // SRA upper i32, SHL whole i64 and select lower i32.
21965
58
      SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21966
58
                                                 ShiftAmt, DAG);
21967
58
      SDValue Lower =
21968
58
          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21969
58
      Lower = DAG.getBitcast(ExVT, Lower);
21970
58
      if (VT == MVT::v2i64)
21971
54
        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21972
58
      if (VT == MVT::v4i64)
21973
4
        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21974
4
                                  {8, 1, 10, 3, 12, 5, 14, 7});
21975
58
    }
21976
217
    return DAG.getBitcast(VT, Ex);
21977
217
  };
21978
13.7k
21979
13.7k
  // Optimize shl/srl/sra with constant shift amount.
21980
13.7k
  if (auto *
BVAmt13.7k
= dyn_cast<BuildVectorSDNode>(Amt)) {
21981
10.1k
    if (auto *
ShiftConst10.1k
= BVAmt->getConstantSplatNode()) {
21982
9.17k
      uint64_t ShiftAmt = ShiftConst->getZExtValue();
21983
9.17k
21984
9.17k
      if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21985
7.20k
        return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21986
1.96k
21987
1.96k
      // i64 SRA needs to be performed as partial shifts.
21988
1.96k
      
if (1.96k
((!Subtarget.hasXOP() && 1.96k
VT == MVT::v2i641.93k
) ||
21989
1.78k
           
(Subtarget.hasInt256() && 1.78k
VT == MVT::v4i64727
)) &&
21990
198
          Op.getOpcode() == ISD::SRA)
21991
198
        return ArithmeticShiftRight64(ShiftAmt);
21992
1.76k
21993
1.76k
      
if (1.76k
VT == MVT::v16i8 ||
21994
774
          
(Subtarget.hasInt256() && 774
VT == MVT::v32i8555
) ||
21995
1.76k
          
VT == MVT::v64i8313
) {
21996
1.54k
        unsigned NumElts = VT.getVectorNumElements();
21997
1.54k
        MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21998
1.54k
21999
1.54k
        // Simple i8 add case
22000
1.54k
        if (
Op.getOpcode() == ISD::SHL && 1.54k
ShiftAmt == 1446
)
22001
119
          return DAG.getNode(ISD::ADD, dl, VT, R, R);
22002
1.43k
22003
1.43k
        // ashr(R, 7)  === cmp_slt(R, 0)
22004
1.43k
        
if (1.43k
Op.getOpcode() == ISD::SRA && 1.43k
ShiftAmt == 798
) {
22005
39
          SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22006
39
          if (
VT.is512BitVector()39
) {
22007
1
            assert(VT == MVT::v64i8 && "Unexpected element type!");
22008
1
            SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
22009
1
            return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22010
1
          }
22011
38
          return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22012
38
        }
22013
1.39k
22014
1.39k
        // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22015
1.39k
        
if (1.39k
VT == MVT::v16i8 && 1.39k
Subtarget.hasXOP()895
)
22016
13
          return SDValue();
22017
1.37k
22018
1.37k
        
if (1.37k
Op.getOpcode() == ISD::SHL1.37k
) {
22019
322
          // Make a large shift.
22020
322
          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22021
322
                                                   R, ShiftAmt, DAG);
22022
322
          SHL = DAG.getBitcast(VT, SHL);
22023
322
          // Zero out the rightmost bits.
22024
322
          return DAG.getNode(ISD::AND, dl, VT, SHL,
22025
322
                             DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22026
322
        }
22027
1.05k
        
if (1.05k
Op.getOpcode() == ISD::SRL1.05k
) {
22028
1.00k
          // Make a large shift.
22029
1.00k
          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22030
1.00k
                                                   R, ShiftAmt, DAG);
22031
1.00k
          SRL = DAG.getBitcast(VT, SRL);
22032
1.00k
          // Zero out the leftmost bits.
22033
1.00k
          return DAG.getNode(ISD::AND, dl, VT, SRL,
22034
1.00k
                             DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22035
1.00k
        }
22036
55
        
if (55
Op.getOpcode() == ISD::SRA55
) {
22037
55
          // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22038
55
          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22039
55
22040
55
          SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22041
55
          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22042
55
          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22043
55
          return Res;
22044
55
        }
22045
0
        
llvm_unreachable0
("Unknown shift opcode.");
22046
0
      }
22047
9.17k
    }
22048
10.1k
  }
22049
4.78k
22050
4.78k
  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22051
4.78k
  // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22052
4.78k
  
if (4.78k
!Subtarget.is64Bit() && 4.78k
!Subtarget.hasXOP()682
&&
22053
682
      
(VT == MVT::v2i64 || 682
(Subtarget.hasInt256() && 469
VT == MVT::v4i64282
) ||
22054
4.78k
       
(Subtarget.hasAVX512() && 390
VT == MVT::v8i6429
))) {
22055
309
22056
309
    // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22057
309
    unsigned SubVectorScale = 1;
22058
309
    if (
Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR309
) {
22059
40
      SubVectorScale =
22060
40
          Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22061
40
      Amt = Amt.getOperand(0);
22062
40
    }
22063
309
22064
309
    // Peek through any splat that was introduced for i64 shift vectorization.
22065
309
    int SplatIndex = -1;
22066
309
    if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22067
89
      
if (89
SVN->isSplat()89
) {
22068
89
        SplatIndex = SVN->getSplatIndex();
22069
89
        Amt = Amt.getOperand(0);
22070
89
        assert(SplatIndex < (int)VT.getVectorNumElements() &&
22071
89
               "Splat shuffle referencing second operand");
22072
89
      }
22073
309
22074
309
    if (Amt.getOpcode() != ISD::BITCAST ||
22075
158
        Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22076
157
      return SDValue();
22077
152
22078
152
    Amt = Amt.getOperand(0);
22079
152
    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22080
152
                     (SubVectorScale * VT.getVectorNumElements());
22081
152
    unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22082
152
    uint64_t ShiftAmt = 0;
22083
152
    unsigned BaseOp = (SplatIndex < 0 ? 
0130
:
SplatIndex * Ratio22
);
22084
440
    for (unsigned i = 0; 
i != Ratio440
;
++i288
) {
22085
296
      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22086
296
      if (!C)
22087
8
        return SDValue();
22088
288
      // 6 == Log2(64)
22089
288
      ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22090
288
    }
22091
152
22092
152
    // Check remaining shift amounts (if not a splat).
22093
144
    
if (144
SplatIndex < 0144
) {
22094
361
      for (unsigned i = Ratio; 
i != Amt.getNumOperands()361
;
i += Ratio239
) {
22095
270
        uint64_t ShAmt = 0;
22096
810
        for (unsigned j = 0; 
j != Ratio810
;
++j540
) {
22097
540
          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22098
540
          if (!C)
22099
0
            return SDValue();
22100
540
          // 6 == Log2(64)
22101
540
          ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22102
540
        }
22103
270
        
if (270
ShAmt != ShiftAmt270
)
22104
31
          return SDValue();
22105
270
      }
22106
122
    }
22107
144
22108
113
    
if (113
SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())113
)
22109
94
      return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22110
19
22111
19
    
if (19
Op.getOpcode() == ISD::SRA19
)
22112
19
      return ArithmeticShiftRight64(ShiftAmt);
22113
4.47k
  }
22114
4.47k
22115
4.47k
  return SDValue();
22116
4.47k
}
22117
22118
static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22119
4.68k
                                        const X86Subtarget &Subtarget) {
22120
4.68k
  MVT VT = Op.getSimpleValueType();
22121
4.68k
  SDLoc dl(Op);
22122
4.68k
  SDValue R = Op.getOperand(0);
22123
4.68k
  SDValue Amt = Op.getOperand(1);
22124
4.68k
22125
1.68k
  unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22126
2.99k
    
(Op.getOpcode() == ISD::SRL) ? 2.99k
X86ISD::VSRLI1.99k
:
X86ISD::VSRAI1.00k
;
22127
4.68k
22128
1.68k
  unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22129
2.99k
    
(Op.getOpcode() == ISD::SRL) ? 2.99k
X86ISD::VSRL1.99k
:
X86ISD::VSRA1.00k
;
22130
4.68k
22131
4.68k
  if (
SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())4.68k
) {
22132
3.86k
    SDValue BaseShAmt;
22133
3.86k
    MVT EltVT = VT.getVectorElementType();
22134
3.86k
22135
3.86k
    if (BuildVectorSDNode *
BV3.86k
= dyn_cast<BuildVectorSDNode>(Amt)) {
22136
800
      // Check if this build_vector node is doing a splat.
22137
800
      // If so, then set BaseShAmt equal to the splat value.
22138
800
      BaseShAmt = BV->getSplatValue();
22139
800
      if (
BaseShAmt && 800
BaseShAmt.isUndef()33
)
22140
0
        BaseShAmt = SDValue();
22141
3.86k
    } else {
22142
3.06k
      if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22143
156
        Amt = Amt.getOperand(0);
22144
3.06k
22145
3.06k
      ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22146
3.06k
      if (
SVN && 3.06k
SVN->isSplat()671
) {
22147
671
        unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22148
671
        SDValue InVec = Amt.getOperand(0);
22149
671
        if (
InVec.getOpcode() == ISD::BUILD_VECTOR671
) {
22150
0
          assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22151
0
                 "Unexpected shuffle index found!");
22152
0
          BaseShAmt = InVec.getOperand(SplatIdx);
22153
671
        } else 
if (671
InVec.getOpcode() == ISD::INSERT_VECTOR_ELT671
) {
22154
0
           if (ConstantSDNode *C =
22155
0
               dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22156
0
             if (C->getZExtValue() == SplatIdx)
22157
0
               BaseShAmt = InVec.getOperand(1);
22158
0
           }
22159
671
        }
22160
671
22161
671
        if (!BaseShAmt)
22162
671
          // Avoid introducing an extract element from a shuffle.
22163
671
          BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22164
671
                                  DAG.getIntPtrConstant(SplatIdx, dl));
22165
671
      }
22166
3.06k
    }
22167
3.86k
22168
3.86k
    if (
BaseShAmt.getNode()3.86k
) {
22169
704
      assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22170
704
      if (
EltVT != MVT::i64 && 704
EltVT.bitsGT(MVT::i32)210
)
22171
0
        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22172
704
      else 
if (704
EltVT.bitsLT(MVT::i32)704
)
22173
97
        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22174
704
22175
704
      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22176
704
    }
22177
3.97k
  }
22178
3.97k
22179
3.97k
  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22180
3.97k
  
if (3.97k
!Subtarget.is64Bit() && 3.97k
VT == MVT::v2i64462
&&
22181
73
      Amt.getOpcode() == ISD::BITCAST &&
22182
3.97k
      
Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR26
) {
22183
26
    Amt = Amt.getOperand(0);
22184
26
    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22185
26
                     VT.getVectorNumElements();
22186
26
    std::vector<SDValue> Vals(Ratio);
22187
98
    for (unsigned i = 0; 
i != Ratio98
;
++i72
)
22188
72
      Vals[i] = Amt.getOperand(i);
22189
32
    for (unsigned i = Ratio; 
i != Amt.getNumOperands()32
;
i += Ratio6
) {
22190
42
      for (unsigned j = 0; 
j != Ratio42
;
++j16
)
22191
36
        
if (36
Vals[j] != Amt.getOperand(i + j)36
)
22192
20
          return SDValue();
22193
26
    }
22194
26
22195
6
    
if (6
SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())6
)
22196
6
      return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22197
3.95k
  }
22198
3.95k
  return SDValue();
22199
3.95k
}
22200
22201
static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22202
13.7k
                          SelectionDAG &DAG) {
22203
13.7k
  MVT VT = Op.getSimpleValueType();
22204
13.7k
  SDLoc dl(Op);
22205
13.7k
  SDValue R = Op.getOperand(0);
22206
13.7k
  SDValue Amt = Op.getOperand(1);
22207
13.7k
  bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22208
13.7k
22209
13.7k
  assert(VT.isVector() && "Custom lowering only for vector shifts!");
22210
13.7k
  assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22211
13.7k
22212
13.7k
  if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22213
9.05k
    return V;
22214
4.68k
22215
4.68k
  
if (SDValue 4.68k
V4.68k
= LowerScalarVariableShift(Op, DAG, Subtarget))
22216
710
    return V;
22217
3.97k
22218
3.97k
  
if (3.97k
SupportedVectorVarShift(VT, Subtarget, Op.getOpcode())3.97k
)
22219
2.26k
    return Op;
22220
1.70k
22221
1.70k
  // XOP has 128-bit variable logical/arithmetic shifts.
22222
1.70k
  // +ve/-ve Amt = shift left/right.
22223
1.70k
  
if (1.70k
Subtarget.hasXOP() &&
22224
218
      
(VT == MVT::v2i64 || 218
VT == MVT::v4i32189
||
22225
1.70k
       
VT == MVT::v8i16170
||
VT == MVT::v16i8135
)) {
22226
151
    if (
Op.getOpcode() == ISD::SRL || 151
Op.getOpcode() == ISD::SRA105
) {
22227
102
      SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22228
102
      Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22229
102
    }
22230
151
    if (
Op.getOpcode() == ISD::SHL || 151
Op.getOpcode() == ISD::SRL102
)
22231
95
      return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22232
56
    
if (56
Op.getOpcode() == ISD::SRA56
)
22233
56
      return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22234
1.55k
  }
22235
1.55k
22236
1.55k
  // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22237
1.55k
  // shifts per-lane and then shuffle the partial results back together.
22238
1.55k
  
if (1.55k
VT == MVT::v2i64 && 1.55k
Op.getOpcode() != ISD::SRA320
) {
22239
255
    // Splat the shift amounts so the scalar shifts above will catch it.
22240
255
    SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22241
255
    SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22242
255
    SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22243
255
    SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22244
255
    return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22245
255
  }
22246
1.30k
22247
1.30k
  // i64 vector arithmetic shift can be emulated with the transform:
22248
1.30k
  // M = lshr(SIGN_MASK, Amt)
22249
1.30k
  // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22250
1.30k
  
if (1.30k
(VT == MVT::v2i64 || 1.30k
(VT == MVT::v4i64 && 1.23k
Subtarget.hasInt256()74
)) &&
22251
1.30k
      
Op.getOpcode() == ISD::SRA75
) {
22252
75
    SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22253
75
    SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22254
75
    R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22255
75
    R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22256
75
    R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22257
75
    return R;
22258
75
  }
22259
1.22k
22260
1.22k
  // If possible, lower this packed shift into a vector multiply instead of
22261
1.22k
  // expanding it into a sequence of scalar shifts.
22262
1.22k
  // Do this only if the vector shift count is a constant build_vector.
22263
1.22k
  
if (1.22k
ConstantAmt && 1.22k
Op.getOpcode() == ISD::SHL669
&&
22264
291
      
(VT == MVT::v8i16 || 291
VT == MVT::v4i32215
||
22265
1.22k
       
(Subtarget.hasInt256() && 134
VT == MVT::v16i1664
))) {
22266
193
    SmallVector<SDValue, 8> Elts;
22267
193
    MVT SVT = VT.getVectorElementType();
22268
193
    unsigned SVTBits = SVT.getSizeInBits();
22269
193
    APInt One(SVTBits, 1);
22270
193
    unsigned NumElems = VT.getVectorNumElements();
22271
193
22272
1.70k
    for (unsigned i=0; 
i !=NumElems1.70k
;
++i1.50k
) {
22273
1.50k
      SDValue Op = Amt->getOperand(i);
22274
1.50k
      if (
Op->isUndef()1.50k
) {
22275
9
        Elts.push_back(Op);
22276
9
        continue;
22277
9
      }
22278
1.49k
22279
1.49k
      ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22280
1.49k
      APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22281
1.49k
      uint64_t ShAmt = C.getZExtValue();
22282
1.49k
      if (
ShAmt >= SVTBits1.49k
) {
22283
5
        Elts.push_back(DAG.getUNDEF(SVT));
22284
5
        continue;
22285
5
      }
22286
1.49k
      Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22287
1.49k
    }
22288
193
    SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22289
193
    return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22290
193
  }
22291
1.03k
22292
1.03k
  // Lower SHL with variable shift amount.
22293
1.03k
  
if (1.03k
VT == MVT::v4i32 && 1.03k
Op->getOpcode() == ISD::SHL165
) {
22294
44
    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22295
44
22296
44
    Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22297
44
                     DAG.getConstant(0x3f800000U, dl, VT));
22298
44
    Op = DAG.getBitcast(MVT::v4f32, Op);
22299
44
    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22300
44
    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22301
44
  }
22302
991
22303
991
  // If possible, lower this shift as a sequence of two shifts by
22304
991
  // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22305
991
  // Example:
22306
991
  //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22307
991
  //
22308
991
  // Could be rewritten as:
22309
991
  //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22310
991
  //
22311
991
  // The advantage is that the two shifts from the example would be
22312
991
  // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22313
991
  // the vector shift into four scalar shifts plus four pairs of vector
22314
991
  // insert/extract.
22315
991
  
if (991
ConstantAmt && 991
(VT == MVT::v8i16 || 476
VT == MVT::v4i32433
)) {
22316
92
    unsigned TargetOpcode = X86ISD::MOVSS;
22317
92
    bool CanBeSimplified;
22318
92
    // The splat value for the first packed shift (the 'X' from the example).
22319
92
    SDValue Amt1 = Amt->getOperand(0);
22320
92
    // The splat value for the second packed shift (the 'Y' from the example).
22321
92
    SDValue Amt2 = (VT == MVT::v4i32) ? 
Amt->getOperand(1)49
:
Amt->getOperand(2)43
;
22322
92
22323
92
    // See if it is possible to replace this node with a sequence of
22324
92
    // two shifts followed by a MOVSS/MOVSD/PBLEND.
22325
92
    if (
VT == MVT::v4i3292
) {
22326
49
      // Check if it is legal to use a MOVSS.
22327
49
      CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22328
6
                        Amt2 == Amt->getOperand(3);
22329
49
      if (
!CanBeSimplified49
) {
22330
45
        // Otherwise, check if we can still simplify this node using a MOVSD.
22331
45
        CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22332
6
                          Amt->getOperand(2) == Amt->getOperand(3);
22333
45
        TargetOpcode = X86ISD::MOVSD;
22334
45
        Amt2 = Amt->getOperand(2);
22335
45
      }
22336
92
    } else {
22337
43
      // Do similar checks for the case where the machine value type
22338
43
      // is MVT::v8i16.
22339
43
      CanBeSimplified = Amt1 == Amt->getOperand(1);
22340
85
      for (unsigned i=3; 
i != 8 && 85
CanBeSimplified79
;
++i42
)
22341
42
        CanBeSimplified = Amt2 == Amt->getOperand(i);
22342
43
22343
43
      if (
!CanBeSimplified43
) {
22344
37
        TargetOpcode = X86ISD::MOVSD;
22345
37
        CanBeSimplified = true;
22346
37
        Amt2 = Amt->getOperand(4);
22347
123
        for (unsigned i=0; 
i != 4 && 123
CanBeSimplified117
;
++i86
)
22348
86
          CanBeSimplified = Amt1 == Amt->getOperand(i);
22349
61
        for (unsigned j=4; 
j != 8 && 61
CanBeSimplified55
;
++j24
)
22350
24
          CanBeSimplified = Amt2 == Amt->getOperand(j);
22351
37
      }
22352
43
    }
22353
92
22354
92
    if (
CanBeSimplified && 92
isa<ConstantSDNode>(Amt1)20
&&
22355
92
        
isa<ConstantSDNode>(Amt2)20
) {
22356
20
      // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22357
20
      MVT CastVT = MVT::v4i32;
22358
20
      SDValue Splat1 =
22359
20
          DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22360
20
      SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22361
20
      SDValue Splat2 =
22362
20
          DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22363
20
      SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22364
20
      SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22365
20
      SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22366
20
      if (TargetOpcode == X86ISD::MOVSD)
22367
10
        return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22368
10
                                                       BitCast2, {0, 1, 6, 7}));
22369
10
      return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22370
10
                                                     BitCast2, {0, 5, 6, 7}));
22371
10
    }
22372
92
  }
22373
971
22374
971
  // v4i32 Non Uniform Shifts.
22375
971
  // If the shift amount is constant we can shift each lane using the SSE2
22376
971
  // immediate shifts, else we need to zero-extend each lane to the lower i64
22377
971
  // and shift using the SSE2 variable shifts.
22378
971
  // The separate results can then be blended together.
22379
971
  
if (971
VT == MVT::v4i32971
) {
22380
113
    unsigned Opc = Op.getOpcode();
22381
113
    SDValue Amt0, Amt1, Amt2, Amt3;
22382
113
    if (
ConstantAmt113
) {
22383
41
      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22384
41
      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22385
41
      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22386
41
      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22387
113
    } else {
22388
72
      // ISD::SHL is handled above but we include it here for completeness.
22389
72
      switch (Opc) {
22390
0
      default:
22391
0
        llvm_unreachable("Unknown target vector shift node");
22392
0
      case ISD::SHL:
22393
0
        Opc = X86ISD::VSHL;
22394
0
        break;
22395
43
      case ISD::SRL:
22396
43
        Opc = X86ISD::VSRL;
22397
43
        break;
22398
29
      case ISD::SRA:
22399
29
        Opc = X86ISD::VSRA;
22400
29
        break;
22401
72
      }
22402
72
      // The SSE2 shifts use the lower i64 as the same shift amount for
22403
72
      // all lanes and the upper i64 is ignored. These shuffle masks
22404
72
      // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22405
72
      SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22406
72
      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22407
72
      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22408
72
      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22409
72
      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22410
72
    }
22411
113
22412
113
    SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22413
113
    SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22414
113
    SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22415
113
    SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22416
113
    SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22417
113
    SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22418
113
    return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22419
858
  }
22420
858
22421
858
  // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22422
858
  // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22423
858
  // make the existing SSE solution better.
22424
858
  
if (858
(Subtarget.hasInt256() && 858
VT == MVT::v8i16296
) ||
22425
817
      
(Subtarget.hasAVX512() && 817
VT == MVT::v16i16177
) ||
22426
779
      
(Subtarget.hasAVX512() && 779
VT == MVT::v16i8139
) ||
22427
858
      
(Subtarget.hasBWI() && 735
VT == MVT::v32i843
)) {
22428
149
    MVT EvtSVT = (VT == MVT::v32i8 ? 
MVT::i1626
:
MVT::i32123
);
22429
149
    MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22430
149
    unsigned ExtOpc =
22431
149
        Op.getOpcode() == ISD::SRA ? 
ISD::SIGN_EXTEND42
:
ISD::ZERO_EXTEND107
;
22432
149
    R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22433
149
    Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22434
149
    return DAG.getNode(ISD::TRUNCATE, dl, VT,
22435
149
                       DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22436
149
  }
22437
709
22438
709
  
if (709
VT == MVT::v16i8 ||
22439
582
      
(VT == MVT::v32i8 && 582
Subtarget.hasInt256()153
&&
!Subtarget.hasXOP()89
) ||
22440
709
      
(VT == MVT::v64i8 && 502
Subtarget.hasBWI()17
)) {
22441
224
    MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22442
224
    unsigned ShiftOpcode = Op->getOpcode();
22443
224
22444
831
    auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22445
831
      if (
VT.is512BitVector()831
) {
22446
60
        // On AVX512BW targets we make use of the fact that VSELECT lowers
22447
60
        // to a masked blend which selects bytes based just on the sign bit
22448
60
        // extracted to a mask.
22449
60
        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22450
60
        V0 = DAG.getBitcast(VT, V0);
22451
60
        V1 = DAG.getBitcast(VT, V1);
22452
60
        Sel = DAG.getBitcast(VT, Sel);
22453
60
        Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22454
60
        return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22455
771
      } else 
if (771
Subtarget.hasSSE41()771
) {
22456
630
        // On SSE41 targets we make use of the fact that VSELECT lowers
22457
630
        // to PBLENDVB which selects bytes based just on the sign bit.
22458
630
        V0 = DAG.getBitcast(VT, V0);
22459
630
        V1 = DAG.getBitcast(VT, V1);
22460
630
        Sel = DAG.getBitcast(VT, Sel);
22461
630
        return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22462
630
      }
22463
141
      // On pre-SSE41 targets we test for the sign bit by comparing to
22464
141
      // zero - a negative value will set all bits of the lanes to true
22465
141
      // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22466
141
      SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22467
141
      SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22468
141
      return DAG.getSelect(dl, SelVT, C, V0, V1);
22469
141
    };
22470
224
22471
224
    // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22472
224
    // We can safely do this using i16 shifts as we're only interested in
22473
224
    // the 3 lower bits of each byte.
22474
224
    Amt = DAG.getBitcast(ExtVT, Amt);
22475
224
    Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22476
224
    Amt = DAG.getBitcast(VT, Amt);
22477
224
22478
224
    if (
Op->getOpcode() == ISD::SHL || 224
Op->getOpcode() == ISD::SRL134
) {
22479
171
      // r = VSELECT(r, shift(r, 4), a);
22480
171
      SDValue M =
22481
171
          DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22482
171
      R = SignBitSelect(VT, Amt, M, R);
22483
171
22484
171
      // a += a
22485
171
      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22486
171
22487
171
      // r = VSELECT(r, shift(r, 2), a);
22488
171
      M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22489
171
      R = SignBitSelect(VT, Amt, M, R);
22490
171
22491
171
      // a += a
22492
171
      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22493
171
22494
171
      // return VSELECT(r, shift(r, 1), a);
22495
171
      M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22496
171
      R = SignBitSelect(VT, Amt, M, R);
22497
171
      return R;
22498
171
    }
22499
53
22500
53
    
if (53
Op->getOpcode() == ISD::SRA53
) {
22501
53
      // For SRA we need to unpack each byte to the higher byte of a i16 vector
22502
53
      // so we can correctly sign extend. We don't care what happens to the
22503
53
      // lower byte.
22504
53
      SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22505
53
      SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22506
53
      SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22507
53
      SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22508
53
      ALo = DAG.getBitcast(ExtVT, ALo);
22509
53
      AHi = DAG.getBitcast(ExtVT, AHi);
22510
53
      RLo = DAG.getBitcast(ExtVT, RLo);
22511
53
      RHi = DAG.getBitcast(ExtVT, RHi);
22512
53
22513
53
      // r = VSELECT(r, shift(r, 4), a);
22514
53
      SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22515
53
                                DAG.getConstant(4, dl, ExtVT));
22516
53
      SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22517
53
                                DAG.getConstant(4, dl, ExtVT));
22518
53
      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22519
53
      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22520
53
22521
53
      // a += a
22522
53
      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22523
53
      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22524
53
22525
53
      // r = VSELECT(r, shift(r, 2), a);
22526
53
      MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22527
53
                        DAG.getConstant(2, dl, ExtVT));
22528
53
      MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22529
53
                        DAG.getConstant(2, dl, ExtVT));
22530
53
      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22531
53
      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22532
53
22533
53
      // a += a
22534
53
      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22535
53
      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22536
53
22537
53
      // r = VSELECT(r, shift(r, 1), a);
22538
53
      MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22539
53
                        DAG.getConstant(1, dl, ExtVT));
22540
53
      MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22541
53
                        DAG.getConstant(1, dl, ExtVT));
22542
53
      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22543
53
      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22544
53
22545
53
      // Logical shift the result back to the lower byte, leaving a zero upper
22546
53
      // byte
22547
53
      // meaning that we can safely pack with PACKUSWB.
22548
53
      RLo =
22549
53
          DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22550
53
      RHi =
22551
53
          DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22552
53
      return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22553
53
    }
22554
485
  }
22555
485
22556
485
  
if (485
Subtarget.hasInt256() && 485
!Subtarget.hasXOP()36
&&
VT == MVT::v16i1622
) {
22557
22
    MVT ExtVT = MVT::v8i32;
22558
22
    SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22559
22
    SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22560
22
    SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22561
22
    SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22562
22
    SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22563
22
    ALo = DAG.getBitcast(ExtVT, ALo);
22564
22
    AHi = DAG.getBitcast(ExtVT, AHi);
22565
22
    RLo = DAG.getBitcast(ExtVT, RLo);
22566
22
    RHi = DAG.getBitcast(ExtVT, RHi);
22567
22
    SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22568
22
    SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22569
22
    Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22570
22
    Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22571
22
    return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22572
22
  }
22573
463
22574
463
  
if (463
VT == MVT::v8i16463
) {
22575
92
    unsigned ShiftOpcode = Op->getOpcode();
22576
92
22577
92
    // If we have a constant shift amount, the non-SSE41 path is best as
22578
92
    // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22579
92
    bool UseSSE41 = Subtarget.hasSSE41() &&
22580
52
                    !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22581
92
22582
368
    auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22583
368
      // On SSE41 targets we make use of the fact that VSELECT lowers
22584
368
      // to PBLENDVB which selects bytes based just on the sign bit.
22585
368
      if (
UseSSE41368
) {
22586
140
        MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22587
140
        V0 = DAG.getBitcast(ExtVT, V0);
22588
140
        V1 = DAG.getBitcast(ExtVT, V1);
22589
140
        Sel = DAG.getBitcast(ExtVT, Sel);
22590
140
        return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
22591
140
      }
22592
228
      // On pre-SSE41 targets we splat the sign bit - a negative value will
22593
228
      // set all bits of the lanes to true and VSELECT uses that in
22594
228
      // its OR(AND(V0,C),AND(V1,~C)) lowering.
22595
228
      SDValue C =
22596
228
          DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22597
228
      return DAG.getSelect(dl, VT, C, V0, V1);
22598
228
    };
22599
92
22600
92
    // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22601
92
    if (
UseSSE4192
) {
22602
35
      // On SSE41 targets we need to replicate the shift mask in both
22603
35
      // bytes for PBLENDVB.
22604
35
      Amt = DAG.getNode(
22605
35
          ISD::OR, dl, VT,
22606
35
          DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22607
35
          DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22608
92
    } else {
22609
57
      Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22610
57
    }
22611
92
22612
92
    // r = VSELECT(r, shift(r, 8), a);
22613
92
    SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22614
92
    R = SignBitSelect(Amt, M, R);
22615
92
22616
92
    // a += a
22617
92
    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22618
92
22619
92
    // r = VSELECT(r, shift(r, 4), a);
22620
92
    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22621
92
    R = SignBitSelect(Amt, M, R);
22622
92
22623
92
    // a += a
22624
92
    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22625
92
22626
92
    // r = VSELECT(r, shift(r, 2), a);
22627
92
    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22628
92
    R = SignBitSelect(Amt, M, R);
22629
92
22630
92
    // a += a
22631
92
    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22632
92
22633
92
    // return VSELECT(r, shift(r, 1), a);
22634
92
    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22635
92
    R = SignBitSelect(Amt, M, R);
22636
92
    return R;
22637
92
  }
22638
371
22639
371
  // Decompose 256-bit shifts into smaller 128-bit shifts.
22640
371
  
if (371
VT.is256BitVector()371
)
22641
371
    return Lower256IntArith(Op, DAG);
22642
0
22643
0
  return SDValue();
22644
0
}
22645
22646
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22647
472
                           SelectionDAG &DAG) {
22648
472
  MVT VT = Op.getSimpleValueType();
22649
472
  SDLoc DL(Op);
22650
472
  SDValue R = Op.getOperand(0);
22651
472
  SDValue Amt = Op.getOperand(1);
22652
472
  unsigned Opcode = Op.getOpcode();
22653
472
  unsigned EltSizeInBits = VT.getScalarSizeInBits();
22654
472
22655
472
  if (
Subtarget.hasAVX512()472
) {
22656
196
    // Attempt to rotate by immediate.
22657
196
    APInt UndefElts;
22658
196
    SmallVector<APInt, 16> EltBits;
22659
196
    if (
getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)196
) {
22660
128
      if (
!UndefElts && 128
llvm::all_of(EltBits, [EltBits](APInt &V) 128
{
22661
778
            return EltBits[0] == V;
22662
128
          })) {
22663
65
        unsigned Op = (Opcode == ISD::ROTL ? 
X86ISD::VROTLI49
:
X86ISD::VROTRI16
);
22664
65
        uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
22665
65
        return DAG.getNode(Op, DL, VT, R,
22666
65
                           DAG.getConstant(RotateAmt, DL, MVT::i8));
22667
65
      }
22668
131
    }
22669
131
22670
131
    // Else, fall-back on VPROLV/VPRORV.
22671
131
    return Op;
22672
131
  }
22673
276
22674
472
  assert(VT.isVector() && "Custom lowering only for vector rotates!");
22675
276
  assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22676
276
  assert((Opcode == ISD::ROTL) && "Only ROTL supported");
22677
276
22678
276
  // XOP has 128-bit vector variable + immediate rotates.
22679
276
  // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
22680
276
22681
276
  // Split 256-bit integers.
22682
276
  if (VT.is256BitVector())
22683
32
    return Lower256IntArith(Op, DAG);
22684
244
22685
276
  assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22686
244
22687
244
  // Attempt to rotate by immediate.
22688
244
  if (auto *
BVAmt244
= dyn_cast<BuildVectorSDNode>(Amt)) {
22689
105
    if (auto *
RotateConst105
= BVAmt->getConstantSplatNode()) {
22690
51
      uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22691
51
      assert(RotateAmt < EltSizeInBits && "Rotation out of range");
22692
51
      return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
22693
51
                         DAG.getConstant(RotateAmt, DL, MVT::i8));
22694
51
    }
22695
193
  }
22696
193
22697
193
  // Use general rotate by variable (per-element).
22698
193
  return Op;
22699
193
}
22700
22701
2.30k
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22702
2.30k
  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22703
2.30k
  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22704
2.30k
  // looks for this combo and may remove the "setcc" instruction if the "setcc"
22705
2.30k
  // has only one use.
22706
2.30k
  SDNode *N = Op.getNode();
22707
2.30k
  SDValue LHS = N->getOperand(0);
22708
2.30k
  SDValue RHS = N->getOperand(1);
22709
2.30k
  unsigned BaseOp = 0;
22710
2.30k
  X86::CondCode Cond;
22711
2.30k
  SDLoc DL(Op);
22712
2.30k
  switch (Op.getOpcode()) {
22713
0
  
default: 0
llvm_unreachable0
("Unknown ovf instruction!");
22714
43
  case ISD::SADDO:
22715
43
    // A subtract of one will be selected as a INC. Note that INC doesn't
22716
43
    // set CF, so we can't do this for UADDO.
22717
43
    if (
isOneConstant(RHS)43
) {
22718
11
      BaseOp = X86ISD::INC;
22719
11
      Cond = X86::COND_O;
22720
11
      break;
22721
11
    }
22722
32
    BaseOp = X86ISD::ADD;
22723
32
    Cond = X86::COND_O;
22724
32
    break;
22725
1.32k
  case ISD::UADDO:
22726
1.32k
    BaseOp = X86ISD::ADD;
22727
1.32k
    Cond = X86::COND_B;
22728
1.32k
    break;
22729
16
  case ISD::SSUBO:
22730
16
    // A subtract of one will be selected as a DEC. Note that DEC doesn't
22731
16
    // set CF, so we can't do this for USUBO.
22732
16
    if (
isOneConstant(RHS)16
) {
22733
1
      BaseOp = X86ISD::DEC;
22734
1
      Cond = X86::COND_O;
22735
1
      break;
22736
1
    }
22737
15
    BaseOp = X86ISD::SUB;
22738
15
    Cond = X86::COND_O;
22739
15
    break;
22740
866
  case ISD::USUBO:
22741
866
    BaseOp = X86ISD::SUB;
22742
866
    Cond = X86::COND_B;
22743
866
    break;
22744
20
  case ISD::SMULO:
22745
20
    BaseOp = N->getValueType(0) == MVT::i8 ? 
X86ISD::SMUL83
:
X86ISD::SMUL17
;
22746
20
    Cond = X86::COND_O;
22747
20
    break;
22748
37
  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22749
37
    if (
N->getValueType(0) == MVT::i837
) {
22750
3
      BaseOp = X86ISD::UMUL8;
22751
3
      Cond = X86::COND_O;
22752
3
      break;
22753
3
    }
22754
34
    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22755
34
                                 MVT::i32);
22756
34
    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22757
34
22758
34
    SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22759
34
22760
34
    if (N->getValueType(1) == MVT::i1)
22761
0
      SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22762
43
22763
43
    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22764
43
  }
22765
2.27k
  }
22766
2.27k
22767
2.27k
  // Also sets EFLAGS.
22768
2.27k
  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22769
2.27k
  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22770
2.27k
22771
2.27k
  SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22772
2.27k
22773
2.27k
  if (N->getValueType(1) == MVT::i1)
22774
0
    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22775
2.30k
22776
2.30k
  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22777
2.30k
}
22778
22779
/// Returns true if the operand type is exactly twice the native width, and
22780
/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22781
/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22782
/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22783
1.28k
bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22784
1.28k
  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22785
1.28k
22786
1.28k
  if (OpWidth == 64)
22787
543
    return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22788
738
  else 
if (738
OpWidth == 128738
)
22789
28
    return Subtarget.hasCmpxchg16b();
22790
738
  else
22791
710
    return false;
22792
0
}
22793
22794
508
bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22795
508
  return needsCmpXchgNb(SI->getValueOperand()->getType());
22796
508
}
22797
22798
// Note: this turns large loads into lock cmpxchg8b/16b.
22799
// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22800
TargetLowering::AtomicExpansionKind
22801
451
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22802
451
  auto PTy = cast<PointerType>(LI->getPointerOperandType());
22803
43
  return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22804
408
                                               : AtomicExpansionKind::None;
22805
451
}
22806
22807
TargetLowering::AtomicExpansionKind
22808
2.52k
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22809
2.52k
  unsigned NativeWidth = Subtarget.is64Bit() ? 
641.43k
:
321.08k
;
22810
2.52k
  Type *MemType = AI->getType();
22811
2.52k
22812
2.52k
  // If the operand is too big, we must see if cmpxchg8/16b is available
22813
2.52k
  // and default to library calls otherwise.
22814
2.52k
  if (
MemType->getPrimitiveSizeInBits() > NativeWidth2.52k
) {
22815
313
    return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22816
9
                                   : AtomicExpansionKind::None;
22817
322
  }
22818
2.19k
22819
2.19k
  AtomicRMWInst::BinOp Op = AI->getOperation();
22820
2.19k
  switch (Op) {
22821
0
  default:
22822
0
    llvm_unreachable("Unknown atomic operation");
22823
1.32k
  case AtomicRMWInst::Xchg:
22824
1.32k
  case AtomicRMWInst::Add:
22825
1.32k
  case AtomicRMWInst::Sub:
22826
1.32k
    // It's better to use xadd, xsub or xchg for these in all cases.
22827
1.32k
    return AtomicExpansionKind::None;
22828
763
  case AtomicRMWInst::Or:
22829
763
  case AtomicRMWInst::And:
22830
763
  case AtomicRMWInst::Xor:
22831
763
    // If the atomicrmw's result isn't actually used, we can just add a "lock"
22832
763
    // prefix to a normal instruction for these operations.
22833
665
    return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22834
98
                            : AtomicExpansionKind::None;
22835
112
  case AtomicRMWInst::Nand:
22836
112
  case AtomicRMWInst::Max:
22837
112
  case AtomicRMWInst::Min:
22838
112
  case AtomicRMWInst::UMax:
22839
112
  case AtomicRMWInst::UMin:
22840
112
    // These always require a non-trivial set of data operations on x86. We must
22841
112
    // use a cmpxchg loop.
22842
112
    return AtomicExpansionKind::CmpXChg;
22843
0
  }
22844
0
}
22845
22846
LoadInst *
22847
14
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22848
14
  unsigned NativeWidth = Subtarget.is64Bit() ? 
647
:
327
;
22849
14
  Type *MemType = AI->getType();
22850
14
  // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22851
14
  // there is no benefit in turning such RMWs into loads, and it is actually
22852
14
  // harmful as it introduces a mfence.
22853
14
  if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22854
3
    return nullptr;
22855
11
22856
11
  auto Builder = IRBuilder<>(AI);
22857
11
  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22858
11
  auto SSID = AI->getSyncScopeID();
22859
11
  // We must restrict the ordering to avoid generating loads with Release or
22860
11
  // ReleaseAcquire orderings.
22861
11
  auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22862
11
  auto Ptr = AI->getPointerOperand();
22863
11
22864
11
  // Before the load we need a fence. Here is an example lifted from
22865
11
  // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22866
11
  // is required:
22867
11
  // Thread 0:
22868
11
  //   x.store(1, relaxed);
22869
11
  //   r1 = y.fetch_add(0, release);
22870
11
  // Thread 1:
22871
11
  //   y.fetch_add(42, acquire);
22872
11
  //   r2 = x.load(relaxed);
22873
11
  // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22874
11
  // lowered to just a load without a fence. A mfence flushes the store buffer,
22875
11
  // making the optimization clearly correct.
22876
11
  // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22877
11
  // otherwise, we might be able to be more aggressive on relaxed idempotent
22878
11
  // rmw. In practice, they do not look useful, so we don't try to be
22879
11
  // especially clever.
22880
11
  if (SSID == SyncScope::SingleThread)
22881
11
    // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22882
11
    // the IR level, so we must wrap it in an intrinsic.
22883
0
    return nullptr;
22884
11
22885
11
  
if (11
!Subtarget.hasMFence()11
)
22886
11
    // FIXME: it might make sense to use a locked operation here but on a
22887
11
    // different cache-line to prevent cache-line bouncing. In practice it
22888
11
    // is probably a small win, and x86 processors without mfence are rare
22889
11
    // enough that we do not bother.
22890
0
    return nullptr;
22891
11
22892
11
  Function *MFence =
22893
11
      llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22894
11
  Builder.CreateCall(MFence, {});
22895
11
22896
11
  // Finally we can emit the atomic load.
22897
11
  LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22898
11
          AI->getType()->getPrimitiveSizeInBits());
22899
11
  Loaded->setAtomic(Order, SSID);
22900
11
  AI->replaceAllUsesWith(Loaded);
22901
11
  AI->eraseFromParent();
22902
11
  return Loaded;
22903
11
}
22904
22905
static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22906
114
                                 SelectionDAG &DAG) {
22907
114
  SDLoc dl(Op);
22908
114
  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22909
114
    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22910
114
  SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
22911
114
    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22912
114
22913
114
  // The only fence that needs an instruction is a sequentially-consistent
22914
114
  // cross-thread fence.
22915
114
  if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22916
114
      
FenceSSID == SyncScope::System37
) {
22917
25
    if (Subtarget.hasMFence())
22918
19
      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22919
6
22920
6
    SDValue Chain = Op.getOperand(0);
22921
6
    SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22922
6
    SDValue Ops[] = {
22923
6
      DAG.getRegister(X86::ESP, MVT::i32),     // Base
22924
6
      DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
22925
6
      DAG.getRegister(0, MVT::i32),            // Index
22926
6
      DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
22927
6
      DAG.getRegister(0, MVT::i32),            // Segment.
22928
6
      Zero,
22929
6
      Chain
22930
6
    };
22931
6
    SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22932
6
    return SDValue(Res, 0);
22933
6
  }
22934
89
22935
89
  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22936
89
  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22937
89
}
22938
22939
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22940
1.47k
                             SelectionDAG &DAG) {
22941
1.47k
  MVT T = Op.getSimpleValueType();
22942
1.47k
  SDLoc DL(Op);
22943
1.47k
  unsigned Reg = 0;
22944
1.47k
  unsigned size = 0;
22945
1.47k
  switch(T.SimpleTy) {
22946
0
  
default: 0
llvm_unreachable0
("Invalid value type!");
22947
324
  case MVT::i8:  Reg = X86::AL;  size = 1; break;
22948
388
  case MVT::i16: Reg = X86::AX;  size = 2; break;
22949
467
  case MVT::i32: Reg = X86::EAX; size = 4; break;
22950
300
  case MVT::i64:
22951
300
    assert(Subtarget.is64Bit() && "Node not type legal!");
22952
300
    Reg = X86::RAX; size = 8;
22953
300
    break;
22954
1.47k
  }
22955
1.47k
  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22956
1.47k
                                  Op.getOperand(2), SDValue());
22957
1.47k
  SDValue Ops[] = { cpIn.getValue(0),
22958
1.47k
                    Op.getOperand(1),
22959
1.47k
                    Op.getOperand(3),
22960
1.47k
                    DAG.getTargetConstant(size, DL, MVT::i8),
22961
1.47k
                    cpIn.getValue(1) };
22962
1.47k
  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22963
1.47k
  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22964
1.47k
  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22965
1.47k
                                           Ops, T, MMO);
22966
1.47k
22967
1.47k
  SDValue cpOut =
22968
1.47k
    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22969
1.47k
  SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22970
1.47k
                                      MVT::i32, cpOut.getValue(2));
22971
1.47k
  SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22972
1.47k
22973
1.47k
  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22974
1.47k
  DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22975
1.47k
  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22976
1.47k
  return SDValue();
22977
1.47k
}
22978
22979
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22980
521
                            SelectionDAG &DAG) {
22981
521
  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22982
521
  MVT DstVT = Op.getSimpleValueType();
22983
521
22984
521
  if (
SrcVT == MVT::v2i32 || 521
SrcVT == MVT::v4i16492
||
SrcVT == MVT::v8i8444
||
22985
521
      
SrcVT == MVT::i64419
) {
22986
521
    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22987
521
    if (DstVT != MVT::f64)
22988
521
      // This conversion needs to be expanded.
22989
362
      return SDValue();
22990
159
22991
159
    SDValue Op0 = Op->getOperand(0);
22992
159
    SmallVector<SDValue, 16> Elts;
22993
159
    SDLoc dl(Op);
22994
159
    unsigned NumElts;
22995
159
    MVT SVT;
22996
159
    if (
SrcVT.isVector()159
) {
22997
22
      NumElts = SrcVT.getVectorNumElements();
22998
22
      SVT = SrcVT.getVectorElementType();
22999
22
23000
22
      // Widen the vector in input in the case of MVT::v2i32.
23001
22
      // Example: from MVT::v2i32 to MVT::v4i32.
23002
122
      for (unsigned i = 0, e = NumElts; 
i != e122
;
++i100
)
23003
100
        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23004
100
                                   DAG.getIntPtrConstant(i, dl)));
23005
159
    } else {
23006
137
      assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
23007
137
             "Unexpected source type in LowerBITCAST");
23008
137
      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23009
137
                                 DAG.getIntPtrConstant(0, dl)));
23010
137
      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23011
137
                                 DAG.getIntPtrConstant(1, dl)));
23012
137
      NumElts = 2;
23013
137
      SVT = MVT::i32;
23014
137
    }
23015
521
    // Explicitly mark the extra elements as Undef.
23016
521
    Elts.append(NumElts, DAG.getUNDEF(SVT));
23017
521
23018
521
    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23019
521
    SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23020
521
    SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23021
521
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23022
521
                       DAG.getIntPtrConstant(0, dl));
23023
521
  }
23024
0
23025
521
  assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
23026
0
         Subtarget.hasMMX() && "Unexpected custom BITCAST");
23027
0
  assert((DstVT == MVT::i64 ||
23028
0
          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
23029
0
         "Unexpected custom BITCAST");
23030
0
  // i64 <=> MMX conversions are Legal.
23031
0
  if (
SrcVT==MVT::i64 && 0
DstVT.isVector()0
)
23032
0
    return Op;
23033
0
  
if (0
DstVT==MVT::i64 && 0
SrcVT.isVector()0
)
23034
0
    return Op;
23035
0
  // MMX <=> MMX conversions are Legal.
23036
0
  
if (0
SrcVT.isVector() && 0
DstVT.isVector()0
)
23037
0
    return Op;
23038
0
  // All other conversions need to be expanded.
23039
0
  return SDValue();
23040
0
}
23041
23042
/// Compute the horizontal sum of bytes in V for the elements of VT.
23043
///
23044
/// Requires V to be a byte vector and VT to be an integer vector type with
23045
/// wider elements than V's type. The width of the elements of VT determines
23046
/// how many bytes of V are summed horizontally to produce each element of the
23047
/// result.
23048
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23049
                                      const X86Subtarget &Subtarget,
23050
169
                                      SelectionDAG &DAG) {
23051
169
  SDLoc DL(V);
23052
169
  MVT ByteVecVT = V.getSimpleValueType();
23053
169
  MVT EltVT = VT.getVectorElementType();
23054
169
  assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
23055
169
         "Expected value to have byte element type.");
23056
169
  assert(EltVT != MVT::i8 &&
23057
169
         "Horizontal byte sum only makes sense for wider elements!");
23058
169
  unsigned VecSize = VT.getSizeInBits();
23059
169
  assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
23060
169
23061
169
  // PSADBW instruction horizontally add all bytes and leave the result in i64
23062
169
  // chunks, thus directly computes the pop count for v2i64 and v4i64.
23063
169
  if (
EltVT == MVT::i64169
) {
23064
59
    SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23065
59
    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23066
59
    V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23067
59
    return DAG.getBitcast(VT, V);
23068
59
  }
23069
110
23070
110
  
if (110
EltVT == MVT::i32110
) {
23071
53
    // We unpack the low half and high half into i32s interleaved with zeros so
23072
53
    // that we can use PSADBW to horizontally sum them. The most useful part of
23073
53
    // this is that it lines up the results of two PSADBW instructions to be
23074
53
    // two v2i64 vectors which concatenated are the 4 population counts. We can
23075
53
    // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23076
53
    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23077
53
    SDValue V32 = DAG.getBitcast(VT, V);
23078
53
    SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23079
53
    SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23080
53
23081
53
    // Do the horizontal sums into two v2i64s.
23082
53
    Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23083
53
    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23084
53
    Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23085
53
                      DAG.getBitcast(ByteVecVT, Low), Zeros);
23086
53
    High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23087
53
                       DAG.getBitcast(ByteVecVT, High), Zeros);
23088
53
23089
53
    // Merge them together.
23090
53
    MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23091
53
    V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23092
53
                    DAG.getBitcast(ShortVecVT, Low),
23093
53
                    DAG.getBitcast(ShortVecVT, High));
23094
53
23095
53
    return DAG.getBitcast(VT, V);
23096
53
  }
23097
57
23098
57
  // The only element type left is i16.
23099
110
  assert(EltVT == MVT::i16 && "Unknown how to handle type");
23100
57
23101
57
  // To obtain pop count for each i16 element starting from the pop count for
23102
57
  // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23103
57
  // right by 8. It is important to shift as i16s as i8 vector shift isn't
23104
57
  // directly supported.
23105
57
  SDValue ShifterV = DAG.getConstant(8, DL, VT);
23106
57
  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23107
57
  V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23108
57
                  DAG.getBitcast(ByteVecVT, V));
23109
57
  return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23110
57
}
23111
23112
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23113
                                        const X86Subtarget &Subtarget,
23114
189
                                        SelectionDAG &DAG) {
23115
189
  MVT VT = Op.getSimpleValueType();
23116
189
  MVT EltVT = VT.getVectorElementType();
23117
189
  unsigned VecSize = VT.getSizeInBits();
23118
189
23119
189
  // Implement a lookup table in register by using an algorithm based on:
23120
189
  // http://wm.ite.pl/articles/sse-popcount.html
23121
189
  //
23122
189
  // The general idea is that every lower byte nibble in the input vector is an
23123
189
  // index into a in-register pre-computed pop count table. We then split up the
23124
189
  // input vector in two new ones: (1) a vector with only the shifted-right
23125
189
  // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23126
189
  // masked out higher ones) for each byte. PSHUFB is used separately with both
23127
189
  // to index the in-register table. Next, both are added and the result is a
23128
189
  // i8 vector where each element contains the pop count for input byte.
23129
189
  //
23130
189
  // To obtain the pop count for elements != i8, we follow up with the same
23131
189
  // approach and use additional tricks as described below.
23132
189
  //
23133
189
  const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23134
189
                       /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23135
189
                       /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23136
189
                       /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23137
189
23138
189
  int NumByteElts = VecSize / 8;
23139
189
  MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23140
189
  SDValue In = DAG.getBitcast(ByteVecVT, Op);
23141
189
  SmallVector<SDValue, 64> LUTVec;
23142
5.66k
  for (int i = 0; 
i < NumByteElts5.66k
;
++i5.47k
)
23143
5.47k
    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23144
189
  SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23145
189
  SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23146
189
23147
189
  // High nibbles
23148
189
  SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23149
189
  SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23150
189
23151
189
  // Low nibbles
23152
189
  SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23153
189
23154
189
  // The input vector is used as the shuffle mask that index elements into the
23155
189
  // LUT. After counting low and high nibbles, add the vector to obtain the
23156
189
  // final pop count per i8 element.
23157
189
  SDValue HighPopCnt =
23158
189
      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23159
189
  SDValue LowPopCnt =
23160
189
      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23161
189
  SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23162
189
23163
189
  if (EltVT == MVT::i8)
23164
56
    return PopCnt;
23165
133
23166
133
  return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23167
133
}
23168
23169
static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23170
                                       const X86Subtarget &Subtarget,
23171
46
                                       SelectionDAG &DAG) {
23172
46
  MVT VT = Op.getSimpleValueType();
23173
46
  assert(VT.is128BitVector() &&
23174
46
         "Only 128-bit vector bitmath lowering supported.");
23175
46
23176
46
  int VecSize = VT.getSizeInBits();
23177
46
  MVT EltVT = VT.getVectorElementType();
23178
46
  int Len = EltVT.getSizeInBits();
23179
46
23180
46
  // This is the vectorized version of the "best" algorithm from
23181
46
  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23182
46
  // with a minor tweak to use a series of adds + shifts instead of vector
23183
46
  // multiplications. Implemented for all integer vector types. We only use
23184
46
  // this when we don't have SSSE3 which allows a LUT-based lowering that is
23185
46
  // much faster, even faster than using native popcnt instructions.
23186
46
23187
138
  auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23188
138
    MVT VT = V.getSimpleValueType();
23189
138
    SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23190
138
    return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23191
138
  };
23192
184
  auto GetMask = [&](SDValue V, APInt Mask) {
23193
184
    MVT VT = V.getSimpleValueType();
23194
184
    SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23195
184
    return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23196
184
  };
23197
46
23198
46
  // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23199
46
  // x86, so set the SRL type to have elements at least i16 wide. This is
23200
46
  // correct because all of our SRLs are followed immediately by a mask anyways
23201
46
  // that handles any bits that sneak into the high bits of the byte elements.
23202
46
  MVT SrlVT = Len > 8 ? 
VT36
:
MVT::getVectorVT(MVT::i16, VecSize / 16)10
;
23203
46
23204
46
  SDValue V = Op;
23205
46
23206
46
  // v = v - ((v >> 1) & 0x55555555...)
23207
46
  SDValue Srl =
23208
46
      DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23209
46
  SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23210
46
  V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23211
46
23212
46
  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23213
46
  SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23214
46
  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23215
46
  SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23216
46
  V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23217
46
23218
46
  // v = (v + (v >> 4)) & 0x0F0F0F0F...
23219
46
  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23220
46
  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23221
46
  V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23222
46
23223
46
  // At this point, V contains the byte-wise population count, and we are
23224
46
  // merely doing a horizontal sum if necessary to get the wider element
23225
46
  // counts.
23226
46
  if (EltVT == MVT::i8)
23227
10
    return V;
23228
36
23229
36
  return LowerHorizontalByteSum(
23230
36
      DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23231
36
      DAG);
23232
36
}
23233
23234
// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23235
// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23236
static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23237
270
                                SelectionDAG &DAG) {
23238
270
  MVT VT = Op.getSimpleValueType();
23239
270
  assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23240
270
         "Unknown CTPOP type to handle");
23241
270
  SDLoc DL(Op.getNode());
23242
270
  SDValue Op0 = Op.getOperand(0);
23243
270
23244
270
  // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
23245
270
  if (
Subtarget.hasVPOPCNTDQ()270
) {
23246
26
    if (
VT == MVT::v8i1626
) {
23247
3
      Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
23248
3
      Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
23249
3
      return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23250
3
    }
23251
23
    
if (23
VT == MVT::v16i8 || 23
VT == MVT::v16i1620
) {
23252
12
      Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
23253
12
      Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
23254
12
      return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23255
12
    }
23256
255
  }
23257
255
23258
255
  
if (255
!Subtarget.hasSSSE3()255
) {
23259
46
    // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23260
46
    assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23261
46
    return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23262
46
  }
23263
209
23264
209
  // Decompose 256-bit ops into smaller 128-bit ops.
23265
209
  
if (209
VT.is256BitVector() && 209
!Subtarget.hasInt256()81
)
23266
12
    return Lower256IntUnary(Op, DAG);
23267
197
23268
197
  // Decompose 512-bit ops into smaller 256-bit ops.
23269
197
  
if (197
VT.is512BitVector() && 197
!Subtarget.hasBWI()36
)
23270
8
    return Lower512IntUnary(Op, DAG);
23271
189
23272
189
  return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23273
189
}
23274
23275
static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23276
270
                          SelectionDAG &DAG) {
23277
270
  assert(Op.getSimpleValueType().isVector() &&
23278
270
         "We only do custom lowering for vector population count.");
23279
270
  return LowerVectorCTPOP(Op, Subtarget, DAG);
23280
270
}
23281
23282
96
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23283
96
  MVT VT = Op.getSimpleValueType();
23284
96
  SDValue In = Op.getOperand(0);
23285
96
  SDLoc DL(Op);
23286
96
23287
96
  // For scalars, its still beneficial to transfer to/from the SIMD unit to
23288
96
  // perform the BITREVERSE.
23289
96
  if (
!VT.isVector()96
) {
23290
8
    MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23291
8
    SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23292
8
    Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23293
8
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23294
8
                       DAG.getIntPtrConstant(0, DL));
23295
8
  }
23296
88
23297
88
  int NumElts = VT.getVectorNumElements();
23298
88
  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23299
88
23300
88
  // Decompose 256-bit ops into smaller 128-bit ops.
23301
88
  if (VT.is256BitVector())
23302
24
    return Lower256IntUnary(Op, DAG);
23303
64
23304
88
  assert(VT.is128BitVector() &&
23305
64
         "Only 128-bit vector bitreverse lowering supported.");
23306
64
23307
64
  // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23308
64
  // perform the BSWAP in the shuffle.
23309
64
  // Its best to shuffle using the second operand as this will implicitly allow
23310
64
  // memory folding for multiple vectors.
23311
64
  SmallVector<SDValue, 16> MaskElts;
23312
544
  for (int i = 0; 
i != NumElts544
;
++i480
) {
23313
1.50k
    for (int j = ScalarSizeInBytes - 1; 
j >= 01.50k
;
--j1.02k
) {
23314
1.02k
      int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23315
1.02k
      int PermuteByte = SourceByte | (2 << 5);
23316
1.02k
      MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23317
1.02k
    }
23318
480
  }
23319
96
23320
96
  SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23321
96
  SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23322
96
  Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23323
96
                    Res, Mask);
23324
96
  return DAG.getBitcast(VT, Res);
23325
96
}
23326
23327
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23328
204
                               SelectionDAG &DAG) {
23329
204
  if (Subtarget.hasXOP())
23330
96
    return LowerBITREVERSE_XOP(Op, DAG);
23331
108
23332
204
  assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23333
108
23334
108
  MVT VT = Op.getSimpleValueType();
23335
108
  SDValue In = Op.getOperand(0);
23336
108
  SDLoc DL(Op);
23337
108
23338
108
  unsigned NumElts = VT.getVectorNumElements();
23339
108
  assert(VT.getScalarType() == MVT::i8 &&
23340
108
         "Only byte vector BITREVERSE supported");
23341
108
23342
108
  // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23343
108
  if (
VT.is256BitVector() && 108
!Subtarget.hasInt256()36
)
23344
12
    return Lower256IntUnary(Op, DAG);
23345
96
23346
96
  // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23347
96
  // two nibbles and a PSHUFB lookup to find the bitreverse of each
23348
96
  // 0-15 value (moved to the other nibble).
23349
96
  SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23350
96
  SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23351
96
  SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23352
96
23353
96
  const int LoLUT[16] = {
23354
96
      /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23355
96
      /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23356
96
      /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23357
96
      /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23358
96
  const int HiLUT[16] = {
23359
96
      /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23360
96
      /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23361
96
      /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23362
96
      /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23363
96
23364
96
  SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23365
2.20k
  for (unsigned i = 0; 
i < NumElts2.20k
;
++i2.11k
) {
23366
2.11k
    LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23367
2.11k
    HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23368
2.11k
  }
23369
204
23370
204
  SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23371
204
  SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23372
204
  Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23373
204
  Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23374
204
  return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23375
204
}
23376
23377
252
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23378
252
  unsigned NewOpc = 0;
23379
252
  switch (N->getOpcode()) {
23380
84
  case ISD::ATOMIC_LOAD_ADD:
23381
84
    NewOpc = X86ISD::LADD;
23382
84
    break;
23383
70
  case ISD::ATOMIC_LOAD_SUB:
23384
70
    NewOpc = X86ISD::LSUB;
23385
70
    break;
23386
34
  case ISD::ATOMIC_LOAD_OR:
23387
34
    NewOpc = X86ISD::LOR;
23388
34
    break;
23389
32
  case ISD::ATOMIC_LOAD_XOR:
23390
32
    NewOpc = X86ISD::LXOR;
23391
32
    break;
23392
32
  case ISD::ATOMIC_LOAD_AND:
23393
32
    NewOpc = X86ISD::LAND;
23394
32
    break;
23395
0
  default:
23396
0
    llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23397
252
  }
23398
252
23399
252
  MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23400
252
  return DAG.getMemIntrinsicNode(
23401
252
      NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23402
252
      {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23403
252
      /*MemVT=*/N->getSimpleValueType(0), MMO);
23404
252
}
23405
23406
/// Lower atomic_load_ops into LOCK-prefixed operations.
23407
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23408
1.53k
                                const X86Subtarget &Subtarget) {
23409
1.53k
  SDValue Chain = N->getOperand(0);
23410
1.53k
  SDValue LHS = N->getOperand(1);
23411
1.53k
  SDValue RHS = N->getOperand(2);
23412
1.53k
  unsigned Opc = N->getOpcode();
23413
1.53k
  MVT VT = N->getSimpleValueType(0);
23414
1.53k
  SDLoc DL(N);
23415
1.53k
23416
1.53k
  // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23417
1.53k
  // can only be lowered when the result is unused.  They should have already
23418
1.53k
  // been transformed into a cmpxchg loop in AtomicExpand.
23419
1.53k
  if (
N->hasAnyUseOfValue(0)1.53k
) {
23420
1.29k
    // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23421
1.29k
    // select LXADD if LOCK_SUB can't be selected.
23422
1.29k
    if (
Opc == ISD::ATOMIC_LOAD_SUB1.29k
) {
23423
232
      AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23424
232
      RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23425
232
      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23426
232
                           RHS, AN->getMemOperand());
23427
232
    }
23428
0
    assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23429
1.06k
           "Used AtomicRMW ops other than Add should have been expanded!");
23430
1.06k
    return N;
23431
1.06k
  }
23432
242
23433
242
  SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23434
242
  // RAUW the chain, but don't worry about the result, as it's unused.
23435
242
  assert(!N->hasAnyUseOfValue(0));
23436
242
  DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23437
242
  return SDValue();
23438
242
}
23439
23440
815
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23441
815
  SDNode *Node = Op.getNode();
23442
815
  SDLoc dl(Node);
23443
815
  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23444
815
23445
815
  // Convert seq_cst store -> xchg
23446
815
  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23447
815
  // FIXME: On 32-bit, store -> fist or movq would be more efficient
23448
815
  //        (The only way to get a 16-byte store is cmpxchg16b)
23449
815
  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23450
815
  if (cast<AtomicSDNode>(Node)->getOrdering() ==
23451
815
          AtomicOrdering::SequentiallyConsistent ||
23452
815
      
!DAG.getTargetLoweringInfo().isTypeLegal(VT)690
) {
23453
125
    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23454
125
                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
23455
125
                                 Node->getOperand(0),
23456
125
                                 Node->getOperand(1), Node->getOperand(2),
23457
125
                                 cast<AtomicSDNode>(Node)->getMemOperand());
23458
125
    return Swap.getValue(1);
23459
125
  }
23460
690
  // Other atomic stores have a simple pattern.
23461
690
  return Op;
23462
690
}
23463
23464
2.54k
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23465
2.54k
  SDNode *N = Op.getNode();
23466
2.54k
  MVT VT = N->getSimpleValueType(0);
23467
2.54k
23468
2.54k
  // Let legalize expand this if it isn't a legal type yet.
23469
2.54k
  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23470
0
    return SDValue();
23471
2.54k
23472
2.54k
  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23473
2.54k
  SDLoc DL(N);
23474
2.54k
23475
2.54k
  // Set the carry flag.
23476
2.54k
  SDValue Carry = Op.getOperand(2);
23477
2.54k
  EVT CarryVT = Carry.getValueType();
23478
2.54k
  APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23479
2.54k
  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23480
2.54k
                      Carry, DAG.getConstant(NegOne, DL, CarryVT));
23481
2.54k
23482
2.54k
  unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? 
X86ISD::ADC2.16k
:
X86ISD::SBB378
;
23483
2.54k
  SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23484
2.54k
                            Op.getOperand(1), Carry.getValue(1));
23485
2.54k
23486
2.54k
  SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23487
2.54k
  if (N->getValueType(1) == MVT::i1)
23488
0
    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23489
2.54k
23490
2.54k
  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23491
2.54k
}
23492
23493
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23494
2
                            SelectionDAG &DAG) {
23495
2
  assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23496
2
23497
2
  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23498
2
  // which returns the values as { float, float } (in XMM0) or
23499
2
  // { double, double } (which is returned in XMM0, XMM1).
23500
2
  SDLoc dl(Op);
23501
2
  SDValue Arg = Op.getOperand(0);
23502
2
  EVT ArgVT = Arg.getValueType();
23503
2
  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23504
2
23505
2
  TargetLowering::ArgListTy Args;
23506
2
  TargetLowering::ArgListEntry Entry;
23507
2
23508
2
  Entry.Node = Arg;
23509
2
  Entry.Ty = ArgTy;
23510
2
  Entry.IsSExt = false;
23511
2
  Entry.IsZExt = false;
23512
2
  Args.push_back(Entry);
23513
2
23514
2
  bool isF64 = ArgVT == MVT::f64;
23515
2
  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23516
2
  // the small struct {f32, f32} is returned in (eax, edx). For f64,
23517
2
  // the results are returned via SRet in memory.
23518
2
  const char *LibcallName =  isF64 ? 
"__sincos_stret"1
:
"__sincosf_stret"1
;
23519
2
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23520
2
  SDValue Callee =
23521
2
      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23522
2
23523
1
  Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
23524
1
                      : (Type *)VectorType::get(ArgTy, 4);
23525
2
23526
2
  TargetLowering::CallLoweringInfo CLI(DAG);
23527
2
  CLI.setDebugLoc(dl)
23528
2
      .setChain(DAG.getEntryNode())
23529
2
      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23530
2
23531
2
  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23532
2
23533
2
  if (isF64)
23534
2
    // Returned in xmm0 and xmm1.
23535
1
    return CallResult.first;
23536
1
23537
1
  // Returned in bits 0:31 and 32:64 xmm0.
23538
1
  SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23539
1
                               CallResult.first, DAG.getIntPtrConstant(0, dl));
23540
1
  SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23541
1
                               CallResult.first, DAG.getIntPtrConstant(1, dl));
23542
1
  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23543
1
  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23544
1
}
23545
23546
/// Widen a vector input to a vector of NVT.  The
23547
/// input vector must have the same element type as NVT.
23548
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23549
171
                            bool FillWithZeroes = false) {
23550
171
  // Check if InOp already has the right width.
23551
171
  MVT InVT = InOp.getSimpleValueType();
23552
171
  if (InVT == NVT)
23553
0
    return InOp;
23554
171
23555
171
  
if (171
InOp.isUndef()171
)
23556
10
    return DAG.getUNDEF(NVT);
23557
161
23558
171
  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23559
161
         "input and widen element type must match");
23560
161
23561
161
  unsigned InNumElts = InVT.getVectorNumElements();
23562
161
  unsigned WidenNumElts = NVT.getVectorNumElements();
23563
161
  assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23564
161
         "Unexpected request for vector widening");
23565
161
23566
161
  SDLoc dl(InOp);
23567
161
  if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23568
161
      
InOp.getNumOperands() == 23
) {
23569
3
    SDValue N1 = InOp.getOperand(1);
23570
3
    if (
(ISD::isBuildVectorAllZeros(N1.getNode()) && 3
FillWithZeroes0
) ||
23571
3
        
N1.isUndef()3
) {
23572
3
      InOp = InOp.getOperand(0);
23573
3
      InVT = InOp.getSimpleValueType();
23574
3
      InNumElts = InVT.getVectorNumElements();
23575
3
    }
23576
3
  }
23577
161
  if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23578
161
      
ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())142
) {
23579
20
    SmallVector<SDValue, 16> Ops;
23580
134
    for (unsigned i = 0; 
i < InNumElts134
;
++i114
)
23581
114
      Ops.push_back(InOp.getOperand(i));
23582
20
23583
20
    EVT EltVT = InOp.getOperand(0).getValueType();
23584
20
23585
13
    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23586
7
      DAG.getUNDEF(EltVT);
23587
158
    for (unsigned i = 0; 
i < WidenNumElts - InNumElts158
;
++i138
)
23588
138
      Ops.push_back(FillVal);
23589
20
    return DAG.getBuildVector(NVT, dl, Ops);
23590
20
  }
23591
141
  
SDValue FillVal = FillWithZeroes ? 141
DAG.getConstant(0, dl, NVT)52
:
23592
89
    DAG.getUNDEF(NVT);
23593
171
  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23594
171
                     InOp, DAG.getIntPtrConstant(0, dl));
23595
171
}
23596
23597
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23598
156
                             SelectionDAG &DAG) {
23599
156
  assert(Subtarget.hasAVX512() &&
23600
156
         "MGATHER/MSCATTER are supported on AVX-512 arch only");
23601
156
23602
156
  // X86 scatter kills mask register, so its type should be added to
23603
156
  // the list of return values.
23604
156
  // If the "scatter" has 2 return values, it is already handled.
23605
156
  if (Op.getNode()->getNumValues() == 2)
23606
78
    return Op;
23607
78
23608
78
  MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23609
78
  SDValue Src = N->getValue();
23610
78
  MVT VT = Src.getSimpleValueType();
23611
78
  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23612
78
  SDLoc dl(Op);
23613
78
23614
78
  SDValue NewScatter;
23615
78
  SDValue Index = N->getIndex();
23616
78
  SDValue Mask = N->getMask();
23617
78
  SDValue Chain = N->getChain();
23618
78
  SDValue BasePtr = N->getBasePtr();
23619
78
  MVT MemVT = N->getMemoryVT().getSimpleVT();
23620
78
  MVT IndexVT = Index.getSimpleValueType();
23621
78
  MVT MaskVT = Mask.getSimpleValueType();
23622
78
23623
78
  if (
MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()78
) {
23624
10
    // The v2i32 value was promoted to v2i64.
23625
10
    // Now we "redo" the type legalizer's work and widen the original
23626
10
    // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23627
10
    // with a shuffle.
23628
10
    assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23629
10
           "Unexpected memory type");
23630
10
    int ShuffleMask[] = {0, 2, -1, -1};
23631
10
    Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23632
10
                               DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23633
10
    // Now we have 4 elements instead of 2.
23634
10
    // Expand the index.
23635
10
    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23636
10
    Index = ExtendToType(Index, NewIndexVT, DAG);
23637
10
23638
10
    // Expand the mask with zeroes
23639
10
    // Mask may be <2 x i64> or <2 x i1> at this moment
23640
10
    assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23641
10
           "Unexpected mask type");
23642
10
    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23643
10
    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23644
10
    VT = MVT::v4i32;
23645
10
  }
23646
78
23647
78
  unsigned NumElts = VT.getVectorNumElements();
23648
78
  if (
!Subtarget.hasVLX() && 78
!VT.is512BitVector()30
&&
23649
78
      
!Index.getSimpleValueType().is512BitVector()16
) {
23650
11
    // AVX512F supports only 512-bit vectors. Or data or index should
23651
11
    // be 512 bit wide. If now the both index and data are 256-bit, but
23652
11
    // the vector contains 8 elements, we just sign-extend the index
23653
11
    if (IndexVT == MVT::v8i32)
23654
11
      // Just extend index
23655
1
      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23656
10
    else {
23657
10
      // The minimal number of elts in scatter is 8
23658
10
      NumElts = 8;
23659
10
      // Index
23660
10
      MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23661
10
      // Use original index here, do not modify the index twice
23662
10
      Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23663
10
      if (IndexVT.getScalarType() == MVT::i32)
23664
2
        Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23665
10
23666
10
      // Mask
23667
10
      // At this point we have promoted mask operand
23668
10
      assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23669
10
      MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23670
10
      // Use the original mask here, do not modify the mask twice
23671
10
      Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23672
10
23673
10
      // The value that should be stored
23674
10
      MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23675
10
      Src = ExtendToType(Src, NewVT, DAG);
23676
10
    }
23677
11
  }
23678
156
  // If the mask is "wide" at this point - truncate it to i1 vector
23679
156
  MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23680
156
  Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23681
156
23682
156
  // The mask is killed by scatter, add it to the values
23683
156
  SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23684
156
  SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23685
156
  NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23686
156
                                    N->getMemOperand());
23687
156
  DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23688
156
  return SDValue(NewScatter.getNode(), 1);
23689
156
}
23690
23691
static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23692
43
                          SelectionDAG &DAG) {
23693
43
23694
43
  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23695
43
  MVT VT = Op.getSimpleValueType();
23696
43
  MVT ScalarVT = VT.getScalarType();
23697
43
  SDValue Mask = N->getMask();
23698
43
  SDLoc dl(Op);
23699
43
23700
43
  assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23701
43
         "Expanding masked load is supported on AVX-512 target only!");
23702
43
23703
43
  assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23704
43
         "Expanding masked load is supported for 32 and 64-bit types only!");
23705
43
23706
43
  // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23707
43
  // VLX. These types for exp-loads are handled here.
23708
43
  if (
!N->isExpandingLoad() && 43
VT.getVectorNumElements() <= 440
)
23709
30
    return Op;
23710
13
23711
43
  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23712
13
         "Cannot lower masked load op.");
23713
13
23714
13
  assert((ScalarVT.getSizeInBits() >= 32 ||
23715
13
          (Subtarget.hasBWI() &&
23716
13
              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23717
13
         "Unsupported masked load op.");
23718
13
23719
13
  // This operation is legal for targets with VLX, but without
23720
13
  // VLX the vector should be widened to 512 bit
23721
13
  unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23722
13
  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23723
13
  SDValue Src0 = N->getSrc0();
23724
13
  Src0 = ExtendToType(Src0, WideDataVT, DAG);
23725
13
23726
13
  // Mask element has to be i1.
23727
13
  MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23728
13
  assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23729
13
         "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23730
13
23731
13
  MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23732
13
23733
13
  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23734
13
  if (MaskEltTy != MVT::i1)
23735
3
    Mask = DAG.getNode(ISD::TRUNCATE, dl,
23736
3
                       MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23737
43
  SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23738
43
                                      N->getBasePtr(), Mask, Src0,
23739
43
                                      N->getMemoryVT(), N->getMemOperand(),
23740
43
                                      N->getExtensionType(),
23741
43
                                      N->isExpandingLoad());
23742
43
23743
43
  SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23744
43
                               NewLoad.getValue(0),
23745
43
                               DAG.getIntPtrConstant(0, dl));
23746
43
  SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23747
43
  return DAG.getMergeValues(RetOps, dl);
23748
43
}
23749
23750
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23751
15
                           SelectionDAG &DAG) {
23752
15
  MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23753
15
  SDValue DataToStore = N->getValue();
23754
15
  MVT VT = DataToStore.getSimpleValueType();
23755
15
  MVT ScalarVT = VT.getScalarType();
23756
15
  SDValue Mask = N->getMask();
23757
15
  SDLoc dl(Op);
23758
15
23759
15
  assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23760
15
         "Expanding masked load is supported on AVX-512 target only!");
23761
15
23762
15
  assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23763
15
         "Expanding masked load is supported for 32 and 64-bit types only!");
23764
15
23765
15
  // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23766
15
  if (
!N->isCompressingStore() && 15
VT.getVectorNumElements() <= 410
)
23767
5
    return Op;
23768
10
23769
15
  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23770
10
         "Cannot lower masked store op.");
23771
10
23772
10
  assert((ScalarVT.getSizeInBits() >= 32 ||
23773
10
          (Subtarget.hasBWI() &&
23774
10
              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23775
10
          "Unsupported masked store op.");
23776
10
23777
10
  // This operation is legal for targets with VLX, but without
23778
10
  // VLX the vector should be widened to 512 bit
23779
10
  unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23780
10
  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23781
10
23782
10
  // Mask element has to be i1.
23783
10
  MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23784
10
  assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23785
10
         "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23786
10
23787
10
  MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23788
10
23789
10
  DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23790
10
  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23791
10
  if (MaskEltTy != MVT::i1)
23792
4
    Mask = DAG.getNode(ISD::TRUNCATE, dl,
23793
4
                       MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23794
15
  return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23795
15
                            Mask, N->getMemoryVT(), N->getMemOperand(),
23796
15
                            N->isTruncatingStore(), N->isCompressingStore());
23797
15
}
23798
23799
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23800
229
                            SelectionDAG &DAG) {
23801
229
  assert(Subtarget.hasAVX512() &&
23802
229
         "MGATHER/MSCATTER are supported on AVX-512 arch only");
23803
229
23804
229
  MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23805
229
  SDLoc dl(Op);
23806
229
  MVT VT = Op.getSimpleValueType();
23807
229
  SDValue Index = N->getIndex();
23808
229
  SDValue Mask = N->getMask();
23809
229
  SDValue Src0 = N->getValue();
23810
229
  MVT IndexVT = Index.getSimpleValueType();
23811
229
  MVT MaskVT = Mask.getSimpleValueType();
23812
229
23813
229
  unsigned NumElts = VT.getVectorNumElements();
23814
229
  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23815
229
23816
229
  if (
!Subtarget.hasVLX() && 229
!VT.is512BitVector()76
&&
23817
229
      
!Index.getSimpleValueType().is512BitVector()76
) {
23818
29
    // AVX512F supports only 512-bit vectors. Or data or index should
23819
29
    // be 512 bit wide. If now the both index and data are 256-bit, but
23820
29
    // the vector contains 8 elements, we just sign-extend the index
23821
29
    if (
NumElts == 829
) {
23822
7
      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23823
7
      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
23824
7
                        N->getOperand(3), Index };
23825
7
      DAG.UpdateNodeOperands(N, Ops);
23826
7
      return Op;
23827
7
    }
23828
22
23829
22
    // Minimal number of elements in Gather
23830
22
    NumElts = 8;
23831
22
    // Index
23832
22
    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23833
22
    Index = ExtendToType(Index, NewIndexVT, DAG);
23834
22
    if (IndexVT.getScalarType() == MVT::i32)
23835
9
      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23836
29
23837
29
    // Mask
23838
29
    MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23839
29
    // At this point we have promoted mask operand
23840
29
    assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23841
29
    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23842
29
    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23843
29
    Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23844
29
23845
29
    // The pass-through value
23846
29
    MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23847
29
    Src0 = ExtendToType(Src0, NewVT, DAG);
23848
29
23849
29
    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23850
29
    SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23851
29
                                            N->getMemoryVT(), dl, Ops,
23852
29
                                            N->getMemOperand());
23853
29
    SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23854
29
                                 NewGather.getValue(0),
23855
29
                                 DAG.getIntPtrConstant(0, dl));
23856
29
    SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23857
29
    return DAG.getMergeValues(RetOps, dl);
23858
29
  }
23859
200
  
if (200
N->getMemoryVT() == MVT::v2i32 && 200
Subtarget.hasVLX()6
) {
23860
6
    // There is a special case when the return type is v2i32 is illegal and
23861
6
    // the type legaizer extended it to v2i64. Without this conversion we end up
23862
6
    // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
23863
6
    // In order to avoid this situation, we'll build an X86 specific Gather node
23864
6
    // with index v2i64 and value type v4i32.
23865
6
    assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
23866
6
           "Unexpected type in masked gather");
23867
6
    Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
23868
6
                                DAG.getBitcast(MVT::v4i32, Src0),
23869
6
                                DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
23870
6
    // The mask should match the destination type. Extending mask with zeroes
23871
6
    // is not necessary since instruction itself reads only two values from
23872
6
    // memory.
23873
6
    Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
23874
6
    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23875
6
    SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23876
6
      DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
23877
6
      N->getMemOperand());
23878
6
23879
6
    SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
23880
6
                                  NewGather.getValue(0), DAG);
23881
6
    SDValue RetOps[] = { Sext, NewGather.getValue(1) };
23882
6
    return DAG.getMergeValues(RetOps, dl);
23883
6
  }
23884
194
  
if (194
N->getMemoryVT() == MVT::v2f32 && 194
Subtarget.hasVLX()27
) {
23885
15
    // This transformation is for optimization only.
23886
15
    // The type legalizer extended mask and index to 4 elements vector
23887
15
    // in order to match requirements of the common gather node - same
23888
15
    // vector width of index and value. X86 Gather node allows mismatch
23889
15
    // of vector width in order to select more optimal instruction at the
23890
15
    // end.
23891
15
    assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
23892
15
           "Unexpected type in masked gather");
23893
15
    if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
23894
6
        ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
23895
6
        Index.getOpcode() == ISD::CONCAT_VECTORS &&
23896
15
        
Index.getOperand(1).isUndef()3
) {
23897
3
      Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
23898
3
      Index = Index.getOperand(0);
23899
3
    } else
23900
12
      return Op;
23901
3
    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23902
3
    SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23903
3
      DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
23904
3
      N->getMemOperand());
23905
3
23906
3
    SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
23907
3
    return DAG.getMergeValues(RetOps, dl);
23908
3
23909
3
  }
23910
179
  return Op;
23911
179
}
23912
23913
SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23914
8
                                                    SelectionDAG &DAG) const {
23915
8
  // TODO: Eventually, the lowering of these nodes should be informed by or
23916
8
  // deferred to the GC strategy for the function in which they appear. For
23917
8
  // now, however, they must be lowered to something. Since they are logically
23918
8
  // no-ops in the case of a null GC strategy (or a GC strategy which does not
23919
8
  // require special handling for these nodes), lower them as literal NOOPs for
23920
8
  // the time being.
23921
8
  SmallVector<SDValue, 2> Ops;
23922
8
23923
8
  Ops.push_back(Op.getOperand(0));
23924
8
  if (Op->getGluedNode())
23925
2
    Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23926
8
23927
8
  SDLoc OpDL(Op);
23928
8
  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23929
8
  SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23930
8
23931
8
  return NOOP;
23932
8
}
23933
23934
SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23935
8
                                                  SelectionDAG &DAG) const {
23936
8
  // TODO: Eventually, the lowering of these nodes should be informed by or
23937
8
  // deferred to the GC strategy for the function in which they appear. For
23938
8
  // now, however, they must be lowered to something. Since they are logically
23939
8
  // no-ops in the case of a null GC strategy (or a GC strategy which does not
23940
8
  // require special handling for these nodes), lower them as literal NOOPs for
23941
8
  // the time being.
23942
8
  SmallVector<SDValue, 2> Ops;
23943
8
23944
8
  Ops.push_back(Op.getOperand(0));
23945
8
  if (Op->getGluedNode())
23946
8
    Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23947
8
23948
8
  SDLoc OpDL(Op);
23949
8
  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23950
8
  SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23951
8
23952
8
  return NOOP;
23953
8
}
23954
23955
/// Provide custom lowering hooks for some operations.
23956
333k
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
23957
333k
  switch (Op.getOpcode()) {
23958
0
  
default: 0
llvm_unreachable0
("Should not custom lower this!");
23959
114
  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
23960
1.47k
  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
23961
1.47k
    return LowerCMP_SWAP(Op, Subtarget, DAG);
23962
270
  case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
23963
1.53k
  case ISD::ATOMIC_LOAD_ADD:
23964
1.53k
  case ISD::ATOMIC_LOAD_SUB:
23965
1.53k
  case ISD::ATOMIC_LOAD_OR:
23966
1.53k
  case ISD::ATOMIC_LOAD_XOR:
23967
1.53k
  case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
23968
815
  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
23969
204
  case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
23970
52.0k
  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
23971
4.73k
  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23972
48.7k
  case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
23973
19.7k
  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
23974
31.4k
  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23975
11.7k
  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
23976
0
  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23977
2.76k
  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23978
7.26k
  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
23979
16.4k
  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
23980
26.6k
  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
23981
373
  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
23982
83
  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
23983
19
  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
23984
461
  case ISD::SHL_PARTS:
23985
461
  case ISD::SRA_PARTS:
23986
461
  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
23987
3.08k
  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
23988
1.07k
  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
23989
3.49k
  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
23990
574
  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
23991
3.62k
  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23992
205
  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
23993
434
  case ISD::ZERO_EXTEND_VECTOR_INREG:
23994
434
  case ISD::SIGN_EXTEND_VECTOR_INREG:
23995
434
    return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23996
1.68k
  case ISD::FP_TO_SINT:
23997
1.68k
  case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
23998
45
  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
23999
1.96k
  case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
24000
529
  case ISD::FABS:
24001
529
  case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
24002
403
  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
24003
5
  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
24004
9.92k
  case ISD::SETCC:              return LowerSETCC(Op, DAG);
24005
548
  case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
24006
5.44k
  case ISD::SELECT:             return LowerSELECT(Op, DAG);
24007
33.9k
  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
24008
557
  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
24009
84
  case ISD::VASTART:            return LowerVASTART(Op, DAG);
24010
3
  case ISD::VAARG:              return LowerVAARG(Op, DAG);
24011
22
  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
24012
14.0k
  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
24013
1.58k
  case ISD::INTRINSIC_VOID:
24014
1.58k
  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24015
27
  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
24016
6
  case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
24017
31
  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
24018
2
  case ISD::FRAME_TO_ARGS_OFFSET:
24019
2
                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24020
256
  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24021
6
  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
24022
10
  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
24023
4
  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
24024
2
  case ISD::EH_SJLJ_SETUP_DISPATCH:
24025
2
    return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24026
2
  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
24027
0
  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
24028
0
  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
24029
657
  case ISD::CTLZ:
24030
657
  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
24031
171
  case ISD::CTTZ:
24032
171
  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
24033
413
  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
24034
52
  case ISD::MULHS:
24035
52
  case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
24036
54
  case ISD::UMUL_LOHI:
24037
54
  case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
24038
472
  case ISD::ROTL:
24039
472
  case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
24040
13.7k
  case ISD::SRA:
24041
13.7k
  case ISD::SRL:
24042
13.7k
  case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
24043
2.30k
  case ISD::SADDO:
24044
2.30k
  case ISD::UADDO:
24045
2.30k
  case ISD::SSUBO:
24046
2.30k
  case ISD::USUBO:
24047
2.30k
  case ISD::SMULO:
24048
2.30k
  case ISD::UMULO:              return LowerXALUO(Op, DAG);
24049
5
  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24050
521
  case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
24051
2.54k
  case ISD::ADDCARRY:
24052
2.54k
  case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
24053
409
  case ISD::ADD:
24054
409
  case ISD::SUB:                return LowerADD_SUB(Op, DAG);
24055
184
  case ISD::SMAX:
24056
184
  case ISD::SMIN:
24057
184
  case ISD::UMAX:
24058
184
  case ISD::UMIN:               return LowerMINMAX(Op, DAG);
24059
11
  case ISD::ABS:                return LowerABS(Op, DAG);
24060
2
  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
24061
43
  case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
24062
15
  case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
24063
229
  case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
24064
156
  case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
24065
8
  case ISD::GC_TRANSITION_START:
24066
8
                                return LowerGC_TRANSITION_START(Op, DAG);
24067
8
  case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
24068
819
  case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
24069
0
  }
24070
0
}
24071
24072
/// Places new result values for the node in Results (their number
24073
/// and types must exactly match those of the original return values of
24074
/// the node), or leaves Results empty, which indicates that the node is not
24075
/// to be custom lowered after all.
24076
void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24077
                                              SmallVectorImpl<SDValue> &Results,
24078
1.15k
                                              SelectionDAG &DAG) const {
24079
1.15k
  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24080
1.15k
24081
1.15k
  if (!Res.getNode())
24082
362
    return;
24083
789
24084
1.15k
  assert((N->getNumValues() <= Res->getNumValues()) &&
24085
789
      "Lowering returned the wrong number of results!");
24086
789
24087
789
  // Places new result values base on N result number.
24088
789
  // In some cases (LowerSINT_TO_FP for example) Res has more result values
24089
789
  // than original node, chain should be dropped(last value).
24090
1.57k
  for (unsigned I = 0, E = N->getNumValues(); 
I != E1.57k
;
++I789
)
24091
789
    Results.push_back(Res.getValue(I));
24092
1.15k
}
24093
24094
/// Replace a node with an illegal result type with a new node built out of
24095
/// custom code.
24096
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24097
                                           SmallVectorImpl<SDValue>&Results,
24098
1.46k
                                           SelectionDAG &DAG) const {
24099
1.46k
  SDLoc dl(N);
24100
1.46k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24101
1.46k
  switch (N->getOpcode()) {
24102
0
  default:
24103
0
    llvm_unreachable("Do not know how to custom type legalize this operation!");
24104
45
  case X86ISD::AVG: {
24105
45
    // Legalize types for X86ISD::AVG by expanding vectors.
24106
45
    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24107
45
24108
45
    auto InVT = N->getValueType(0);
24109
45
    auto InVTSize = InVT.getSizeInBits();
24110
45
    const unsigned RegSize =
24111
45
        (InVTSize > 128) ? 
((InVTSize > 256) ? 0
5120
:
2560
) :
12845
;
24112
45
    assert((Subtarget.hasBWI() || RegSize < 512) &&
24113
45
           "512-bit vector requires AVX512BW");
24114
45
    assert((Subtarget.hasAVX2() || RegSize < 256) &&
24115
45
           "256-bit vector requires AVX2");
24116
45
24117
45
    auto ElemVT = InVT.getVectorElementType();
24118
45
    auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24119
45
                                  RegSize / ElemVT.getSizeInBits());
24120
45
    assert(RegSize % InVT.getSizeInBits() == 0);
24121
45
    unsigned NumConcat = RegSize / InVT.getSizeInBits();
24122
45
24123
45
    SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24124
45
    Ops[0] = N->getOperand(0);
24125
45
    SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24126
45
    Ops[0] = N->getOperand(1);
24127
45
    SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24128
45
24129
45
    SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24130
45
    Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24131
45
                                  DAG.getIntPtrConstant(0, dl)));
24132
45
    return;
24133
1.46k
  }
24134
1.46k
  // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24135
4
  case X86ISD::FMINC:
24136
4
  case X86ISD::FMIN:
24137
4
  case X86ISD::FMAXC:
24138
4
  case X86ISD::FMAX: {
24139
4
    EVT VT = N->getValueType(0);
24140
4
    assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
24141
4
    SDValue UNDEF = DAG.getUNDEF(VT);
24142
4
    SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24143
4
                              N->getOperand(0), UNDEF);
24144
4
    SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24145
4
                              N->getOperand(1), UNDEF);
24146
4
    Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24147
4
    return;
24148
4
  }
24149
3
  case ISD::SDIV:
24150
3
  case ISD::UDIV:
24151
3
  case ISD::SREM:
24152
3
  case ISD::UREM:
24153
3
  case ISD::SDIVREM:
24154
3
  case ISD::UDIVREM: {
24155
3
    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24156
3
    Results.push_back(V);
24157
3
    return;
24158
3
  }
24159
321
  case ISD::FP_TO_SINT:
24160
321
  case ISD::FP_TO_UINT: {
24161
321
    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24162
321
24163
321
    if (
N->getValueType(0) == MVT::v2i32321
) {
24164
91
      assert((IsSigned || Subtarget.hasAVX512()) &&
24165
91
             "Can only handle signed conversion without AVX512");
24166
91
      assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24167
91
      SDValue Src = N->getOperand(0);
24168
91
      if (
Src.getValueType() == MVT::v2f6491
) {
24169
53
        SDValue Idx = DAG.getIntPtrConstant(0, dl);
24170
45
        SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
24171
8
                                           : X86ISD::CVTTP2UI,
24172
53
                                  dl, MVT::v4i32, Src);
24173
53
        Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24174
53
        Results.push_back(Res);
24175
53
        return;
24176
53
      }
24177
38
      
if (38
Src.getValueType() == MVT::v2f3238
) {
24178
14
        SDValue Idx = DAG.getIntPtrConstant(0, dl);
24179
14
        SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24180
14
                                  DAG.getUNDEF(MVT::v2f32));
24181
10
        Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24182
4
                                   : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24183
14
        Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24184
14
        Results.push_back(Res);
24185
14
        return;
24186
14
      }
24187
24
24188
24
      // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24189
24
      // so early out here.
24190
24
      return;
24191
24
    }
24192
230
24193
230
    std::pair<SDValue,SDValue> Vals =
24194
230
        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24195
230
    SDValue FIST = Vals.first, StackSlot = Vals.second;
24196
230
    if (
FIST.getNode()230
) {
24197
208
      EVT VT = N->getValueType(0);
24198
208
      // Return a load from the stack slot.
24199
208
      if (StackSlot.getNode())
24200
98
        Results.push_back(
24201
98
            DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24202
208
      else
24203
110
        Results.push_back(FIST);
24204
208
    }
24205
230
    return;
24206
230
  }
24207
6
  case ISD::SINT_TO_FP: {
24208
6
    assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24209
6
    SDValue Src = N->getOperand(0);
24210
6
    if (
N->getValueType(0) != MVT::v2f32 || 6
Src.getValueType() != MVT::v2i646
)
24211
2
      return;
24212
4
    Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24213
4
    return;
24214
4
  }
24215
47
  case ISD::UINT_TO_FP: {
24216
47
    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24217
47
    EVT VT = N->getValueType(0);
24218
47
    if (VT != MVT::v2f32)
24219
0
      return;
24220
47
    SDValue Src = N->getOperand(0);
24221
47
    EVT SrcVT = Src.getValueType();
24222
47
    if (
Subtarget.hasDQI() && 47
Subtarget.hasVLX()7
&&
SrcVT == MVT::v2i644
) {
24223
2
      Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24224
2
      return;
24225
2
    }
24226
45
    
if (45
SrcVT != MVT::v2i3245
)
24227
43
      return;
24228
2
    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24229
2
    SDValue VBias =
24230
2
        DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24231
2
    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24232
2
                             DAG.getBitcast(MVT::v2i64, VBias));
24233
2
    Or = DAG.getBitcast(MVT::v2f64, Or);
24234
2
    // TODO: Are there any fast-math-flags to propagate here?
24235
2
    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24236
2
    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24237
2
    return;
24238
2
  }
24239
59
  case ISD::FP_ROUND: {
24240
59
    if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24241
1
        return;
24242
58
    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24243
58
    Results.push_back(V);
24244
58
    return;
24245
58
  }
24246
136
  case ISD::FP_EXTEND: {
24247
136
    // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24248
136
    // No other ValueType for FP_EXTEND should reach this point.
24249
136
    assert(N->getValueType(0) == MVT::v2f32 &&
24250
136
           "Do not know how to legalize this Node");
24251
136
    return;
24252
58
  }
24253
4
  case ISD::INTRINSIC_W_CHAIN: {
24254
4
    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24255
4
    switch (IntNo) {
24256
0
    
default : 0
llvm_unreachable0
("Do not know how to custom type "
24257
4
                               "legalize this intrinsic operation!");
24258
1
    case Intrinsic::x86_rdtsc:
24259
1
      return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24260
1
                                     Results);
24261
1
    case Intrinsic::x86_rdtscp:
24262
1
      return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24263
1
                                     Results);
24264
1
    case Intrinsic::x86_rdpmc:
24265
1
      return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24266
4
24267
1
    case Intrinsic::x86_xgetbv:
24268
1
      return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24269
0
    }
24270
0
  }
24271
6
  case ISD::INTRINSIC_WO_CHAIN: {
24272
6
    if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24273
6
      Results.push_back(V);
24274
6
    return;
24275
0
  }
24276
1
  case ISD::READCYCLECOUNTER: {
24277
1
    return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24278
1
                                   Results);
24279
0
  }
24280
453
  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24281
453
    EVT T = N->getValueType(0);
24282
453
    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24283
453
    bool Regs64bit = T == MVT::i128;
24284
453
    MVT HalfT = Regs64bit ? 
MVT::i6425
:
MVT::i32428
;
24285
453
    SDValue cpInL, cpInH;
24286
453
    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24287
453
                        DAG.getConstant(0, dl, HalfT));
24288
453
    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24289
453
                        DAG.getConstant(1, dl, HalfT));
24290
453
    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24291
453
                             Regs64bit ? 
X86::RAX25
:
X86::EAX428
,
24292
453
                             cpInL, SDValue());
24293
453
    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24294
453
                             Regs64bit ? 
X86::RDX25
:
X86::EDX428
,
24295
453
                             cpInH, cpInL.getValue(1));
24296
453
    SDValue swapInL, swapInH;
24297
453
    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24298
453
                          DAG.getConstant(0, dl, HalfT));
24299
453
    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24300
453
                          DAG.getConstant(1, dl, HalfT));
24301
453
    swapInH =
24302
453
        DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? 
X86::RCX25
:
X86::ECX428
,
24303
453
                         swapInH, cpInH.getValue(1));
24304
453
    // If the current function needs the base pointer, RBX,
24305
453
    // we shouldn't use cmpxchg directly.
24306
453
    // Indeed the lowering of that instruction will clobber
24307
453
    // that register and since RBX will be a reserved register
24308
453
    // the register allocator will not make sure its value will
24309
453
    // be properly saved and restored around this live-range.
24310
453
    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24311
453
    SDValue Result;
24312
453
    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24313
453
    unsigned BasePtr = TRI->getBaseRegister();
24314
453
    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24315
453
    if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24316
453
        
(BasePtr == X86::RBX || 4
BasePtr == X86::EBX3
)) {
24317
2
      // ISel prefers the LCMPXCHG64 variant.
24318
2
      // If that assert breaks, that means it is not the case anymore,
24319
2
      // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24320
2
      // not just EBX. This is a matter of accepting i64 input for that
24321
2
      // pseudo, and restoring into the register of the right wide
24322
2
      // in expand pseudo. Everything else should just work.
24323
2
      assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24324
2
             "Saving only half of the RBX");
24325
2
      unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24326
0
                                  : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24327
2
      SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24328
2
                                           Regs64bit ? 
X86::RBX2
:
X86::EBX0
,
24329
2
                                           HalfT, swapInH.getValue(1));
24330
2
      SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24331
2
                       RBXSave,
24332
2
                       /*Glue*/ RBXSave.getValue(2)};
24333
2
      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24334
453
    } else {
24335
451
      unsigned Opcode =
24336
451
          Regs64bit ? 
X86ISD::LCMPXCHG16_DAG23
:
X86ISD::LCMPXCHG8_DAG428
;
24337
451
      swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24338
451
                                 Regs64bit ? 
X86::RBX23
:
X86::EBX428
, swapInL,
24339
451
                                 swapInH.getValue(1));
24340
451
      SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24341
451
                       swapInL.getValue(1)};
24342
451
      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24343
451
    }
24344
453
    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24345
453
                                        Regs64bit ? 
X86::RAX25
:
X86::EAX428
,
24346
453
                                        HalfT, Result.getValue(1));
24347
453
    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24348
453
                                        Regs64bit ? 
X86::RDX25
:
X86::EDX428
,
24349
453
                                        HalfT, cpOutL.getValue(2));
24350
453
    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24351
453
24352
453
    SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24353
453
                                        MVT::i32, cpOutH.getValue(2));
24354
453
    SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24355
453
    Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24356
453
24357
453
    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24358
453
    Results.push_back(Success);
24359
453
    Results.push_back(EFLAGS.getValue(1));
24360
453
    return;
24361
0
  }
24362
0
  case ISD::ATOMIC_SWAP:
24363
0
  case ISD::ATOMIC_LOAD_ADD:
24364
0
  case ISD::ATOMIC_LOAD_SUB:
24365
0
  case ISD::ATOMIC_LOAD_AND:
24366
0
  case ISD::ATOMIC_LOAD_OR:
24367
0
  case ISD::ATOMIC_LOAD_XOR:
24368
0
  case ISD::ATOMIC_LOAD_NAND:
24369
0
  case ISD::ATOMIC_LOAD_MIN:
24370
0
  case ISD::ATOMIC_LOAD_MAX:
24371
0
  case ISD::ATOMIC_LOAD_UMIN:
24372
0
  case ISD::ATOMIC_LOAD_UMAX:
24373
0
  case ISD::ATOMIC_LOAD: {
24374
0
    // Delegate to generic TypeLegalization. Situations we can really handle
24375
0
    // should have already been dealt with by AtomicExpandPass.cpp.
24376
0
    break;
24377
0
  }
24378
383
  case ISD::BITCAST: {
24379
383
    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24380
383
    EVT DstVT = N->getValueType(0);
24381
383
    EVT SrcVT = N->getOperand(0)->getValueType(0);
24382
383
24383
383
    if (SrcVT != MVT::f64 ||
24384
55
        
(DstVT != MVT::v2i32 && 55
DstVT != MVT::v4i1641
&&
DstVT != MVT::v8i827
))
24385
335
      return;
24386
48
24387
48
    unsigned NumElts = DstVT.getVectorNumElements();
24388
48
    EVT SVT = DstVT.getVectorElementType();
24389
48
    EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24390
48
    SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24391
48
                                   MVT::v2f64, N->getOperand(0));
24392
48
    SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24393
48
24394
48
    if (
ExperimentalVectorWideningLegalization48
) {
24395
9
      // If we are legalizing vectors by widening, we already have the desired
24396
9
      // legal vector type, just return it.
24397
9
      Results.push_back(ToVecInt);
24398
9
      return;
24399
9
    }
24400
39
24401
39
    SmallVector<SDValue, 8> Elts;
24402
241
    for (unsigned i = 0, e = NumElts; 
i != e241
;
++i202
)
24403
202
      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24404
202
                                   ToVecInt, DAG.getIntPtrConstant(i, dl)));
24405
4
24406
4
    Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24407
4
  }
24408
1.46k
  }
24409
1.46k
}
24410
24411
0
const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24412
0
  switch ((X86ISD::NodeType)Opcode) {
24413
0
  case X86ISD::FIRST_NUMBER:       break;
24414
0
  case X86ISD::BSF:                return "X86ISD::BSF";
24415
0
  case X86ISD::BSR:                return "X86ISD::BSR";
24416
0
  case X86ISD::SHLD:               return "X86ISD::SHLD";
24417
0
  case X86ISD::SHRD:               return "X86ISD::SHRD";
24418
0
  case X86ISD::FAND:               return "X86ISD::FAND";
24419
0
  case X86ISD::FANDN:              return "X86ISD::FANDN";
24420
0
  case X86ISD::FOR:                return "X86ISD::FOR";
24421
0
  case X86ISD::FXOR:               return "X86ISD::FXOR";
24422
0
  case X86ISD::FILD:               return "X86ISD::FILD";
24423
0
  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
24424
0
  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24425
0
  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24426
0
  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24427
0
  case X86ISD::FLD:                return "X86ISD::FLD";
24428
0
  case X86ISD::FST:                return "X86ISD::FST";
24429
0
  case X86ISD::CALL:               return "X86ISD::CALL";
24430
0
  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
24431
0
  case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
24432
0
  case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
24433
0
  case X86ISD::BT:                 return "X86ISD::BT";
24434
0
  case X86ISD::CMP:                return "X86ISD::CMP";
24435
0
  case X86ISD::COMI:               return "X86ISD::COMI";
24436
0
  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
24437
0
  case X86ISD::CMPM:               return "X86ISD::CMPM";
24438
0
  case X86ISD::CMPMU:              return "X86ISD::CMPMU";
24439
0
  case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
24440
0
  case X86ISD::SETCC:              return "X86ISD::SETCC";
24441
0
  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
24442
0
  case X86ISD::FSETCC:             return "X86ISD::FSETCC";
24443
0
  case X86ISD::FSETCCM:            return "X86ISD::FSETCCM";
24444
0
  case X86ISD::FSETCCM_RND:        return "X86ISD::FSETCCM_RND";
24445
0
  case X86ISD::CMOV:               return "X86ISD::CMOV";
24446
0
  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
24447
0
  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
24448
0
  case X86ISD::IRET:               return "X86ISD::IRET";
24449
0
  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
24450
0
  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
24451
0
  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
24452
0
  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
24453
0
  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
24454
0
  case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
24455
0
  case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
24456
0
  case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
24457
0
  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
24458
0
  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
24459
0
  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
24460
0
  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
24461
0
  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
24462
0
  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
24463
0
  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
24464
0
  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
24465
0
  case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
24466
0
  case X86ISD::ADDUS:              return "X86ISD::ADDUS";
24467
0
  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
24468
0
  case X86ISD::HADD:               return "X86ISD::HADD";
24469
0
  case X86ISD::HSUB:               return "X86ISD::HSUB";
24470
0
  case X86ISD::FHADD:              return "X86ISD::FHADD";
24471
0
  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
24472
0
  case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
24473
0
  case X86ISD::FMAX:               return "X86ISD::FMAX";
24474
0
  case X86ISD::FMAXS:              return "X86ISD::FMAXS";
24475
0
  case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
24476
0
  case X86ISD::FMAXS_RND:          return "X86ISD::FMAX_RND";
24477
0
  case X86ISD::FMIN:               return "X86ISD::FMIN";
24478
0
  case X86ISD::FMINS:              return "X86ISD::FMINS";
24479
0
  case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
24480
0
  case X86ISD::FMINS_RND:          return "X86ISD::FMINS_RND";
24481
0
  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
24482
0
  case X86ISD::FMINC:              return "X86ISD::FMINC";
24483
0
  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
24484
0
  case X86ISD::FRSQRTS:            return "X86ISD::FRSQRTS";
24485
0
  case X86ISD::FRCP:               return "X86ISD::FRCP";
24486
0
  case X86ISD::FRCPS:              return "X86ISD::FRCPS";
24487
0
  case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
24488
0
  case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
24489
0
  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
24490
0
  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
24491
0
  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
24492
0
  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
24493
0
  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
24494
0
  case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24495
0
    return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24496
0
  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
24497
0
  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
24498
0
  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
24499
0
  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
24500
0
  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
24501
0
  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
24502
0
  case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
24503
0
  case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24504
0
    return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24505
0
  case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24506
0
    return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24507
0
  case X86ISD::LADD:               return "X86ISD::LADD";
24508
0
  case X86ISD::LSUB:               return "X86ISD::LSUB";
24509
0
  case X86ISD::LOR:                return "X86ISD::LOR";
24510
0
  case X86ISD::LXOR:               return "X86ISD::LXOR";
24511
0
  case X86ISD::LAND:               return "X86ISD::LAND";
24512
0
  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
24513
0
  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
24514
0
  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
24515
0
  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
24516
0
  case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
24517
0
  case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
24518
0
  case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
24519
0
  case X86ISD::VTRUNCSTORES:       return "X86ISD::VTRUNCSTORES";
24520
0
  case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
24521
0
  case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
24522
0
  case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
24523
0
  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
24524
0
  case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
24525
0
  case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";
24526
0
  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
24527
0
  case X86ISD::VFPROUND_RND:       return "X86ISD::VFPROUND_RND";
24528
0
  case X86ISD::VFPROUNDS_RND:      return "X86ISD::VFPROUNDS_RND";
24529
0
  case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
24530
0
  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
24531
0
  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
24532
0
  case X86ISD::VSHL:               return "X86ISD::VSHL";
24533
0
  case X86ISD::VSRL:               return "X86ISD::VSRL";
24534
0
  case X86ISD::VSRA:               return "X86ISD::VSRA";
24535
0
  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
24536
0
  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
24537
0
  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
24538
0
  case X86ISD::VSRAV:              return "X86ISD::VSRAV";
24539
0
  case X86ISD::VROTLI:             return "X86ISD::VROTLI";
24540
0
  case X86ISD::VROTRI:             return "X86ISD::VROTRI";
24541
0
  case X86ISD::VPPERM:             return "X86ISD::VPPERM";
24542
0
  case X86ISD::CMPP:               return "X86ISD::CMPP";
24543
0
  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
24544
0
  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
24545
0
  case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
24546
0
  case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
24547
0
  case X86ISD::ADD:                return "X86ISD::ADD";
24548
0
  case X86ISD::SUB:                return "X86ISD::SUB";
24549
0
  case X86ISD::ADC:                return "X86ISD::ADC";
24550
0
  case X86ISD::SBB:                return "X86ISD::SBB";
24551
0
  case X86ISD::SMUL:               return "X86ISD::SMUL";
24552
0
  case X86ISD::UMUL:               return "X86ISD::UMUL";
24553
0
  case X86ISD::SMUL8:              return "X86ISD::SMUL8";
24554
0
  case X86ISD::UMUL8:              return "X86ISD::UMUL8";
24555
0
  case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24556
0
  case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24557
0
  case X86ISD::INC:                return "X86ISD::INC";
24558
0
  case X86ISD::DEC:                return "X86ISD::DEC";
24559
0
  case X86ISD::OR:                 return "X86ISD::OR";
24560
0
  case X86ISD::XOR:                return "X86ISD::XOR";
24561
0
  case X86ISD::AND:                return "X86ISD::AND";
24562
0
  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
24563
0
  case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
24564
0
  case X86ISD::PTEST:              return "X86ISD::PTEST";
24565
0
  case X86ISD::TESTP:              return "X86ISD::TESTP";
24566
0
  case X86ISD::TESTM:              return "X86ISD::TESTM";
24567
0
  case X86ISD::TESTNM:             return "X86ISD::TESTNM";
24568
0
  case X86ISD::KORTEST:            return "X86ISD::KORTEST";
24569
0
  case X86ISD::KTEST:              return "X86ISD::KTEST";
24570
0
  case X86ISD::KSHIFTL:            return "X86ISD::KSHIFTL";
24571
0
  case X86ISD::KSHIFTR:            return "X86ISD::KSHIFTR";
24572
0
  case X86ISD::PACKSS:             return "X86ISD::PACKSS";
24573
0
  case X86ISD::PACKUS:             return "X86ISD::PACKUS";
24574
0
  case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
24575
0
  case X86ISD::VALIGN:             return "X86ISD::VALIGN";
24576
0
  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
24577
0
  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
24578
0
  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
24579
0
  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
24580
0
  case X86ISD::SHUF128:            return "X86ISD::SHUF128";
24581
0
  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
24582
0
  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
24583
0
  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
24584
0
  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
24585
0
  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
24586
0
  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
24587
0
  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
24588
0
  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
24589
0
  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
24590
0
  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
24591
0
  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
24592
0
  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
24593
0
  case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
24594
0
  case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
24595
0
  case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
24596
0
  case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
24597
0
  case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
24598
0
  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
24599
0
  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
24600
0
  case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
24601
0
  case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
24602
0
  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
24603
0
  case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
24604
0
  case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
24605
0
  case X86ISD::VFIXUPIMMS:          return "X86ISD::VFIXUPIMMS";
24606
0
  case X86ISD::VRANGE:             return "X86ISD::VRANGE";
24607
0
  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
24608
0
  case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
24609
0
  case X86ISD::PSADBW:             return "X86ISD::PSADBW";
24610
0
  case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
24611
0
  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24612
0
  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
24613
0
  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
24614
0
  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
24615
0
  case X86ISD::MFENCE:             return "X86ISD::MFENCE";
24616
0
  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
24617
0
  case X86ISD::SAHF:               return "X86ISD::SAHF";
24618
0
  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
24619
0
  case X86ISD::RDSEED:             return "X86ISD::RDSEED";
24620
0
  case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
24621
0
  case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
24622
0
  case X86ISD::VPSHA:              return "X86ISD::VPSHA";
24623
0
  case X86ISD::VPSHL:              return "X86ISD::VPSHL";
24624
0
  case X86ISD::VPCOM:              return "X86ISD::VPCOM";
24625
0
  case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
24626
0
  case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
24627
0
  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
24628
0
  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
24629
0
  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
24630
0
  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
24631
0
  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
24632
0
  case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
24633
0
  case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
24634
0
  case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
24635
0
  case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
24636
0
  case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
24637
0
  case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
24638
0
  case X86ISD::FMADDS1_RND:        return "X86ISD::FMADDS1_RND";
24639
0
  case X86ISD::FNMADDS1_RND:       return "X86ISD::FNMADDS1_RND";
24640
0
  case X86ISD::FMSUBS1_RND:        return "X86ISD::FMSUBS1_RND";
24641
0
  case X86ISD::FNMSUBS1_RND:       return "X86ISD::FNMSUBS1_RND";
24642
0
  case X86ISD::FMADDS3_RND:        return "X86ISD::FMADDS3_RND";
24643
0
  case X86ISD::FNMADDS3_RND:       return "X86ISD::FNMADDS3_RND";
24644
0
  case X86ISD::FMSUBS3_RND:        return "X86ISD::FMSUBS3_RND";
24645
0
  case X86ISD::FNMSUBS3_RND:       return "X86ISD::FNMSUBS3_RND";
24646
0
  case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
24647
0
  case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
24648
0
  case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
24649
0
  case X86ISD::VRNDSCALES:         return "X86ISD::VRNDSCALES";
24650
0
  case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
24651
0
  case X86ISD::VREDUCES:           return "X86ISD::VREDUCES";
24652
0
  case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
24653
0
  case X86ISD::VGETMANTS:          return "X86ISD::VGETMANTS";
24654
0
  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
24655
0
  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
24656
0
  case X86ISD::XTEST:              return "X86ISD::XTEST";
24657
0
  case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
24658
0
  case X86ISD::EXPAND:             return "X86ISD::EXPAND";
24659
0
  case X86ISD::SELECT:             return "X86ISD::SELECT";
24660
0
  case X86ISD::SELECTS:            return "X86ISD::SELECTS";
24661
0
  case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
24662
0
  case X86ISD::RCP28:              return "X86ISD::RCP28";
24663
0
  case X86ISD::RCP28S:             return "X86ISD::RCP28S";
24664
0
  case X86ISD::EXP2:               return "X86ISD::EXP2";
24665
0
  case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
24666
0
  case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
24667
0
  case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
24668
0
  case X86ISD::FADDS_RND:          return "X86ISD::FADDS_RND";
24669
0
  case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
24670
0
  case X86ISD::FSUBS_RND:          return "X86ISD::FSUBS_RND";
24671
0
  case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
24672
0
  case X86ISD::FMULS_RND:          return "X86ISD::FMULS_RND";
24673
0
  case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
24674
0
  case X86ISD::FDIVS_RND:          return "X86ISD::FDIVS_RND";
24675
0
  case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
24676
0
  case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
24677
0
  case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
24678
0
  case X86ISD::FGETEXPS_RND:       return "X86ISD::FGETEXPS_RND";
24679
0
  case X86ISD::SCALEF:             return "X86ISD::SCALEF";
24680
0
  case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
24681
0
  case X86ISD::ADDS:               return "X86ISD::ADDS";
24682
0
  case X86ISD::SUBS:               return "X86ISD::SUBS";
24683
0
  case X86ISD::AVG:                return "X86ISD::AVG";
24684
0
  case X86ISD::MULHRS:             return "X86ISD::MULHRS";
24685
0
  case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
24686
0
  case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
24687
0
  case X86ISD::CVTTP2SI:           return "X86ISD::CVTTP2SI";
24688
0
  case X86ISD::CVTTP2UI:           return "X86ISD::CVTTP2UI";
24689
0
  case X86ISD::CVTTP2SI_RND:       return "X86ISD::CVTTP2SI_RND";
24690
0
  case X86ISD::CVTTP2UI_RND:       return "X86ISD::CVTTP2UI_RND";
24691
0
  case X86ISD::CVTTS2SI_RND:       return "X86ISD::CVTTS2SI_RND";
24692
0
  case X86ISD::CVTTS2UI_RND:       return "X86ISD::CVTTS2UI_RND";
24693
0
  case X86ISD::CVTSI2P:            return "X86ISD::CVTSI2P";
24694
0
  case X86ISD::CVTUI2P:            return "X86ISD::CVTUI2P";
24695
0
  case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
24696
0
  case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
24697
0
  case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
24698
0
  case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24699
0
  case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24700
0
  case X86ISD::CVTPS2PH:           return "X86ISD::CVTPS2PH";
24701
0
  case X86ISD::CVTPH2PS:           return "X86ISD::CVTPH2PS";
24702
0
  case X86ISD::CVTP2SI:            return "X86ISD::CVTP2SI";
24703
0
  case X86ISD::CVTP2UI:            return "X86ISD::CVTP2UI";
24704
0
  case X86ISD::CVTP2SI_RND:        return "X86ISD::CVTP2SI_RND";
24705
0
  case X86ISD::CVTP2UI_RND:        return "X86ISD::CVTP2UI_RND";
24706
0
  case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
24707
0
  case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
24708
0
  case X86ISD::LWPINS:             return "X86ISD::LWPINS";
24709
0
  case X86ISD::MGATHER:            return "X86ISD::MGATHER";
24710
0
  }
24711
0
  return nullptr;
24712
0
}
24713
24714
/// Return true if the addressing mode represented by AM is legal for this
24715
/// target, for a load/store of the specified type.
24716
bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24717
                                              const AddrMode &AM, Type *Ty,
24718
                                              unsigned AS,
24719
2.65M
                                              Instruction *I) const {
24720
2.65M
  // X86 supports extremely general addressing modes.
24721
2.65M
  CodeModel::Model M = getTargetMachine().getCodeModel();
24722
2.65M
24723
2.65M
  // X86 allows a sign-extended 32-bit immediate field as a displacement.
24724
2.65M
  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24725
445
    return false;
24726
2.65M
24727
2.65M
  
if (2.65M
AM.BaseGV2.65M
) {
24728
25.0k
    unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24729
25.0k
24730
25.0k
    // If a reference to this global requires an extra load, we can't fold it.
24731
25.0k
    if (isGlobalStubReference(GVFlags))
24732
7.84k
      return false;
24733
17.1k
24734
17.1k
    // If BaseGV requires a register for the PIC base, we cannot also have a
24735
17.1k
    // BaseReg specified.
24736
17.1k
    
if (17.1k
AM.HasBaseReg && 17.1k
isGlobalRelativeToPICBase(GVFlags)2.65k
)
24737
40
      return false;
24738
17.1k
24739
17.1k
    // If lower 4G is not available, then we must use rip-relative addressing.
24740
17.1k
    
if (17.1k
(M != CodeModel::Small || 17.1k
isPositionIndependent()17.1k
) &&
24741
17.1k
        
Subtarget.is64Bit()9.21k
&&
(AM.BaseOffs || 5.98k
AM.Scale > 14.75k
))
24742
1.84k
      return false;
24743
2.64M
  }
24744
2.64M
24745
2.64M
  switch (AM.Scale) {
24746
2.50M
  case 0:
24747
2.50M
  case 1:
24748
2.50M
  case 2:
24749
2.50M
  case 4:
24750
2.50M
  case 8:
24751
2.50M
    // These scales always work.
24752
2.50M
    break;
24753
69
  case 3:
24754
69
  case 5:
24755
69
  case 9:
24756
69
    // These scales are formed with basereg+scalereg.  Only accept if there is
24757
69
    // no basereg yet.
24758
69
    if (AM.HasBaseReg)
24759
18
      return false;
24760
51
    break;
24761
135k
  default:  // Other stuff never works.
24762
135k
    return false;
24763
2.50M
  }
24764
2.50M
24765
2.50M
  return true;
24766
2.50M
}
24767
24768
22.2k
bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24769
22.2k
  unsigned Bits = Ty->getScalarSizeInBits();
24770
22.2k
24771
22.2k
  // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24772
22.2k
  // particularly cheaper than those without.
24773
22.2k
  if (Bits == 8)
24774
3.35k
    return false;
24775
18.8k
24776
18.8k
  // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24777
18.8k
  // variable shifts just as cheap as scalar ones.
24778
18.8k
  
if (18.8k
Subtarget.hasInt256() && 18.8k
(Bits == 32 || 12.2k
Bits == 647.75k
))
24779
7.79k
    return false;
24780
11.0k
24781
11.0k
  // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24782
11.0k
  // fully general vector.
24783
11.0k
  return true;
24784
11.0k
}
24785
24786
24.9k
bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24787
24.9k
  if (
!Ty1->isIntegerTy() || 24.9k
!Ty2->isIntegerTy()24.7k
)
24788
189
    return false;
24789
24.7k
  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24790
24.7k
  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24791
24.7k
  return NumBits1 > NumBits2;
24792
24.7k
}
24793
24794
59
bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24795
59
  if (
!Ty1->isIntegerTy() || 59
!Ty2->isIntegerTy()59
)
24796
0
    return false;
24797
59
24798
59
  
if (59
!isTypeLegal(EVT::getEVT(Ty1))59
)
24799
48
    return false;
24800
11
24801
59
  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24802
11
24803
11
  // Assuming the caller doesn't have a zeroext or signext return parameter,
24804
11
  // truncation all the way down to i1 is valid.
24805
11
  return true;
24806
11
}
24807
24808
94.0k
bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24809
94.0k
  return isInt<32>(Imm);
24810
94.0k
}
24811
24812
14.0k
bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24813
14.0k
  // Can also use sub to handle negated immediates.
24814
14.0k
  return isInt<32>(Imm);
24815
14.0k
}
24816
24817
27.6k
bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24818
27.6k
  if (
!VT1.isInteger() || 27.6k
!VT2.isInteger()27.6k
)
24819
0
    return false;
24820
27.6k
  unsigned NumBits1 = VT1.getSizeInBits();
24821
27.6k
  unsigned NumBits2 = VT2.getSizeInBits();
24822
27.6k
  return NumBits1 > NumBits2;
24823
27.6k
}
24824
24825
18.5k
bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24826
18.5k
  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24827
18.5k
  return Ty1->isIntegerTy(32) && 
Ty2->isIntegerTy(64)13.5k
&&
Subtarget.is64Bit()13.3k
;
24828
18.5k
}
24829
24830
95.6k
bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24831
95.6k
  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24832
95.6k
  return VT1 == MVT::i32 && 
VT2 == MVT::i6430.5k
&&
Subtarget.is64Bit()2.95k
;
24833
95.6k
}
24834
24835
65.8k
bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24836
65.8k
  EVT VT1 = Val.getValueType();
24837
65.8k
  if (isZExtFree(VT1, VT2))
24838
0
    return true;
24839
65.8k
24840
65.8k
  
if (65.8k
Val.getOpcode() != ISD::LOAD65.8k
)
24841
52.4k
    return false;
24842
13.3k
24843
13.3k
  
if (13.3k
!VT1.isSimple() || 13.3k
!VT1.isInteger()13.3k
||
24844
13.3k
      
!VT2.isSimple()12.9k
||
!VT2.isInteger()12.9k
)
24845
439
    return false;
24846
12.9k
24847
12.9k
  switch (VT1.getSimpleVT().SimpleTy) {
24848
4.01k
  default: break;
24849
8.93k
  case MVT::i8:
24850
8.93k
  case MVT::i16:
24851
8.93k
  case MVT::i32:
24852
8.93k
    // X86 has 8, 16, and 32-bit zero-extending loads.
24853
8.93k
    return true;
24854
4.01k
  }
24855
4.01k
24856
4.01k
  return false;
24857
4.01k
}
24858
24859
1.27k
bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24860
24861
bool
24862
14.1k
X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24863
14.1k
  if (!Subtarget.hasAnyFMA())
24864
9.56k
    return false;
24865
4.55k
24866
4.55k
  VT = VT.getScalarType();
24867
4.55k
24868
4.55k
  if (!VT.isSimple())
24869
0
    return false;
24870
4.55k
24871
4.55k
  switch (VT.getSimpleVT().SimpleTy) {
24872
4.41k
  case MVT::f32:
24873
4.41k
  case MVT::f64:
24874
4.41k
    return true;
24875
143
  default:
24876
143
    break;
24877
143
  }
24878
143
24879
143
  return false;
24880
143
}
24881
24882
1.52k
bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24883
1.52k
  // i16 instructions are longer (0x66 prefix) and potentially slower.
24884
1.22k
  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24885
1.52k
}
24886
24887
/// Targets can use this to indicate that they only support *some*
24888
/// VECTOR_SHUFFLE operations, those with specific masks.
24889
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24890
/// are assumed to be legal.
24891
6.50k
bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
24892
6.50k
  if (!VT.isSimple())
24893
6
    return false;
24894
6.50k
24895
6.50k
  // Not for i1 vectors
24896
6.50k
  
if (6.50k
VT.getSimpleVT().getScalarType() == MVT::i16.50k
)
24897
12
    return false;
24898
6.48k
24899
6.48k
  // Very little shuffling can be done for 64-bit vectors right now.
24900
6.48k
  
if (6.48k
VT.getSimpleVT().getSizeInBits() == 646.48k
)
24901
4
    return false;
24902
6.48k
24903
6.48k
  // We only care that the types being shuffled are legal. The lowering can
24904
6.48k
  // handle any possible shuffle mask that results.
24905
6.48k
  return isTypeLegal(VT.getSimpleVT());
24906
6.48k
}
24907
24908
bool
24909
X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24910
1.00k
                                          EVT VT) const {
24911
1.00k
  // Just delegate to the generic legality, clear masks aren't special.
24912
1.00k
  return isShuffleMaskLegal(Mask, VT);
24913
1.00k
}
24914
24915
//===----------------------------------------------------------------------===//
24916
//                           X86 Scheduler Hooks
24917
//===----------------------------------------------------------------------===//
24918
24919
/// Utility function to emit xbegin specifying the start of an RTM region.
24920
static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24921
2
                                     const TargetInstrInfo *TII) {
24922
2
  DebugLoc DL = MI.getDebugLoc();
24923
2
24924
2
  const BasicBlock *BB = MBB->getBasicBlock();
24925
2
  MachineFunction::iterator I = ++MBB->getIterator();
24926
2
24927
2
  // For the v = xbegin(), we generate
24928
2
  //
24929
2
  // thisMBB:
24930
2
  //  xbegin sinkMBB
24931
2
  //
24932
2
  // mainMBB:
24933
2
  //  s0 = -1
24934
2
  //
24935
2
  // fallBB:
24936
2
  //  eax = # XABORT_DEF
24937
2
  //  s1 = eax
24938
2
  //
24939
2
  // sinkMBB:
24940
2
  //  v = phi(s0/mainBB, s1/fallBB)
24941
2
24942
2
  MachineBasicBlock *thisMBB = MBB;
24943
2
  MachineFunction *MF = MBB->getParent();
24944
2
  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24945
2
  MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
24946
2
  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24947
2
  MF->insert(I, mainMBB);
24948
2
  MF->insert(I, fallMBB);
24949
2
  MF->insert(I, sinkMBB);
24950
2
24951
2
  // Transfer the remainder of BB and its successor edges to sinkMBB.
24952
2
  sinkMBB->splice(sinkMBB->begin(), MBB,
24953
2
                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24954
2
  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
24955
2
24956
2
  MachineRegisterInfo &MRI = MF->getRegInfo();
24957
2
  unsigned DstReg = MI.getOperand(0).getReg();
24958
2
  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
24959
2
  unsigned mainDstReg = MRI.createVirtualRegister(RC);
24960
2
  unsigned fallDstReg = MRI.createVirtualRegister(RC);
24961
2
24962
2
  // thisMBB:
24963
2
  //  xbegin fallMBB
24964
2
  //  # fallthrough to mainMBB
24965
2
  //  # abortion to fallMBB
24966
2
  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
24967
2
  thisMBB->addSuccessor(mainMBB);
24968
2
  thisMBB->addSuccessor(fallMBB);
24969
2
24970
2
  // mainMBB:
24971
2
  //  mainDstReg := -1
24972
2
  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
24973
2
  BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
24974
2
  mainMBB->addSuccessor(sinkMBB);
24975
2
24976
2
  // fallMBB:
24977
2
  //  ; pseudo instruction to model hardware's definition from XABORT
24978
2
  //  EAX := XABORT_DEF
24979
2
  //  fallDstReg := EAX
24980
2
  BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
24981
2
  BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
24982
2
      .addReg(X86::EAX);
24983
2
  fallMBB->addSuccessor(sinkMBB);
24984
2
24985
2
  // sinkMBB:
24986
2
  //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
24987
2
  BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
24988
2
      .addReg(mainDstReg).addMBB(mainMBB)
24989
2
      .addReg(fallDstReg).addMBB(fallMBB);
24990
2
24991
2
  MI.eraseFromParent();
24992
2
  return sinkMBB;
24993
2
}
24994
24995
// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24996
// or XMM0_V32I8 in AVX all of this code can be replaced with that
24997
// in the .td file.
24998
static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24999
52
                                       const TargetInstrInfo *TII) {
25000
52
  unsigned Opc;
25001
52
  switch (MI.getOpcode()) {
25002
0
  
default: 0
llvm_unreachable0
("illegal opcode!");
25003
6
  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
25004
9
  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25005
3
  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
25006
8
  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25007
6
  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
25008
9
  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25009
3
  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
25010
8
  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25011
52
  }
25012
52
25013
52
  DebugLoc dl = MI.getDebugLoc();
25014
52
  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25015
52
25016
52
  unsigned NumArgs = MI.getNumOperands();
25017
400
  for (unsigned i = 1; 
i < NumArgs400
;
++i348
) {
25018
348
    MachineOperand &Op = MI.getOperand(i);
25019
348
    if (
!(Op.isReg() && 348
Op.isImplicit()252
))
25020
244
      MIB.add(Op);
25021
348
  }
25022
52
  if (MI.hasOneMemOperand())
25023
22
    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25024
52
25025
52
  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25026
52
      .addReg(X86::XMM0);
25027
52
25028
52
  MI.eraseFromParent();
25029
52
  return BB;
25030
52
}
25031
25032
// FIXME: Custom handling because TableGen doesn't support multiple implicit
25033
// defs in an instruction pattern
25034
static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25035
102
                                       const TargetInstrInfo *TII) {
25036
102
  unsigned Opc;
25037
102
  switch (MI.getOpcode()) {
25038
0
  
default: 0
llvm_unreachable0
("illegal opcode!");
25039
21
  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
25040
19
  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25041
3
  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
25042
8
  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25043
21
  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
25044
19
  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25045
3
  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
25046
8
  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25047
102
  }
25048
102
25049
102
  DebugLoc dl = MI.getDebugLoc();
25050
102
  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25051
102
25052
102
  unsigned NumArgs = MI.getNumOperands(); // remove the results
25053
700
  for (unsigned i = 1; 
i < NumArgs700
;
++i598
) {
25054
598
    MachineOperand &Op = MI.getOperand(i);
25055
598
    if (
!(Op.isReg() && 598
Op.isImplicit()452
))
25056
394
      MIB.add(Op);
25057
598
  }
25058
102
  if (MI.hasOneMemOperand())
25059
22
    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25060
102
25061
102
  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25062
102
      .addReg(X86::ECX);
25063
102
25064
102
  MI.eraseFromParent();
25065
102
  return BB;
25066
102
}
25067
25068
static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25069
1
                                     const X86Subtarget &Subtarget) {
25070
1
  DebugLoc dl = MI.getDebugLoc();
25071
1
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25072
1
25073
1
  // insert input VAL into EAX
25074
1
  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25075
1
      .addReg(MI.getOperand(0).getReg());
25076
1
  // insert zero to ECX
25077
1
  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25078
1
25079
1
  // insert zero to EDX
25080
1
  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25081
1
25082
1
  // insert WRPKRU instruction
25083
1
  BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25084
1
25085
1
  MI.eraseFromParent(); // The pseudo is gone now.
25086
1
  return BB;
25087
1
}
25088
25089
static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25090
1
                                     const X86Subtarget &Subtarget) {
25091
1
  DebugLoc dl = MI.getDebugLoc();
25092
1
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25093
1
25094
1
  // insert zero to ECX
25095
1
  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25096
1
25097
1
  // insert RDPKRU instruction
25098
1
  BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25099
1
  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25100
1
      .addReg(X86::EAX);
25101
1
25102
1
  MI.eraseFromParent(); // The pseudo is gone now.
25103
1
  return BB;
25104
1
}
25105
25106
static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25107
                                      const X86Subtarget &Subtarget,
25108
18
                                      unsigned Opc) {
25109
18
  DebugLoc dl = MI.getDebugLoc();
25110
18
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25111
18
  // Address into RAX/EAX, other two args into ECX, EDX.
25112
18
  unsigned MemOpc = Subtarget.is64Bit() ? 
X86::LEA64r15
:
X86::LEA32r3
;
25113
18
  unsigned MemReg = Subtarget.is64Bit() ? 
X86::RAX15
:
X86::EAX3
;
25114
18
  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25115
108
  for (int i = 0; 
i < X86::AddrNumOperands108
;
++i90
)
25116
90
    MIB.add(MI.getOperand(i));
25117
18
25118
18
  unsigned ValOps = X86::AddrNumOperands;
25119
18
  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
25120
18
      .addReg(MI.getOperand(ValOps).getReg());
25121
18
  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
25122
18
      .addReg(MI.getOperand(ValOps + 1).getReg());
25123
18
25124
18
  // The instruction doesn't actually take any operands though.
25125
18
  BuildMI(*BB, MI, dl, TII->get(Opc));
25126
18
25127
18
  MI.eraseFromParent(); // The pseudo is gone now.
25128
18
  return BB;
25129
18
}
25130
25131
static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
25132
2
                                      const X86Subtarget &Subtarget) {
25133
2
  DebugLoc dl = MI->getDebugLoc();
25134
2
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25135
2
  // Address into RAX/EAX
25136
2
  unsigned MemOpc = Subtarget.is64Bit() ? 
X86::LEA64r1
:
X86::LEA32r1
;
25137
2
  unsigned MemReg = Subtarget.is64Bit() ? 
X86::RAX1
:
X86::EAX1
;
25138
2
  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25139
12
  for (int i = 0; 
i < X86::AddrNumOperands12
;
++i10
)
25140
10
    MIB.add(MI->getOperand(i));
25141
2
25142
2
  // The instruction doesn't actually take any operands though.
25143
2
  BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
25144
2
25145
2
  MI->eraseFromParent(); // The pseudo is gone now.
25146
2
  return BB;
25147
2
}
25148
25149
25150
25151
MachineBasicBlock *
25152
X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
25153
0
                                                 MachineBasicBlock *MBB) const {
25154
0
  // Emit va_arg instruction on X86-64.
25155
0
25156
0
  // Operands to this pseudo-instruction:
25157
0
  // 0  ) Output        : destination address (reg)
25158
0
  // 1-5) Input         : va_list address (addr, i64mem)
25159
0
  // 6  ) ArgSize       : Size (in bytes) of vararg type
25160
0
  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
25161
0
  // 8  ) Align         : Alignment of type
25162
0
  // 9  ) EFLAGS (implicit-def)
25163
0
25164
0
  assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
25165
0
  static_assert(X86::AddrNumOperands == 5,
25166
0
                "VAARG_64 assumes 5 address operands");
25167
0
25168
0
  unsigned DestReg = MI.getOperand(0).getReg();
25169
0
  MachineOperand &Base = MI.getOperand(1);
25170
0
  MachineOperand &Scale = MI.getOperand(2);
25171
0
  MachineOperand &Index = MI.getOperand(3);
25172
0
  MachineOperand &Disp = MI.getOperand(4);
25173
0
  MachineOperand &Segment = MI.getOperand(5);
25174
0
  unsigned ArgSize = MI.getOperand(6).getImm();
25175
0
  unsigned ArgMode = MI.getOperand(7).getImm();
25176
0
  unsigned Align = MI.getOperand(8).getImm();
25177
0
25178
0
  // Memory Reference
25179
0
  assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
25180
0
  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25181
0
  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25182
0
25183
0
  // Machine Information
25184
0
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25185
0
  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
25186
0
  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
25187
0
  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
25188
0
  DebugLoc DL = MI.getDebugLoc();
25189
0
25190
0
  // struct va_list {
25191
0
  //   i32   gp_offset
25192
0
  //   i32   fp_offset
25193
0
  //   i64   overflow_area (address)
25194
0
  //   i64   reg_save_area (address)
25195
0
  // }
25196
0
  // sizeof(va_list) = 24
25197
0
  // alignment(va_list) = 8
25198
0
25199
0
  unsigned TotalNumIntRegs = 6;
25200
0
  unsigned TotalNumXMMRegs = 8;
25201
0
  bool UseGPOffset = (ArgMode == 1);
25202
0
  bool UseFPOffset = (ArgMode == 2);
25203
0
  unsigned MaxOffset = TotalNumIntRegs * 8 +
25204
0
                       (UseFPOffset ? 
TotalNumXMMRegs * 160
:
00
);
25205
0
25206
0
  /* Align ArgSize to a multiple of 8 */
25207
0
  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25208
0
  bool NeedsAlign = (Align > 8);
25209
0
25210
0
  MachineBasicBlock *thisMBB = MBB;
25211
0
  MachineBasicBlock *overflowMBB;
25212
0
  MachineBasicBlock *offsetMBB;
25213
0
  MachineBasicBlock *endMBB;
25214
0
25215
0
  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
25216
0
  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
25217
0
  unsigned OffsetReg = 0;
25218
0
25219
0
  if (
!UseGPOffset && 0
!UseFPOffset0
) {
25220
0
    // If we only pull from the overflow region, we don't create a branch.
25221
0
    // We don't need to alter control flow.
25222
0
    OffsetDestReg = 0; // unused
25223
0
    OverflowDestReg = DestReg;
25224
0
25225
0
    offsetMBB = nullptr;
25226
0
    overflowMBB = thisMBB;
25227
0
    endMBB = thisMBB;
25228
0
  } else {
25229
0
    // First emit code to check if gp_offset (or fp_offset) is below the bound.
25230
0
    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25231
0
    // If not, pull from overflow_area. (branch to overflowMBB)
25232
0
    //
25233
0
    //       thisMBB
25234
0
    //         |     .
25235
0
    //         |        .
25236
0
    //     offsetMBB   overflowMBB
25237
0
    //         |        .
25238
0
    //         |     .
25239
0
    //        endMBB
25240
0
25241
0
    // Registers for the PHI in endMBB
25242
0
    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25243
0
    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25244
0
25245
0
    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25246
0
    MachineFunction *MF = MBB->getParent();
25247
0
    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25248
0
    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25249
0
    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25250
0
25251
0
    MachineFunction::iterator MBBIter = ++MBB->getIterator();
25252
0
25253
0
    // Insert the new basic blocks
25254
0
    MF->insert(MBBIter, offsetMBB);
25255
0
    MF->insert(MBBIter, overflowMBB);
25256
0
    MF->insert(MBBIter, endMBB);
25257
0
25258
0
    // Transfer the remainder of MBB and its successor edges to endMBB.
25259
0
    endMBB->splice(endMBB->begin(), thisMBB,
25260
0
                   std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25261
0
    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25262
0
25263
0
    // Make offsetMBB and overflowMBB successors of thisMBB
25264
0
    thisMBB->addSuccessor(offsetMBB);
25265
0
    thisMBB->addSuccessor(overflowMBB);
25266
0
25267
0
    // endMBB is a successor of both offsetMBB and overflowMBB
25268
0
    offsetMBB->addSuccessor(endMBB);
25269
0
    overflowMBB->addSuccessor(endMBB);
25270
0
25271
0
    // Load the offset value into a register
25272
0
    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25273
0
    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25274
0
        .add(Base)
25275
0
        .add(Scale)
25276
0
        .add(Index)
25277
0
        .addDisp(Disp, UseFPOffset ? 
40
:
00
)
25278
0
        .add(Segment)
25279
0
        .setMemRefs(MMOBegin, MMOEnd);
25280
0
25281
0
    // Check if there is enough room left to pull this argument.
25282
0
    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25283
0
      .addReg(OffsetReg)
25284
0
      .addImm(MaxOffset + 8 - ArgSizeA8);
25285
0
25286
0
    // Branch to "overflowMBB" if offset >= max
25287
0
    // Fall through to "offsetMBB" otherwise
25288
0
    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25289
0
      .addMBB(overflowMBB);
25290
0
  }
25291
0
25292
0
  // In offsetMBB, emit code to use the reg_save_area.
25293
0
  if (
offsetMBB0
) {
25294
0
    assert(OffsetReg != 0);
25295
0
25296
0
    // Read the reg_save_area address.
25297
0
    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25298
0
    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25299
0
        .add(Base)
25300
0
        .add(Scale)
25301
0
        .add(Index)
25302
0
        .addDisp(Disp, 16)
25303
0
        .add(Segment)
25304
0
        .setMemRefs(MMOBegin, MMOEnd);
25305
0
25306
0
    // Zero-extend the offset
25307
0
    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25308
0
      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25309
0
        .addImm(0)
25310
0
        .addReg(OffsetReg)
25311
0
        .addImm(X86::sub_32bit);
25312
0
25313
0
    // Add the offset to the reg_save_area to get the final address.
25314
0
    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25315
0
      .addReg(OffsetReg64)
25316
0
      .addReg(RegSaveReg);
25317
0
25318
0
    // Compute the offset for the next argument
25319
0
    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25320
0
    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25321
0
      .addReg(OffsetReg)
25322
0
      .addImm(UseFPOffset ? 
160
:
80
);
25323
0
25324
0
    // Store it back into the va_list.
25325
0
    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25326
0
        .add(Base)
25327
0
        .add(Scale)
25328
0
        .add(Index)
25329
0
        .addDisp(Disp, UseFPOffset ? 
40
:
00
)
25330
0
        .add(Segment)
25331
0
        .addReg(NextOffsetReg)
25332
0
        .setMemRefs(MMOBegin, MMOEnd);
25333
0
25334
0
    // Jump to endMBB
25335
0
    BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25336
0
      .addMBB(endMBB);
25337
0
  }
25338
0
25339
0
  //
25340
0
  // Emit code to use overflow area
25341
0
  //
25342
0
25343
0
  // Load the overflow_area address into a register.
25344
0
  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25345
0
  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25346
0
      .add(Base)
25347
0
      .add(Scale)
25348
0
      .add(Index)
25349
0
      .addDisp(Disp, 8)
25350
0
      .add(Segment)
25351
0
      .setMemRefs(MMOBegin, MMOEnd);
25352
0
25353
0
  // If we need to align it, do so. Otherwise, just copy the address
25354
0
  // to OverflowDestReg.
25355
0
  if (
NeedsAlign0
) {
25356
0
    // Align the overflow address
25357
0
    assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25358
0
    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25359
0
25360
0
    // aligned_addr = (addr + (align-1)) & ~(align-1)
25361
0
    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25362
0
      .addReg(OverflowAddrReg)
25363
0
      .addImm(Align-1);
25364
0
25365
0
    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25366
0
      .addReg(TmpReg)
25367
0
      .addImm(~(uint64_t)(Align-1));
25368
0
  } else {
25369
0
    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25370
0
      .addReg(OverflowAddrReg);
25371
0
  }
25372
0
25373
0
  // Compute the next overflow address after this argument.
25374
0
  // (the overflow address should be kept 8-byte aligned)
25375
0
  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25376
0
  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25377
0
    .addReg(OverflowDestReg)
25378
0
    .addImm(ArgSizeA8);
25379
0
25380
0
  // Store the new overflow address.
25381
0
  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25382
0
      .add(Base)
25383
0
      .add(Scale)
25384
0
      .add(Index)
25385
0
      .addDisp(Disp, 8)
25386
0
      .add(Segment)
25387
0
      .addReg(NextAddrReg)
25388
0
      .setMemRefs(MMOBegin, MMOEnd);
25389
0
25390
0
  // If we branched, emit the PHI to the front of endMBB.
25391
0
  if (
offsetMBB0
) {
25392
0
    BuildMI(*endMBB, endMBB->begin(), DL,
25393
0
            TII->get(X86::PHI), DestReg)
25394
0
      .addReg(OffsetDestReg).addMBB(offsetMBB)
25395
0
      .addReg(OverflowDestReg).addMBB(overflowMBB);
25396
0
  }
25397
0
25398
0
  // Erase the pseudo instruction
25399
0
  MI.eraseFromParent();
25400
0
25401
0
  return endMBB;
25402
0
}
25403
25404
MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25405
37
    MachineInstr &MI, MachineBasicBlock *MBB) const {
25406
37
  // Emit code to save XMM registers to the stack. The ABI says that the
25407
37
  // number of registers to save is given in %al, so it's theoretically
25408
37
  // possible to do an indirect jump trick to avoid saving all of them,
25409
37
  // however this code takes a simpler approach and just executes all
25410
37
  // of the stores if %al is non-zero. It's less code, and it's probably
25411
37
  // easier on the hardware branch predictor, and stores aren't all that
25412
37
  // expensive anyway.
25413
37
25414
37
  // Create the new basic blocks. One block contains all the XMM stores,
25415
37
  // and one block is the final destination regardless of whether any
25416
37
  // stores were performed.
25417
37
  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25418
37
  MachineFunction *F = MBB->getParent();
25419
37
  MachineFunction::iterator MBBIter = ++MBB->getIterator();
25420
37
  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25421
37
  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25422
37
  F->insert(MBBIter, XMMSaveMBB);
25423
37
  F->insert(MBBIter, EndMBB);
25424
37
25425
37
  // Transfer the remainder of MBB and its successor edges to EndMBB.
25426
37
  EndMBB->splice(EndMBB->begin(), MBB,
25427
37
                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25428
37
  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25429
37
25430
37
  // The original block will now fall through to the XMM save block.
25431
37
  MBB->addSuccessor(XMMSaveMBB);
25432
37
  // The XMMSaveMBB will fall through to the end block.
25433
37
  XMMSaveMBB->addSuccessor(EndMBB);
25434
37
25435
37
  // Now add the instructions.
25436
37
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25437
37
  DebugLoc DL = MI.getDebugLoc();
25438
37
25439
37
  unsigned CountReg = MI.getOperand(0).getReg();
25440
37
  int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25441
37
  int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25442
37
25443
37
  if (
!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())37
) {
25444
37
    // If %al is 0, branch around the XMM save block.
25445
37
    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25446
37
    BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25447
37
    MBB->addSuccessor(EndMBB);
25448
37
  }
25449
37
25450
37
  // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25451
37
  // that was just emitted, but clearly shouldn't be "saved".
25452
37
  assert((MI.getNumOperands() <= 3 ||
25453
37
          !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25454
37
          MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25455
37
         "Expected last argument to be EFLAGS");
25456
37
  unsigned MOVOpc = Subtarget.hasFp256() ? 
X86::VMOVAPSmr9
:
X86::MOVAPSmr28
;
25457
37
  // In the XMM save block, save all the XMM argument registers.
25458
331
  for (int i = 3, e = MI.getNumOperands() - 1; 
i != e331
;
++i294
) {
25459
294
    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25460
294
    MachineMemOperand *MMO = F->getMachineMemOperand(
25461
294
        MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25462
294
        MachineMemOperand::MOStore,
25463
294
        /*Size=*/16, /*Align=*/16);
25464
294
    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25465
294
        .addFrameIndex(RegSaveFrameIndex)
25466
294
        .addImm(/*Scale=*/1)
25467
294
        .addReg(/*IndexReg=*/0)
25468
294
        .addImm(/*Disp=*/Offset)
25469
294
        .addReg(/*Segment=*/0)
25470
294
        .addReg(MI.getOperand(i).getReg())
25471
294
        .addMemOperand(MMO);
25472
294
  }
25473
37
25474
37
  MI.eraseFromParent(); // The pseudo instruction is gone now.
25475
37
25476
37
  return EndMBB;
25477
37
}
25478
25479
// The EFLAGS operand of SelectItr might be missing a kill marker
25480
// because there were multiple uses of EFLAGS, and ISel didn't know
25481
// which to mark. Figure out whether SelectItr should have had a
25482
// kill marker, and set it if it should. Returns the correct kill
25483
// marker value.
25484
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25485
                                     MachineBasicBlock* BB,
25486
999
                                     const TargetRegisterInfo* TRI) {
25487
999
  // Scan forward through BB for a use/def of EFLAGS.
25488
999
  MachineBasicBlock::iterator miI(std::next(SelectItr));
25489
3.06k
  for (MachineBasicBlock::iterator miE = BB->end(); 
miI != miE3.06k
;
++miI2.06k
) {
25490
2.68k
    const MachineInstr& mi = *miI;
25491
2.68k
    if (mi.readsRegister(X86::EFLAGS))
25492
77
      return false;
25493
2.60k
    
if (2.60k
mi.definesRegister(X86::EFLAGS)2.60k
)
25494
539
      break; // Should have kill-flag - update below.
25495
2.68k
  }
25496
999
25497
999
  // If we hit the end of the block, check whether EFLAGS is live into a
25498
999
  // successor.
25499
922
  
if (922
miI == BB->end()922
) {
25500
383
    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25501
383
                                          sEnd = BB->succ_end();
25502
447
         
sItr != sEnd447
;
++sItr64
) {
25503
64
      MachineBasicBlock* succ = *sItr;
25504
64
      if (succ->isLiveIn(X86::EFLAGS))
25505
0
        return false;
25506
64
    }
25507
383
  }
25508
922
25509
922
  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25510
922
  // out. SelectMI should have a kill flag on EFLAGS.
25511
922
  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25512
922
  return true;
25513
999
}
25514
25515
// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25516
// together with other CMOV pseudo-opcodes into a single basic-block with
25517
// conditional jump around it.
25518
2.29k
static bool isCMOVPseudo(MachineInstr &MI) {
25519
2.29k
  switch (MI.getOpcode()) {
25520
1.34k
  case X86::CMOV_FR32:
25521
1.34k
  case X86::CMOV_FR64:
25522
1.34k
  case X86::CMOV_GR8:
25523
1.34k
  case X86::CMOV_GR16:
25524
1.34k
  case X86::CMOV_GR32:
25525
1.34k
  case X86::CMOV_RFP32:
25526
1.34k
  case X86::CMOV_RFP64:
25527
1.34k
  case X86::CMOV_RFP80:
25528
1.34k
  case X86::CMOV_V2F64:
25529
1.34k
  case X86::CMOV_V2I64:
25530
1.34k
  case X86::CMOV_V4F32:
25531
1.34k
  case X86::CMOV_V4F64:
25532
1.34k
  case X86::CMOV_V4I64:
25533
1.34k
  case X86::CMOV_V16F32:
25534
1.34k
  case X86::CMOV_V8F32:
25535
1.34k
  case X86::CMOV_V8F64:
25536
1.34k
  case X86::CMOV_V8I64:
25537
1.34k
  case X86::CMOV_V8I1:
25538
1.34k
  case X86::CMOV_V16I1:
25539
1.34k
  case X86::CMOV_V32I1:
25540
1.34k
  case X86::CMOV_V64I1:
25541
1.34k
    return true;
25542
1.34k
25543
957
  default:
25544
957
    return false;
25545
0
  }
25546
0
}
25547
25548
// Helper function, which inserts PHI functions into SinkMBB:
25549
//   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
25550
// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
25551
// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
25552
// the last PHI function inserted.
25553
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
25554
    MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
25555
    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
25556
984
    MachineBasicBlock *SinkMBB) {
25557
984
  MachineFunction *MF = TrueMBB->getParent();
25558
984
  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
25559
984
  DebugLoc DL = MIItBegin->getDebugLoc();
25560
984
25561
984
  X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
25562
984
  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25563
984
25564
984
  MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
25565
984
25566
984
  // As we are creating the PHIs, we have to be careful if there is more than
25567
984
  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
25568
984
  // PHIs have to reference the individual true/false inputs from earlier PHIs.
25569
984
  // That also means that PHI construction must work forward from earlier to
25570
984
  // later, and that the code must maintain a mapping from earlier PHI's
25571
984
  // destination registers, and the registers that went into the PHI.
25572
984
  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25573
984
  MachineInstrBuilder MIB;
25574
984
25575
2.28k
  for (MachineBasicBlock::iterator MIIt = MIItBegin; 
MIIt != MIItEnd2.28k
;
++MIIt1.30k
) {
25576
1.30k
    unsigned DestReg = MIIt->getOperand(0).getReg();
25577
1.30k
    unsigned Op1Reg = MIIt->getOperand(1).getReg();
25578
1.30k
    unsigned Op2Reg = MIIt->getOperand(2).getReg();
25579
1.30k
25580
1.30k
    // If this CMOV we are generating is the opposite condition from
25581
1.30k
    // the jump we generated, then we have to swap the operands for the
25582
1.30k
    // PHI that is going to be generated.
25583
1.30k
    if (MIIt->getOperand(3).getImm() == OppCC)
25584
3
      std::swap(Op1Reg, Op2Reg);
25585
1.30k
25586
1.30k
    if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25587
4
      Op1Reg = RegRewriteTable[Op1Reg].first;
25588
1.30k
25589
1.30k
    if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25590
4
      Op2Reg = RegRewriteTable[Op2Reg].second;
25591
1.30k
25592
1.30k
    MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
25593
1.30k
              .addReg(Op1Reg)
25594
1.30k
              .addMBB(FalseMBB)
25595
1.30k
              .addReg(Op2Reg)
25596
1.30k
              .addMBB(TrueMBB);
25597
1.30k
25598
1.30k
    // Add this PHI to the rewrite table.
25599
1.30k
    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25600
1.30k
  }
25601
984
25602
984
  return MIB;
25603
984
}
25604
25605
// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
25606
MachineBasicBlock *
25607
X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
25608
                                             MachineInstr &SecondCascadedCMOV,
25609
15
                                             MachineBasicBlock *ThisMBB) const {
25610
15
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25611
15
  DebugLoc DL = FirstCMOV.getDebugLoc();
25612
15
25613
15
  // We lower cascaded CMOVs such as
25614
15
  //
25615
15
  //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
25616
15
  //
25617
15
  // to two successive branches.
25618
15
  //
25619
15
  // Without this, we would add a PHI between the two jumps, which ends up
25620
15
  // creating a few copies all around. For instance, for
25621
15
  //
25622
15
  //    (sitofp (zext (fcmp une)))
25623
15
  //
25624
15
  // we would generate:
25625
15
  //
25626
15
  //         ucomiss %xmm1, %xmm0
25627
15
  //         movss  <1.0f>, %xmm0
25628
15
  //         movaps  %xmm0, %xmm1
25629
15
  //         jne     .LBB5_2
25630
15
  //         xorps   %xmm1, %xmm1
25631
15
  // .LBB5_2:
25632
15
  //         jp      .LBB5_4
25633
15
  //         movaps  %xmm1, %xmm0
25634
15
  // .LBB5_4:
25635
15
  //         retq
25636
15
  //
25637
15
  // because this custom-inserter would have generated:
25638
15
  //
25639
15
  //   A
25640
15
  //   | \
25641
15
  //   |  B
25642
15
  //   | /
25643
15
  //   C
25644
15
  //   | \
25645
15
  //   |  D
25646
15
  //   | /
25647
15
  //   E
25648
15
  //
25649
15
  // A: X = ...; Y = ...
25650
15
  // B: empty
25651
15
  // C: Z = PHI [X, A], [Y, B]
25652
15
  // D: empty
25653
15
  // E: PHI [X, C], [Z, D]
25654
15
  //
25655
15
  // If we lower both CMOVs in a single step, we can instead generate:
25656
15
  //
25657
15
  //   A
25658
15
  //   | \
25659
15
  //   |  C
25660
15
  //   | /|
25661
15
  //   |/ |
25662
15
  //   |  |
25663
15
  //   |  D
25664
15
  //   | /
25665
15
  //   E
25666
15
  //
25667
15
  // A: X = ...; Y = ...
25668
15
  // D: empty
25669
15
  // E: PHI [X, A], [X, C], [Y, D]
25670
15
  //
25671
15
  // Which, in our sitofp/fcmp example, gives us something like:
25672
15
  //
25673
15
  //         ucomiss %xmm1, %xmm0
25674
15
  //         movss  <1.0f>, %xmm0
25675
15
  //         jne     .LBB5_4
25676
15
  //         jp      .LBB5_4
25677
15
  //         xorps   %xmm0, %xmm0
25678
15
  // .LBB5_4:
25679
15
  //         retq
25680
15
  //
25681
15
25682
15
  // We lower cascaded CMOV into two successive branches to the same block.
25683
15
  // EFLAGS is used by both, so mark it as live in the second.
25684
15
  const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
25685
15
  MachineFunction *F = ThisMBB->getParent();
25686
15
  MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
25687
15
  MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
25688
15
  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25689
15
25690
15
  MachineFunction::iterator It = ++ThisMBB->getIterator();
25691
15
  F->insert(It, FirstInsertedMBB);
25692
15
  F->insert(It, SecondInsertedMBB);
25693
15
  F->insert(It, SinkMBB);
25694
15
25695
15
  // For a cascaded CMOV, we lower it to two successive branches to
25696
15
  // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in
25697
15
  // the FirstInsertedMBB.
25698
15
  FirstInsertedMBB->addLiveIn(X86::EFLAGS);
25699
15
25700
15
  // If the EFLAGS register isn't dead in the terminator, then claim that it's
25701
15
  // live into the sink and copy blocks.
25702
15
  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25703
15
  if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
25704
15
      
!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)15
) {
25705
3
    SecondInsertedMBB->addLiveIn(X86::EFLAGS);
25706
3
    SinkMBB->addLiveIn(X86::EFLAGS);
25707
3
  }
25708
15
25709
15
  // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
25710
15
  SinkMBB->splice(SinkMBB->begin(), ThisMBB,
25711
15
                  std::next(MachineBasicBlock::iterator(FirstCMOV)),
25712
15
                  ThisMBB->end());
25713
15
  SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
25714
15
25715
15
  // Fallthrough block for ThisMBB.
25716
15
  ThisMBB->addSuccessor(FirstInsertedMBB);
25717
15
  // The true block target of the first branch is always SinkMBB.
25718
15
  ThisMBB->addSuccessor(SinkMBB);
25719
15
  // Fallthrough block for FirstInsertedMBB.
25720
15
  FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
25721
15
  // The true block for the branch of FirstInsertedMBB.
25722
15
  FirstInsertedMBB->addSuccessor(SinkMBB);
25723
15
  // This is fallthrough.
25724
15
  SecondInsertedMBB->addSuccessor(SinkMBB);
25725
15
25726
15
  // Create the conditional branch instructions.
25727
15
  X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
25728
15
  unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
25729
15
  BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
25730
15
25731
15
  X86::CondCode SecondCC =
25732
15
      X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
25733
15
  unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
25734
15
  BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
25735
15
25736
15
  //  SinkMBB:
25737
15
  //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
25738
15
  unsigned DestReg = FirstCMOV.getOperand(0).getReg();
25739
15
  unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
25740
15
  unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
25741
15
  MachineInstrBuilder MIB =
25742
15
      BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
25743
15
          .addReg(Op1Reg)
25744
15
          .addMBB(SecondInsertedMBB)
25745
15
          .addReg(Op2Reg)
25746
15
          .addMBB(ThisMBB);
25747
15
25748
15
  // The second SecondInsertedMBB provides the same incoming value as the
25749
15
  // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
25750
15
  MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
25751
15
  // Copy the PHI result to the register defined by the second CMOV.
25752
15
  BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
25753
15
          TII->get(TargetOpcode::COPY),
25754
15
          SecondCascadedCMOV.getOperand(0).getReg())
25755
15
      .addReg(FirstCMOV.getOperand(0).getReg());
25756
15
25757
15
  // Now remove the CMOVs.
25758
15
  FirstCMOV.eraseFromParent();
25759
15
  SecondCascadedCMOV.eraseFromParent();
25760
15
25761
15
  return SinkMBB;
25762
15
}
25763
25764
MachineBasicBlock *
25765
X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25766
999
                                     MachineBasicBlock *ThisMBB) const {
25767
999
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25768
999
  DebugLoc DL = MI.getDebugLoc();
25769
999
25770
999
  // To "insert" a SELECT_CC instruction, we actually have to insert the
25771
999
  // diamond control-flow pattern.  The incoming instruction knows the
25772
999
  // destination vreg to set, the condition code register to branch on, the
25773
999
  // true/false values to select between and a branch opcode to use.
25774
999
25775
999
  //  ThisMBB:
25776
999
  //  ...
25777
999
  //   TrueVal = ...
25778
999
  //   cmpTY ccX, r1, r2
25779
999
  //   bCC copy1MBB
25780
999
  //   fallthrough --> FalseMBB
25781
999
25782
999
  // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25783
999
  // as described above, by inserting a BB, and then making a PHI at the join
25784
999
  // point to select the true and false operands of the CMOV in the PHI.
25785
999
  //
25786
999
  // The code also handles two different cases of multiple CMOV opcodes
25787
999
  // in a row.
25788
999
  //
25789
999
  // Case 1:
25790
999
  // In this case, there are multiple CMOVs in a row, all which are based on
25791
999
  // the same condition setting (or the exact opposite condition setting).
25792
999
  // In this case we can lower all the CMOVs using a single inserted BB, and
25793
999
  // then make a number of PHIs at the join point to model the CMOVs. The only
25794
999
  // trickiness here, is that in a case like:
25795
999
  //
25796
999
  // t2 = CMOV cond1 t1, f1
25797
999
  // t3 = CMOV cond1 t2, f2
25798
999
  //
25799
999
  // when rewriting this into PHIs, we have to perform some renaming on the
25800
999
  // temps since you cannot have a PHI operand refer to a PHI result earlier
25801
999
  // in the same block.  The "simple" but wrong lowering would be:
25802
999
  //
25803
999
  // t2 = PHI t1(BB1), f1(BB2)
25804
999
  // t3 = PHI t2(BB1), f2(BB2)
25805
999
  //
25806
999
  // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25807
999
  // renaming is to note that on the path through BB1, t2 is really just a
25808
999
  // copy of t1, and do that renaming, properly generating:
25809
999
  //
25810
999
  // t2 = PHI t1(BB1), f1(BB2)
25811
999
  // t3 = PHI t1(BB1), f2(BB2)
25812
999
  //
25813
999
  // Case 2:
25814
999
  // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
25815
999
  // function - EmitLoweredCascadedSelect.
25816
999
25817
999
  X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25818
999
  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25819
999
  MachineInstr *LastCMOV = &MI;
25820
999
  MachineBasicBlock::iterator NextMIIt =
25821
999
      std::next(MachineBasicBlock::iterator(MI));
25822
999
25823
999
  // Check for case 1, where there are multiple CMOVs with the same condition
25824
999
  // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
25825
999
  // number of jumps the most.
25826
999
25827
999
  if (
isCMOVPseudo(MI)999
) {
25828
990
    // See if we have a string of CMOVS with the same condition.
25829
1.30k
    while (
NextMIIt != ThisMBB->end() && 1.30k
isCMOVPseudo(*NextMIIt)1.29k
&&
25830
351
           (NextMIIt->getOperand(3).getImm() == CC ||
25831
1.30k
            
NextMIIt->getOperand(3).getImm() == OppCC37
)) {
25832
317
      LastCMOV = &*NextMIIt;
25833
317
      ++NextMIIt;
25834
317
    }
25835
990
  }
25836
999
25837
999
  // This checks for case 2, but only do this if we didn't already find
25838
999
  // case 1, as indicated by LastCMOV == MI.
25839
999
  if (
LastCMOV == &MI && 999
NextMIIt != ThisMBB->end()870
&&
25840
862
      NextMIIt->getOpcode() == MI.getOpcode() &&
25841
32
      NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25842
17
      NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25843
999
      
NextMIIt->getOperand(1).isKill()17
) {
25844
15
    return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
25845
15
  }
25846
984
25847
984
  const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
25848
984
  MachineFunction *F = ThisMBB->getParent();
25849
984
  MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
25850
984
  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25851
984
25852
984
  MachineFunction::iterator It = ++ThisMBB->getIterator();
25853
984
  F->insert(It, FalseMBB);
25854
984
  F->insert(It, SinkMBB);
25855
984
25856
984
  // If the EFLAGS register isn't dead in the terminator, then claim that it's
25857
984
  // live into the sink and copy blocks.
25858
984
  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25859
984
  if (!LastCMOV->killsRegister(X86::EFLAGS) &&
25860
984
      
!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)984
) {
25861
74
    FalseMBB->addLiveIn(X86::EFLAGS);
25862
74
    SinkMBB->addLiveIn(X86::EFLAGS);
25863
74
  }
25864
999
25865
999
  // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
25866
999
  SinkMBB->splice(SinkMBB->begin(), ThisMBB,
25867
999
                  std::next(MachineBasicBlock::iterator(LastCMOV)),
25868
999
                  ThisMBB->end());
25869
999
  SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
25870
999
25871
999
  // Fallthrough block for ThisMBB.
25872
999
  ThisMBB->addSuccessor(FalseMBB);
25873
999
  // The true block target of the first (or only) branch is always a SinkMBB.
25874
999
  ThisMBB->addSuccessor(SinkMBB);
25875
999
  // Fallthrough block for FalseMBB.
25876
999
  FalseMBB->addSuccessor(SinkMBB);
25877
999
25878
999
  // Create the conditional branch instruction.
25879
999
  unsigned Opc = X86::GetCondBranchFromCond(CC);
25880
999
  BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
25881
999
25882
999
  //  SinkMBB:
25883
999
  //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
25884
999
  //  ...
25885
999
  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25886
999
  MachineBasicBlock::iterator MIItEnd =
25887
999
      std::next(MachineBasicBlock::iterator(LastCMOV));
25888
999
  createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
25889
999
25890
999
  // Now remove the CMOV(s).
25891
999
  ThisMBB->erase(MIItBegin, MIItEnd);
25892
999
25893
999
  return SinkMBB;
25894
999
}
25895
25896
MachineBasicBlock *
25897
X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25898
18
                                       MachineBasicBlock *BB) const {
25899
18
  // Combine the following atomic floating-point modification pattern:
25900
18
  //   a.store(reg OP a.load(acquire), release)
25901
18
  // Transform them into:
25902
18
  //   OPss (%gpr), %xmm
25903
18
  //   movss %xmm, (%gpr)
25904
18
  // Or sd equivalent for 64-bit operations.
25905
18
  unsigned MOp, FOp;
25906
18
  switch (MI.getOpcode()) {
25907
0
  
default: 0
llvm_unreachable0
("unexpected instr type for EmitLoweredAtomicFP");
25908
8
  case X86::RELEASE_FADD32mr:
25909
8
    FOp = X86::ADDSSrm;
25910
8
    MOp = X86::MOVSSmr;
25911
8
    break;
25912
10
  case X86::RELEASE_FADD64mr:
25913
10
    FOp = X86::ADDSDrm;
25914
10
    MOp = X86::MOVSDmr;
25915
10
    break;
25916
18
  }
25917
18
  const X86InstrInfo *TII = Subtarget.getInstrInfo();
25918
18
  DebugLoc DL = MI.getDebugLoc();
25919
18
  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25920
18
  unsigned ValOpIdx = X86::AddrNumOperands;
25921
18
  unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25922
18
  MachineInstrBuilder MIB =
25923
18
      BuildMI(*BB, MI, DL, TII->get(FOp),
25924
18
              MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25925
18
          .addReg(VSrc);
25926
108
  for (int i = 0; 
i < X86::AddrNumOperands108
;
++i90
) {
25927
90
    MachineOperand &Operand = MI.getOperand(i);
25928
90
    // Clear any kill flags on register operands as we'll create a second
25929
90
    // instruction using the same address operands.
25930
90
    if (Operand.isReg())
25931
50
      Operand.setIsKill(false);
25932
90
    MIB.add(Operand);
25933
90
  }
25934
18
  MachineInstr *FOpMI = MIB;
25935
18
  MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25936
108
  for (int i = 0; 
i < X86::AddrNumOperands108
;
++i90
)
25937
90
    MIB.add(MI.getOperand(i));
25938
18
  MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25939
18
  MI.eraseFromParent(); // The pseudo instruction is gone now.
25940
18
  return BB;
25941
18
}
25942
25943
MachineBasicBlock *
25944
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25945
6
                                        MachineBasicBlock *BB) const {
25946
6
  MachineFunction *MF = BB->getParent();
25947
6
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25948
6
  DebugLoc DL = MI.getDebugLoc();
25949
6
  const BasicBlock *LLVM_BB = BB->getBasicBlock();
25950
6
25951
6
  assert(MF->shouldSplitStack());
25952
6
25953
6
  const bool Is64Bit = Subtarget.is64Bit();
25954
6
  const bool IsLP64 = Subtarget.isTarget64BitLP64();
25955
6
25956
6
  const unsigned TlsReg = Is64Bit ? 
X86::FS4
:
X86::GS2
;
25957
6
  const unsigned TlsOffset = IsLP64 ? 
0x702
:
Is64Bit ? 4
0x402
:
0x302
;
25958
6
25959
6
  // BB:
25960
6
  //  ... [Till the alloca]
25961
6
  // If stacklet is not large enough, jump to mallocMBB
25962
6
  //
25963
6
  // bumpMBB:
25964
6
  //  Allocate by subtracting from RSP
25965
6
  //  Jump to continueMBB
25966
6
  //
25967
6
  // mallocMBB:
25968
6
  //  Allocate by call to runtime
25969
6
  //
25970
6
  // continueMBB:
25971
6
  //  ...
25972
6
  //  [rest of original BB]
25973
6
  //
25974
6
25975
6
  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25976
6
  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25977
6
  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25978
6
25979
6
  MachineRegisterInfo &MRI = MF->getRegInfo();
25980
6
  const TargetRegisterClass *AddrRegClass =
25981
6
      getRegClassFor(getPointerTy(MF->getDataLayout()));
25982
6
25983
6
  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25984
6
           bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25985
6
           tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25986
6
           SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25987
6
           sizeVReg = MI.getOperand(1).getReg(),
25988
6
           physSPReg =
25989
6
               IsLP64 || 
Subtarget.isTargetNaCl64()4
?
X86::RSP2
:
X86::ESP4
;
25990
6
25991
6
  MachineFunction::iterator MBBIter = ++BB->getIterator();
25992
6
25993
6
  MF->insert(MBBIter, bumpMBB);
25994
6
  MF->insert(MBBIter, mallocMBB);
25995
6
  MF->insert(MBBIter, continueMBB);
25996
6
25997
6
  continueMBB->splice(continueMBB->begin(), BB,
25998
6
                      std::next(MachineBasicBlock::iterator(MI)), BB->end());
25999
6
  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
26000
6
26001
6
  // Add code to the main basic block to check if the stack limit has been hit,
26002
6
  // and if so, jump to mallocMBB otherwise to bumpMBB.
26003
6
  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
26004
6
  BuildMI(BB, DL, TII->get(IsLP64 ? 
X86::SUB64rr2
:
X86::SUB32rr4
), SPLimitVReg)
26005
6
    .addReg(tmpSPVReg).addReg(sizeVReg);
26006
6
  BuildMI(BB, DL, TII->get(IsLP64 ? 
X86::CMP64mr2
:
X86::CMP32mr4
))
26007
6
    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
26008
6
    .addReg(SPLimitVReg);
26009
6
  BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
26010
6
26011
6
  // bumpMBB simply decreases the stack pointer, since we know the current
26012
6
  // stacklet has enough space.
26013
6
  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
26014
6
    .addReg(SPLimitVReg);
26015
6
  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
26016
6
    .addReg(SPLimitVReg);
26017
6
  BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26018
6
26019
6
  // Calls into a routine in libgcc to allocate more space from the heap.
26020
6
  const uint32_t *RegMask =
26021
6
      Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26022
6
  if (
IsLP646
) {
26023
2
    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26024
2
      .addReg(sizeVReg);
26025
2
    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26026
2
      .addExternalSymbol("__morestack_allocate_stack_space")
26027
2
      .addRegMask(RegMask)
26028
2
      .addReg(X86::RDI, RegState::Implicit)
26029
2
      .addReg(X86::RAX, RegState::ImplicitDefine);
26030
6
  } else 
if (4
Is64Bit4
) {
26031
2
    BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26032
2
      .addReg(sizeVReg);
26033
2
    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26034
2
      .addExternalSymbol("__morestack_allocate_stack_space")
26035
2
      .addRegMask(RegMask)
26036
2
      .addReg(X86::EDI, RegState::Implicit)
26037
2
      .addReg(X86::EAX, RegState::ImplicitDefine);
26038
4
  } else {
26039
2
    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26040
2
      .addImm(12);
26041
2
    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26042
2
    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26043
2
      .addExternalSymbol("__morestack_allocate_stack_space")
26044
2
      .addRegMask(RegMask)
26045
2
      .addReg(X86::EAX, RegState::ImplicitDefine);
26046
2
  }
26047
6
26048
6
  if (!Is64Bit)
26049
2
    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26050
2
      .addImm(16);
26051
6
26052
6
  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26053
6
    .addReg(IsLP64 ? 
X86::RAX2
:
X86::EAX4
);
26054
6
  BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26055
6
26056
6
  // Set up the CFG correctly.
26057
6
  BB->addSuccessor(bumpMBB);
26058
6
  BB->addSuccessor(mallocMBB);
26059
6
  mallocMBB->addSuccessor(continueMBB);
26060
6
  bumpMBB->addSuccessor(continueMBB);
26061
6
26062
6
  // Take care of the PHI nodes.
26063
6
  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26064
6
          MI.getOperand(0).getReg())
26065
6
      .addReg(mallocPtrVReg)
26066
6
      .addMBB(mallocMBB)
26067
6
      .addReg(bumpSPPtrVReg)
26068
6
      .addMBB(bumpMBB);
26069
6
26070
6
  // Delete the original pseudo instruction.
26071
6
  MI.eraseFromParent();
26072
6
26073
6
  // And we're done.
26074
6
  return continueMBB;
26075
6
}
26076
26077
MachineBasicBlock *
26078
X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26079
61
                                       MachineBasicBlock *BB) const {
26080
61
  MachineFunction *MF = BB->getParent();
26081
61
  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26082
61
  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26083
61
  DebugLoc DL = MI.getDebugLoc();
26084
61
26085
61
  assert(!isAsynchronousEHPersonality(
26086
61
             classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
26087
61
         "SEH does not use catchret!");
26088
61
26089
61
  // Only 32-bit EH needs to worry about manually restoring stack pointers.
26090
61
  if (!Subtarget.is32Bit())
26091
43
    return BB;
26092
18
26093
18
  // C++ EH creates a new target block to hold the restore code, and wires up
26094
18
  // the new block to the return destination with a normal JMP_4.
26095
18
  MachineBasicBlock *RestoreMBB =
26096
18
      MF->CreateMachineBasicBlock(BB->getBasicBlock());
26097
18
  assert(BB->succ_size() == 1);
26098
18
  MF->insert(std::next(BB->getIterator()), RestoreMBB);
26099
18
  RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26100
18
  BB->addSuccessor(RestoreMBB);
26101
18
  MI.getOperand(0).setMBB(RestoreMBB);
26102
18
26103
18
  auto RestoreMBBI = RestoreMBB->begin();
26104
18
  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26105
18
  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26106
18
  return BB;
26107
18
}
26108
26109
MachineBasicBlock *
26110
X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26111
104
                                       MachineBasicBlock *BB) const {
26112
104
  MachineFunction *MF = BB->getParent();
26113
104
  const Constant *PerFn = MF->getFunction()->getPersonalityFn();
26114
104
  bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26115
104
  // Only 32-bit SEH requires special handling for catchpad.
26116
104
  if (
IsSEH && 104
Subtarget.is32Bit()28
) {
26117
16
    const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26118
16
    DebugLoc DL = MI.getDebugLoc();
26119
16
    BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
26120
16
  }
26121
104
  MI.eraseFromParent();
26122
104
  return BB;
26123
104
}
26124
26125
MachineBasicBlock *
26126
X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
26127
35
                                      MachineBasicBlock *BB) const {
26128
35
  // So, here we replace TLSADDR with the sequence:
26129
35
  // adjust_stackdown -> TLSADDR -> adjust_stackup.
26130
35
  // We need this because TLSADDR is lowered into calls
26131
35
  // inside MC, therefore without the two markers shrink-wrapping
26132
35
  // may push the prologue/epilogue pass them.
26133
35
  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26134
35
  DebugLoc DL = MI.getDebugLoc();
26135
35
  MachineFunction &MF = *BB->getParent();
26136
35
26137
35
  // Emit CALLSEQ_START right before the instruction.
26138
35
  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
26139
35
  MachineInstrBuilder CallseqStart =
26140
35
    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
26141
35
  BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
26142
35
26143
35
  // Emit CALLSEQ_END right after the instruction.
26144
35
  // We don't call erase from parent because we want to keep the
26145
35
  // original instruction around.
26146
35
  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
26147
35
  MachineInstrBuilder CallseqEnd =
26148
35
    BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
26149
35
  BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
26150
35
26151
35
  return BB;
26152
35
}
26153
26154
MachineBasicBlock *
26155
X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
26156
53
                                      MachineBasicBlock *BB) const {
26157
53
  // This is pretty easy.  We're taking the value that we received from
26158
53
  // our load from the relocation, sticking it in either RDI (x86-64)
26159
53
  // or EAX and doing an indirect call.  The return value will then
26160
53
  // be in the normal return register.
26161
53
  MachineFunction *F = BB->getParent();
26162
53
  const X86InstrInfo *TII = Subtarget.getInstrInfo();
26163
53
  DebugLoc DL = MI.getDebugLoc();
26164
53
26165
53
  assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
26166
53
  assert(MI.getOperand(3).isGlobal() && "This should be a global");
26167
53
26168
53
  // Get a register mask for the lowered call.
26169
53
  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
26170
53
  // proper register mask.
26171
53
  const uint32_t *RegMask =
26172
53
      Subtarget.is64Bit() ?
26173
52
      Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
26174
1
      Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
26175
53
  if (
Subtarget.is64Bit()53
) {
26176
52
    MachineInstrBuilder MIB =
26177
52
        BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
26178
52
            .addReg(X86::RIP)
26179
52
            .addImm(0)
26180
52
            .addReg(0)
26181
52
            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26182
52
                              MI.getOperand(3).getTargetFlags())
26183
52
            .addReg(0);
26184
52
    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
26185
52
    addDirectMem(MIB, X86::RDI);
26186
52
    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
26187
53
  } else 
if (1
!isPositionIndependent()1
) {
26188
1
    MachineInstrBuilder MIB =
26189
1
        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26190
1
            .addReg(0)
26191
1
            .addImm(0)
26192
1
            .addReg(0)
26193
1
            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26194
1
                              MI.getOperand(3).getTargetFlags())
26195
1
            .addReg(0);
26196
1
    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26197
1
    addDirectMem(MIB, X86::EAX);
26198
1
    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26199
1
  } else {
26200
0
    MachineInstrBuilder MIB =
26201
0
        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26202
0
            .addReg(TII->getGlobalBaseReg(F))
26203
0
            .addImm(0)
26204
0
            .addReg(0)
26205
0
            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26206
0
                              MI.getOperand(3).getTargetFlags())
26207
0
            .addReg(0);
26208
0
    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26209
0
    addDirectMem(MIB, X86::EAX);
26210
0
    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26211
0
  }
26212
53
26213
53
  MI.eraseFromParent(); // The pseudo instruction is gone now.
26214
53
  return BB;
26215
53
}
26216
26217
MachineBasicBlock *
26218
X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
26219
10
                                    MachineBasicBlock *MBB) const {
26220
10
  DebugLoc DL = MI.getDebugLoc();
26221
10
  MachineFunction *MF = MBB->getParent();
26222
10
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26223
10
  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26224
10
  MachineRegisterInfo &MRI = MF->getRegInfo();
26225
10
26226
10
  const BasicBlock *BB = MBB->getBasicBlock();
26227
10
  MachineFunction::iterator I = ++MBB->getIterator();
26228
10
26229
10
  // Memory Reference
26230
10
  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26231
10
  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26232
10
26233
10
  unsigned DstReg;
26234
10
  unsigned MemOpndSlot = 0;
26235
10
26236
10
  unsigned CurOp = 0;
26237
10
26238
10
  DstReg = MI.getOperand(CurOp++).getReg();
26239
10
  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26240
10
  assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
26241
10
  (void)TRI;
26242
10
  unsigned mainDstReg = MRI.createVirtualRegister(RC);
26243
10
  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
26244
10
26245
10
  MemOpndSlot = CurOp;
26246
10
26247
10
  MVT PVT = getPointerTy(MF->getDataLayout());
26248
10
  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26249
10
         "Invalid Pointer Size!");
26250
10
26251
10
  // For v = setjmp(buf), we generate
26252
10
  //
26253
10
  // thisMBB:
26254
10
  //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
26255
10
  //  SjLjSetup restoreMBB
26256
10
  //
26257
10
  // mainMBB:
26258
10
  //  v_main = 0
26259
10
  //
26260
10
  // sinkMBB:
26261
10
  //  v = phi(main, restore)
26262
10
  //
26263
10
  // restoreMBB:
26264
10
  //  if base pointer being used, load it from frame
26265
10
  //  v_restore = 1
26266
10
26267
10
  MachineBasicBlock *thisMBB = MBB;
26268
10
  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26269
10
  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26270
10
  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
26271
10
  MF->insert(I, mainMBB);
26272
10
  MF->insert(I, sinkMBB);
26273
10
  MF->push_back(restoreMBB);
26274
10
  restoreMBB->setHasAddressTaken();
26275
10
26276
10
  MachineInstrBuilder MIB;
26277
10
26278
10
  // Transfer the remainder of BB and its successor edges to sinkMBB.
26279
10
  sinkMBB->splice(sinkMBB->begin(), MBB,
26280
10
                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26281
10
  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26282
10
26283
10
  // thisMBB:
26284
10
  unsigned PtrStoreOpc = 0;
26285
10
  unsigned LabelReg = 0;
26286
10
  const int64_t LabelOffset = 1 * PVT.getStoreSize();
26287
10
  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26288
10
                     !isPositionIndependent();
26289
10
26290
10
  // Prepare IP either in reg or imm.
26291
10
  if (
!UseImmLabel10
) {
26292
6
    PtrStoreOpc = (PVT == MVT::i64) ? 
X86::MOV64mr4
:
X86::MOV32mr2
;
26293
6
    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26294
6
    LabelReg = MRI.createVirtualRegister(PtrRC);
26295
6
    if (
Subtarget.is64Bit()6
) {
26296
4
      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26297
4
              .addReg(X86::RIP)
26298
4
              .addImm(0)
26299
4
              .addReg(0)
26300
4
              .addMBB(restoreMBB)
26301
4
              .addReg(0);
26302
6
    } else {
26303
2
      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26304
2
      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26305
2
              .addReg(XII->getGlobalBaseReg(MF))
26306
2
              .addImm(0)
26307
2
              .addReg(0)
26308
2
              .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26309
2
              .addReg(0);
26310
2
    }
26311
6
  } else
26312
4
    
PtrStoreOpc = (PVT == MVT::i64) ? 4
X86::MOV64mi322
:
X86::MOV32mi2
;
26313
10
  // Store IP
26314
10
  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26315
60
  for (unsigned i = 0; 
i < X86::AddrNumOperands60
;
++i50
) {
26316
50
    if (i == X86::AddrDisp)
26317
10
      MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26318
50
    else
26319
40
      MIB.add(MI.getOperand(MemOpndSlot + i));
26320
50
  }
26321
10
  if (!UseImmLabel)
26322
6
    MIB.addReg(LabelReg);
26323
10
  else
26324
4
    MIB.addMBB(restoreMBB);
26325
10
  MIB.setMemRefs(MMOBegin, MMOEnd);
26326
10
  // Setup
26327
10
  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26328
10
          .addMBB(restoreMBB);
26329
10
26330
10
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26331
10
  MIB.addRegMask(RegInfo->getNoPreservedMask());
26332
10
  thisMBB->addSuccessor(mainMBB);
26333
10
  thisMBB->addSuccessor(restoreMBB);
26334
10
26335
10
  // mainMBB:
26336
10
  //  EAX = 0
26337
10
  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26338
10
  mainMBB->addSuccessor(sinkMBB);
26339
10
26340
10
  // sinkMBB:
26341
10
  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26342
10
          TII->get(X86::PHI), DstReg)
26343
10
    .addReg(mainDstReg).addMBB(mainMBB)
26344
10
    .addReg(restoreDstReg).addMBB(restoreMBB);
26345
10
26346
10
  // restoreMBB:
26347
10
  if (
RegInfo->hasBasePointer(*MF)10
) {
26348
2
    const bool Uses64BitFramePtr =
26349
1
        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26350
2
    X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26351
2
    X86FI->setRestoreBasePointer(MF);
26352
2
    unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26353
2
    unsigned BasePtr = RegInfo->getBaseRegister();
26354
2
    unsigned Opm = Uses64BitFramePtr ? 
X86::MOV64rm1
:
X86::MOV32rm1
;
26355
2
    addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26356
2
                 FramePtr, true, X86FI->getRestoreBasePointerOffset())
26357
2
      .setMIFlag(MachineInstr::FrameSetup);
26358
2
  }
26359
10
  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26360
10
  BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26361
10
  restoreMBB->addSuccessor(sinkMBB);
26362
10
26363
10
  MI.eraseFromParent();
26364
10
  return sinkMBB;
26365
10
}
26366
26367
MachineBasicBlock *
26368
X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26369
4
                                     MachineBasicBlock *MBB) const {
26370
4
  DebugLoc DL = MI.getDebugLoc();
26371
4
  MachineFunction *MF = MBB->getParent();
26372
4
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26373
4
  MachineRegisterInfo &MRI = MF->getRegInfo();
26374
4
26375
4
  // Memory Reference
26376
4
  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26377
4
  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26378
4
26379
4
  MVT PVT = getPointerTy(MF->getDataLayout());
26380
4
  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26381
4
         "Invalid Pointer Size!");
26382
4
26383
4
  const TargetRegisterClass *RC =
26384
4
    (PVT == MVT::i64) ? 
&X86::GR64RegClass2
:
&X86::GR32RegClass2
;
26385
4
  unsigned Tmp = MRI.createVirtualRegister(RC);
26386
4
  // Since FP is only updated here but NOT referenced, it's treated as GPR.
26387
4
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26388
4
  unsigned FP = (PVT == MVT::i64) ? 
X86::RBP2
:
X86::EBP2
;
26389
4
  unsigned SP = RegInfo->getStackRegister();
26390
4
26391
4
  MachineInstrBuilder MIB;
26392
4
26393
4
  const int64_t LabelOffset = 1 * PVT.getStoreSize();
26394
4
  const int64_t SPOffset = 2 * PVT.getStoreSize();
26395
4
26396
4
  unsigned PtrLoadOpc = (PVT == MVT::i64) ? 
X86::MOV64rm2
:
X86::MOV32rm2
;
26397
4
  unsigned IJmpOpc = (PVT == MVT::i64) ? 
X86::JMP64r2
:
X86::JMP32r2
;
26398
4
26399
4
  // Reload FP
26400
4
  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26401
24
  for (unsigned i = 0; 
i < X86::AddrNumOperands24
;
++i20
)
26402
20
    MIB.add(MI.getOperand(i));
26403
4
  MIB.setMemRefs(MMOBegin, MMOEnd);
26404
4
  // Reload IP
26405
4
  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26406
24
  for (unsigned i = 0; 
i < X86::AddrNumOperands24
;
++i20
) {
26407
20
    if (i == X86::AddrDisp)
26408
4
      MIB.addDisp(MI.getOperand(i), LabelOffset);
26409
20
    else
26410
16
      MIB.add(MI.getOperand(i));
26411
20
  }
26412
4
  MIB.setMemRefs(MMOBegin, MMOEnd);
26413
4
  // Reload SP
26414
4
  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26415
24
  for (unsigned i = 0; 
i < X86::AddrNumOperands24
;
++i20
) {
26416
20
    if (i == X86::AddrDisp)
26417
4
      MIB.addDisp(MI.getOperand(i), SPOffset);
26418
20
    else
26419
16
      MIB.add(MI.getOperand(i));
26420
20
  }
26421
4
  MIB.setMemRefs(MMOBegin, MMOEnd);
26422
4
  // Jump
26423
4
  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26424
4
26425
4
  MI.eraseFromParent();
26426
4
  return MBB;
26427
4
}
26428
26429
void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26430
                                               MachineBasicBlock *MBB,
26431
                                               MachineBasicBlock *DispatchBB,
26432
2
                                               int FI) const {
26433
2
  DebugLoc DL = MI.getDebugLoc();
26434
2
  MachineFunction *MF = MBB->getParent();
26435
2
  MachineRegisterInfo *MRI = &MF->getRegInfo();
26436
2
  const X86InstrInfo *TII = Subtarget.getInstrInfo();
26437
2
26438
2
  MVT PVT = getPointerTy(MF->getDataLayout());
26439
2
  assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26440
2
26441
2
  unsigned Op = 0;
26442
2
  unsigned VR = 0;
26443
2
26444
2
  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26445
2
                     !isPositionIndependent();
26446
2
26447
2
  if (
UseImmLabel2
) {
26448
1
    Op = (PVT == MVT::i64) ? 
X86::MOV64mi320
:
X86::MOV32mi1
;
26449
2
  } else {
26450
1
    const TargetRegisterClass *TRC =
26451
1
        (PVT == MVT::i64) ? 
&X86::GR64RegClass1
:
&X86::GR32RegClass0
;
26452
1
    VR = MRI->createVirtualRegister(TRC);
26453
1
    Op = (PVT == MVT::i64) ? 
X86::MOV64mr1
:
X86::MOV32mr0
;
26454
1
26455
1
    if (Subtarget.is64Bit())
26456
1
      BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26457
1
          .addReg(X86::RIP)
26458
1
          .addImm(1)
26459
1
          .addReg(0)
26460
1
          .addMBB(DispatchBB)
26461
1
          .addReg(0);
26462
1
    else
26463
0
      BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26464
0
          .addReg(0) /* TII->getGlobalBaseReg(MF) */
26465
0
          .addImm(1)
26466
0
          .addReg(0)
26467
0
          .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26468
0
          .addReg(0);
26469
1
  }
26470
2
26471
2
  MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26472
2
  addFrameReference(MIB, FI, Subtarget.is64Bit() ? 
561
:
361
);
26473
2
  if (UseImmLabel)
26474
1
    MIB.addMBB(DispatchBB);
26475
2
  else
26476
1
    MIB.addReg(VR);
26477
2
}
26478
26479
MachineBasicBlock *
26480
X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26481
2
                                         MachineBasicBlock *BB) const {
26482
2
  DebugLoc DL = MI.getDebugLoc();
26483
2
  MachineFunction *MF = BB->getParent();
26484
2
  MachineFrameInfo &MFI = MF->getFrameInfo();
26485
2
  MachineRegisterInfo *MRI = &MF->getRegInfo();
26486
2
  const X86InstrInfo *TII = Subtarget.getInstrInfo();
26487
2
  int FI = MFI.getFunctionContextIndex();
26488
2
26489
2
  // Get a mapping of the call site numbers to all of the landing pads they're
26490
2
  // associated with.
26491
2
  DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26492
2
  unsigned MaxCSNum = 0;
26493
6
  for (auto &MBB : *MF) {
26494
6
    if (!MBB.isEHPad())
26495
4
      continue;
26496
2
26497
2
    MCSymbol *Sym = nullptr;
26498
2
    for (const auto &MI : MBB) {
26499
2
      if (MI.isDebugValue())
26500
0
        continue;
26501
2
26502
2
      assert(MI.isEHLabel() && "expected EH_LABEL");
26503
2
      Sym = MI.getOperand(0).getMCSymbol();
26504
2
      break;
26505
2
    }
26506
2
26507
2
    if (!MF->hasCallSiteLandingPad(Sym))
26508
0
      continue;
26509
2
26510
2
    
for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) 2
{
26511
2
      CallSiteNumToLPad[CSI].push_back(&MBB);
26512
2
      MaxCSNum = std::max(MaxCSNum, CSI);
26513
2
    }
26514
6
  }
26515
2
26516
2
  // Get an ordered list of the machine basic blocks for the jump table.
26517
2
  std::vector<MachineBasicBlock *> LPadList;
26518
2
  SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26519
2
  LPadList.reserve(CallSiteNumToLPad.size());
26520
2
26521
4
  for (unsigned CSI = 1; 
CSI <= MaxCSNum4
;
++CSI2
) {
26522
2
    for (auto &LP : CallSiteNumToLPad[CSI]) {
26523
2
      LPadList.push_back(LP);
26524
2
      InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26525
2
    }
26526
2
  }
26527
2
26528
2
  assert(!LPadList.empty() &&
26529
2
         "No landing pad destinations for the dispatch jump table!");
26530
2
26531
2
  // Create the MBBs for the dispatch code.
26532
2
26533
2
  // Shove the dispatch's address into the return slot in the function context.
26534
2
  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26535
2
  DispatchBB->setIsEHPad(true);
26536
2
26537
2
  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26538
2
  BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26539
2
  DispatchBB->addSuccessor(TrapBB);
26540
2
26541
2
  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26542
2
  DispatchBB->addSuccessor(DispContBB);
26543
2
26544
2
  // Insert MBBs.
26545
2
  MF->push_back(DispatchBB);
26546
2
  MF->push_back(DispContBB);
26547
2
  MF->push_back(TrapBB);
26548
2
26549
2
  // Insert code into the entry block that creates and registers the function
26550
2
  // context.
26551
2
  SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26552
2
26553
2
  // Create the jump table and associated information
26554
2
  MachineJumpTableInfo *JTI =
26555
2
      MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26556
2
  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26557
2
26558
2
  const X86RegisterInfo &RI = TII->getRegisterInfo();
26559
2
  // Add a register mask with no preserved registers.  This results in all
26560
2
  // registers being marked as clobbered.
26561
2
  if (
RI.hasBasePointer(*MF)2
) {
26562
0
    const bool FPIs64Bit =
26563
0
        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26564
0
    X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26565
0
    MFI->setRestoreBasePointer(MF);
26566
0
26567
0
    unsigned FP = RI.getFrameRegister(*MF);
26568
0
    unsigned BP = RI.getBaseRegister();
26569
0
    unsigned Op = FPIs64Bit ? 
X86::MOV64rm0
:
X86::MOV32rm0
;
26570
0
    addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26571
0
                 MFI->getRestoreBasePointerOffset())
26572
0
        .addRegMask(RI.getNoPreservedMask());
26573
2
  } else {
26574
2
    BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26575
2
        .addRegMask(RI.getNoPreservedMask());
26576
2
  }
26577
2
26578
2
  unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26579
2
  addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26580
2
                    Subtarget.is64Bit() ? 
81
:
41
);
26581
2
  BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26582
2
      .addReg(IReg)
26583
2
      .addImm(LPadList.size());
26584
2
  BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
26585
2
26586
2
  BuildMI(DispContBB, DL,
26587
2
          TII->get(Subtarget.is64Bit() ? 
X86::JMP64m1
:
X86::JMP32m1
))
26588
2
      .addReg(0)
26589
2
      .addImm(Subtarget.is64Bit() ? 
81
:
41
)
26590
2
      .addReg(IReg)
26591
2
      .addJumpTableIndex(MJTI)
26592
2
      .addReg(0);
26593
2
26594
2
  // Add the jump table entries as successors to the MBB.
26595
2
  SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26596
2
  for (auto &LP : LPadList)
26597
2
    
if (2
SeenMBBs.insert(LP).second2
)
26598
2
      DispContBB->addSuccessor(LP);
26599
2
26600
2
  // N.B. the order the invoke BBs are processed in doesn't matter here.
26601
2
  SmallVector<MachineBasicBlock *, 64> MBBLPads;
26602
2
  const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26603
2
  for (MachineBasicBlock *MBB : InvokeBBs) {
26604
2
    // Remove the landing pad successor from the invoke block and replace it
26605
2
    // with the new dispatch block.
26606
2
    // Keep a copy of Successors since it's modified inside the loop.
26607
2
    SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26608
2
                                                   MBB->succ_rend());
26609
2
    // FIXME: Avoid quadratic complexity.
26610
4
    for (auto MBBS : Successors) {
26611
4
      if (
MBBS->isEHPad()4
) {
26612
2
        MBB->removeSuccessor(MBBS);
26613
2
        MBBLPads.push_back(MBBS);
26614
2
      }
26615
4
    }
26616
2
26617
2
    MBB->addSuccessor(DispatchBB);
26618
2
26619
2
    // Find the invoke call and mark all of the callee-saved registers as
26620
2
    // 'implicit defined' so that they're spilled.  This prevents code from
26621
2
    // moving instructions to before the EH block, where they will never be
26622
2
    // executed.
26623
8
    for (auto &II : reverse(*MBB)) {
26624
8
      if (!II.isCall())
26625
6
        continue;
26626
2
26627
2
      DenseMap<unsigned, bool> DefRegs;
26628
2
      for (auto &MOp : II.operands())
26629
8
        
if (8
MOp.isReg()8
)
26630
4
          DefRegs[MOp.getReg()] = true;
26631
2
26632
2
      MachineInstrBuilder MIB(*MF, &II);
26633
24
      for (unsigned RI = 0; 
SavedRegs[RI]24
;
++RI22
) {
26634
22
        unsigned Reg = SavedRegs[RI];
26635
22
        if (!DefRegs[Reg])
26636
22
          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26637
22
      }
26638
8
26639
8
      break;
26640
8
    }
26641
2
  }
26642
2
26643
2
  // Mark all former landing pads as non-landing pads.  The dispatch is the only
26644
2
  // landing pad now.
26645
2
  for (auto &LP : MBBLPads)
26646
2
    LP->setIsEHPad(false);
26647
2
26648
2
  // The instruction is gone now.
26649
2
  MI.eraseFromParent();
26650
2
  return BB;
26651
2
}
26652
26653
MachineBasicBlock *
26654
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26655
3.98k
                                               MachineBasicBlock *BB) const {
26656
3.98k
  MachineFunction *MF = BB->getParent();
26657
3.98k
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26658
3.98k
  DebugLoc DL = MI.getDebugLoc();
26659
3.98k
26660
3.98k
  switch (MI.getOpcode()) {
26661
0
  
default: 0
llvm_unreachable0
("Unexpected instr type to insert");
26662
0
  case X86::TAILJMPd64:
26663
0
  case X86::TAILJMPr64:
26664
0
  case X86::TAILJMPm64:
26665
0
  case X86::TAILJMPr64_REX:
26666
0
  case X86::TAILJMPm64_REX:
26667
0
    llvm_unreachable("TAILJMP64 would not be touched here.");
26668
1.50k
  case X86::TCRETURNdi64:
26669
1.50k
  case X86::TCRETURNri64:
26670
1.50k
  case X86::TCRETURNmi64:
26671
1.50k
    return BB;
26672
35
  case X86::TLS_addr32:
26673
35
  case X86::TLS_addr64:
26674
35
  case X86::TLS_base_addr32:
26675
35
  case X86::TLS_base_addr64:
26676
35
    return EmitLoweredTLSAddr(MI, BB);
26677
61
  case X86::CATCHRET:
26678
61
    return EmitLoweredCatchRet(MI, BB);
26679
104
  case X86::CATCHPAD:
26680
104
    return EmitLoweredCatchPad(MI, BB);
26681
6
  case X86::SEG_ALLOCA_32:
26682
6
  case X86::SEG_ALLOCA_64:
26683
6
    return EmitLoweredSegAlloca(MI, BB);
26684
53
  case X86::TLSCall_32:
26685
53
  case X86::TLSCall_64:
26686
53
    return EmitLoweredTLSCall(MI, BB);
26687
999
  case X86::CMOV_FR32:
26688
999
  case X86::CMOV_FR64:
26689
999
  case X86::CMOV_FR128:
26690
999
  case X86::CMOV_GR8:
26691
999
  case X86::CMOV_GR16:
26692
999
  case X86::CMOV_GR32:
26693
999
  case X86::CMOV_RFP32:
26694
999
  case X86::CMOV_RFP64:
26695
999
  case X86::CMOV_RFP80:
26696
999
  case X86::CMOV_V2F64:
26697
999
  case X86::CMOV_V2I64:
26698
999
  case X86::CMOV_V4F32:
26699
999
  case X86::CMOV_V4F64:
26700
999
  case X86::CMOV_V4I64:
26701
999
  case X86::CMOV_V16F32:
26702
999
  case X86::CMOV_V8F32:
26703
999
  case X86::CMOV_V8F64:
26704
999
  case X86::CMOV_V8I64:
26705
999
  case X86::CMOV_V8I1:
26706
999
  case X86::CMOV_V16I1:
26707
999
  case X86::CMOV_V32I1:
26708
999
  case X86::CMOV_V64I1:
26709
999
    return EmitLoweredSelect(MI, BB);
26710
999
26711
4
  case X86::RDFLAGS32:
26712
4
  case X86::RDFLAGS64: {
26713
4
    unsigned PushF =
26714
4
        MI.getOpcode() == X86::RDFLAGS32 ? 
X86::PUSHF321
:
X86::PUSHF643
;
26715
4
    unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? 
X86::POP32r1
:
X86::POP64r3
;
26716
4
    MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26717
4
    // Permit reads of the FLAGS register without it being defined.
26718
4
    // This intrinsic exists to read external processor state in flags, such as
26719
4
    // the trap flag, interrupt flag, and direction flag, none of which are
26720
4
    // modeled by the backend.
26721
4
    Push->getOperand(2).setIsUndef();
26722
4
    BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26723
4
26724
4
    MI.eraseFromParent(); // The pseudo is gone now.
26725
4
    return BB;
26726
4
  }
26727
4
26728
2
  case X86::WRFLAGS32:
26729
2
  case X86::WRFLAGS64: {
26730
2
    unsigned Push =
26731
2
        MI.getOpcode() == X86::WRFLAGS32 ? 
X86::PUSH32r1
:
X86::PUSH64r1
;
26732
2
    unsigned PopF =
26733
2
        MI.getOpcode() == X86::WRFLAGS32 ? 
X86::POPF321
:
X86::POPF641
;
26734
2
    BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26735
2
    BuildMI(*BB, MI, DL, TII->get(PopF));
26736
2
26737
2
    MI.eraseFromParent(); // The pseudo is gone now.
26738
2
    return BB;
26739
2
  }
26740
2
26741
18
  case X86::RELEASE_FADD32mr:
26742
18
  case X86::RELEASE_FADD64mr:
26743
18
    return EmitLoweredAtomicFP(MI, BB);
26744
18
26745
275
  case X86::FP32_TO_INT16_IN_MEM:
26746
275
  case X86::FP32_TO_INT32_IN_MEM:
26747
275
  case X86::FP32_TO_INT64_IN_MEM:
26748
275
  case X86::FP64_TO_INT16_IN_MEM:
26749
275
  case X86::FP64_TO_INT32_IN_MEM:
26750
275
  case X86::FP64_TO_INT64_IN_MEM:
26751
275
  case X86::FP80_TO_INT16_IN_MEM:
26752
275
  case X86::FP80_TO_INT32_IN_MEM:
26753
275
  case X86::FP80_TO_INT64_IN_MEM: {
26754
275
    // Change the floating point control register to use "round towards zero"
26755
275
    // mode when truncating to an integer value.
26756
275
    int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26757
275
    addFrameReference(BuildMI(*BB, MI, DL,
26758
275
                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
26759
275
26760
275
    // Load the old value of the high byte of the control word...
26761
275
    unsigned OldCW =
26762
275
      MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26763
275
    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26764
275
                      CWFrameIdx);
26765
275
26766
275
    // Set the high part to be round to zero...
26767
275
    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26768
275
      .addImm(0xC7F);
26769
275
26770
275
    // Reload the modified control word now...
26771
275
    addFrameReference(BuildMI(*BB, MI, DL,
26772
275
                              TII->get(X86::FLDCW16m)), CWFrameIdx);
26773
275
26774
275
    // Restore the memory image of control word to original value
26775
275
    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26776
275
      .addReg(OldCW);
26777
275
26778
275
    // Get the X86 opcode to use.
26779
275
    unsigned Opc;
26780
275
    switch (MI.getOpcode()) {
26781
0
    
default: 0
llvm_unreachable0
("illegal opcode!");
26782
1
    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26783
28
    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26784
96
    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26785
1
    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26786
47
    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26787
74
    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26788
0
    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26789
9
    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26790
19
    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26791
275
    }
26792
275
26793
275
    X86AddressMode AM = getAddressFromInstr(&MI, 0);
26794
275
    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26795
275
        .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26796
275
26797
275
    // Reload the original control word now.
26798
275
    addFrameReference(BuildMI(*BB, MI, DL,
26799
275
                              TII->get(X86::FLDCW16m)), CWFrameIdx);
26800
275
26801
275
    MI.eraseFromParent(); // The pseudo instruction is gone now.
26802
275
    return BB;
26803
275
  }
26804
275
    // String/text processing lowering.
26805
52
  case X86::PCMPISTRM128REG:
26806
52
  case X86::VPCMPISTRM128REG:
26807
52
  case X86::PCMPISTRM128MEM:
26808
52
  case X86::VPCMPISTRM128MEM:
26809
52
  case X86::PCMPESTRM128REG:
26810
52
  case X86::VPCMPESTRM128REG:
26811
52
  case X86::PCMPESTRM128MEM:
26812
52
  case X86::VPCMPESTRM128MEM:
26813
52
    assert(Subtarget.hasSSE42() &&
26814
52
           "Target must have SSE4.2 or AVX features enabled");
26815
52
    return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26816
52
26817
52
  // String/text processing lowering.
26818
102
  case X86::PCMPISTRIREG:
26819
102
  case X86::VPCMPISTRIREG:
26820
102
  case X86::PCMPISTRIMEM:
26821
102
  case X86::VPCMPISTRIMEM:
26822
102
  case X86::PCMPESTRIREG:
26823
102
  case X86::VPCMPESTRIREG:
26824
102
  case X86::PCMPESTRIMEM:
26825
102
  case X86::VPCMPESTRIMEM:
26826
102
    assert(Subtarget.hasSSE42() &&
26827
102
           "Target must have SSE4.2 or AVX features enabled");
26828
102
    return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26829
102
26830
102
  // Thread synchronization.
26831
14
  case X86::MONITOR:
26832
14
    return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26833
4
  case X86::MONITORX:
26834
4
    return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26835
102
26836
102
  // Cache line zero
26837
2
  case X86::CLZERO:
26838
2
    return emitClzero(&MI, BB, Subtarget);
26839
102
26840
102
  // PKU feature
26841
1
  case X86::WRPKRU:
26842
1
    return emitWRPKRU(MI, BB, Subtarget);
26843
1
  case X86::RDPKRU:
26844
1
    return emitRDPKRU(MI, BB, Subtarget);
26845
102
  // xbegin
26846
2
  case X86::XBEGIN:
26847
2
    return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26848
102
26849
37
  case X86::VASTART_SAVE_XMM_REGS:
26850
37
    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26851
102
26852
0
  case X86::VAARG_64:
26853
0
    return EmitVAARG64WithCustomInserter(MI, BB);
26854
102
26855
10
  case X86::EH_SjLj_SetJmp32:
26856
10
  case X86::EH_SjLj_SetJmp64:
26857
10
    return emitEHSjLjSetJmp(MI, BB);
26858
10
26859
4
  case X86::EH_SjLj_LongJmp32:
26860
4
  case X86::EH_SjLj_LongJmp64:
26861
4
    return emitEHSjLjLongJmp(MI, BB);
26862
4
26863
2
  case X86::Int_eh_sjlj_setup_dispatch:
26864
2
    return EmitSjLjDispatchBlock(MI, BB);
26865
4
26866
69
  case TargetOpcode::STATEPOINT:
26867
69
    // As an implementation detail, STATEPOINT shares the STACKMAP format at
26868
69
    // this point in the process.  We diverge later.
26869
69
    return emitPatchPoint(MI, BB);
26870
4
26871
170
  case TargetOpcode::STACKMAP:
26872
170
  case TargetOpcode::PATCHPOINT:
26873
170
    return emitPatchPoint(MI, BB);
26874
170
26875
2
  case TargetOpcode::PATCHABLE_EVENT_CALL:
26876
2
    // Do nothing here, handle in xray instrumentation pass.
26877
2
    return BB;
26878
170
26879
428
  case X86::LCMPXCHG8B: {
26880
428
    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26881
428
    // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26882
428
    // requires a memory operand. If it happens that current architecture is
26883
428
    // i686 and for current function we need a base pointer
26884
428
    // - which is ESI for i686 - register allocator would not be able to
26885
428
    // allocate registers for an address in form of X(%reg, %reg, Y)
26886
428
    // - there never would be enough unreserved registers during regalloc
26887
428
    // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26888
428
    // We are giving a hand to register allocator by precomputing the address in
26889
428
    // a new vreg using LEA.
26890
428
26891
428
    // If it is not i686 or there is no base pointer - nothing to do here.
26892
428
    if (
!Subtarget.is32Bit() || 428
!TRI->hasBasePointer(*MF)428
)
26893
426
      return BB;
26894
2
26895
2
    // Even though this code does not necessarily needs the base pointer to
26896
2
    // be ESI, we check for that. The reason: if this assert fails, there are
26897
2
    // some changes happened in the compiler base pointer handling, which most
26898
2
    // probably have to be addressed somehow here.
26899
428
    assert(TRI->getBaseRegister() == X86::ESI &&
26900
2
           "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26901
2
           "base pointer in mind");
26902
2
26903
2
    MachineRegisterInfo &MRI = MF->getRegInfo();
26904
2
    MVT SPTy = getPointerTy(MF->getDataLayout());
26905
2
    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26906
2
    unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26907
2
26908
2
    X86AddressMode AM = getAddressFromInstr(&MI, 0);
26909
2
    // Regalloc does not need any help when the memory operand of CMPXCHG8B
26910
2
    // does not use index register.
26911
2
    if (AM.IndexReg == X86::NoRegister)
26912
1
      return BB;
26913
1
26914
1
    // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26915
1
    // four operand definitions that are E[ABCD] registers. We skip them and
26916
1
    // then insert the LEA.
26917
1
    MachineBasicBlock::iterator MBBI(MI);
26918
6
    while (
MBBI->definesRegister(X86::EAX) || 6
MBBI->definesRegister(X86::EBX)4
||
26919
6
           
MBBI->definesRegister(X86::ECX)3
||
MBBI->definesRegister(X86::EDX)2
)
26920
5
      --MBBI;
26921
1
    addFullAddress(
26922
1
        BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26923
1
26924
1
    setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26925
1
26926
1
    return BB;
26927
1
  }
26928
23
  case X86::LCMPXCHG16B:
26929
23
    return BB;
26930
2
  case X86::LCMPXCHG8B_SAVE_EBX:
26931
2
  case X86::LCMPXCHG16B_SAVE_RBX: {
26932
2
    unsigned BasePtr =
26933
2
        MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? 
X86::EBX0
:
X86::RBX2
;
26934
2
    if (!BB->isLiveIn(BasePtr))
26935
2
      BB->addLiveIn(BasePtr);
26936
2
    return BB;
26937
0
  }
26938
3.98k
  }
26939
3.98k
}
26940
26941
//===----------------------------------------------------------------------===//
26942
//                           X86 Optimization Hooks
26943
//===----------------------------------------------------------------------===//
26944
26945
void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26946
                                                      KnownBits &Known,
26947
                                                      const APInt &DemandedElts,
26948
                                                      const SelectionDAG &DAG,
26949
367k
                                                      unsigned Depth) const {
26950
367k
  unsigned BitWidth = Known.getBitWidth();
26951
367k
  unsigned Opc = Op.getOpcode();
26952
367k
  EVT VT = Op.getValueType();
26953
367k
  assert((Opc >= ISD::BUILTIN_OP_END ||
26954
367k
          Opc == ISD::INTRINSIC_WO_CHAIN ||
26955
367k
          Opc == ISD::INTRINSIC_W_CHAIN ||
26956
367k
          Opc == ISD::INTRINSIC_VOID) &&
26957
367k
         "Should use MaskedValueIsZero if you don't know whether Op"
26958
367k
         " is a target node!");
26959
367k
26960
367k
  Known.resetAll();
26961
367k
  switch (Opc) {
26962
317k
  default: break;
26963
12.0k
  case X86ISD::SETCC:
26964
12.0k
    Known.Zero.setBitsFrom(1);
26965
12.0k
    break;
26966
958
  case X86ISD::MOVMSK: {
26967
958
    unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26968
958
    Known.Zero.setBitsFrom(NumLoBits);
26969
958
    break;
26970
367k
  }
26971
36.0k
  case X86ISD::VSHLI:
26972
36.0k
  case X86ISD::VSRLI: {
26973
36.0k
    if (auto *
ShiftImm36.0k
= dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26974
36.0k
      if (
ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())36.0k
) {
26975
0
        Known.setAllZero();
26976
0
        break;
26977
0
      }
26978
36.0k
26979
36.0k
      DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
26980
36.0k
      unsigned ShAmt = ShiftImm->getZExtValue();
26981
36.0k
      if (
Opc == X86ISD::VSHLI36.0k
) {
26982
22.0k
        Known.Zero <<= ShAmt;
26983
22.0k
        Known.One <<= ShAmt;
26984
22.0k
        // Low bits are known zero.
26985
22.0k
        Known.Zero.setLowBits(ShAmt);
26986
36.0k
      } else {
26987
14.0k
        Known.Zero.lshrInPlace(ShAmt);
26988
14.0k
        Known.One.lshrInPlace(ShAmt);
26989
14.0k
        // High bits are known zero.
26990
14.0k
        Known.Zero.setHighBits(ShAmt);
26991
14.0k
      }
26992
36.0k
    }
26993
36.0k
    break;
26994
36.0k
  }
26995
722
  case X86ISD::VZEXT: {
26996
722
    SDValue N0 = Op.getOperand(0);
26997
722
    unsigned NumElts = VT.getVectorNumElements();
26998
722
26999
722
    EVT SrcVT = N0.getValueType();
27000
722
    unsigned InNumElts = SrcVT.getVectorNumElements();
27001
722
    unsigned InBitWidth = SrcVT.getScalarSizeInBits();
27002
722
    assert(InNumElts >= NumElts && "Illegal VZEXT input");
27003
722
27004
722
    Known = KnownBits(InBitWidth);
27005
722
    APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
27006
722
    DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
27007
722
    Known = Known.zext(BitWidth);
27008
722
    Known.Zero.setBitsFrom(InBitWidth);
27009
722
    break;
27010
367k
  }
27011
367k
  }
27012
367k
}
27013
27014
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
27015
    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
27016
10.6k
    unsigned Depth) const {
27017
10.6k
  unsigned VTBits = Op.getScalarValueSizeInBits();
27018
10.6k
  unsigned Opcode = Op.getOpcode();
27019
10.6k
  switch (Opcode) {
27020
154
  case X86ISD::SETCC_CARRY:
27021
154
    // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
27022
154
    return VTBits;
27023
10.6k
27024
104
  case X86ISD::VSEXT: {
27025
104
    SDValue Src = Op.getOperand(0);
27026
104
    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27027
104
    Tmp += VTBits - Src.getScalarValueSizeInBits();
27028
104
    return Tmp;
27029
10.6k
  }
27030
10.6k
27031
50
  case X86ISD::PACKSS: {
27032
50
    // PACKSS is just a truncation if the sign bits extend to the packed size.
27033
50
    // TODO: Add DemandedElts support.
27034
50
    unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
27035
50
    unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
27036
50
    unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
27037
50
    unsigned Tmp = std::min(Tmp0, Tmp1);
27038
50
    if (Tmp > (SrcBits - VTBits))
27039
34
      return Tmp - (SrcBits - VTBits);
27040
16
    return 1;
27041
16
  }
27042
16
27043
805
  case X86ISD::VSHLI: {
27044
805
    SDValue Src = Op.getOperand(0);
27045
805
    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27046
805
    APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27047
805
    if (ShiftVal.uge(VTBits))
27048
0
      return VTBits; // Shifted all bits out --> zero.
27049
805
    
if (805
ShiftVal.uge(Tmp)805
)
27050
805
      return 1; // Shifted all sign bits out --> unknown.
27051
0
    return Tmp - ShiftVal.getZExtValue();
27052
0
  }
27053
0
27054
347
  case X86ISD::VSRAI: {
27055
347
    SDValue Src = Op.getOperand(0);
27056
347
    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27057
347
    APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27058
347
    ShiftVal += Tmp;
27059
347
    return ShiftVal.uge(VTBits) ? 
VTBits347
:
ShiftVal.getZExtValue()0
;
27060
0
  }
27061
0
27062
2.55k
  case X86ISD::PCMPGT:
27063
2.55k
  case X86ISD::PCMPEQ:
27064
2.55k
  case X86ISD::CMPP:
27065
2.55k
  case X86ISD::VPCOM:
27066
2.55k
  case X86ISD::VPCOMU:
27067
2.55k
    // Vector compares return zero/all-bits result values.
27068
2.55k
    return VTBits;
27069
6.66k
  }
27070
6.66k
27071
6.66k
  // Fallback case.
27072
6.66k
  return 1;
27073
6.66k
}
27074
27075
1.70M
SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
27076
1.70M
  if (
N->getOpcode() == X86ISD::Wrapper || 1.70M
N->getOpcode() == X86ISD::WrapperRIP1.70M
)
27077
7.14k
    return N->getOperand(0);
27078
1.70M
  return N;
27079
1.70M
}
27080
27081
/// Returns true (and the GlobalValue and the offset) if the node is a
27082
/// GlobalAddress + offset.
27083
bool X86TargetLowering::isGAPlusOffset(SDNode *N,
27084
                                       const GlobalValue* &GA,
27085
9.44M
                                       int64_t &Offset) const {
27086
9.44M
  if (
N->getOpcode() == X86ISD::Wrapper9.44M
) {
27087
25.2k
    if (
isa<GlobalAddressSDNode>(N->getOperand(0))25.2k
) {
27088
6.49k
      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
27089
6.49k
      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
27090
6.49k
      return true;
27091
6.49k
    }
27092
9.43M
  }
27093
9.43M
  return TargetLowering::isGAPlusOffset(N, GA, Offset);
27094
9.43M
}
27095
27096
// Attempt to match a combined shuffle mask against supported unary shuffle
27097
// instructions.
27098
// TODO: Investigate sharing more of this with shuffle lowering.
27099
static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27100
                                    bool AllowFloatDomain, bool AllowIntDomain,
27101
                                    SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
27102
                                    const X86Subtarget &Subtarget,
27103
43.1k
                                    unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
27104
43.1k
  unsigned NumMaskElts = Mask.size();
27105
43.1k
  unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
27106
43.1k
27107
43.1k
  // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
27108
43.1k
  // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
27109
43.1k
  if (
AllowIntDomain && 43.1k
((MaskVT.is128BitVector() && 37.0k
Subtarget.hasSSE41()30.9k
) ||
27110
43.1k
                         
(MaskVT.is256BitVector() && 21.6k
Subtarget.hasInt256()5.62k
))) {
27111
21.0k
    unsigned MaxScale = 64 / MaskEltSize;
27112
61.0k
    for (unsigned Scale = 2; 
Scale <= MaxScale61.0k
;
Scale *= 240.0k
) {
27113
40.0k
      bool Match = true;
27114
40.0k
      unsigned NumDstElts = NumMaskElts / Scale;
27115
85.0k
      for (unsigned i = 0; 
i != NumDstElts && 85.0k
Match83.6k
;
++i44.9k
) {
27116
44.9k
        Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
27117
44.9k
        Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
27118
44.9k
      }
27119
40.0k
      if (
Match40.0k
) {
27120
50
        unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
27121
48
        MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
27122
2
                                            MVT::getIntegerVT(MaskEltSize);
27123
50
        SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
27124
50
27125
50
        if (
SrcVT.getSizeInBits() != MaskVT.getSizeInBits()50
) {
27126
6
          V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
27127
6
          Shuffle = unsigned(X86ISD::VZEXT);
27128
6
        } else
27129
44
          Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
27130
50
27131
50
        DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
27132
50
        DstVT = MVT::getVectorVT(DstVT, NumDstElts);
27133
50
        return true;
27134
50
      }
27135
40.0k
    }
27136
21.0k
  }
27137
43.1k
27138
43.1k
  // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
27139
43.0k
  
if (43.0k
((MaskEltSize == 32) || 43.0k
(MaskEltSize == 64 && 32.0k
Subtarget.hasSSE2()6.68k
)) &&
27140
17.6k
      isUndefOrEqual(Mask[0], 0) &&
27141
43.0k
      
isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)7.42k
) {
27142
1.45k
    Shuffle = X86ISD::VZEXT_MOVL;
27143
1.45k
    SrcVT = DstVT = !Subtarget.hasSSE2() ? 
MVT::v4f324
:
MaskVT1.44k
;
27144
1.45k
    return true;
27145
1.45k
  }
27146
41.6k
27147
41.6k
  // Check if we have SSE3 which will let us use MOVDDUP etc. The
27148
41.6k
  // instructions are no slower than UNPCKLPD but has the option to
27149
41.6k
  // fold the input operand into even an unaligned memory load.
27150
41.6k
  
if (41.6k
MaskVT.is128BitVector() && 41.6k
Subtarget.hasSSE3()33.3k
&&
AllowFloatDomain21.7k
) {
27151
2.98k
    if (
isTargetShuffleEquivalent(Mask, {0, 0})2.98k
) {
27152
333
      Shuffle = X86ISD::MOVDDUP;
27153
333
      SrcVT = DstVT = MVT::v2f64;
27154
333
      return true;
27155
333
    }
27156
2.65k
    
if (2.65k
isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})2.65k
) {
27157
98
      Shuffle = X86ISD::MOVSLDUP;
27158
98
      SrcVT = DstVT = MVT::v4f32;
27159
98
      return true;
27160
98
    }
27161
2.55k
    
if (2.55k
isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})2.55k
) {
27162
341
      Shuffle = X86ISD::MOVSHDUP;
27163
341
      SrcVT = DstVT = MVT::v4f32;
27164
341
      return true;
27165
341
    }
27166
40.8k
  }
27167
40.8k
27168
40.8k
  
if (40.8k
MaskVT.is256BitVector() && 40.8k
AllowFloatDomain7.17k
) {
27169
1.45k
    assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
27170
1.45k
    if (
isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})1.45k
) {
27171
117
      Shuffle = X86ISD::MOVDDUP;
27172
117
      SrcVT = DstVT = MVT::v4f64;
27173
117
      return true;
27174
117
    }
27175
1.33k
    
if (1.33k
isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})1.33k
) {
27176
70
      Shuffle = X86ISD::MOVSLDUP;
27177
70
      SrcVT = DstVT = MVT::v8f32;
27178
70
      return true;
27179
70
    }
27180
1.26k
    
if (1.26k
isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})1.26k
) {
27181
71
      Shuffle = X86ISD::MOVSHDUP;
27182
71
      SrcVT = DstVT = MVT::v8f32;
27183
71
      return true;
27184
71
    }
27185
40.5k
  }
27186
40.5k
27187
40.5k
  
if (40.5k
MaskVT.is512BitVector() && 40.5k
AllowFloatDomain1.04k
) {
27188
516
    assert(Subtarget.hasAVX512() &&
27189
516
           "AVX512 required for 512-bit vector shuffles");
27190
516
    if (
isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})516
) {
27191
43
      Shuffle = X86ISD::MOVDDUP;
27192
43
      SrcVT = DstVT = MVT::v8f64;
27193
43
      return true;
27194
43
    }
27195
473
    
if (473
isTargetShuffleEquivalent(
27196
473
            Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
27197
47
      Shuffle = X86ISD::MOVSLDUP;
27198
47
      SrcVT = DstVT = MVT::v16f32;
27199
47
      return true;
27200
47
    }
27201
426
    
if (426
isTargetShuffleEquivalent(
27202
426
            Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
27203
43
      Shuffle = X86ISD::MOVSHDUP;
27204
43
      SrcVT = DstVT = MVT::v16f32;
27205
43
      return true;
27206
43
    }
27207
40.4k
  }
27208
40.4k
27209
40.4k
  // Attempt to match against broadcast-from-vector.
27210
40.4k
  
if (40.4k
Subtarget.hasAVX2()40.4k
) {
27211
12.1k
    SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
27212
12.1k
    if (
isTargetShuffleEquivalent(Mask, BroadcastMask)12.1k
) {
27213
85
      SrcVT = DstVT = MaskVT;
27214
85
      Shuffle = X86ISD::VBROADCAST;
27215
85
      return true;
27216
85
    }
27217
40.3k
  }
27218
40.3k
27219
40.3k
  return false;
27220
40.3k
}
27221
27222
// Attempt to match a combined shuffle mask against supported unary immediate
27223
// permute instructions.
27224
// TODO: Investigate sharing more of this with shuffle lowering.
27225
static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27226
                                           const APInt &Zeroable,
27227
                                           bool AllowFloatDomain,
27228
                                           bool AllowIntDomain,
27229
                                           const X86Subtarget &Subtarget,
27230
                                           unsigned &Shuffle, MVT &ShuffleVT,
27231
40.3k
                                           unsigned &PermuteImm) {
27232
40.3k
  unsigned NumMaskElts = Mask.size();
27233
40.3k
  unsigned InputSizeInBits = MaskVT.getSizeInBits();
27234
40.3k
  unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
27235
40.3k
  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
27236
40.3k
27237
40.3k
  bool ContainsZeros =
27238
259k
      llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27239
40.3k
27240
40.3k
  // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
27241
40.3k
  if (
!ContainsZeros && 40.3k
MaskScalarSizeInBits == 6425.8k
) {
27242
5.32k
    // Check for lane crossing permutes.
27243
5.32k
    if (
is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)5.32k
) {
27244
1.57k
      // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27245
1.57k
      if (
Subtarget.hasAVX2() && 1.57k
MaskVT.is256BitVector()1.53k
) {
27246
1.22k
        Shuffle = X86ISD::VPERMI;
27247
1.22k
        ShuffleVT = (AllowFloatDomain ? 
MVT::v4f64220
:
MVT::v4i641.00k
);
27248
1.22k
        PermuteImm = getV4X86ShuffleImm(Mask);
27249
1.22k
        return true;
27250
1.22k
      }
27251
346
      
if (346
Subtarget.hasAVX512() && 346
MaskVT.is512BitVector()315
) {
27252
315
        SmallVector<int, 4> RepeatedMask;
27253
315
        if (
is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)315
) {
27254
108
          Shuffle = X86ISD::VPERMI;
27255
108
          ShuffleVT = (AllowFloatDomain ? 
MVT::v8f6454
:
MVT::v8i6454
);
27256
108
          PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27257
108
          return true;
27258
108
        }
27259
5.32k
      }
27260
3.75k
    } else 
if (3.75k
AllowFloatDomain && 3.75k
Subtarget.hasAVX()1.60k
) {
27261
902
      // VPERMILPD can permute with a non-repeating shuffle.
27262
902
      Shuffle = X86ISD::VPERMILPI;
27263
902
      ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27264
902
      PermuteImm = 0;
27265
3.33k
      for (int i = 0, e = Mask.size(); 
i != e3.33k
;
++i2.42k
) {
27266
2.42k
        int M = Mask[i];
27267
2.42k
        if (M == SM_SentinelUndef)
27268
0
          continue;
27269
2.42k
        assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27270
2.42k
        PermuteImm |= (M & 1) << i;
27271
2.42k
      }
27272
3.75k
      return true;
27273
3.75k
    }
27274
38.1k
  }
27275
38.1k
27276
38.1k
  // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
27277
38.1k
  // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27278
38.1k
  // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27279
38.1k
  
if (38.1k
(MaskScalarSizeInBits == 64 || 38.1k
MaskScalarSizeInBits == 3234.6k
) &&
27280
38.1k
      
!ContainsZeros12.7k
&&
(AllowIntDomain || 9.81k
Subtarget.hasAVX()2.65k
)) {
27281
8.55k
    SmallVector<int, 4> RepeatedMask;
27282
8.55k
    if (
is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)8.55k
) {
27283
7.53k
      // Narrow the repeated mask to create 32-bit element permutes.
27284
7.53k
      SmallVector<int, 4> WordMask = RepeatedMask;
27285
7.53k
      if (MaskScalarSizeInBits == 64)
27286
2.11k
        scaleShuffleMask<int>(2, RepeatedMask, WordMask);
27287
7.53k
27288
7.53k
      Shuffle = (AllowIntDomain ? 
X86ISD::PSHUFD6.67k
:
X86ISD::VPERMILPI859
);
27289
7.53k
      ShuffleVT = (AllowIntDomain ? 
MVT::i326.67k
:
MVT::f32859
);
27290
7.53k
      ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27291
7.53k
      PermuteImm = getV4X86ShuffleImm(WordMask);
27292
7.53k
      return true;
27293
7.53k
    }
27294
30.5k
  }
27295
30.5k
27296
30.5k
  // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
27297
30.5k
  
if (30.5k
!ContainsZeros && 30.5k
AllowIntDomain16.0k
&&
MaskScalarSizeInBits == 1614.2k
) {
27298
9.08k
    SmallVector<int, 4> RepeatedMask;
27299
9.08k
    if (
is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)9.08k
) {
27300
8.51k
      ArrayRef<int> LoMask(Mask.data() + 0, 4);
27301
8.51k
      ArrayRef<int> HiMask(Mask.data() + 4, 4);
27302
8.51k
27303
8.51k
      // PSHUFLW: permute lower 4 elements only.
27304
8.51k
      if (isUndefOrInRange(LoMask, 0, 4) &&
27305
8.51k
          
isSequentialOrUndefInRange(HiMask, 0, 4, 4)3.74k
) {
27306
1.25k
        Shuffle = X86ISD::PSHUFLW;
27307
1.25k
        ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27308
1.25k
        PermuteImm = getV4X86ShuffleImm(LoMask);
27309
1.25k
        return true;
27310
1.25k
      }
27311
7.25k
27312
7.25k
      // PSHUFHW: permute upper 4 elements only.
27313
7.25k
      
if (7.25k
isUndefOrInRange(HiMask, 4, 8) &&
27314
7.25k
          
isSequentialOrUndefInRange(LoMask, 0, 4, 0)4.54k
) {
27315
583
        // Offset the HiMask so that we can create the shuffle immediate.
27316
583
        int OffsetHiMask[4];
27317
2.91k
        for (int i = 0; 
i != 42.91k
;
++i2.33k
)
27318
2.33k
          
OffsetHiMask[i] = (HiMask[i] < 0 ? 2.33k
HiMask[i]0
:
HiMask[i] - 42.33k
);
27319
583
27320
583
        Shuffle = X86ISD::PSHUFHW;
27321
583
        ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27322
583
        PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
27323
583
        return true;
27324
583
      }
27325
28.7k
    }
27326
9.08k
  }
27327
28.7k
27328
28.7k
  // Attempt to match against byte/bit shifts.
27329
28.7k
  // FIXME: Add 512-bit support.
27330
28.7k
  
if (28.7k
AllowIntDomain && 28.7k
((MaskVT.is128BitVector() && 26.1k
Subtarget.hasSSE2()21.8k
) ||
27331
28.7k
                         
(MaskVT.is256BitVector() && 4.34k
Subtarget.hasAVX2()4.00k
))) {
27332
25.8k
    int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
27333
25.8k
                                             MaskScalarSizeInBits, Mask,
27334
25.8k
                                             0, Zeroable, Subtarget);
27335
25.8k
    if (
0 < ShiftAmt25.8k
) {
27336
4.42k
      PermuteImm = (unsigned)ShiftAmt;
27337
4.42k
      return true;
27338
4.42k
    }
27339
24.3k
  }
27340
24.3k
27341
24.3k
  return false;
27342
24.3k
}
27343
27344
// Attempt to match a combined unary shuffle mask against supported binary
27345
// shuffle instructions.
27346
// TODO: Investigate sharing more of this with shuffle lowering.
27347
static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27348
                                     bool AllowFloatDomain, bool AllowIntDomain,
27349
                                     SDValue &V1, SDValue &V2, SDLoc &DL,
27350
                                     SelectionDAG &DAG,
27351
                                     const X86Subtarget &Subtarget,
27352
                                     unsigned &Shuffle, MVT &ShuffleVT,
27353
61.9k
                                     bool IsUnary) {
27354
61.9k
  unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27355
61.9k
27356
61.9k
  if (
MaskVT.is128BitVector()61.9k
) {
27357
52.8k
    if (
isTargetShuffleEquivalent(Mask, {0, 0}) && 52.8k
AllowFloatDomain74
) {
27358
74
      V2 = V1;
27359
74
      Shuffle = X86ISD::MOVLHPS;
27360
74
      ShuffleVT = MVT::v4f32;
27361
74
      return true;
27362
74
    }
27363
52.7k
    
if (52.7k
isTargetShuffleEquivalent(Mask, {1, 1}) && 52.7k
AllowFloatDomain546
) {
27364
546
      V2 = V1;
27365
546
      Shuffle = X86ISD::MOVHLPS;
27366
546
      ShuffleVT = MVT::v4f32;
27367
546
      return true;
27368
546
    }
27369
52.1k
    
if (52.1k
isTargetShuffleEquivalent(Mask, {0, 3}) && 52.1k
Subtarget.hasSSE2()2.16k
&&
27370
52.1k
        
(AllowFloatDomain || 2.16k
!Subtarget.hasSSE41()1.57k
)) {
27371
1.28k
      std::swap(V1, V2);
27372
1.28k
      Shuffle = X86ISD::MOVSD;
27373
1.28k
      ShuffleVT = MaskVT;
27374
1.28k
      return true;
27375
1.28k
    }
27376
50.9k
    
if (50.9k
isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27377
50.9k
        
(AllowFloatDomain || 652
!Subtarget.hasSSE41()102
)) {
27378
595
      Shuffle = X86ISD::MOVSS;
27379
595
      ShuffleVT = MaskVT;
27380
595
      return true;
27381
595
    }
27382
59.4k
  }
27383
59.4k
27384
59.4k
  // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27385
59.4k
  
if (59.4k
(MaskVT == MVT::v4f32 && 59.4k
Subtarget.hasSSE1()3.38k
) ||
27386
56.1k
      
(MaskVT.is128BitVector() && 56.1k
Subtarget.hasSSE2()46.9k
) ||
27387
9.17k
      
(MaskVT.is256BitVector() && 9.17k
32 <= EltSizeInBits7.51k
&&
Subtarget.hasAVX()3.19k
) ||
27388
5.98k
      
(MaskVT.is256BitVector() && 5.98k
Subtarget.hasAVX2()4.32k
) ||
27389
59.4k
      
(MaskVT.is512BitVector() && 1.84k
Subtarget.hasAVX512()1.66k
)) {
27390
59.3k
    if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27391
59.3k
                                    DAG, Subtarget)) {
27392
10.3k
      ShuffleVT = MaskVT;
27393
10.3k
      if (
ShuffleVT.is256BitVector() && 10.3k
!Subtarget.hasAVX2()899
)
27394
128
        
ShuffleVT = (32 == EltSizeInBits ? 128
MVT::v8f3243
:
MVT::v4f6485
);
27395
10.3k
      return true;
27396
10.3k
    }
27397
49.1k
  }
27398
49.1k
27399
49.1k
  return false;
27400
49.1k
}
27401
27402
static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27403
                                            const APInt &Zeroable,
27404
                                            bool AllowFloatDomain,
27405
                                            bool AllowIntDomain,
27406
                                            SDValue &V1, SDValue &V2, SDLoc &DL,
27407
                                            SelectionDAG &DAG,
27408
                                            const X86Subtarget &Subtarget,
27409
                                            unsigned &Shuffle, MVT &ShuffleVT,
27410
49.1k
                                            unsigned &PermuteImm) {
27411
49.1k
  unsigned NumMaskElts = Mask.size();
27412
49.1k
  unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27413
49.1k
27414
49.1k
  // Attempt to match against PALIGNR byte rotate.
27415
49.1k
  if (
AllowIntDomain && 49.1k
((MaskVT.is128BitVector() && 43.9k
Subtarget.hasSSSE3()38.1k
) ||
27416
49.1k
                         
(MaskVT.is256BitVector() && 21.5k
Subtarget.hasAVX2()4.79k
))) {
27417
27.2k
    int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27418
27.2k
    if (
0 < ByteRotation27.2k
) {
27419
546
      Shuffle = X86ISD::PALIGNR;
27420
546
      ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27421
546
      PermuteImm = ByteRotation;
27422
546
      return true;
27423
546
    }
27424
48.6k
  }
27425
48.6k
27426
48.6k
  // Attempt to combine to X86ISD::BLENDI.
27427
48.6k
  
if (48.6k
(NumMaskElts <= 8 && 48.6k
((Subtarget.hasSSE41() && 30.5k
MaskVT.is128BitVector()17.4k
) ||
27428
30.5k
                            
(Subtarget.hasAVX() && 16.5k
MaskVT.is256BitVector()3.42k
))) ||
27429
48.6k
      
(MaskVT == MVT::v16i16 && 31.9k
Subtarget.hasAVX2()2.88k
)) {
27430
19.5k
    uint64_t BlendMask = 0;
27431
19.5k
    bool ForceV1Zero = false, ForceV2Zero = false;
27432
19.5k
    SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27433
19.5k
    if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27434
19.5k
                                  BlendMask)) {
27435
3.54k
      if (
MaskVT == MVT::v16i163.54k
) {
27436
149
        // We can only use v16i16 PBLENDW if the lanes are repeated.
27437
149
        SmallVector<int, 8> RepeatedMask;
27438
149
        if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27439
149
                                        RepeatedMask)) {
27440
97
          assert(RepeatedMask.size() == 8 &&
27441
97
                 "Repeated mask size doesn't match!");
27442
97
          PermuteImm = 0;
27443
873
          for (int i = 0; 
i < 8873
;
++i776
)
27444
776
            
if (776
RepeatedMask[i] >= 8776
)
27445
374
              PermuteImm |= 1 << i;
27446
97
          V1 = ForceV1Zero ? 
getZeroVector(MaskVT, Subtarget, DAG, DL)0
:
V197
;
27447
97
          V2 = ForceV2Zero ? 
getZeroVector(MaskVT, Subtarget, DAG, DL)26
:
V271
;
27448
97
          Shuffle = X86ISD::BLENDI;
27449
97
          ShuffleVT = MaskVT;
27450
97
          return true;
27451
97
        }
27452
3.39k
      } else {
27453
3.39k
        // Determine a type compatible with X86ISD::BLENDI.
27454
3.39k
        ShuffleVT = MaskVT;
27455
3.39k
        if (
Subtarget.hasAVX2()3.39k
) {
27456
1.09k
          if (ShuffleVT == MVT::v4i64)
27457
141
            ShuffleVT = MVT::v8i32;
27458
957
          else 
if (957
ShuffleVT == MVT::v2i64957
)
27459
125
            ShuffleVT = MVT::v4i32;
27460
3.39k
        } else {
27461
2.29k
          if (
ShuffleVT == MVT::v2i64 || 2.29k
ShuffleVT == MVT::v4i321.53k
)
27462
1.37k
            ShuffleVT = MVT::v8i16;
27463
925
          else 
if (925
ShuffleVT == MVT::v4i64925
)
27464
0
            ShuffleVT = MVT::v4f64;
27465
925
          else 
if (925
ShuffleVT == MVT::v8i32925
)
27466
0
            ShuffleVT = MVT::v8f32;
27467
2.29k
        }
27468
3.39k
27469
3.39k
        if (
!ShuffleVT.isFloatingPoint()3.39k
) {
27470
2.73k
          int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27471
2.73k
          BlendMask =
27472
2.73k
              scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27473
2.73k
          ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27474
2.73k
          ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27475
2.73k
        }
27476
3.39k
27477
3.39k
        V1 = ForceV1Zero ? 
getZeroVector(MaskVT, Subtarget, DAG, DL)0
:
V13.39k
;
27478
3.39k
        V2 = ForceV2Zero ? 
getZeroVector(MaskVT, Subtarget, DAG, DL)805
:
V22.59k
;
27479
3.39k
        PermuteImm = (unsigned)BlendMask;
27480
3.39k
        Shuffle = X86ISD::BLENDI;
27481
3.39k
        return true;
27482
3.39k
      }
27483
45.1k
    }
27484
19.5k
  }
27485
45.1k
27486
45.1k
  // Attempt to combine to INSERTPS.
27487
45.1k
  
if (45.1k
AllowFloatDomain && 45.1k
EltSizeInBits == 3218.9k
&&
Subtarget.hasSSE41()3.74k
&&
27488
45.1k
      
MaskVT.is128BitVector()2.73k
) {
27489
1.70k
    if (Zeroable.getBoolValue() &&
27490
1.70k
        
matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)312
) {
27491
258
      Shuffle = X86ISD::INSERTPS;
27492
258
      ShuffleVT = MVT::v4f32;
27493
258
      return true;
27494
258
    }
27495
44.8k
  }
27496
44.8k
27497
44.8k
  // Attempt to combine to SHUFPD.
27498
44.8k
  
if (44.8k
AllowFloatDomain && 44.8k
EltSizeInBits == 6418.7k
&&
27499
754
      
((MaskVT.is128BitVector() && 754
Subtarget.hasSSE2()152
) ||
27500
602
       
(MaskVT.is256BitVector() && 602
Subtarget.hasAVX()404
) ||
27501
44.8k
       
(MaskVT.is512BitVector() && 198
Subtarget.hasAVX512()198
))) {
27502
754
    if (
matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)754
) {
27503
166
      Shuffle = X86ISD::SHUFP;
27504
166
      ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27505
166
      return true;
27506
166
    }
27507
44.7k
  }
27508
44.7k
27509
44.7k
  // Attempt to combine to SHUFPS.
27510
44.7k
  
if (44.7k
AllowFloatDomain && 44.7k
EltSizeInBits == 3218.5k
&&
27511
3.48k
      
((MaskVT.is128BitVector() && 3.48k
Subtarget.hasSSE1()2.46k
) ||
27512
1.02k
       
(MaskVT.is256BitVector() && 1.02k
Subtarget.hasAVX()896
) ||
27513
44.7k
       
(MaskVT.is512BitVector() && 127
Subtarget.hasAVX512()127
))) {
27514
3.48k
    SmallVector<int, 4> RepeatedMask;
27515
3.48k
    if (
isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)3.48k
) {
27516
2.61k
      // Match each half of the repeated mask, to determine if its just
27517
2.61k
      // referencing one of the vectors, is zeroable or entirely undef.
27518
5.22k
      auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27519
5.22k
        int M0 = RepeatedMask[Offset];
27520
5.22k
        int M1 = RepeatedMask[Offset + 1];
27521
5.22k
27522
5.22k
        if (
isUndefInRange(RepeatedMask, Offset, 2)5.22k
) {
27523
4
          return DAG.getUNDEF(MaskVT);
27524
5.22k
        } else 
if (5.22k
isUndefOrZeroInRange(RepeatedMask, Offset, 2)5.22k
) {
27525
117
          S0 = (SM_SentinelUndef == M0 ? 
-10
:
0117
);
27526
117
          S1 = (SM_SentinelUndef == M1 ? 
-10
:
1117
);
27527
117
          return getZeroVector(MaskVT, Subtarget, DAG, DL);
27528
5.10k
        } else 
if (5.10k
isUndefOrInRange(M0, 0, 4) && 5.10k
isUndefOrInRange(M1, 0, 4)3.90k
) {
27529
2.96k
          S0 = (SM_SentinelUndef == M0 ? 
-10
:
M0 & 32.96k
);
27530
2.96k
          S1 = (SM_SentinelUndef == M1 ? 
-123
:
M1 & 32.94k
);
27531
2.96k
          return V1;
27532
2.13k
        } else 
if (2.13k
isUndefOrInRange(M0, 4, 8) && 2.13k
isUndefOrInRange(M1, 4, 8)1.12k
) {
27533
663
          S0 = (SM_SentinelUndef == M0 ? 
-10
:
M0 & 3663
);
27534
663
          S1 = (SM_SentinelUndef == M1 ? 
-10
:
M1 & 3663
);
27535
5.22k
          return V2;
27536
5.22k
        }
27537
1.47k
27538
1.47k
        return SDValue();
27539
1.47k
      };
27540
2.61k
27541
2.61k
      int ShufMask[4] = {-1, -1, -1, -1};
27542
2.61k
      SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27543
2.61k
      SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27544
2.61k
27545
2.61k
      if (
Lo && 2.61k
Hi2.01k
) {
27546
1.27k
        V1 = Lo;
27547
1.27k
        V2 = Hi;
27548
1.27k
        Shuffle = X86ISD::SHUFP;
27549
1.27k
        ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27550
1.27k
        PermuteImm = getV4X86ShuffleImm(ShufMask);
27551
1.27k
        return true;
27552
1.27k
      }
27553
43.4k
    }
27554
3.48k
  }
27555
43.4k
27556
43.4k
  return false;
27557
43.4k
}
27558
27559
/// \brief Combine an arbitrary chain of shuffles into a single instruction if
27560
/// possible.
27561
///
27562
/// This is the leaf of the recursive combine below. When we have found some
27563
/// chain of single-use x86 shuffle instructions and accumulated the combined
27564
/// shuffle mask represented by them, this will try to pattern match that mask
27565
/// into either a single instruction if there is a special purpose instruction
27566
/// for this operation, or into a PSHUFB instruction which is a fully general
27567
/// instruction but should only be used to replace chains over a certain depth.
27568
static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27569
                                      ArrayRef<int> BaseMask, int Depth,
27570
                                      bool HasVariableMask, SelectionDAG &DAG,
27571
                                      TargetLowering::DAGCombinerInfo &DCI,
27572
85.0k
                                      const X86Subtarget &Subtarget) {
27573
85.0k
  assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27574
85.0k
  assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27575
85.0k
         "Unexpected number of shuffle inputs!");
27576
85.0k
27577
85.0k
  // Find the inputs that enter the chain. Note that multiple uses are OK
27578
85.0k
  // here, we're not going to remove the operands we find.
27579
85.0k
  bool UnaryShuffle = (Inputs.size() == 1);
27580
85.0k
  SDValue V1 = peekThroughBitcasts(Inputs[0]);
27581
45.0k
  SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27582
39.9k
                             : peekThroughBitcasts(Inputs[1]));
27583
85.0k
27584
85.0k
  MVT VT1 = V1.getSimpleValueType();
27585
85.0k
  MVT VT2 = V2.getSimpleValueType();
27586
85.0k
  MVT RootVT = Root.getSimpleValueType();
27587
85.0k
  assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27588
85.0k
         VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27589
85.0k
         "Vector size mismatch");
27590
85.0k
27591
85.0k
  SDLoc DL(Root);
27592
85.0k
  SDValue Res;
27593
85.0k
27594
85.0k
  unsigned NumBaseMaskElts = BaseMask.size();
27595
85.0k
  if (
NumBaseMaskElts == 185.0k
) {
27596
132
    assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27597
132
    return DAG.getBitcast(RootVT, V1);
27598
132
  }
27599
84.8k
27600
84.8k
  unsigned RootSizeInBits = RootVT.getSizeInBits();
27601
84.8k
  unsigned NumRootElts = RootVT.getVectorNumElements();
27602
84.8k
  unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27603
69.0k
  bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27604
67.3k
                     
(RootVT.is256BitVector() && 67.3k
!Subtarget.hasAVX2()9.52k
);
27605
84.8k
27606
84.8k
  // Don't combine if we are a AVX512/EVEX target and the mask element size
27607
84.8k
  // is different from the root element size - this would prevent writemasks
27608
84.8k
  // from being reused.
27609
84.8k
  // TODO - this currently prevents all lane shuffles from occurring.
27610
84.8k
  // TODO - check for writemasks usage instead of always preventing combining.
27611
84.8k
  // TODO - attempt to narrow Mask back to writemask size.
27612
84.8k
  bool IsEVEXShuffle =
27613
82.3k
      RootSizeInBits == 512 || 
(Subtarget.hasVLX() && 82.3k
RootSizeInBits >= 1287.87k
);
27614
84.8k
  if (
IsEVEXShuffle && 84.8k
(RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits)10.3k
)
27615
3.84k
    return SDValue();
27616
81.0k
27617
81.0k
  // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27618
81.0k
27619
81.0k
  // Handle 128-bit lane shuffles of 256-bit vectors.
27620
81.0k
  // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
27621
81.0k
  // we need to use the zeroing feature.
27622
81.0k
  // TODO - this should support binary shuffles.
27623
81.0k
  
if (81.0k
UnaryShuffle && 81.0k
RootVT.is256BitVector()43.2k
&&
NumBaseMaskElts == 27.40k
&&
27624
410
      
!(Subtarget.hasAVX2() && 410
BaseMask[0] >= -1290
&&
BaseMask[1] >= -1257
) &&
27625
81.0k
      
!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)198
) {
27626
137
    if (
Depth == 1 && 137
Root.getOpcode() == X86ISD::VPERM2X128125
)
27627
125
      return SDValue(); // Nothing to do!
27628
12
    
MVT ShuffleVT = (FloatDomain ? 12
MVT::v4f648
:
MVT::v4i644
);
27629
12
    unsigned PermMask = 0;
27630
12
    PermMask |= ((BaseMask[0] < 0 ? 
0x86
:
(BaseMask[0] & 1)6
) << 0);
27631
12
    PermMask |= ((BaseMask[1] < 0 ? 
0x84
:
(BaseMask[1] & 1)8
) << 4);
27632
137
27633
137
    Res = DAG.getBitcast(ShuffleVT, V1);
27634
137
    DCI.AddToWorklist(Res.getNode());
27635
137
    Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27636
137
                      DAG.getUNDEF(ShuffleVT),
27637
137
                      DAG.getConstant(PermMask, DL, MVT::i8));
27638
137
    DCI.AddToWorklist(Res.getNode());
27639
137
    return DAG.getBitcast(RootVT, Res);
27640
137
  }
27641
80.9k
27642
80.9k
  // For masks that have been widened to 128-bit elements or more,
27643
80.9k
  // narrow back down to 64-bit elements.
27644
80.9k
  SmallVector<int, 64> Mask;
27645
80.9k
  if (
BaseMaskEltSizeInBits > 6480.9k
) {
27646
611
    assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27647
611
    int MaskScale = BaseMaskEltSizeInBits / 64;
27648
611
    scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
27649
80.9k
  } else {
27650
80.2k
    Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27651
80.2k
  }
27652
80.9k
27653
80.9k
  unsigned NumMaskElts = Mask.size();
27654
80.9k
  unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27655
80.9k
27656
80.9k
  // Determine the effective mask value type.
27657
80.9k
  FloatDomain &= (32 <= MaskEltSizeInBits);
27658
12.3k
  MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27659
68.5k
                           : MVT::getIntegerVT(MaskEltSizeInBits);
27660
80.9k
  MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27661
80.9k
27662
80.9k
  // Only allow legal mask types.
27663
80.9k
  if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27664
133
    return SDValue();
27665
80.7k
27666
80.7k
  // Attempt to match the mask against known shuffle patterns.
27667
80.7k
  MVT ShuffleSrcVT, ShuffleVT;
27668
80.7k
  unsigned Shuffle, PermuteImm;
27669
80.7k
27670
80.7k
  // Which shuffle domains are permitted?
27671
80.7k
  // Permit domain crossing at higher combine depths.
27672
68.5k
  bool AllowFloatDomain = FloatDomain || (Depth > 3);
27673
12.2k
  bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
27674
68.7k
                        
(!MaskVT.is256BitVector() || 68.7k
Subtarget.hasAVX2()7.98k
);
27675
80.7k
27676
80.7k
  // Determine zeroable mask elements.
27677
80.7k
  APInt Zeroable(NumMaskElts, 0);
27678
809k
  for (unsigned i = 0; 
i != NumMaskElts809k
;
++i728k
)
27679
728k
    
if (728k
isUndefOrZero(Mask[i])728k
)
27680
188k
      Zeroable.setBit(i);
27681
80.7k
27682
80.7k
  if (
UnaryShuffle80.7k
) {
27683
43.1k
    // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27684
43.1k
    // directly if we don't shuffle the lower element and we shuffle the upper
27685
43.1k
    // (zero) elements within themselves.
27686
43.1k
    if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27687
43.1k
        
(V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 061
) {
27688
61
      unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27689
61
      ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27690
61
      if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27691
61
          
isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)14
) {
27692
5
        return DAG.getBitcast(RootVT, V1);
27693
5
      }
27694
43.1k
    }
27695
43.1k
27696
43.1k
    
if (43.1k
matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27697
43.1k
                                V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27698
43.1k
                                ShuffleVT)) {
27699
2.74k
      if (
Depth == 1 && 2.74k
Root.getOpcode() == Shuffle2.68k
)
27700
2.46k
        return SDValue(); // Nothing to do!
27701
282
      
if (282
IsEVEXShuffle && 282
(NumRootElts != ShuffleVT.getVectorNumElements())32
)
27702
2
        return SDValue(); // AVX512 Writemask clash.
27703
280
      Res = DAG.getBitcast(ShuffleSrcVT, V1);
27704
280
      DCI.AddToWorklist(Res.getNode());
27705
280
      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27706
280
      DCI.AddToWorklist(Res.getNode());
27707
280
      return DAG.getBitcast(RootVT, Res);
27708
280
    }
27709
40.3k
27710
40.3k
    
if (40.3k
matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27711
40.3k
                                       AllowIntDomain, Subtarget, Shuffle,
27712
40.3k
                                       ShuffleVT, PermuteImm)) {
27713
16.0k
      if (
Depth == 1 && 16.0k
Root.getOpcode() == Shuffle15.9k
)
27714
15.5k
        return SDValue(); // Nothing to do!
27715
426
      
if (426
IsEVEXShuffle && 426
(NumRootElts != ShuffleVT.getVectorNumElements())50
)
27716
12
        return SDValue(); // AVX512 Writemask clash.
27717
414
      Res = DAG.getBitcast(ShuffleVT, V1);
27718
414
      DCI.AddToWorklist(Res.getNode());
27719
414
      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27720
414
                        DAG.getConstant(PermuteImm, DL, MVT::i8));
27721
414
      DCI.AddToWorklist(Res.getNode());
27722
414
      return DAG.getBitcast(RootVT, Res);
27723
414
    }
27724
43.1k
  }
27725
61.9k
27726
61.9k
  
if (61.9k
matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27727
61.9k
                               V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27728
61.9k
                               UnaryShuffle)) {
27729
12.8k
    if (
Depth == 1 && 12.8k
Root.getOpcode() == Shuffle12.6k
)
27730
11.9k
      return SDValue(); // Nothing to do!
27731
857
    
if (857
IsEVEXShuffle && 857
(NumRootElts != ShuffleVT.getVectorNumElements())35
)
27732
0
      return SDValue(); // AVX512 Writemask clash.
27733
857
    V1 = DAG.getBitcast(ShuffleVT, V1);
27734
857
    DCI.AddToWorklist(V1.getNode());
27735
857
    V2 = DAG.getBitcast(ShuffleVT, V2);
27736
857
    DCI.AddToWorklist(V2.getNode());
27737
857
    Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27738
857
    DCI.AddToWorklist(Res.getNode());
27739
857
    return DAG.getBitcast(RootVT, Res);
27740
857
  }
27741
49.1k
27742
49.1k
  
if (49.1k
matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27743
49.1k
                                      AllowIntDomain, V1, V2, DL, DAG,
27744
49.1k
                                      Subtarget, Shuffle, ShuffleVT,
27745
49.1k
                                      PermuteImm)) {
27746
5.73k
    if (
Depth == 1 && 5.73k
Root.getOpcode() == Shuffle5.44k
)
27747
5.31k
      return SDValue(); // Nothing to do!
27748
422
    
if (422
IsEVEXShuffle && 422
(NumRootElts != ShuffleVT.getVectorNumElements())44
)
27749
26
      return SDValue(); // AVX512 Writemask clash.
27750
396
    V1 = DAG.getBitcast(ShuffleVT, V1);
27751
396
    DCI.AddToWorklist(V1.getNode());
27752
396
    V2 = DAG.getBitcast(ShuffleVT, V2);
27753
396
    DCI.AddToWorklist(V2.getNode());
27754
396
    Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27755
396
                      DAG.getConstant(PermuteImm, DL, MVT::i8));
27756
396
    DCI.AddToWorklist(Res.getNode());
27757
396
    return DAG.getBitcast(RootVT, Res);
27758
396
  }
27759
43.4k
27760
43.4k
  // Typically from here on, we need an integer version of MaskVT.
27761
43.4k
  MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
27762
43.4k
  IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
27763
43.4k
27764
43.4k
  // Annoyingly, SSE4A instructions don't map into the above match helpers.
27765
43.4k
  if (
Subtarget.hasSSE4A() && 43.4k
AllowIntDomain410
&&
RootSizeInBits == 128281
) {
27766
278
    uint64_t BitLen, BitIdx;
27767
278
    if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
27768
278
                                  Zeroable)) {
27769
59
      if (
Depth == 1 && 59
Root.getOpcode() == X86ISD::EXTRQI49
)
27770
49
        return SDValue(); // Nothing to do!
27771
10
      V1 = DAG.getBitcast(IntMaskVT, V1);
27772
10
      DCI.AddToWorklist(V1.getNode());
27773
10
      Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
27774
10
                        DAG.getConstant(BitLen, DL, MVT::i8),
27775
10
                        DAG.getConstant(BitIdx, DL, MVT::i8));
27776
10
      DCI.AddToWorklist(Res.getNode());
27777
10
      return DAG.getBitcast(RootVT, Res);
27778
10
    }
27779
219
27780
219
    
if (219
matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)219
) {
27781
44
      if (
Depth == 1 && 44
Root.getOpcode() == X86ISD::INSERTQI40
)
27782
40
        return SDValue(); // Nothing to do!
27783
4
      V1 = DAG.getBitcast(IntMaskVT, V1);
27784
4
      DCI.AddToWorklist(V1.getNode());
27785
4
      V2 = DAG.getBitcast(IntMaskVT, V2);
27786
4
      DCI.AddToWorklist(V2.getNode());
27787
4
      Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
27788
4
                        DAG.getConstant(BitLen, DL, MVT::i8),
27789
4
                        DAG.getConstant(BitIdx, DL, MVT::i8));
27790
4
      DCI.AddToWorklist(Res.getNode());
27791
4
      return DAG.getBitcast(RootVT, Res);
27792
4
    }
27793
278
  }
27794
43.3k
27795
43.3k
  // Don't try to re-form single instruction chains under any circumstances now
27796
43.3k
  // that we've done encoding canonicalization for them.
27797
43.3k
  
if (43.3k
Depth < 243.3k
)
27798
14.8k
    return SDValue();
27799
28.5k
27800
28.5k
  // Depth threshold above which we can efficiently use variable mask shuffles.
27801
28.5k
  // TODO This should probably be target specific.
27802
28.5k
  
bool AllowVariableMask = (Depth >= 3) || 28.5k
HasVariableMask8.08k
;
27803
28.5k
27804
28.5k
  bool MaskContainsZeros =
27805
238k
      any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27806
28.5k
27807
28.5k
  if (
is128BitLaneCrossingShuffleMask(MaskVT, Mask)28.5k
) {
27808
1.53k
    // If we have a single input lane-crossing shuffle then lower to VPERMV.
27809
1.53k
    if (
UnaryShuffle && 1.53k
AllowVariableMask697
&&
!MaskContainsZeros465
&&
27810
400
        ((Subtarget.hasAVX2() &&
27811
379
          
(MaskVT == MVT::v8f32 || 379
MaskVT == MVT::v8i32367
)) ||
27812
382
         (Subtarget.hasAVX512() &&
27813
125
          
(MaskVT == MVT::v8f64 || 125
MaskVT == MVT::v8i64123
||
27814
382
           
MaskVT == MVT::v16f32121
||
MaskVT == MVT::v16i32119
)) ||
27815
374
         
(Subtarget.hasBWI() && 374
MaskVT == MVT::v32i1691
) ||
27816
372
         
(Subtarget.hasBWI() && 372
Subtarget.hasVLX()89
&&
MaskVT == MVT::v16i1684
) ||
27817
306
         
(Subtarget.hasVBMI() && 306
MaskVT == MVT::v64i84
) ||
27818
1.53k
         
(Subtarget.hasVBMI() && 304
Subtarget.hasVLX()2
&&
MaskVT == MVT::v32i82
))) {
27819
98
      SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27820
98
      DCI.AddToWorklist(VPermMask.getNode());
27821
98
      Res = DAG.getBitcast(MaskVT, V1);
27822
98
      DCI.AddToWorklist(Res.getNode());
27823
98
      Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27824
98
      DCI.AddToWorklist(Res.getNode());
27825
98
      return DAG.getBitcast(RootVT, Res);
27826
98
    }
27827
1.43k
27828
1.43k
    // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27829
1.43k
    // vector as the second source.
27830
1.43k
    
if (1.43k
UnaryShuffle && 1.43k
AllowVariableMask599
&&
27831
367
        ((Subtarget.hasAVX512() &&
27832
93
          
(MaskVT == MVT::v8f64 || 93
MaskVT == MVT::v8i6491
||
27833
93
           
MaskVT == MVT::v16f3291
||
MaskVT == MVT::v16i3289
)) ||
27834
363
         (Subtarget.hasVLX() &&
27835
14
          
(MaskVT == MVT::v4f64 || 14
MaskVT == MVT::v4i6414
||
27836
363
           
MaskVT == MVT::v8f3214
||
MaskVT == MVT::v8i3214
)) ||
27837
363
         
(Subtarget.hasBWI() && 363
MaskVT == MVT::v32i1631
) ||
27838
363
         
(Subtarget.hasBWI() && 363
Subtarget.hasVLX()31
&&
MaskVT == MVT::v16i1614
) ||
27839
363
         
(Subtarget.hasVBMI() && 363
MaskVT == MVT::v64i80
) ||
27840
1.43k
         
(Subtarget.hasVBMI() && 363
Subtarget.hasVLX()0
&&
MaskVT == MVT::v32i80
))) {
27841
4
      // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27842
52
      for (unsigned i = 0; 
i != NumMaskElts52
;
++i48
)
27843
48
        
if (48
Mask[i] == SM_SentinelZero48
)
27844
18
          Mask[i] = NumMaskElts + i;
27845
4
27846
4
      SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27847
4
      DCI.AddToWorklist(VPermMask.getNode());
27848
4
      Res = DAG.getBitcast(MaskVT, V1);
27849
4
      DCI.AddToWorklist(Res.getNode());
27850
4
      SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27851
4
      DCI.AddToWorklist(Zero.getNode());
27852
4
      Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27853
4
      DCI.AddToWorklist(Res.getNode());
27854
4
      return DAG.getBitcast(RootVT, Res);
27855
4
    }
27856
1.42k
27857
1.42k
    // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27858
1.42k
    
if (1.42k
AllowVariableMask && 1.42k
!MaskContainsZeros1.03k
&&
27859
897
        ((Subtarget.hasAVX512() &&
27860
288
          
(MaskVT == MVT::v8f64 || 288
MaskVT == MVT::v8i64286
||
27861
288
           
MaskVT == MVT::v16f32286
||
MaskVT == MVT::v16i32286
)) ||
27862
893
         (Subtarget.hasVLX() &&
27863
94
          
(MaskVT == MVT::v4f64 || 94
MaskVT == MVT::v4i6483
||
27864
893
           
MaskVT == MVT::v8f3283
||
MaskVT == MVT::v8i3252
)) ||
27865
821
         
(Subtarget.hasBWI() && 821
MaskVT == MVT::v32i1630
) ||
27866
819
         
(Subtarget.hasBWI() && 819
Subtarget.hasVLX()28
&&
MaskVT == MVT::v16i1622
) ||
27867
817
         
(Subtarget.hasVBMI() && 817
MaskVT == MVT::v64i84
) ||
27868
1.42k
         
(Subtarget.hasVBMI() && 815
Subtarget.hasVLX()2
&&
MaskVT == MVT::v32i82
))) {
27869
84
      SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27870
84
      DCI.AddToWorklist(VPermMask.getNode());
27871
84
      V1 = DAG.getBitcast(MaskVT, V1);
27872
84
      DCI.AddToWorklist(V1.getNode());
27873
84
      V2 = DAG.getBitcast(MaskVT, V2);
27874
84
      DCI.AddToWorklist(V2.getNode());
27875
84
      Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27876
84
      DCI.AddToWorklist(Res.getNode());
27877
84
      return DAG.getBitcast(RootVT, Res);
27878
84
    }
27879
1.34k
    return SDValue();
27880
1.34k
  }
27881
26.9k
27882
26.9k
  // See if we can combine a single input shuffle with zeros to a bit-mask,
27883
26.9k
  // which is much simpler than any shuffle.
27884
26.9k
  
if (26.9k
UnaryShuffle && 26.9k
MaskContainsZeros5.16k
&&
AllowVariableMask1.57k
&&
27885
856
      isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27886
26.9k
      
DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)106
) {
27887
106
    APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27888
106
    APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27889
106
    APInt UndefElts(NumMaskElts, 0);
27890
106
    SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27891
1.77k
    for (unsigned i = 0; 
i != NumMaskElts1.77k
;
++i1.66k
) {
27892
1.66k
      int M = Mask[i];
27893
1.66k
      if (
M == SM_SentinelUndef1.66k
) {
27894
0
        UndefElts.setBit(i);
27895
0
        continue;
27896
0
      }
27897
1.66k
      
if (1.66k
M == SM_SentinelZero1.66k
)
27898
602
        continue;
27899
1.06k
      EltBits[i] = AllOnes;
27900
1.06k
    }
27901
106
    SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27902
106
    DCI.AddToWorklist(BitMask.getNode());
27903
106
    Res = DAG.getBitcast(MaskVT, V1);
27904
106
    DCI.AddToWorklist(Res.getNode());
27905
106
    unsigned AndOpcode =
27906
106
        FloatDomain ? 
unsigned(X86ISD::FAND)0
:
unsigned(ISD::AND)106
;
27907
106
    Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27908
106
    DCI.AddToWorklist(Res.getNode());
27909
106
    return DAG.getBitcast(RootVT, Res);
27910
106
  }
27911
26.8k
27912
26.8k
  // If we have a single input shuffle with different shuffle patterns in the
27913
26.8k
  // the 128-bit lanes use the variable mask to VPERMILPS.
27914
26.8k
  // TODO Combine other mask types at higher depths.
27915
26.8k
  
if (26.8k
UnaryShuffle && 26.8k
AllowVariableMask5.06k
&&
!MaskContainsZeros2.45k
&&
27916
1.70k
      
((MaskVT == MVT::v8f32 && 1.70k
Subtarget.hasAVX()6
) ||
27917
26.8k
       
(MaskVT == MVT::v16f32 && 1.69k
Subtarget.hasAVX512()2
))) {
27918
8
    SmallVector<SDValue, 16> VPermIdx;
27919
80
    for (int M : Mask) {
27920
80
      SDValue Idx =
27921
80
          M < 0 ? 
DAG.getUNDEF(MVT::i32)12
:
DAG.getConstant(M % 4, DL, MVT::i32)68
;
27922
80
      VPermIdx.push_back(Idx);
27923
80
    }
27924
8
    SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
27925
8
    DCI.AddToWorklist(VPermMask.getNode());
27926
8
    Res = DAG.getBitcast(MaskVT, V1);
27927
8
    DCI.AddToWorklist(Res.getNode());
27928
8
    Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27929
8
    DCI.AddToWorklist(Res.getNode());
27930
8
    return DAG.getBitcast(RootVT, Res);
27931
8
  }
27932
26.8k
27933
26.8k
  // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27934
26.8k
  // to VPERMIL2PD/VPERMIL2PS.
27935
26.8k
  
if (26.8k
AllowVariableMask && 26.8k
Subtarget.hasXOP()21.6k
&&
27936
28
      
(MaskVT == MVT::v2f64 || 28
MaskVT == MVT::v4f6428
||
MaskVT == MVT::v4f3224
||
27937
26.8k
       
MaskVT == MVT::v8f3214
)) {
27938
18
    // VPERMIL2 Operation.
27939
18
    // Bits[3] - Match Bit.
27940
18
    // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27941
18
    // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27942
18
    unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27943
18
    unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27944
18
    SmallVector<int, 8> VPerm2Idx;
27945
18
    unsigned M2ZImm = 0;
27946
88
    for (int M : Mask) {
27947
88
      if (
M == SM_SentinelUndef88
) {
27948
4
        VPerm2Idx.push_back(-1);
27949
4
        continue;
27950
4
      }
27951
84
      
if (84
M == SM_SentinelZero84
) {
27952
16
        M2ZImm = 2;
27953
16
        VPerm2Idx.push_back(8);
27954
16
        continue;
27955
16
      }
27956
68
      int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27957
68
      Index = (MaskVT.getScalarSizeInBits() == 64 ? 
Index << 112
:
Index56
);
27958
88
      VPerm2Idx.push_back(Index);
27959
88
    }
27960
18
    V1 = DAG.getBitcast(MaskVT, V1);
27961
18
    DCI.AddToWorklist(V1.getNode());
27962
18
    V2 = DAG.getBitcast(MaskVT, V2);
27963
18
    DCI.AddToWorklist(V2.getNode());
27964
18
    SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
27965
18
    DCI.AddToWorklist(VPerm2MaskOp.getNode());
27966
18
    Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27967
18
                      DAG.getConstant(M2ZImm, DL, MVT::i8));
27968
18
    DCI.AddToWorklist(Res.getNode());
27969
18
    return DAG.getBitcast(RootVT, Res);
27970
18
  }
27971
26.8k
27972
26.8k
  // If we have 3 or more shuffle instructions or a chain involving a variable
27973
26.8k
  // mask, we can replace them with a single PSHUFB instruction profitably.
27974
26.8k
  // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27975
26.8k
  // instructions, but in practice PSHUFB tends to be *very* fast so we're
27976
26.8k
  // more aggressive.
27977
26.8k
  
if (26.8k
UnaryShuffle && 26.8k
AllowVariableMask5.05k
&&
27978
2.44k
      
((RootVT.is128BitVector() && 2.44k
Subtarget.hasSSSE3()2.18k
) ||
27979
1.37k
       
(RootVT.is256BitVector() && 1.37k
Subtarget.hasAVX2()258
) ||
27980
26.8k
       
(RootVT.is512BitVector() && 1.11k
Subtarget.hasBWI()2
))) {
27981
1.33k
    SmallVector<SDValue, 16> PSHUFBMask;
27982
1.33k
    int NumBytes = RootVT.getSizeInBits() / 8;
27983
1.33k
    int Ratio = NumBytes / NumMaskElts;
27984
26.9k
    for (int i = 0; 
i < NumBytes26.9k
;
++i25.6k
) {
27985
25.6k
      int M = Mask[i / Ratio];
27986
25.6k
      if (
M == SM_SentinelUndef25.6k
) {
27987
168
        PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
27988
168
        continue;
27989
168
      }
27990
25.4k
      
if (25.4k
M == SM_SentinelZero25.4k
) {
27991
2.16k
        PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
27992
2.16k
        continue;
27993
2.16k
      }
27994
23.2k
      M = Ratio * M + i % Ratio;
27995
23.2k
      assert((M / 16) == (i / 16) && "Lane crossing detected");
27996
23.2k
      PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27997
23.2k
    }
27998
1.33k
    MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
27999
1.33k
    Res = DAG.getBitcast(ByteVT, V1);
28000
1.33k
    DCI.AddToWorklist(Res.getNode());
28001
1.33k
    SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
28002
1.33k
    DCI.AddToWorklist(PSHUFBMaskOp.getNode());
28003
1.33k
    Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
28004
1.33k
    DCI.AddToWorklist(Res.getNode());
28005
1.33k
    return DAG.getBitcast(RootVT, Res);
28006
1.33k
  }
28007
25.5k
28008
25.5k
  // With XOP, if we have a 128-bit binary input shuffle we can always combine
28009
25.5k
  // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
28010
25.5k
  // slower than PSHUFB on targets that support both.
28011
25.5k
  
if (25.5k
AllowVariableMask && 25.5k
RootVT.is128BitVector()20.3k
&&
Subtarget.hasXOP()19.7k
) {
28012
4
    // VPPERM Mask Operation
28013
4
    // Bits[4:0] - Byte Index (0 - 31)
28014
4
    // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
28015
4
    SmallVector<SDValue, 16> VPPERMMask;
28016
4
    int NumBytes = 16;
28017
4
    int Ratio = NumBytes / NumMaskElts;
28018
68
    for (int i = 0; 
i < NumBytes68
;
++i64
) {
28019
64
      int M = Mask[i / Ratio];
28020
64
      if (
M == SM_SentinelUndef64
) {
28021
0
        VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
28022
0
        continue;
28023
0
      }
28024
64
      
if (64
M == SM_SentinelZero64
) {
28025
16
        VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
28026
16
        continue;
28027
16
      }
28028
48
      M = Ratio * M + i % Ratio;
28029
48
      VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28030
48
    }
28031
4
    MVT ByteVT = MVT::v16i8;
28032
4
    V1 = DAG.getBitcast(ByteVT, V1);
28033
4
    DCI.AddToWorklist(V1.getNode());
28034
4
    V2 = DAG.getBitcast(ByteVT, V2);
28035
4
    DCI.AddToWorklist(V2.getNode());
28036
4
    SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
28037
4
    DCI.AddToWorklist(VPPERMMaskOp.getNode());
28038
4
    Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
28039
4
    DCI.AddToWorklist(Res.getNode());
28040
4
    return DAG.getBitcast(RootVT, Res);
28041
4
  }
28042
25.4k
28043
25.4k
  // Failed to find any combines.
28044
25.4k
  return SDValue();
28045
25.4k
}
28046
28047
// Attempt to constant fold all of the constant source ops.
28048
// Returns true if the entire shuffle is folded to a constant.
28049
// TODO: Extend this to merge multiple constant Ops and update the mask.
28050
static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
28051
                                           ArrayRef<int> Mask, SDValue Root,
28052
                                           bool HasVariableMask,
28053
                                           SelectionDAG &DAG,
28054
                                           TargetLowering::DAGCombinerInfo &DCI,
28055
1.37M
                                           const X86Subtarget &Subtarget) {
28056
1.37M
  MVT VT = Root.getSimpleValueType();
28057
1.37M
28058
1.37M
  unsigned SizeInBits = VT.getSizeInBits();
28059
1.37M
  unsigned NumMaskElts = Mask.size();
28060
1.37M
  unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
28061
1.37M
  unsigned NumOps = Ops.size();
28062
1.37M
28063
1.37M
  // Extract constant bits from each source op.
28064
1.37M
  bool OneUseConstantOp = false;
28065
1.37M
  SmallVector<APInt, 16> UndefEltsOps(NumOps);
28066
1.37M
  SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
28067
1.37M
  for (unsigned i = 0; 
i != NumOps1.37M
;
++i308
) {
28068
1.37M
    SDValue SrcOp = Ops[i];
28069
1.37M
    OneUseConstantOp |= SrcOp.hasOneUse();
28070
1.37M
    if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
28071
1.37M
                                       RawBitsOps[i]))
28072
1.37M
      return SDValue();
28073
1.37M
  }
28074
1.37M
28075
1.37M
  // Only fold if at least one of the constants is only used once or
28076
1.37M
  // the combined shuffle has included a variable mask shuffle, this
28077
1.37M
  // is to avoid constant pool bloat.
28078
244
  
if (244
!OneUseConstantOp && 244
!HasVariableMask150
)
28079
149
    return SDValue();
28080
95
28081
95
  // Shuffle the constant bits according to the mask.
28082
95
  APInt UndefElts(NumMaskElts, 0);
28083
95
  APInt ZeroElts(NumMaskElts, 0);
28084
95
  APInt ConstantElts(NumMaskElts, 0);
28085
95
  SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
28086
95
                                        APInt::getNullValue(MaskSizeInBits));
28087
991
  for (unsigned i = 0; 
i != NumMaskElts991
;
++i896
) {
28088
896
    int M = Mask[i];
28089
896
    if (
M == SM_SentinelUndef896
) {
28090
122
      UndefElts.setBit(i);
28091
122
      continue;
28092
774
    } else 
if (774
M == SM_SentinelZero774
) {
28093
234
      ZeroElts.setBit(i);
28094
234
      continue;
28095
234
    }
28096
896
    assert(0 <= M && M < (int)(NumMaskElts * NumOps));
28097
540
28098
540
    unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
28099
540
    unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
28100
540
28101
540
    auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
28102
540
    if (
SrcUndefElts[SrcMaskIdx]540
) {
28103
0
      UndefElts.setBit(i);
28104
0
      continue;
28105
0
    }
28106
540
28107
540
    auto &SrcEltBits = RawBitsOps[SrcOpIdx];
28108
540
    APInt &Bits = SrcEltBits[SrcMaskIdx];
28109
540
    if (
!Bits540
) {
28110
0
      ZeroElts.setBit(i);
28111
0
      continue;
28112
0
    }
28113
540
28114
540
    ConstantElts.setBit(i);
28115
540
    ConstantBitData[i] = Bits;
28116
540
  }
28117
95
  assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
28118
95
28119
95
  // Create the constant data.
28120
95
  MVT MaskSVT;
28121
95
  if (
VT.isFloatingPoint() && 95
(MaskSizeInBits == 32 || 44
MaskSizeInBits == 6420
))
28122
44
    MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
28123
95
  else
28124
51
    MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
28125
1.37M
28126
1.37M
  MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
28127
1.37M
28128
1.37M
  SDLoc DL(Root);
28129
1.37M
  SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
28130
1.37M
  DCI.AddToWorklist(CstOp.getNode());
28131
1.37M
  return DAG.getBitcast(VT, CstOp);
28132
1.37M
}
28133
28134
/// \brief Fully generic combining of x86 shuffle instructions.
28135
///
28136
/// This should be the last combine run over the x86 shuffle instructions. Once
28137
/// they have been fully optimized, this will recursively consider all chains
28138
/// of single-use shuffle instructions, build a generic model of the cumulative
28139
/// shuffle operation, and check for simpler instructions which implement this
28140
/// operation. We use this primarily for two purposes:
28141
///
28142
/// 1) Collapse generic shuffles to specialized single instructions when
28143
///    equivalent. In most cases, this is just an encoding size win, but
28144
///    sometimes we will collapse multiple generic shuffles into a single
28145
///    special-purpose shuffle.
28146
/// 2) Look for sequences of shuffle instructions with 3 or more total
28147
///    instructions, and replace them with the slightly more expensive SSSE3
28148
///    PSHUFB instruction if available. We do this as the last combining step
28149
///    to ensure we avoid using PSHUFB if we can implement the shuffle with
28150
///    a suitable short sequence of other instructions. The PSHUFB will either
28151
///    use a register or have to read from memory and so is slightly (but only
28152
///    slightly) more expensive than the other shuffle instructions.
28153
///
28154
/// Because this is inherently a quadratic operation (for each shuffle in
28155
/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
28156
/// This should never be an issue in practice as the shuffle lowering doesn't
28157
/// produce sequences of more than 8 instructions.
28158
///
28159
/// FIXME: We will currently miss some cases where the redundant shuffling
28160
/// would simplify under the threshold for PSHUFB formation because of
28161
/// combine-ordering. To fix this, we should do the redundant instruction
28162
/// combining in this recursive walk.
28163
static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
28164
                                          int SrcOpIndex, SDValue Root,
28165
                                          ArrayRef<int> RootMask,
28166
                                          ArrayRef<const SDNode*> SrcNodes,
28167
                                          int Depth, bool HasVariableMask,
28168
                                          SelectionDAG &DAG,
28169
                                          TargetLowering::DAGCombinerInfo &DCI,
28170
9.48M
                                          const X86Subtarget &Subtarget) {
28171
9.48M
  // Bound the depth of our recursive combine because this is ultimately
28172
9.48M
  // quadratic in nature.
28173
9.48M
  if (Depth > 8)
28174
7.19M
    return false;
28175
2.29M
28176
2.29M
  // Directly rip through bitcasts to find the underlying operand.
28177
2.29M
  SDValue Op = SrcOps[SrcOpIndex];
28178
2.29M
  Op = peekThroughOneUseBitcasts(Op);
28179
2.29M
28180
2.29M
  MVT VT = Op.getSimpleValueType();
28181
2.29M
  if (!VT.isVector())
28182
0
    return false; // Bail if we hit a non-vector.
28183
2.29M
28184
2.29M
  assert(Root.getSimpleValueType().isVector() &&
28185
2.29M
         "Shuffles operate on vector types!");
28186
2.29M
  assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
28187
2.29M
         "Can only combine shuffles of the same vector register size.");
28188
2.29M
28189
2.29M
  // Extract target shuffle mask and resolve sentinels and inputs.
28190
2.29M
  SmallVector<int, 64> OpMask;
28191
2.29M
  SmallVector<SDValue, 2> OpInputs;
28192
2.29M
  if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
28193
919k
    return false;
28194
1.37M
28195
2.29M
  assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
28196
1.37M
  SDValue Input0 = (OpInputs.size() > 0 ? 
OpInputs[0]1.37M
:
SDValue()129
);
28197
1.37M
  SDValue Input1 = (OpInputs.size() > 1 ? 
OpInputs[1]931k
:
SDValue()443k
);
28198
1.37M
28199
1.37M
  // Add the inputs to the Ops list, avoiding duplicates.
28200
1.37M
  SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
28201
1.37M
28202
1.37M
  int InputIdx0 = -1, InputIdx1 = -1;
28203
9.89M
  for (int i = 0, e = Ops.size(); 
i < e9.89M
;
++i8.51M
) {
28204
8.51M
    SDValue BC = peekThroughBitcasts(Ops[i]);
28205
8.51M
    if (
Input0 && 8.51M
BC == peekThroughBitcasts(Input0)8.51M
)
28206
4.87k
      InputIdx0 = i;
28207
8.51M
    if (
Input1 && 8.51M
BC == peekThroughBitcasts(Input1)6.96M
)
28208
1.21k
      InputIdx1 = i;
28209
8.51M
  }
28210
1.37M
28211
1.37M
  if (
Input0 && 1.37M
InputIdx0 < 01.37M
) {
28212
1.37M
    InputIdx0 = SrcOpIndex;
28213
1.37M
    Ops[SrcOpIndex] = Input0;
28214
1.37M
  }
28215
1.37M
  if (
Input1 && 1.37M
InputIdx1 < 0931k
) {
28216
930k
    InputIdx1 = Ops.size();
28217
930k
    Ops.push_back(Input1);
28218
930k
  }
28219
1.37M
28220
1.37M
  assert(((RootMask.size() > OpMask.size() &&
28221
1.37M
           RootMask.size() % OpMask.size() == 0) ||
28222
1.37M
          (OpMask.size() > RootMask.size() &&
28223
1.37M
           OpMask.size() % RootMask.size() == 0) ||
28224
1.37M
          OpMask.size() == RootMask.size()) &&
28225
1.37M
         "The smaller number of elements must divide the larger.");
28226
1.37M
28227
1.37M
  // This function can be performance-critical, so we rely on the power-of-2
28228
1.37M
  // knowledge that we have about the mask sizes to replace div/rem ops with
28229
1.37M
  // bit-masks and shifts.
28230
1.37M
  assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
28231
1.37M
  assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
28232
1.37M
  unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
28233
1.37M
  unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
28234
1.37M
28235
1.37M
  unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
28236
1.37M
  unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
28237
1.37M
  unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
28238
1.37M
  assert((RootRatio == 1 || OpRatio == 1) &&
28239
1.37M
         "Must not have a ratio for both incoming and op masks!");
28240
1.37M
28241
1.37M
  assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
28242
1.37M
  assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
28243
1.37M
  assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
28244
1.37M
  unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
28245
1.37M
  unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
28246
1.37M
28247
1.37M
  SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
28248
1.37M
28249
1.37M
  // Merge this shuffle operation's mask into our accumulated mask. Note that
28250
1.37M
  // this shuffle's mask will be the first applied to the input, followed by the
28251
1.37M
  // root mask to get us all the way to the root value arrangement. The reason
28252
1.37M
  // for this order is that we are recursing up the operation chain.
28253
20.2M
  for (unsigned i = 0; 
i < MaskWidth20.2M
;
++i18.8M
) {
28254
18.8M
    unsigned RootIdx = i >> RootRatioLog2;
28255
18.8M
    if (
RootMask[RootIdx] < 018.8M
) {
28256
316k
      // This is a zero or undef lane, we're done.
28257
316k
      Mask[i] = RootMask[RootIdx];
28258
316k
      continue;
28259
316k
    }
28260
18.5M
28261
18.5M
    unsigned RootMaskedIdx =
28262
18.5M
        RootRatio == 1
28263
16.4M
            ? RootMask[RootIdx]
28264
2.03M
            : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
28265
18.5M
28266
18.5M
    // Just insert the scaled root mask value if it references an input other
28267
18.5M
    // than the SrcOp we're currently inserting.
28268
18.5M
    if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
28269
18.5M
        
(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)11.4M
) {
28270
14.3M
      Mask[i] = RootMaskedIdx;
28271
14.3M
      continue;
28272
14.3M
    }
28273
4.12M
28274
4.12M
    RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
28275
4.12M
    unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
28276
4.12M
    if (
OpMask[OpIdx] < 04.12M
) {
28277
204k
      // The incoming lanes are zero or undef, it doesn't matter which ones we
28278
204k
      // are using.
28279
204k
      Mask[i] = OpMask[OpIdx];
28280
204k
      continue;
28281
204k
    }
28282
3.91M
28283
3.91M
    // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
28284
3.91M
    unsigned OpMaskedIdx =
28285
3.91M
        OpRatio == 1
28286
2.25M
            ? OpMask[OpIdx]
28287
1.65M
            : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
28288
3.91M
28289
3.91M
    OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
28290
3.91M
    if (
OpMask[OpIdx] < (int)OpMask.size()3.91M
) {
28291
2.63M
      assert(0 <= InputIdx0 && "Unknown target shuffle input");
28292
2.63M
      OpMaskedIdx += InputIdx0 * MaskWidth;
28293
3.91M
    } else {
28294
1.28M
      assert(0 <= InputIdx1 && "Unknown target shuffle input");
28295
1.28M
      OpMaskedIdx += InputIdx1 * MaskWidth;
28296
1.28M
    }
28297
18.8M
28298
18.8M
    Mask[i] = OpMaskedIdx;
28299
18.8M
  }
28300
1.37M
28301
1.37M
  // Handle the all undef/zero cases early.
28302
1.38M
  if (
all_of(Mask, [](int Idx) 1.37M
{ return Idx == SM_SentinelUndef; }1.38M
)) {
28303
14
    DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
28304
14
    return true;
28305
14
  }
28306
1.39M
  
if (1.37M
all_of(Mask, [](int Idx) 1.37M
{ return Idx < 0; }1.39M
)) {
28307
19
    // TODO - should we handle the mixed zero/undef case as well? Just returning
28308
19
    // a zero mask will lose information on undef elements possibly reducing
28309
19
    // future combine possibilities.
28310
19
    DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
28311
19
                                                Subtarget, DAG, SDLoc(Root)));
28312
19
    return true;
28313
19
  }
28314
1.37M
28315
1.37M
  // Remove unused shuffle source ops.
28316
1.37M
  resolveTargetShuffleInputsAndMask(Ops, Mask);
28317
1.37M
  assert(!Ops.empty() && "Shuffle with no inputs detected");
28318
1.37M
28319
1.37M
  HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
28320
1.37M
28321
1.37M
  // Update the list of shuffle nodes that have been combined so far.
28322
1.37M
  SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28323
1.37M
                                                SrcNodes.end());
28324
1.37M
  CombinedNodes.push_back(Op.getNode());
28325
1.37M
28326
1.37M
  // See if we can recurse into each shuffle source op (if it's a target
28327
1.37M
  // shuffle). The source op should only be combined if it either has a
28328
1.37M
  // single use (i.e. current Op) or all its users have already been combined.
28329
10.8M
  for (int i = 0, e = Ops.size(); 
i < e10.8M
;
++i9.43M
)
28330
9.44M
    
if (9.44M
Ops[i].getNode()->hasOneUse() ||
28331
75.7k
        SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28332
9.39M
      
if (9.39M
combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28333
9.39M
                                        Depth + 1, HasVariableMask, DAG, DCI,
28334
9.39M
                                        Subtarget))
28335
4.36k
        return true;
28336
1.37M
28337
1.37M
  // Attempt to constant fold all of the constant source ops.
28338
1.37M
  
if (SDValue 1.37M
Cst1.37M
= combineX86ShufflesConstants(
28339
95
          Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget)) {
28340
95
    DCI.CombineTo(Root.getNode(), Cst);
28341
95
    return true;
28342
95
  }
28343
1.37M
28344
1.37M
  // We can only combine unary and binary shuffle mask cases.
28345
1.37M
  
if (1.37M
Ops.size() > 21.37M
)
28346
1.28M
    return false;
28347
85.0k
28348
85.0k
  // Minor canonicalization of the accumulated shuffle mask to make it easier
28349
85.0k
  // to match below. All this does is detect masks with sequential pairs of
28350
85.0k
  // elements, and shrink them to the half-width mask. It does this in a loop
28351
85.0k
  // so it will reduce the size of the mask to the minimal width mask which
28352
85.0k
  // performs an equivalent shuffle.
28353
85.0k
  SmallVector<int, 64> WidenedMask;
28354
105k
  while (
Mask.size() > 1 && 105k
canWidenShuffleElements(Mask, WidenedMask)105k
) {
28355
20.2k
    Mask = std::move(WidenedMask);
28356
20.2k
  }
28357
85.0k
28358
85.0k
  // Canonicalization of binary shuffle masks to improve pattern matching by
28359
85.0k
  // commuting the inputs.
28360
85.0k
  if (
Ops.size() == 2 && 85.0k
canonicalizeShuffleMaskWithCommute(Mask)39.9k
) {
28361
2.86k
    ShuffleVectorSDNode::commuteMask(Mask);
28362
2.86k
    std::swap(Ops[0], Ops[1]);
28363
2.86k
  }
28364
85.0k
28365
85.0k
  // Finally, try to combine into a single shuffle instruction.
28366
85.0k
  if (SDValue Res = combineX86ShuffleChain(
28367
3.76k
          Ops, Root, Mask, Depth, HasVariableMask, DAG, DCI, Subtarget)) {
28368
3.76k
    DCI.CombineTo(Root.getNode(), Res, /*AddTo*/ true);
28369
3.76k
    return true;
28370
3.76k
  }
28371
81.2k
  return false;
28372
81.2k
}
28373
28374
/// \brief Get the PSHUF-style mask from PSHUF node.
28375
///
28376
/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28377
/// PSHUF-style masks that can be reused with such instructions.
28378
10.4k
static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28379
10.4k
  MVT VT = N.getSimpleValueType();
28380
10.4k
  SmallVector<int, 4> Mask;
28381
10.4k
  SmallVector<SDValue, 2> Ops;
28382
10.4k
  bool IsUnary;
28383
10.4k
  bool HaveMask =
28384
10.4k
      getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28385
10.4k
  (void)HaveMask;
28386
10.4k
  assert(HaveMask);
28387
10.4k
28388
10.4k
  // If we have more than 128-bits, only the low 128-bits of shuffle mask
28389
10.4k
  // matter. Check that the upper masks are repeats and remove them.
28390
10.4k
  if (
VT.getSizeInBits() > 12810.4k
) {
28391
990
    int LaneElts = 128 / VT.getScalarSizeInBits();
28392
#ifndef NDEBUG
28393
    for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28394
      for (int j = 0; j < LaneElts; ++j)
28395
        assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28396
               "Mask doesn't repeat in high 128-bit lanes!");
28397
#endif
28398
    Mask.resize(LaneElts);
28399
990
  }
28400
10.4k
28401
10.4k
  switch (N.getOpcode()) {
28402
7.87k
  case X86ISD::PSHUFD:
28403
7.87k
    return Mask;
28404
1.45k
  case X86ISD::PSHUFLW:
28405
1.45k
    Mask.resize(4);
28406
1.45k
    return Mask;
28407
1.07k
  case X86ISD::PSHUFHW:
28408
1.07k
    Mask.erase(Mask.begin(), Mask.begin() + 4);
28409
1.07k
    for (int &M : Mask)
28410
4.29k
      M -= 4;
28411
1.07k
    return Mask;
28412
0
  default:
28413
0
    llvm_unreachable("No valid shuffle instruction found!");
28414
0
  }
28415
0
}
28416
28417
/// \brief Search for a combinable shuffle across a chain ending in pshufd.
28418
///
28419
/// We walk up the chain and look for a combinable shuffle, skipping over
28420
/// shuffles that we could hoist this shuffle's transformation past without
28421
/// altering anything.
28422
static SDValue
28423
combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28424
7.62k
                             SelectionDAG &DAG) {
28425
7.62k
  assert(N.getOpcode() == X86ISD::PSHUFD &&
28426
7.62k
         "Called with something other than an x86 128-bit half shuffle!");
28427
7.62k
  SDLoc DL(N);
28428
7.62k
28429
7.62k
  // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28430
7.62k
  // of the shuffles in the chain so that we can form a fresh chain to replace
28431
7.62k
  // this one.
28432
7.62k
  SmallVector<SDValue, 8> Chain;
28433
7.62k
  SDValue V = N.getOperand(0);
28434
12.3k
  for (; 
V.hasOneUse()12.3k
;
V = V.getOperand(0)4.71k
) {
28435
9.11k
    switch (V.getOpcode()) {
28436
2.92k
    default:
28437
2.92k
      return SDValue(); // Nothing combined!
28438
9.11k
28439
4.67k
    case ISD::BITCAST:
28440
4.67k
      // Skip bitcasts as we always know the type for the target specific
28441
4.67k
      // instructions.
28442
4.67k
      continue;
28443
9.11k
28444
90
    case X86ISD::PSHUFD:
28445
90
      // Found another dword shuffle.
28446
90
      break;
28447
9.11k
28448
567
    case X86ISD::PSHUFLW:
28449
567
      // Check that the low words (being shuffled) are the identity in the
28450
567
      // dword shuffle, and the high words are self-contained.
28451
567
      if (
Mask[0] != 0 || 567
Mask[1] != 1548
||
28452
55
          
!(Mask[2] >= 2 && 55
Mask[2] < 449
&&
Mask[3] >= 249
&&
Mask[3] < 440
))
28453
527
        return SDValue();
28454
40
28455
40
      Chain.push_back(V);
28456
40
      continue;
28457
40
28458
766
    case X86ISD::PSHUFHW:
28459
766
      // Check that the high words (being shuffled) are the identity in the
28460
766
      // dword shuffle, and the low words are self-contained.
28461
766
      if (
Mask[2] != 2 || 766
Mask[3] != 3663
||
28462
646
          
!(Mask[0] >= 0 && 646
Mask[0] < 2646
&&
Mask[1] >= 0642
&&
Mask[1] < 2642
))
28463
766
        return SDValue();
28464
0
28465
0
      Chain.push_back(V);
28466
0
      continue;
28467
0
28468
101
    case X86ISD::UNPCKL:
28469
101
    case X86ISD::UNPCKH:
28470
101
      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28471
101
      // shuffle into a preceding word shuffle.
28472
101
      if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28473
53
          V.getSimpleValueType().getVectorElementType() != MVT::i16)
28474
23
        return SDValue();
28475
78
28476
78
      // Search for a half-shuffle which we can combine with.
28477
78
      unsigned CombineOp =
28478
78
          V.getOpcode() == X86ISD::UNPCKL ? 
X86ISD::PSHUFLW57
:
X86ISD::PSHUFHW21
;
28479
78
      if (V.getOperand(0) != V.getOperand(1) ||
28480
7
          !V->isOnlyUserOf(V.getOperand(0).getNode()))
28481
71
        return SDValue();
28482
7
      Chain.push_back(V);
28483
7
      V = V.getOperand(0);
28484
8
      do {
28485
8
        switch (V.getOpcode()) {
28486
6
        default:
28487
6
          return SDValue(); // Nothing to combine.
28488
8
28489
1
        case X86ISD::PSHUFLW:
28490
1
        case X86ISD::PSHUFHW:
28491
1
          if (V.getOpcode() == CombineOp)
28492
1
            break;
28493
0
28494
0
          Chain.push_back(V);
28495
0
28496
0
          LLVM_FALLTHROUGH;
28497
1
        case ISD::BITCAST:
28498
1
          V = V.getOperand(0);
28499
1
          continue;
28500
1
        }
28501
1
        break;
28502
1
      } while (V.hasOneUse());
28503
1
      break;
28504
91
    }
28505
91
    // Break out of the loop if we break out of the switch.
28506
91
    break;
28507
91
  }
28508
7.62k
28509
3.30k
  
if (3.30k
!V.hasOneUse()3.30k
)
28510
3.30k
    // We fell out of the loop without finding a viable combining instruction.
28511
3.21k
    return SDValue();
28512
91
28513
91
  // Merge this node's mask and our incoming mask.
28514
91
  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28515
91
  for (int &M : Mask)
28516
364
    M = VMask[M];
28517
91
  V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28518
91
                  getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28519
91
28520
91
  // Rebuild the chain around this new shuffle.
28521
109
  while (
!Chain.empty()109
) {
28522
18
    SDValue W = Chain.pop_back_val();
28523
18
28524
18
    if (V.getValueType() != W.getOperand(0).getValueType())
28525
18
      V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28526
18
28527
18
    switch (W.getOpcode()) {
28528
0
    default:
28529
0
      llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28530
18
28531
1
    case X86ISD::UNPCKL:
28532
1
    case X86ISD::UNPCKH:
28533
1
      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28534
1
      break;
28535
1
28536
17
    case X86ISD::PSHUFD:
28537
17
    case X86ISD::PSHUFLW:
28538
17
    case X86ISD::PSHUFHW:
28539
17
      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28540
17
      break;
28541
18
    }
28542
18
  }
28543
91
  
if (91
V.getValueType() != N.getValueType()91
)
28544
18
    V = DAG.getBitcast(N.getValueType(), V);
28545
91
28546
91
  // Return the new chain to replace N.
28547
91
  return V;
28548
7.62k
}
28549
28550
/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28551
/// pshufhw.
28552
///
28553
/// We walk up the chain, skipping shuffles of the other half and looking
28554
/// through shuffles which switch halves trying to find a shuffle of the same
28555
/// pair of dwords.
28556
static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28557
                                        SelectionDAG &DAG,
28558
2.36k
                                        TargetLowering::DAGCombinerInfo &DCI) {
28559
2.36k
  assert(
28560
2.36k
      (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28561
2.36k
      "Called with something other than an x86 128-bit half shuffle!");
28562
2.36k
  SDLoc DL(N);
28563
2.36k
  unsigned CombineOpcode = N.getOpcode();
28564
2.36k
28565
2.36k
  // Walk up a single-use chain looking for a combinable shuffle.
28566
2.36k
  SDValue V = N.getOperand(0);
28567
5.00k
  for (; 
V.hasOneUse()5.00k
;
V = V.getOperand(0)2.63k
) {
28568
4.78k
    switch (V.getOpcode()) {
28569
2.14k
    default:
28570
2.14k
      return false; // Nothing combined!
28571
4.78k
28572
1.85k
    case ISD::BITCAST:
28573
1.85k
      // Skip bitcasts as we always know the type for the target specific
28574
1.85k
      // instructions.
28575
1.85k
      continue;
28576
4.78k
28577
778
    case X86ISD::PSHUFLW:
28578
778
    case X86ISD::PSHUFHW:
28579
778
      if (V.getOpcode() == CombineOpcode)
28580
0
        break;
28581
778
28582
778
      // Other-half shuffles are no-ops.
28583
778
      continue;
28584
0
    }
28585
0
    // Break out of the loop if we break out of the switch.
28586
0
    break;
28587
0
  }
28588
2.36k
28589
221
  
if (221
!V.hasOneUse()221
)
28590
221
    // We fell out of the loop without finding a viable combining instruction.
28591
221
    return false;
28592
0
28593
0
  // Combine away the bottom node as its shuffle will be accumulated into
28594
0
  // a preceding shuffle.
28595
0
  DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28596
0
28597
0
  // Record the old value.
28598
0
  SDValue Old = V;
28599
0
28600
0
  // Merge this node's mask and our incoming mask (adjusted to account for all
28601
0
  // the pshufd instructions encountered).
28602
0
  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28603
0
  for (int &M : Mask)
28604
0
    M = VMask[M];
28605
0
  V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28606
0
                  getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28607
0
28608
0
  // Check that the shuffles didn't cancel each other out. If not, we need to
28609
0
  // combine to the new one.
28610
0
  if (Old != V)
28611
0
    // Replace the combinable shuffle with the combined one, updating all users
28612
0
    // so that we re-evaluate the chain here.
28613
0
    DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28614
2.36k
28615
2.36k
  return true;
28616
2.36k
}
28617
28618
/// \brief Try to combine x86 target specific shuffles.
28619
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28620
                                    TargetLowering::DAGCombinerInfo &DCI,
28621
50.7k
                                    const X86Subtarget &Subtarget) {
28622
50.7k
  SDLoc DL(N);
28623
50.7k
  MVT VT = N.getSimpleValueType();
28624
50.7k
  SmallVector<int, 4> Mask;
28625
50.7k
28626
50.7k
  unsigned Opcode = N.getOpcode();
28627
50.7k
  switch (Opcode) {
28628
10.0k
  case X86ISD::PSHUFD:
28629
10.0k
  case X86ISD::PSHUFLW:
28630
10.0k
  case X86ISD::PSHUFHW:
28631
10.0k
    Mask = getPSHUFShuffleMask(N);
28632
10.0k
    assert(Mask.size() == 4);
28633
10.0k
    break;
28634
8.29k
  case X86ISD::UNPCKL: {
28635
8.29k
    auto Op0 = N.getOperand(0);
28636
8.29k
    auto Op1 = N.getOperand(1);
28637
8.29k
    unsigned Opcode0 = Op0.getOpcode();
28638
8.29k
    unsigned Opcode1 = Op1.getOpcode();
28639
8.29k
28640
8.29k
    // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28641
8.29k
    // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28642
8.29k
    // TODO: Add other horizontal operations as required.
28643
8.29k
    if (
VT == MVT::v2f64 && 8.29k
Opcode0 == Opcode1844
&&
Opcode0 == X86ISD::FHADD718
)
28644
12
      return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28645
8.28k
28646
8.28k
    // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28647
8.28k
    // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28648
8.28k
    // moves upper half elements into the lower half part. For example:
28649
8.28k
    //
28650
8.28k
    // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28651
8.28k
    //     undef:v16i8
28652
8.28k
    // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28653
8.28k
    //
28654
8.28k
    // will be combined to:
28655
8.28k
    //
28656
8.28k
    // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28657
8.28k
28658
8.28k
    // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28659
8.28k
    // happen due to advanced instructions.
28660
8.28k
    
if (8.28k
!VT.is128BitVector()8.28k
)
28661
627
      return SDValue();
28662
7.65k
28663
7.65k
    
if (7.65k
Op0.isUndef() && 7.65k
Opcode1 == ISD::VECTOR_SHUFFLE1.15k
) {
28664
126
      ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28665
126
28666
126
      unsigned NumElts = VT.getVectorNumElements();
28667
126
      SmallVector<int, 8> ExpectedMask(NumElts, -1);
28668
126
      std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28669
126
                NumElts / 2);
28670
126
28671
126
      auto ShufOp = Op1.getOperand(0);
28672
126
      if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28673
26
        return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28674
7.62k
    }
28675
7.62k
    return SDValue();
28676
7.62k
  }
28677
4.21k
  case X86ISD::BLENDI: {
28678
4.21k
    SDValue V0 = N->getOperand(0);
28679
4.21k
    SDValue V1 = N->getOperand(1);
28680
4.21k
    assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28681
4.21k
           "Unexpected input vector types");
28682
4.21k
28683
4.21k
    // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28684
4.21k
    // operands and changing the mask to 1. This saves us a bunch of
28685
4.21k
    // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28686
4.21k
    // x86InstrInfo knows how to commute this back after instruction selection
28687
4.21k
    // if it would help register allocation.
28688
4.21k
28689
4.21k
    // TODO: If optimizing for size or a processor that doesn't suffer from
28690
4.21k
    // partial register update stalls, this should be transformed into a MOVSD
28691
4.21k
    // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28692
4.21k
28693
4.21k
    if (VT == MVT::v2f64)
28694
326
      
if (auto *326
Mask326
= dyn_cast<ConstantSDNode>(N->getOperand(2)))
28695
326
        
if (326
Mask->getZExtValue() == 2 && 326
!isShuffleFoldableLoad(V0)163
) {
28696
163
          SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28697
163
          return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28698
163
        }
28699
4.05k
28700
4.05k
    return SDValue();
28701
4.05k
  }
28702
1.89k
  case X86ISD::MOVSD:
28703
1.89k
  case X86ISD::MOVSS: {
28704
1.89k
    SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28705
1.89k
    SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28706
1.89k
    bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28707
1.89k
    bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28708
1.89k
    if (
isZero0 && 1.89k
isZero11
)
28709
1
      return SDValue();
28710
1.89k
28711
1.89k
    // We often lower to MOVSD/MOVSS from integer as well as native float
28712
1.89k
    // types; remove unnecessary domain-crossing bitcasts if we can to make it
28713
1.89k
    // easier to combine shuffles later on. We've already accounted for the
28714
1.89k
    // domain switching cost when we decided to lower with it.
28715
1.89k
    bool isFloat = VT.isFloatingPoint();
28716
1.89k
    bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28717
1.89k
    bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28718
1.89k
    if (
(isFloat != isFloat0 || 1.89k
isZero01.49k
) &&
(isFloat != isFloat1 || 398
isZero142
)) {
28719
356
      MVT NewVT = isFloat ? 
(X86ISD::MOVSD == Opcode ? 356
MVT::v2i64335
:
MVT::v4i3221
)
28720
0
                          : 
(X86ISD::MOVSD == Opcode ? 0
MVT::v2f640
:
MVT::v4f320
);
28721
356
      V0 = DAG.getBitcast(NewVT, V0);
28722
356
      V1 = DAG.getBitcast(NewVT, V1);
28723
356
      return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28724
356
    }
28725
1.53k
28726
1.53k
    return SDValue();
28727
1.53k
  }
28728
1.32k
  case X86ISD::INSERTPS: {
28729
1.32k
    assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28730
1.32k
    SDValue Op0 = N.getOperand(0);
28731
1.32k
    SDValue Op1 = N.getOperand(1);
28732
1.32k
    SDValue Op2 = N.getOperand(2);
28733
1.32k
    unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28734
1.32k
    unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28735
1.32k
    unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28736
1.32k
    unsigned ZeroMask = InsertPSMask & 0xF;
28737
1.32k
28738
1.32k
    // If we zero out all elements from Op0 then we don't need to reference it.
28739
1.32k
    if (
((ZeroMask | (1u << DstIdx)) == 0xF) && 1.32k
!Op0.isUndef()29
)
28740
0
      return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28741
0
                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
28742
1.32k
28743
1.32k
    // If we zero out the element from Op1 then we don't need to reference it.
28744
1.32k
    
if (1.32k
(ZeroMask & (1u << DstIdx)) && 1.32k
!Op1.isUndef()0
)
28745
0
      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28746
0
                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
28747
1.32k
28748
1.32k
    // Attempt to merge insertps Op1 with an inner target shuffle node.
28749
1.32k
    SmallVector<int, 8> TargetMask1;
28750
1.32k
    SmallVector<SDValue, 2> Ops1;
28751
1.32k
    if (
setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)1.32k
) {
28752
38
      int M = TargetMask1[SrcIdx];
28753
38
      if (
isUndefOrZero(M)38
) {
28754
0
        // Zero/UNDEF insertion - zero out element and remove dependency.
28755
0
        InsertPSMask |= (1u << DstIdx);
28756
0
        return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28757
0
                           DAG.getConstant(InsertPSMask, DL, MVT::i8));
28758
0
      }
28759
38
      // Update insertps mask srcidx and reference the source input directly.
28760
0
      assert(0 <= M && M < 8 && "Shuffle index out of range");
28761
38
      InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28762
38
      Op1 = Ops1[M < 4 ? 
038
:
10
];
28763
38
      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28764
38
                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
28765
38
    }
28766
1.28k
28767
1.28k
    // Attempt to merge insertps Op0 with an inner target shuffle node.
28768
1.28k
    SmallVector<int, 8> TargetMask0;
28769
1.28k
    SmallVector<SDValue, 2> Ops0;
28770
1.28k
    if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28771
718
      return SDValue();
28772
565
28773
565
    bool Updated = false;
28774
565
    bool UseInput00 = false;
28775
565
    bool UseInput01 = false;
28776
1.44k
    for (int i = 0; 
i != 41.44k
;
++i875
) {
28777
1.42k
      int M = TargetMask0[i];
28778
1.42k
      if (
(InsertPSMask & (1u << i)) || 1.42k
(i == (int)DstIdx)1.42k
) {
28779
13
        // No change if element is already zero or the inserted element.
28780
13
        continue;
28781
1.41k
      } else 
if (1.41k
isUndefOrZero(M)1.41k
) {
28782
26
        // If the target mask is undef/zero then we must zero the element.
28783
26
        InsertPSMask |= (1u << i);
28784
26
        Updated = true;
28785
26
        continue;
28786
26
      }
28787
1.38k
28788
1.38k
      // The input vector element must be inline.
28789
1.38k
      
if (1.38k
M != i && 1.38k
M != (i + 4)554
)
28790
552
        return SDValue();
28791
836
28792
836
      // Determine which inputs of the target shuffle we're using.
28793
836
      
UseInput00 |= (0 <= M && 836
M < 4836
);
28794
1.42k
      UseInput01 |= (4 <= M);
28795
1.42k
    }
28796
565
28797
565
    // If we're not using both inputs of the target shuffle then use the
28798
565
    // referenced input directly.
28799
13
    
if (13
UseInput00 && 13
!UseInput0113
) {
28800
11
      Updated = true;
28801
11
      Op0 = Ops0[0];
28802
13
    } else 
if (2
!UseInput00 && 2
UseInput010
) {
28803
0
      Updated = true;
28804
0
      Op0 = Ops0[1];
28805
0
    }
28806
13
28807
13
    if (Updated)
28808
11
      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28809
11
                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
28810
2
28811
2
    return SDValue();
28812
2
  }
28813
24.9k
  default:
28814
24.9k
    return SDValue();
28815
10.0k
  }
28816
10.0k
28817
10.0k
  // Nuke no-op shuffles that show up after combining.
28818
10.0k
  
if (10.0k
isNoopShuffleMask(Mask)10.0k
)
28819
10
    return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28820
9.99k
28821
9.99k
  // Look for simplifications involving one or two shuffle instructions.
28822
9.99k
  SDValue V = N.getOperand(0);
28823
9.99k
  switch (N.getOpcode()) {
28824
0
  default:
28825
0
    break;
28826
2.36k
  case X86ISD::PSHUFLW:
28827
2.36k
  case X86ISD::PSHUFHW:
28828
2.36k
    assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28829
2.36k
28830
2.36k
    if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28831
0
      return SDValue(); // We combined away this shuffle, so we're done.
28832
2.36k
28833
2.36k
    // See if this reduces to a PSHUFD which is no more expensive and can
28834
2.36k
    // combine with more operations. Note that it has to at least flip the
28835
2.36k
    // dwords as otherwise it would have been removed as a no-op.
28836
2.36k
    
if (2.36k
makeArrayRef(Mask).equals({2, 3, 0, 1})2.36k
) {
28837
102
      int DMask[] = {0, 1, 2, 3};
28838
102
      int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 
026
:
276
;
28839
102
      DMask[DOffset + 0] = DOffset + 1;
28840
102
      DMask[DOffset + 1] = DOffset + 0;
28841
102
      MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28842
102
      V = DAG.getBitcast(DVT, V);
28843
102
      DCI.AddToWorklist(V.getNode());
28844
102
      V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28845
102
                      getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28846
102
      DCI.AddToWorklist(V.getNode());
28847
102
      return DAG.getBitcast(VT, V);
28848
102
    }
28849
2.26k
28850
2.26k
    // Look for shuffle patterns which can be implemented as a single unpack.
28851
2.26k
    // FIXME: This doesn't handle the location of the PSHUFD generically, and
28852
2.26k
    // only works when we have a PSHUFD followed by two half-shuffles.
28853
2.26k
    
if (2.26k
Mask[0] == Mask[1] && 2.26k
Mask[2] == Mask[3]688
&&
28854
532
        (V.getOpcode() == X86ISD::PSHUFLW ||
28855
532
         V.getOpcode() == X86ISD::PSHUFHW) &&
28856
203
        V.getOpcode() != N.getOpcode() &&
28857
2.26k
        
V.hasOneUse()203
) {
28858
203
      SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28859
203
      if (
D.getOpcode() == X86ISD::PSHUFD && 203
D.hasOneUse()154
) {
28860
154
        SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28861
154
        SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28862
154
        int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 
00
:
4154
;
28863
154
        int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 
0154
:
40
;
28864
154
        int WordMask[8];
28865
770
        for (int i = 0; 
i < 4770
;
++i616
) {
28866
616
          WordMask[i + NOffset] = Mask[i] + NOffset;
28867
616
          WordMask[i + VOffset] = VMask[i] + VOffset;
28868
616
        }
28869
154
        // Map the word mask through the DWord mask.
28870
154
        int MappedMask[8];
28871
1.38k
        for (int i = 0; 
i < 81.38k
;
++i1.23k
)
28872
1.23k
          MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28873
154
        if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28874
154
            
makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})146
) {
28875
16
          // We can replace all three shuffles with an unpack.
28876
16
          V = DAG.getBitcast(VT, D.getOperand(0));
28877
16
          DCI.AddToWorklist(V.getNode());
28878
8
          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28879
8
                                                : X86ISD::UNPCKH,
28880
16
                             DL, VT, V, V);
28881
16
        }
28882
2.25k
      }
28883
203
    }
28884
2.25k
28885
2.25k
    break;
28886
2.25k
28887
7.62k
  case X86ISD::PSHUFD:
28888
7.62k
    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28889
91
      return NewN;
28890
7.53k
28891
7.53k
    break;
28892
9.78k
  }
28893
9.78k
28894
9.78k
  return SDValue();
28895
9.78k
}
28896
28897
/// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28898
/// operation. If true is returned then the operands of ADDSUB operation
28899
/// are written to the parameters \p Opnd0 and \p Opnd1.
28900
///
28901
/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28902
/// so it is easier to generically match. We also insert dummy vector shuffle
28903
/// nodes for the operands which explicitly discard the lanes which are unused
28904
/// by this operation to try to flow through the rest of the combiner
28905
/// the fact that they're unused.
28906
static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28907
82.4k
                     SDValue &Opnd0, SDValue &Opnd1) {
28908
82.4k
28909
82.4k
  EVT VT = N->getValueType(0);
28910
82.4k
  if (
(!Subtarget.hasSSE3() || 82.4k
(VT != MVT::v4f32 && 67.4k
VT != MVT::v2f6461.4k
)) &&
28911
72.6k
      
(!Subtarget.hasAVX() || 72.6k
(VT != MVT::v8f32 && 41.8k
VT != MVT::v4f6439.3k
)) &&
28912
67.6k
      
(!Subtarget.hasAVX512() || 67.6k
(VT != MVT::v16f32 && 16.1k
VT != MVT::v8f6415.1k
)))
28913
65.6k
    return false;
28914
16.8k
28915
16.8k
  // We only handle target-independent shuffles.
28916
16.8k
  // FIXME: It would be easy and harmless to use the target shuffle mask
28917
16.8k
  // extraction tool to support more.
28918
16.8k
  
if (16.8k
N->getOpcode() != ISD::VECTOR_SHUFFLE16.8k
)
28919
11.0k
    return false;
28920
5.77k
28921
5.77k
  ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28922
5.77k
  SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28923
5.77k
28924
5.77k
  SDValue V1 = N->getOperand(0);
28925
5.77k
  SDValue V2 = N->getOperand(1);
28926
5.77k
28927
5.77k
  // We require the first shuffle operand to be the FSUB node, and the second to
28928
5.77k
  // be the FADD node.
28929
5.77k
  if (
V1.getOpcode() == ISD::FADD && 5.77k
V2.getOpcode() == ISD::FSUB90
) {
28930
47
    ShuffleVectorSDNode::commuteMask(Mask);
28931
47
    std::swap(V1, V2);
28932
5.77k
  } else 
if (5.72k
V1.getOpcode() != ISD::FSUB || 5.72k
V2.getOpcode() != ISD::FADD86
)
28933
5.65k
    return false;
28934
117
28935
117
  // If there are other uses of these operations we can't fold them.
28936
117
  
if (117
!V1->hasOneUse() || 117
!V2->hasOneUse()114
)
28937
3
    return false;
28938
114
28939
114
  // Ensure that both operations have the same operands. Note that we can
28940
114
  // commute the FADD operands.
28941
114
  SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28942
114
  if (
(V2->getOperand(0) != LHS || 114
V2->getOperand(1) != RHS114
) &&
28943
0
      
(V2->getOperand(0) != RHS || 0
V2->getOperand(1) != LHS0
))
28944
0
    return false;
28945
114
28946
114
  // We're looking for blends between FADD and FSUB nodes. We insist on these
28947
114
  // nodes being lined up in a specific expected pattern.
28948
114
  
if (114
!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28949
90
        isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28950
53
        isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28951
36
        isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28952
36
                                           8, 25, 10, 27, 12, 29, 14, 31})))
28953
34
    return false;
28954
80
28955
80
  Opnd0 = LHS;
28956
80
  Opnd1 = RHS;
28957
80
  return true;
28958
80
}
28959
28960
/// \brief Try to combine a shuffle into a target-specific add-sub or
28961
/// mul-add-sub node.
28962
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28963
                                                const X86Subtarget &Subtarget,
28964
82.4k
                                                SelectionDAG &DAG) {
28965
82.4k
  SDValue Opnd0, Opnd1;
28966
82.4k
  if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28967
82.4k
    return SDValue();
28968
80
28969
80
  EVT VT = N->getValueType(0);
28970
80
  SDLoc DL(N);
28971
80
28972
80
  // Try to generate X86ISD::FMADDSUB node here.
28973
80
  SDValue Opnd2;
28974
80
  if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28975
22
    return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28976
58
28977
58
  // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28978
58
  // the ADDSUB idiom has been successfully recognized. There are no known
28979
58
  // X86 targets with 512-bit ADDSUB instructions!
28980
58
  
if (58
VT.is512BitVector()58
)
28981
2
    return SDValue();
28982
56
28983
56
  return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
28984
56
}
28985
28986
// We are looking for a shuffle where both sources are concatenated with undef
28987
// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
28988
// if we can express this as a single-source shuffle, that's preferable.
28989
static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
28990
82.5k
                                           const X86Subtarget &Subtarget) {
28991
82.5k
  if (
!Subtarget.hasAVX2() || 82.5k
!isa<ShuffleVectorSDNode>(N)34.0k
)
28992
68.6k
    return SDValue();
28993
13.8k
28994
13.8k
  EVT VT = N->getValueType(0);
28995
13.8k
28996
13.8k
  // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
28997
13.8k
  if (
!VT.is128BitVector() && 13.8k
!VT.is256BitVector()9.14k
)
28998
2.72k
    return SDValue();
28999
11.1k
29000
11.1k
  
if (11.1k
VT.getVectorElementType() != MVT::i32 &&
29001
9.07k
      VT.getVectorElementType() != MVT::i64 &&
29002
7.82k
      VT.getVectorElementType() != MVT::f32 &&
29003
6.15k
      VT.getVectorElementType() != MVT::f64)
29004
5.02k
    return SDValue();
29005
6.09k
29006
6.09k
  SDValue N0 = N->getOperand(0);
29007
6.09k
  SDValue N1 = N->getOperand(1);
29008
6.09k
29009
6.09k
  // Check that both sources are concats with undef.
29010
6.09k
  if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
29011
6.09k
      
N1.getOpcode() != ISD::CONCAT_VECTORS280
||
N0.getNumOperands() != 27
||
29012
6.09k
      
N1.getNumOperands() != 27
||
!N0.getOperand(1).isUndef()7
||
29013
7
      !N1.getOperand(1).isUndef())
29014
6.09k
    return SDValue();
29015
7
29016
7
  // Construct the new shuffle mask. Elements from the first source retain their
29017
7
  // index, but elements from the second source no longer need to skip an undef.
29018
7
  SmallVector<int, 8> Mask;
29019
7
  int NumElts = VT.getVectorNumElements();
29020
7
29021
7
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29022
7
  for (int Elt : SVOp->getMask())
29023
40
    
Mask.push_back(Elt < NumElts ? 40
Elt27
:
(Elt - NumElts / 2)13
);
29024
82.5k
29025
82.5k
  SDLoc DL(N);
29026
82.5k
  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
29027
82.5k
                               N1.getOperand(0));
29028
82.5k
  return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
29029
82.5k
}
29030
29031
/// Eliminate a redundant shuffle of a horizontal math op.
29032
82.4k
static SDValue foldShuffleOfHorizOp(SDNode *N) {
29033
82.4k
  if (
N->getOpcode() != ISD::VECTOR_SHUFFLE || 82.4k
!N->getOperand(1).isUndef()31.2k
)
29034
66.4k
    return SDValue();
29035
15.9k
29036
15.9k
  SDValue HOp = N->getOperand(0);
29037
15.9k
  if (
HOp.getOpcode() != X86ISD::HADD && 15.9k
HOp.getOpcode() != X86ISD::FHADD15.9k
&&
29038
15.9k
      
HOp.getOpcode() != X86ISD::HSUB15.9k
&&
HOp.getOpcode() != X86ISD::FHSUB15.9k
)
29039
15.9k
    return SDValue();
29040
16
29041
16
  // 128-bit horizontal math instructions are defined to operate on adjacent
29042
16
  // lanes of each operand as:
29043
16
  // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
29044
16
  // ...similarly for v2f64 and v8i16.
29045
16
  // TODO: 256-bit is not the same because...x86.
29046
16
  
if (16
HOp.getOperand(0) != HOp.getOperand(1) || 16
HOp.getValueSizeInBits() != 12816
)
29047
0
    return SDValue();
29048
16
29049
16
  // When the operands of a horizontal math op are identical, the low half of
29050
16
  // the result is the same as the high half. If the shuffle is also replicating
29051
16
  // low and high halves, we don't need the shuffle.
29052
16
  // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
29053
16
  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
29054
16
  // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
29055
16
  // but this should be tied to whatever horizontal op matching and shuffle
29056
16
  // canonicalization are producing.
29057
16
  if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
29058
12
      isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
29059
4
      isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
29060
16
    return HOp;
29061
0
29062
0
  return SDValue();
29063
0
}
29064
29065
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
29066
                              TargetLowering::DAGCombinerInfo &DCI,
29067
83.1k
                              const X86Subtarget &Subtarget) {
29068
83.1k
  SDLoc dl(N);
29069
83.1k
  EVT VT = N->getValueType(0);
29070
83.1k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29071
83.1k
  // If we have legalized the vector types, look for blends of FADD and FSUB
29072
83.1k
  // nodes that we can fuse into an ADDSUB node.
29073
83.1k
  if (
TLI.isTypeLegal(VT)83.1k
) {
29074
82.4k
    if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
29075
78
      return AddSub;
29076
82.4k
29077
82.4k
    
if (SDValue 82.4k
HAddSub82.4k
= foldShuffleOfHorizOp(N))
29078
16
      return HAddSub;
29079
83.0k
  }
29080
83.0k
29081
83.0k
  // During Type Legalization, when promoting illegal vector types,
29082
83.0k
  // the backend might introduce new shuffle dag nodes and bitcasts.
29083
83.0k
  //
29084
83.0k
  // This code performs the following transformation:
29085
83.0k
  // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
29086
83.0k
  //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
29087
83.0k
  //
29088
83.0k
  // We do this only if both the bitcast and the BINOP dag nodes have
29089
83.0k
  // one use. Also, perform this transformation only if the new binary
29090
83.0k
  // operation is legal. This is to avoid introducing dag nodes that
29091
83.0k
  // potentially need to be further expanded (or custom lowered) into a
29092
83.0k
  // less optimal sequence of dag nodes.
29093
83.0k
  
if (83.0k
!DCI.isBeforeLegalize() && 83.0k
DCI.isBeforeLegalizeOps()67.8k
&&
29094
7.54k
      N->getOpcode() == ISD::VECTOR_SHUFFLE &&
29095
7.53k
      N->getOperand(0).getOpcode() == ISD::BITCAST &&
29096
83.0k
      
N->getOperand(1).isUndef()3.06k
&&
N->getOperand(0).hasOneUse()1.40k
) {
29097
1.31k
    SDValue N0 = N->getOperand(0);
29098
1.31k
    SDValue N1 = N->getOperand(1);
29099
1.31k
29100
1.31k
    SDValue BC0 = N0.getOperand(0);
29101
1.31k
    EVT SVT = BC0.getValueType();
29102
1.31k
    unsigned Opcode = BC0.getOpcode();
29103
1.31k
    unsigned NumElts = VT.getVectorNumElements();
29104
1.31k
29105
1.31k
    if (
BC0.hasOneUse() && 1.31k
SVT.isVector()1.23k
&&
29106
1.23k
        SVT.getVectorNumElements() * 2 == NumElts &&
29107
1.31k
        
TLI.isOperationLegal(Opcode, VT)780
) {
29108
221
      bool CanFold = false;
29109
221
      switch (Opcode) {
29110
181
      default : break;
29111
39
      case ISD::ADD:
29112
39
      case ISD::SUB:
29113
39
      case ISD::MUL:
29114
39
        // isOperationLegal lies for integer ops on floating point types.
29115
39
        CanFold = VT.isInteger();
29116
39
        break;
29117
1
      case ISD::FADD:
29118
1
      case ISD::FSUB:
29119
1
      case ISD::FMUL:
29120
1
        // isOperationLegal lies for floating point ops on integer types.
29121
1
        CanFold = VT.isFloatingPoint();
29122
1
        break;
29123
221
      }
29124
221
29125
221
      unsigned SVTNumElts = SVT.getVectorNumElements();
29126
221
      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29127
395
      for (unsigned i = 0, e = SVTNumElts; 
i != e && 395
CanFold356
;
++i174
)
29128
174
        CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
29129
395
      for (unsigned i = SVTNumElts, e = NumElts; 
i != e && 395
CanFold356
;
++i174
)
29130
174
        CanFold = SVOp->getMaskElt(i) < 0;
29131
221
29132
221
      if (
CanFold221
) {
29133
39
        SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
29134
39
        SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
29135
39
        SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
29136
39
        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
29137
39
      }
29138
82.9k
    }
29139
1.31k
  }
29140
82.9k
29141
82.9k
  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
29142
82.9k
  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
29143
82.9k
  // consecutive, non-overlapping, and in the right order.
29144
82.9k
  SmallVector<SDValue, 16> Elts;
29145
113k
  for (unsigned i = 0, e = VT.getVectorNumElements(); 
i != e113k
;
++i30.1k
) {
29146
108k
    if (SDValue 
Elt108k
= getShuffleScalarElt(N, i, DAG, 0)) {
29147
30.1k
      Elts.push_back(Elt);
29148
30.1k
      continue;
29149
30.1k
    }
29150
78.0k
    Elts.clear();
29151
78.0k
    break;
29152
78.0k
  }
29153
82.9k
29154
82.9k
  if (Elts.size() == VT.getVectorNumElements())
29155
4.89k
    
if (SDValue 4.89k
LD4.89k
=
29156
4.89k
            EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
29157
448
      return LD;
29158
82.5k
29159
82.5k
  // For AVX2, we sometimes want to combine
29160
82.5k
  // (vector_shuffle <mask> (concat_vectors t1, undef)
29161
82.5k
  //                        (concat_vectors t2, undef))
29162
82.5k
  // Into:
29163
82.5k
  // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
29164
82.5k
  // Since the latter can be efficiently lowered with VPERMD/VPERMQ
29165
82.5k
  
if (SDValue 82.5k
ShufConcat82.5k
= combineShuffleOfConcatUndef(N, DAG, Subtarget))
29166
7
    return ShufConcat;
29167
82.5k
29168
82.5k
  
if (82.5k
isTargetShuffle(N->getOpcode())82.5k
) {
29169
50.7k
    SDValue Op(N, 0);
29170
50.7k
    if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
29171
825
      return Shuffle;
29172
49.8k
29173
49.8k
    // Try recursively combining arbitrary sequences of x86 shuffle
29174
49.8k
    // instructions into higher-order shuffles. We do this after combining
29175
49.8k
    // specific PSHUF instruction sequences into their minimal form so that we
29176
49.8k
    // can evaluate how many specialized shuffle instructions are involved in
29177
49.8k
    // a particular chain.
29178
49.8k
    
if (49.8k
combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
29179
49.8k
                                      /*HasVarMask*/ false, DAG, DCI,
29180
49.8k
                                      Subtarget))
29181
3.55k
      return SDValue(); // This routine will use CombineTo to replace N.
29182
78.1k
  }
29183
78.1k
29184
78.1k
  return SDValue();
29185
78.1k
}
29186
29187
/// Check if a vector extract from a target-specific shuffle of a load can be
29188
/// folded into a single element load.
29189
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
29190
/// shuffles have been custom lowered so we need to handle those here.
29191
static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
29192
48.3k
                                         TargetLowering::DAGCombinerInfo &DCI) {
29193
48.3k
  if (DCI.isBeforeLegalizeOps())
29194
25.9k
    return SDValue();
29195
22.4k
29196
22.4k
  SDValue InVec = N->getOperand(0);
29197
22.4k
  SDValue EltNo = N->getOperand(1);
29198
22.4k
  EVT EltVT = N->getValueType(0);
29199
22.4k
29200
22.4k
  if (!isa<ConstantSDNode>(EltNo))
29201
16
    return SDValue();
29202
22.4k
29203
22.4k
  EVT OriginalVT = InVec.getValueType();
29204
22.4k
29205
22.4k
  // Peek through bitcasts, don't duplicate a load with other uses.
29206
22.4k
  InVec = peekThroughOneUseBitcasts(InVec);
29207
22.4k
29208
22.4k
  EVT CurrentVT = InVec.getValueType();
29209
22.4k
  if (!CurrentVT.isVector() ||
29210
21.8k
      CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
29211
4.29k
    return SDValue();
29212
18.1k
29213
18.1k
  
if (18.1k
!isTargetShuffle(InVec.getOpcode())18.1k
)
29214
16.7k
    return SDValue();
29215
1.40k
29216
1.40k
  // Don't duplicate a load with other uses.
29217
1.40k
  
if (1.40k
!InVec.hasOneUse()1.40k
)
29218
111
    return SDValue();
29219
1.28k
29220
1.28k
  SmallVector<int, 16> ShuffleMask;
29221
1.28k
  SmallVector<SDValue, 2> ShuffleOps;
29222
1.28k
  bool UnaryShuffle;
29223
1.28k
  if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
29224
1.28k
                            ShuffleOps, ShuffleMask, UnaryShuffle))
29225
4
    return SDValue();
29226
1.28k
29227
1.28k
  // Select the input vector, guarding against out of range extract vector.
29228
1.28k
  unsigned NumElems = CurrentVT.getVectorNumElements();
29229
1.28k
  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
29230
1.28k
  int Idx = (Elt > (int)NumElems) ? 
SM_SentinelUndef0
:
ShuffleMask[Elt]1.28k
;
29231
1.28k
29232
1.28k
  if (Idx == SM_SentinelZero)
29233
3
    
return EltVT.isInteger() ? 3
DAG.getConstant(0, SDLoc(N), EltVT)0
29234
3
                             : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
29235
1.28k
  
if (1.28k
Idx == SM_SentinelUndef1.28k
)
29236
0
    return DAG.getUNDEF(EltVT);
29237
1.28k
29238
1.28k
  assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
29239
1.27k
  SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
29240
5
                                         : ShuffleOps[1];
29241
1.28k
29242
1.28k
  // If inputs to shuffle are the same for both ops, then allow 2 uses
29243
1.28k
  unsigned AllowedUses =
29244
1.28k
      (ShuffleOps.size() > 1 && 
ShuffleOps[0] == ShuffleOps[1]272
) ?
2263
:
11.01k
;
29245
1.28k
29246
1.28k
  if (
LdNode.getOpcode() == ISD::BITCAST1.28k
) {
29247
79
    // Don't duplicate a load with other uses.
29248
79
    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
29249
46
      return SDValue();
29250
33
29251
33
    AllowedUses = 1; // only allow 1 load use if we have a bitcast
29252
33
    LdNode = LdNode.getOperand(0);
29253
33
  }
29254
1.28k
29255
1.23k
  
if (1.23k
!ISD::isNormalLoad(LdNode.getNode())1.23k
)
29256
1.19k
    return SDValue();
29257
37
29258
37
  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
29259
37
29260
37
  if (
!LN0 ||37
!LN0->hasNUsesOfValue(AllowedUses, 0)37
||
LN0->isVolatile()11
)
29261
26
    return SDValue();
29262
11
29263
11
  // If there's a bitcast before the shuffle, check if the load type and
29264
11
  // alignment is valid.
29265
11
  unsigned Align = LN0->getAlignment();
29266
11
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29267
11
  unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
29268
11
      EltVT.getTypeForEVT(*DAG.getContext()));
29269
11
29270
11
  if (
NewAlign > Align || 11
!TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)11
)
29271
0
    return SDValue();
29272
11
29273
11
  // All checks match so transform back to vector_shuffle so that DAG combiner
29274
11
  // can finish the job
29275
11
  SDLoc dl(N);
29276
11
29277
11
  // Create shuffle node taking into account the case that its a unary shuffle
29278
11
  SDValue Shuffle = (UnaryShuffle) ? 
DAG.getUNDEF(CurrentVT)8
:
ShuffleOps[1]3
;
29279
48.3k
  Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
29280
48.3k
                                 ShuffleMask);
29281
48.3k
  Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
29282
48.3k
  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
29283
48.3k
                     EltNo);
29284
48.3k
}
29285
29286
// Try to match patterns such as
29287
// (i16 bitcast (v16i1 x))
29288
// ->
29289
// (i16 movmsk (16i8 sext (v16i1 x)))
29290
// before the illegal vector is scalarized on subtargets that don't have legal
29291
// vxi1 types.
29292
static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
29293
19.4k
                                  const X86Subtarget &Subtarget) {
29294
19.4k
  EVT VT = BitCast.getValueType();
29295
19.4k
  SDValue N0 = BitCast.getOperand(0);
29296
19.4k
  EVT VecVT = N0->getValueType(0);
29297
19.4k
29298
19.4k
  if (
!VT.isScalarInteger() || 19.4k
!VecVT.isSimple()4.75k
)
29299
14.7k
    return SDValue();
29300
4.72k
29301
4.72k
  // With AVX512 vxi1 types are legal and we prefer using k-regs.
29302
4.72k
  // MOVMSK is supported in SSE2 or later.
29303
4.72k
  
if (4.72k
Subtarget.hasAVX512() || 4.72k
!Subtarget.hasSSE2()1.36k
)
29304
3.41k
    return SDValue();
29305
1.30k
29306
1.30k
  // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
29307
1.30k
  // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
29308
1.30k
  // v8i16 and v16i16.
29309
1.30k
  // For these two cases, we can shuffle the upper element bytes to a
29310
1.30k
  // consecutive sequence at the start of the vector and treat the results as
29311
1.30k
  // v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
29312
1.30k
  // for v16i16 this is not the case, because the shuffle is expensive, so we
29313
1.30k
  // avoid sign-extending to this type entirely.
29314
1.30k
  // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
29315
1.30k
  // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
29316
1.30k
  MVT SExtVT;
29317
1.30k
  MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
29318
1.30k
  switch (VecVT.getSimpleVT().SimpleTy) {
29319
1.13k
  default:
29320
1.13k
    return SDValue();
29321
40
  case MVT::v2i1:
29322
40
    SExtVT = MVT::v2i64;
29323
40
    FPCastVT = MVT::v2f64;
29324
40
    break;
29325
49
  case MVT::v4i1:
29326
49
    SExtVT = MVT::v4i32;
29327
49
    FPCastVT = MVT::v4f32;
29328
49
    // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
29329
49
    // sign-extend to a 256-bit operation to avoid truncation.
29330
49
    if (N0->getOpcode() == ISD::SETCC &&
29331
24
        N0->getOperand(0)->getValueType(0).is256BitVector() &&
29332
49
        
Subtarget.hasAVX()8
) {
29333
4
      SExtVT = MVT::v4i64;
29334
4
      FPCastVT = MVT::v4f64;
29335
4
    }
29336
49
    break;
29337
45
  case MVT::v8i1:
29338
45
    SExtVT = MVT::v8i16;
29339
45
    // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
29340
45
    // sign-extend to a 256-bit operation to match the compare.
29341
45
    // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
29342
45
    // 256-bit because the shuffle is cheaper than sign extending the result of
29343
45
    // the compare.
29344
45
    if (N0->getOpcode() == ISD::SETCC &&
29345
22
        N0->getOperand(0)->getValueType(0).is256BitVector() &&
29346
45
        
Subtarget.hasAVX()8
) {
29347
4
      SExtVT = MVT::v8i32;
29348
4
      FPCastVT = MVT::v8f32;
29349
4
    }
29350
45
    break;
29351
29
  case MVT::v16i1:
29352
29
    SExtVT = MVT::v16i8;
29353
29
    // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
29354
29
    // it is not profitable to sign-extend to 256-bit because this will
29355
29
    // require an extra cross-lane shuffle which is more expensive than
29356
29
    // truncating the result of the compare to 128-bits.
29357
29
    break;
29358
14
  case MVT::v32i1:
29359
14
    SExtVT = MVT::v32i8;
29360
14
    break;
29361
177
  };
29362
177
29363
177
  SDLoc DL(BitCast);
29364
177
  SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
29365
177
29366
177
  if (
SExtVT == MVT::v32i8 && 177
!Subtarget.hasInt256()14
) {
29367
10
    // Handle pre-AVX2 cases by splitting to two v16i1's.
29368
10
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29369
10
    MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
29370
10
    SDValue Lo = extract128BitVector(V, 0, DAG, DL);
29371
10
    SDValue Hi = extract128BitVector(V, 16, DAG, DL);
29372
10
    Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
29373
10
    Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
29374
10
    Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
29375
10
                     DAG.getConstant(16, DL, ShiftTy));
29376
10
    V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
29377
10
    return DAG.getZExtOrTrunc(V, DL, VT);
29378
10
  }
29379
167
29380
167
  
if (167
SExtVT == MVT::v8i16167
) {
29381
41
    V = DAG.getBitcast(MVT::v16i8, V);
29382
41
    V = DAG.getVectorShuffle(
29383
41
        MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
29384
41
        {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
29385
41
  } else
29386
167
    assert(SExtVT.getScalarType() != MVT::i16 &&
29387
167
           "Vectors of i16 must be shuffled");
29388
167
  if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
29389
93
    V = DAG.getBitcast(FPCastVT, V);
29390
19.4k
  V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29391
19.4k
  return DAG.getZExtOrTrunc(V, DL, VT);
29392
19.4k
}
29393
29394
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
29395
                              TargetLowering::DAGCombinerInfo &DCI,
29396
183k
                              const X86Subtarget &Subtarget) {
29397
183k
  SDValue N0 = N->getOperand(0);
29398
183k
  EVT VT = N->getValueType(0);
29399
183k
  EVT SrcVT = N0.getValueType();
29400
183k
29401
183k
  // Try to match patterns such as
29402
183k
  // (i16 bitcast (v16i1 x))
29403
183k
  // ->
29404
183k
  // (i16 movmsk (16i8 sext (v16i1 x)))
29405
183k
  // before the setcc result is scalarized on subtargets that don't have legal
29406
183k
  // vxi1 types.
29407
183k
  if (DCI.isBeforeLegalize())
29408
19.4k
    
if (SDValue 19.4k
V19.4k
= combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
29409
177
      return V;
29410
183k
  // Since MMX types are special and don't usually play with other vector types,
29411
183k
  // it's better to handle them early to be sure we emit efficient code by
29412
183k
  // avoiding store-load conversions.
29413
183k
29414
183k
  // Detect bitcasts between i32 to x86mmx low word.
29415
183k
  
if (183k
VT == MVT::x86mmx && 183k
N0.getOpcode() == ISD::BUILD_VECTOR1.46k
&&
29416
183k
      
SrcVT == MVT::v2i32648
&&
isNullConstant(N0.getOperand(1))32
) {
29417
29
    SDValue N00 = N0->getOperand(0);
29418
29
    if (N00.getValueType() == MVT::i32)
29419
29
      return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
29420
183k
  }
29421
183k
29422
183k
  // Detect bitcasts between element or subvector extraction to x86mmx.
29423
183k
  
if (183k
VT == MVT::x86mmx &&
29424
1.43k
      (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
29425
1.43k
       N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
29426
183k
      
isNullConstant(N0.getOperand(1))12
) {
29427
12
    SDValue N00 = N0->getOperand(0);
29428
12
    if (N00.getValueType().is128BitVector())
29429
12
      return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
29430
12
                         DAG.getBitcast(MVT::v2i64, N00));
29431
183k
  }
29432
183k
29433
183k
  // Detect bitcasts from FP_TO_SINT to x86mmx.
29434
183k
  
if (183k
VT == MVT::x86mmx && 183k
SrcVT == MVT::v2i321.42k
&&
29435
183k
      
N0.getOpcode() == ISD::FP_TO_SINT11
) {
29436
2
    SDLoc DL(N0);
29437
2
    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
29438
2
                              DAG.getUNDEF(MVT::v2i32));
29439
2
    return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
29440
2
                       DAG.getBitcast(MVT::v2i64, Res));
29441
2
  }
29442
183k
29443
183k
  // Convert a bitcasted integer logic operation that has one bitcasted
29444
183k
  // floating-point operand into a floating-point logic operation. This may
29445
183k
  // create a load of a constant, but that is cheaper than materializing the
29446
183k
  // constant in an integer register and transferring it to an SSE register or
29447
183k
  // transferring the SSE operand to integer register and back.
29448
183k
  unsigned FPOpcode;
29449
183k
  switch (N0.getOpcode()) {
29450
13.6k
    case ISD::AND: FPOpcode = X86ISD::FAND; break;
29451
5.63k
    case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
29452
4.57k
    case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
29453
159k
    default: return SDValue();
29454
23.8k
  }
29455
23.8k
29456
23.8k
  
if (23.8k
!((Subtarget.hasSSE1() && 23.8k
VT == MVT::f3223.8k
) ||
29457
23.4k
        
(Subtarget.hasSSE2() && 23.4k
VT == MVT::f6423.4k
)))
29458
23.2k
    return SDValue();
29459
682
29460
682
  SDValue LogicOp0 = N0.getOperand(0);
29461
682
  SDValue LogicOp1 = N0.getOperand(1);
29462
682
  SDLoc DL0(N0);
29463
682
29464
682
  // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29465
682
  if (
N0.hasOneUse() && 682
LogicOp0.getOpcode() == ISD::BITCAST662
&&
29466
682
      
LogicOp0.hasOneUse()35
&&
LogicOp0.getOperand(0).getValueType() == VT35
&&
29467
682
      
!isa<ConstantSDNode>(LogicOp0.getOperand(0))35
) {
29468
35
    SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29469
35
    return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29470
35
  }
29471
647
  // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29472
647
  
if (647
N0.hasOneUse() && 647
LogicOp1.getOpcode() == ISD::BITCAST627
&&
29473
647
      
LogicOp1.hasOneUse()1
&&
LogicOp1.getOperand(0).getValueType() == VT1
&&
29474
647
      
!isa<ConstantSDNode>(LogicOp1.getOperand(0))1
) {
29475
1
    SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29476
1
    return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29477
1
  }
29478
646
29479
646
  return SDValue();
29480
646
}
29481
29482
// Match a binop + shuffle pyramid that represents a horizontal reduction over
29483
// the elements of a vector.
29484
// Returns the vector that is being reduced on, or SDValue() if a reduction
29485
// was not matched.
29486
93.5k
static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29487
93.5k
  // The pattern must end in an extract from index 0.
29488
93.5k
  if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29489
93.5k
      !isNullConstant(Extract->getOperand(1)))
29490
64.1k
    return SDValue();
29491
29.4k
29492
29.4k
  unsigned Stages =
29493
29.4k
      Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29494
29.4k
29495
29.4k
  SDValue Op = Extract->getOperand(0);
29496
29.4k
  // At each stage, we're looking for something that looks like:
29497
29.4k
  // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29498
29.4k
  //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29499
29.4k
  //                               i32 undef, i32 undef, i32 undef, i32 undef>
29500
29.4k
  // %a = binop <8 x i32> %op, %s
29501
29.4k
  // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29502
29.4k
  // we expect something like:
29503
29.4k
  // <4,5,6,7,u,u,u,u>
29504
29.4k
  // <2,3,u,u,u,u,u,u>
29505
29.4k
  // <1,u,u,u,u,u,u,u>
29506
30.1k
  for (unsigned i = 0; 
i < Stages30.1k
;
++i705
) {
29507
29.7k
    if (Op.getOpcode() != BinOp)
29508
28.5k
      return SDValue();
29509
1.22k
29510
1.22k
    ShuffleVectorSDNode *Shuffle =
29511
1.22k
        dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29512
1.22k
    if (
Shuffle1.22k
) {
29513
17
      Op = Op.getOperand(1);
29514
1.22k
    } else {
29515
1.21k
      Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29516
1.21k
      Op = Op.getOperand(0);
29517
1.21k
    }
29518
1.22k
29519
1.22k
    // The first operand of the shuffle should be the same as the other operand
29520
1.22k
    // of the add.
29521
1.22k
    if (
!Shuffle || 1.22k
(Shuffle->getOperand(0) != Op)716
)
29522
522
      return SDValue();
29523
705
29524
705
    // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29525
2.56k
    
for (int Index = 0, MaskEnd = 1 << i; 705
Index < MaskEnd2.56k
;
++Index1.86k
)
29526
1.86k
      
if (1.86k
Shuffle->getMaskElt(Index) != MaskEnd + Index1.86k
)
29527
0
        return SDValue();
29528
29.7k
  }
29529
29.4k
29530
357
  return Op;
29531
93.5k
}
29532
29533
// Given a select, detect the following pattern:
29534
// 1:    %2 = zext <N x i8> %0 to <N x i32>
29535
// 2:    %3 = zext <N x i8> %1 to <N x i32>
29536
// 3:    %4 = sub nsw <N x i32> %2, %3
29537
// 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29538
// 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
29539
// 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29540
// This is useful as it is the input into a SAD pattern.
29541
static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29542
48
                              SDValue &Op1) {
29543
48
  // Check the condition of the select instruction is greater-than.
29544
48
  SDValue SetCC = Select->getOperand(0);
29545
48
  if (SetCC.getOpcode() != ISD::SETCC)
29546
0
    return false;
29547
48
  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29548
48
  if (
CC != ISD::SETGT && 48
CC != ISD::SETLT15
)
29549
0
    return false;
29550
48
29551
48
  SDValue SelectOp1 = Select->getOperand(1);
29552
48
  SDValue SelectOp2 = Select->getOperand(2);
29553
48
29554
48
  // The following instructions assume SelectOp1 is the subtraction operand
29555
48
  // and SelectOp2 is the negation operand.
29556
48
  // In the case of SETLT this is the other way around.
29557
48
  if (CC == ISD::SETLT)
29558
15
    std::swap(SelectOp1, SelectOp2);
29559
48
29560
48
  // The second operand of the select should be the negation of the first
29561
48
  // operand, which is implemented as 0 - SelectOp1.
29562
48
  if (!(SelectOp2.getOpcode() == ISD::SUB &&
29563
48
        ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29564
48
        SelectOp2.getOperand(1) == SelectOp1))
29565
0
    return false;
29566
48
29567
48
  // The first operand of SetCC is the first operand of the select, which is the
29568
48
  // difference between the two input vectors.
29569
48
  
if (48
SetCC.getOperand(0) != SelectOp148
)
29570
0
    return false;
29571
48
29572
48
  // In SetLT case, The second operand of the comparison can be either 1 or 0.
29573
48
  APInt SplatVal;
29574
48
  if ((CC == ISD::SETLT) &&
29575
15
      !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29576
15
         SplatVal.isOneValue()) ||
29577
12
        (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29578
0
    return false;
29579
48
29580
48
  // In SetGT case, The second operand of the comparison can be either -1 or 0.
29581
48
  
if (48
(CC == ISD::SETGT) &&
29582
33
      !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29583
30
        ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29584
0
    return false;
29585
48
29586
48
  // The first operand of the select is the difference between the two input
29587
48
  // vectors.
29588
48
  
if (48
SelectOp1.getOpcode() != ISD::SUB48
)
29589
0
    return false;
29590
48
29591
48
  Op0 = SelectOp1.getOperand(0);
29592
48
  Op1 = SelectOp1.getOperand(1);
29593
48
29594
48
  // Check if the operands of the sub are zero-extended from vectors of i8.
29595
48
  if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29596
48
      Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29597
48
      Op1.getOpcode() != ISD::ZERO_EXTEND ||
29598
48
      Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29599
0
    return false;
29600
48
29601
48
  return true;
29602
48
}
29603
29604
// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29605
// to these zexts.
29606
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29607
48
                            const SDValue &Zext1, const SDLoc &DL) {
29608
48
29609
48
  // Find the appropriate width for the PSADBW.
29610
48
  EVT InVT = Zext0.getOperand(0).getValueType();
29611
48
  unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29612
48
29613
48
  // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29614
48
  // fill in the missing vector elements with 0.
29615
48
  unsigned NumConcat = RegSize / InVT.getSizeInBits();
29616
48
  SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29617
48
  Ops[0] = Zext0.getOperand(0);
29618
48
  MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29619
48
  SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29620
48
  Ops[0] = Zext1.getOperand(0);
29621
48
  SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29622
48
29623
48
  // Actually build the SAD
29624
48
  MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29625
48
  return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29626
48
}
29627
29628
// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29629
static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29630
                                                SelectionDAG &DAG,
29631
47.7k
                                                const X86Subtarget &Subtarget) {
29632
47.7k
  // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29633
47.7k
  if (
!Subtarget.hasSSE2() || 47.7k
Subtarget.hasVLX()47.1k
)
29634
2.61k
    return SDValue();
29635
45.0k
29636
45.0k
  EVT ExtractVT = Extract->getValueType(0);
29637
45.0k
  unsigned BitWidth = ExtractVT.getSizeInBits();
29638
45.0k
  if (
ExtractVT != MVT::i64 && 45.0k
ExtractVT != MVT::i3239.3k
&&
ExtractVT != MVT::i1633.5k
&&
29639
30.6k
      ExtractVT != MVT::i8)
29640
9.11k
    return SDValue();
29641
35.9k
29642
35.9k
  // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29643
35.9k
  
for (ISD::NodeType Op : {ISD::OR, ISD::AND}) 35.9k
{
29644
71.8k
    SDValue Match = matchBinOpReduction(Extract, Op);
29645
71.8k
    if (!Match)
29646
71.5k
      continue;
29647
243
29648
243
    // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29649
243
    // which we can't support here for now.
29650
243
    
if (243
Match.getScalarValueSizeInBits() != BitWidth243
)
29651
44
      continue;
29652
199
29653
199
    // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29654
199
    unsigned MatchSizeInBits = Match.getValueSizeInBits();
29655
199
    if (!(MatchSizeInBits == 128 ||
29656
57
          (MatchSizeInBits == 256 &&
29657
57
           
((Subtarget.hasAVX() && 48
BitWidth >= 3236
) ||
Subtarget.hasAVX2()20
))))
29658
25
      return SDValue();
29659
174
29660
174
    // Don't bother performing this for 2-element vectors.
29661
174
    
if (174
Match.getValueType().getVectorNumElements() <= 2174
)
29662
56
      return SDValue();
29663
118
29664
118
    // Check that we are extracting a reduction of all sign bits.
29665
118
    
if (118
DAG.ComputeNumSignBits(Match) != BitWidth118
)
29666
36
      return SDValue();
29667
82
29668
82
    // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29669
82
    MVT MaskVT;
29670
82
    if (
64 == BitWidth || 82
32 == BitWidth74
)
29671
44
      MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29672
44
                                MatchSizeInBits / BitWidth);
29673
82
    else
29674
38
      MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29675
82
29676
82
    APInt CompareBits;
29677
82
    ISD::CondCode CondCode;
29678
82
    if (
Op == ISD::OR82
) {
29679
41
      // any_of -> MOVMSK != 0
29680
41
      CompareBits = APInt::getNullValue(32);
29681
41
      CondCode = ISD::CondCode::SETNE;
29682
82
    } else {
29683
41
      // all_of -> MOVMSK == ((1 << NumElts) - 1)
29684
41
      CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29685
41
      CondCode = ISD::CondCode::SETEQ;
29686
41
    }
29687
71.8k
29688
71.8k
    // Perform the select as i32/i64 and then truncate to avoid partial register
29689
71.8k
    // stalls.
29690
71.8k
    unsigned ResWidth = std::max(BitWidth, 32u);
29691
71.8k
    EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29692
71.8k
    SDLoc DL(Extract);
29693
71.8k
    SDValue Zero = DAG.getConstant(0, DL, ResVT);
29694
71.8k
    SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29695
71.8k
    SDValue Res = DAG.getBitcast(MaskVT, Match);
29696
71.8k
    Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29697
71.8k
    Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29698
71.8k
                          Ones, Zero, CondCode);
29699
71.8k
    return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29700
71.8k
  }
29701
35.7k
29702
35.7k
  return SDValue();
29703
35.7k
}
29704
29705
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29706
47.7k
                                      const X86Subtarget &Subtarget) {
29707
47.7k
  // PSADBW is only supported on SSE2 and up.
29708
47.7k
  if (!Subtarget.hasSSE2())
29709
547
    return SDValue();
29710
47.1k
29711
47.1k
  // Verify the type we're extracting from is any integer type above i16.
29712
47.1k
  EVT VT = Extract->getOperand(0).getValueType();
29713
47.1k
  if (
!VT.isSimple() || 47.1k
!(VT.getVectorElementType().getSizeInBits() > 16)45.8k
)
29714
25.4k
    return SDValue();
29715
21.7k
29716
21.7k
  unsigned RegSize = 128;
29717
21.7k
  if (Subtarget.hasBWI())
29718
1.95k
    RegSize = 512;
29719
19.7k
  else 
if (19.7k
Subtarget.hasAVX2()19.7k
)
29720
6.98k
    RegSize = 256;
29721
21.7k
29722
21.7k
  // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29723
21.7k
  // TODO: We should be able to handle larger vectors by splitting them before
29724
21.7k
  // feeding them into several SADs, and then reducing over those.
29725
21.7k
  if (RegSize / VT.getVectorNumElements() < 8)
29726
5
    return SDValue();
29727
21.7k
29728
21.7k
  // Match shuffle + add pyramid.
29729
21.7k
  SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29730
21.7k
29731
21.7k
  // The operand is expected to be zero extended from i8
29732
21.7k
  // (verified in detectZextAbsDiff).
29733
21.7k
  // In order to convert to i64 and above, additional any/zero/sign
29734
21.7k
  // extend is expected.
29735
21.7k
  // The zero extend from 32 bit has no mathematical effect on the result.
29736
21.7k
  // Also the sign extend is basically zero extend
29737
21.7k
  // (extends the sign bit which is zero).
29738
21.7k
  // So it is correct to skip the sign/zero extend instruction.
29739
21.7k
  if (
Root && 21.7k
(Root.getOpcode() == ISD::SIGN_EXTEND ||
29740
111
    Root.getOpcode() == ISD::ZERO_EXTEND ||
29741
108
    Root.getOpcode() == ISD::ANY_EXTEND))
29742
6
    Root = Root.getOperand(0);
29743
21.7k
29744
21.7k
  // If there was a match, we want Root to be a select that is the root of an
29745
21.7k
  // abs-diff pattern.
29746
21.7k
  if (
!Root || 21.7k
(Root.getOpcode() != ISD::VSELECT)114
)
29747
21.7k
    return SDValue();
29748
36
29749
36
  // Check whether we have an abs-diff pattern feeding into the select.
29750
36
  SDValue Zext0, Zext1;
29751
36
  if (!detectZextAbsDiff(Root, Zext0, Zext1))
29752
0
    return SDValue();
29753
36
29754
36
  // Create the SAD instruction.
29755
36
  SDLoc DL(Extract);
29756
36
  SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29757
36
29758
36
  // If the original vector was wider than 8 elements, sum over the results
29759
36
  // in the SAD vector.
29760
36
  unsigned Stages = Log2_32(VT.getVectorNumElements());
29761
36
  MVT SadVT = SAD.getSimpleValueType();
29762
36
  if (
Stages > 336
) {
29763
7
    unsigned SadElems = SadVT.getVectorNumElements();
29764
7
29765
17
    for(unsigned i = Stages - 3; 
i > 017
;
--i10
) {
29766
10
      SmallVector<int, 16> Mask(SadElems, -1);
29767
23
      for(unsigned j = 0, MaskEnd = 1 << (i - 1); 
j < MaskEnd23
;
++j13
)
29768
13
        Mask[j] = MaskEnd + j;
29769
10
29770
10
      SDValue Shuffle =
29771
10
          DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29772
10
      SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29773
10
    }
29774
7
  }
29775
47.7k
29776
47.7k
  MVT Type = Extract->getSimpleValueType(0);
29777
47.7k
  unsigned TypeSizeInBits = Type.getSizeInBits();
29778
47.7k
  // Return the lowest TypeSizeInBits bits.
29779
47.7k
  MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29780
47.7k
  SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29781
47.7k
  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29782
47.7k
                     Extract->getOperand(1));
29783
47.7k
}
29784
29785
// Attempt to peek through a target shuffle and extract the scalar from the
29786
// source.
29787
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29788
                                         TargetLowering::DAGCombinerInfo &DCI,
29789
52.1k
                                         const X86Subtarget &Subtarget) {
29790
52.1k
  if (DCI.isBeforeLegalizeOps())
29791
25.9k
    return SDValue();
29792
26.2k
29793
26.2k
  SDValue Src = N->getOperand(0);
29794
26.2k
  SDValue Idx = N->getOperand(1);
29795
26.2k
29796
26.2k
  EVT VT = N->getValueType(0);
29797
26.2k
  EVT SrcVT = Src.getValueType();
29798
26.2k
  EVT SrcSVT = SrcVT.getVectorElementType();
29799
26.2k
  unsigned NumSrcElts = SrcVT.getVectorNumElements();
29800
26.2k
29801
26.2k
  // Don't attempt this for boolean mask vectors or unknown extraction indices.
29802
26.2k
  if (
SrcSVT == MVT::i1 || 26.2k
!isa<ConstantSDNode>(Idx)21.3k
)
29803
4.93k
    return SDValue();
29804
21.3k
29805
21.3k
  // Resolve the target shuffle inputs and mask.
29806
21.3k
  SmallVector<int, 16> Mask;
29807
21.3k
  SmallVector<SDValue, 2> Ops;
29808
21.3k
  if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
29809
17.7k
    return SDValue();
29810
3.55k
29811
3.55k
  // Attempt to narrow/widen the shuffle mask to the correct size.
29812
3.55k
  
if (3.55k
Mask.size() != NumSrcElts3.55k
) {
29813
1.71k
    if (
(NumSrcElts % Mask.size()) == 01.71k
) {
29814
688
      SmallVector<int, 16> ScaledMask;
29815
688
      int Scale = NumSrcElts / Mask.size();
29816
688
      scaleShuffleMask<int>(Scale, Mask, ScaledMask);
29817
688
      Mask = std::move(ScaledMask);
29818
1.71k
    } else 
if (1.02k
(Mask.size() % NumSrcElts) == 01.02k
) {
29819
1.02k
      SmallVector<int, 16> WidenedMask;
29820
1.50k
      while (Mask.size() > NumSrcElts &&
29821
1.16k
             canWidenShuffleElements(Mask, WidenedMask))
29822
482
        Mask = std::move(WidenedMask);
29823
1.02k
      // TODO - investigate support for wider shuffle masks with known upper
29824
1.02k
      // undef/zero elements for implicit zero-extension.
29825
1.02k
    }
29826
1.71k
  }
29827
3.55k
29828
3.55k
  // Check if narrowing/widening failed.
29829
3.55k
  if (Mask.size() != NumSrcElts)
29830
678
    return SDValue();
29831
2.87k
29832
2.87k
  int SrcIdx = Mask[N->getConstantOperandVal(1)];
29833
2.87k
  SDLoc dl(N);
29834
2.87k
29835
2.87k
  // If the shuffle source element is undef/zero then we can just accept it.
29836
2.87k
  if (SrcIdx == SM_SentinelUndef)
29837
0
    return DAG.getUNDEF(VT);
29838
2.87k
29839
2.87k
  
if (2.87k
SrcIdx == SM_SentinelZero2.87k
)
29840
2
    
return VT.isFloatingPoint() ? 2
DAG.getConstantFP(0.0, dl, VT)0
29841
2
                                : DAG.getConstant(0, dl, VT);
29842
2.87k
29843
2.87k
  SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29844
2.87k
  SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29845
2.87k
  SrcIdx = SrcIdx % Mask.size();
29846
2.87k
29847
2.87k
  // We can only extract other elements from 128-bit vectors and in certain
29848
2.87k
  // circumstances, depending on SSE-level.
29849
2.87k
  // TODO: Investigate using extract_subvector for larger vectors.
29850
2.87k
  // TODO: Investigate float/double extraction if it will be just stored.
29851
2.87k
  if (
(SrcVT == MVT::v4i32 || 2.87k
SrcVT == MVT::v2i642.64k
) &&
29852
2.87k
      
((SrcIdx == 0 && 528
Subtarget.hasSSE2()138
) ||
Subtarget.hasSSE41()390
)) {
29853
146
    assert(SrcSVT == VT && "Unexpected extraction type");
29854
146
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29855
146
                       DAG.getIntPtrConstant(SrcIdx, dl));
29856
146
  }
29857
2.72k
29858
2.72k
  
if (2.72k
(SrcVT == MVT::v8i16 && 2.72k
Subtarget.hasSSE2()14
) ||
29859
2.72k
      
(SrcVT == MVT::v16i8 && 2.71k
Subtarget.hasSSE41()956
)) {
29860
970
    assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29861
970
           "Unexpected extraction type");
29862
970
    unsigned OpCode = (SrcVT == MVT::v8i16 ? 
X86ISD::PEXTRW14
:
X86ISD::PEXTRB956
);
29863
970
    SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29864
970
                                DAG.getIntPtrConstant(SrcIdx, dl));
29865
970
    SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29866
970
                                 DAG.getValueType(SrcSVT));
29867
970
    return DAG.getZExtOrTrunc(Assert, dl, VT);
29868
970
  }
29869
1.75k
29870
1.75k
  return SDValue();
29871
1.75k
}
29872
29873
/// Detect vector gather/scatter index generation and convert it from being a
29874
/// bunch of shuffles and extracts into a somewhat faster sequence.
29875
/// For i686, the best sequence is apparently storing the value and loading
29876
/// scalars back, while for x64 we should use 64-bit extracts and shifts.
29877
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29878
                                       TargetLowering::DAGCombinerInfo &DCI,
29879
48.3k
                                       const X86Subtarget &Subtarget) {
29880
48.3k
  if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29881
14
    return NewOp;
29882
48.3k
29883
48.3k
  
if (SDValue 48.3k
NewOp48.3k
= combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29884
276
    return NewOp;
29885
48.0k
29886
48.0k
  SDValue InputVector = N->getOperand(0);
29887
48.0k
  SDValue EltIdx = N->getOperand(1);
29888
48.0k
29889
48.0k
  EVT SrcVT = InputVector.getValueType();
29890
48.0k
  EVT VT = N->getValueType(0);
29891
48.0k
  SDLoc dl(InputVector);
29892
48.0k
29893
48.0k
  // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29894
48.0k
  if (
InputVector.getOpcode() == ISD::BITCAST && 48.0k
InputVector.hasOneUse()15.1k
&&
29895
48.0k
      
VT == MVT::i646.64k
&&
SrcVT == MVT::v1i641.67k
&&
isNullConstant(EltIdx)318
) {
29896
318
    SDValue MMXSrc = InputVector.getOperand(0);
29897
318
29898
318
    // The bitcast source is a direct mmx result.
29899
318
    if (MMXSrc.getValueType() == MVT::x86mmx)
29900
315
      return DAG.getBitcast(VT, InputVector);
29901
47.7k
  }
29902
47.7k
29903
47.7k
  // Detect mmx to i32 conversion through a v2i32 elt extract.
29904
47.7k
  
if (47.7k
InputVector.getOpcode() == ISD::BITCAST && 47.7k
InputVector.hasOneUse()14.8k
&&
29905
47.7k
      
VT == MVT::i326.32k
&&
SrcVT == MVT::v2i321.41k
&&
isNullConstant(EltIdx)11
) {
29906
7
    SDValue MMXSrc = InputVector.getOperand(0);
29907
7
29908
7
    // The bitcast source is a direct mmx result.
29909
7
    if (MMXSrc.getValueType() == MVT::x86mmx)
29910
6
      return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29911
47.7k
  }
29912
47.7k
29913
47.7k
  
if (47.7k
VT == MVT::i1 && 47.7k
InputVector.getOpcode() == ISD::BITCAST245
&&
29914
16
      isa<ConstantSDNode>(EltIdx) &&
29915
47.7k
      
isa<ConstantSDNode>(InputVector.getOperand(0))16
) {
29916
8
    uint64_t ExtractedElt = N->getConstantOperandVal(1);
29917
8
    uint64_t InputValue = InputVector.getConstantOperandVal(0);
29918
8
    uint64_t Res = (InputValue >> ExtractedElt) & 1;
29919
8
    return DAG.getConstant(Res, dl, MVT::i1);
29920
8
  }
29921
47.7k
29922
47.7k
  // Check whether this extract is the root of a sum of absolute differences
29923
47.7k
  // pattern. This has to be done here because we really want it to happen
29924
47.7k
  // pre-legalization,
29925
47.7k
  
if (SDValue 47.7k
SAD47.7k
= combineBasicSADPattern(N, DAG, Subtarget))
29926
36
    return SAD;
29927
47.7k
29928
47.7k
  // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29929
47.7k
  
if (SDValue 47.7k
Cmp47.7k
= combineHorizontalPredicateResult(N, DAG, Subtarget))
29930
82
    return Cmp;
29931
47.6k
29932
47.6k
  // Only operate on vectors of 4 elements, where the alternative shuffling
29933
47.6k
  // gets to be more expensive.
29934
47.6k
  
if (47.6k
SrcVT != MVT::v4i3247.6k
)
29935
42.4k
    return SDValue();
29936
5.21k
29937
5.21k
  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29938
5.21k
  // single use which is a sign-extend or zero-extend, and all elements are
29939
5.21k
  // used.
29940
5.21k
  SmallVector<SDNode *, 4> Uses;
29941
5.21k
  unsigned ExtractedElements = 0;
29942
5.21k
  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29943
5.29k
       UE = InputVector.getNode()->use_end(); 
UI != UE5.29k
;
++UI73
) {
29944
5.26k
    if (UI.getUse().getResNo() != InputVector.getResNo())
29945
27
      return SDValue();
29946
5.23k
29947
5.23k
    SDNode *Extract = *UI;
29948
5.23k
    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29949
793
      return SDValue();
29950
4.44k
29951
4.44k
    
if (4.44k
Extract->getValueType(0) != MVT::i324.44k
)
29952
0
      return SDValue();
29953
4.44k
    
if (4.44k
!Extract->hasOneUse()4.44k
)
29954
95
      return SDValue();
29955
4.34k
    
if (4.34k
Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29956
4.32k
        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29957
4.27k
      return SDValue();
29958
73
    
if (73
!isa<ConstantSDNode>(Extract->getOperand(1))73
)
29959
0
      return SDValue();
29960
73
29961
73
    // Record which element was extracted.
29962
73
    ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29963
73
    Uses.push_back(Extract);
29964
73
  }
29965
5.21k
29966
5.21k
  // If not all the elements were used, this may not be worthwhile.
29967
28
  
if (28
ExtractedElements != 1528
)
29968
13
    return SDValue();
29969
15
29970
15
  // Ok, we've now decided to do the transformation.
29971
15
  // If 64-bit shifts are legal, use the extract-shift sequence,
29972
15
  // otherwise bounce the vector off the cache.
29973
15
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29974
15
  SDValue Vals[4];
29975
15
29976
15
  if (
TLI.isOperationLegal(ISD::SRA, MVT::i64)15
) {
29977
14
    SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29978
14
    auto &DL = DAG.getDataLayout();
29979
14
    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29980
14
    SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29981
14
      DAG.getConstant(0, dl, VecIdxTy));
29982
14
    SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29983
14
      DAG.getConstant(1, dl, VecIdxTy));
29984
14
29985
14
    SDValue ShAmt = DAG.getConstant(
29986
14
        32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29987
14
    Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29988
14
    Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29989
14
      DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29990
14
    Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29991
14
    Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29992
14
      DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29993
15
  } else {
29994
1
    // Store the value to a temporary stack slot.
29995
1
    SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29996
1
    SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29997
1
                              MachinePointerInfo());
29998
1
29999
1
    EVT ElementType = SrcVT.getVectorElementType();
30000
1
    unsigned EltSize = ElementType.getSizeInBits() / 8;
30001
1
30002
1
    // Replace each use (extract) with a load of the appropriate element.
30003
5
    for (unsigned i = 0; 
i < 45
;
++i4
) {
30004
4
      uint64_t Offset = EltSize * i;
30005
4
      auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
30006
4
      SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
30007
4
30008
4
      SDValue ScalarAddr =
30009
4
          DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
30010
4
30011
4
      // Load the scalar.
30012
4
      Vals[i] =
30013
4
          DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
30014
4
    }
30015
1
  }
30016
15
30017
15
  // Replace the extracts
30018
15
  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
30019
75
    UE = Uses.end(); 
UI != UE75
;
++UI60
) {
30020
60
    SDNode *Extract = *UI;
30021
60
30022
60
    uint64_t IdxVal = Extract->getConstantOperandVal(1);
30023
60
    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
30024
60
  }
30025
48.3k
30026
48.3k
  // The replacement was made in place; don't return anything.
30027
48.3k
  return SDValue();
30028
48.3k
}
30029
30030
// TODO - merge with combineExtractVectorElt once it can handle the implicit
30031
// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
30032
// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
30033
// combineBasicSADPattern.
30034
static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
30035
                                           TargetLowering::DAGCombinerInfo &DCI,
30036
3.83k
                                           const X86Subtarget &Subtarget) {
30037
3.83k
  return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
30038
3.83k
}
30039
30040
/// If a vector select has an operand that is -1 or 0, try to simplify the
30041
/// select to a bitwise logic operation.
30042
/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
30043
static SDValue
30044
combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
30045
                                 TargetLowering::DAGCombinerInfo &DCI,
30046
31.5k
                                 const X86Subtarget &Subtarget) {
30047
31.5k
  SDValue Cond = N->getOperand(0);
30048
31.5k
  SDValue LHS = N->getOperand(1);
30049
31.5k
  SDValue RHS = N->getOperand(2);
30050
31.5k
  EVT VT = LHS.getValueType();
30051
31.5k
  EVT CondVT = Cond.getValueType();
30052
31.5k
  SDLoc DL(N);
30053
31.5k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30054
31.5k
30055
31.5k
  if (N->getOpcode() != ISD::VSELECT)
30056
9.99k
    return SDValue();
30057
21.5k
30058
31.5k
  assert(CondVT.isVector() && "Vector select expects a vector selector!");
30059
21.5k
30060
21.5k
  bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30061
21.5k
  // Check if the first operand is all zeros and Cond type is vXi1.
30062
21.5k
  // This situation only applies to avx512.
30063
21.5k
  if (
TValIsAllZeros && 21.5k
Subtarget.hasAVX512()97
&&
Cond.hasOneUse()23
&&
30064
21.5k
      
CondVT.getVectorElementType() == MVT::i117
) {
30065
15
    // Invert the cond to not(cond) : xor(op,allones)=not(op)
30066
15
    SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
30067
15
                                  DAG.getAllOnesConstant(DL, CondVT));
30068
15
    // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
30069
15
    return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
30070
15
  }
30071
21.5k
30072
21.5k
  // To use the condition operand as a bitwise mask, it must have elements that
30073
21.5k
  // are the same size as the select elements. Ie, the condition operand must
30074
21.5k
  // have already been promoted from the IR select condition type <N x i1>.
30075
21.5k
  // Don't check if the types themselves are equal because that excludes
30076
21.5k
  // vector floating-point selects.
30077
21.5k
  
if (21.5k
CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits()21.5k
)
30078
16.1k
    return SDValue();
30079
5.44k
30080
5.44k
  bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
30081
5.44k
  bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
30082
5.44k
30083
5.44k
  // Try to invert the condition if true value is not all 1s and false value is
30084
5.44k
  // not all 0s.
30085
5.44k
  if (
!TValIsAllOnes && 5.44k
!FValIsAllZeros5.43k
&&
30086
5.44k
      // Check if the selector will be produced by CMPP*/PCMP*.
30087
5.24k
      Cond.getOpcode() == ISD::SETCC &&
30088
5.44k
      // Check if SETCC has already been promoted.
30089
1.05k
      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
30090
5.44k
          CondVT) {
30091
1.05k
    bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
30092
1.05k
30093
1.05k
    if (
TValIsAllZeros || 1.05k
FValIsAllOnes1.02k
) {
30094
27
      SDValue CC = Cond.getOperand(2);
30095
27
      ISD::CondCode NewCC =
30096
27
          ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
30097
27
                               Cond.getOperand(0).getValueType().isInteger());
30098
27
      Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
30099
27
                          NewCC);
30100
27
      std::swap(LHS, RHS);
30101
27
      TValIsAllOnes = FValIsAllOnes;
30102
27
      FValIsAllZeros = TValIsAllZeros;
30103
27
    }
30104
1.05k
  }
30105
5.44k
30106
5.44k
  // Cond value must be 'sign splat' to be converted to a logical op.
30107
5.44k
  if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
30108
2.78k
    return SDValue();
30109
2.65k
30110
2.65k
  // vselect Cond, 111..., 000... -> Cond
30111
2.65k
  
if (2.65k
TValIsAllOnes && 2.65k
FValIsAllZeros10
)
30112
2
    return DAG.getBitcast(VT, Cond);
30113
2.65k
30114
2.65k
  
if (2.65k
!DCI.isBeforeLegalize() && 2.65k
!TLI.isTypeLegal(CondVT)2.54k
)
30115
2
    return SDValue();
30116
2.65k
30117
2.65k
  // vselect Cond, 111..., X -> or Cond, X
30118
2.65k
  
if (2.65k
TValIsAllOnes2.65k
) {
30119
8
    SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
30120
8
    SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
30121
8
    return DAG.getBitcast(VT, Or);
30122
8
  }
30123
2.64k
30124
2.64k
  // vselect Cond, X, 000... -> and Cond, X
30125
2.64k
  
if (2.64k
FValIsAllZeros2.64k
) {
30126
212
    SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
30127
212
    SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
30128
212
    return DAG.getBitcast(VT, And);
30129
212
  }
30130
2.43k
30131
2.43k
  // vselect Cond, 000..., X -> andn Cond, X
30132
2.43k
  
if (2.43k
TValIsAllZeros2.43k
) {
30133
16
    MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
30134
16
    SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
30135
16
    SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
30136
16
    SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
30137
16
    return DAG.getBitcast(VT, AndN);
30138
16
  }
30139
2.41k
30140
2.41k
  return SDValue();
30141
2.41k
}
30142
30143
32.6k
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
30144
32.6k
  SDValue Cond = N->getOperand(0);
30145
32.6k
  SDValue LHS = N->getOperand(1);
30146
32.6k
  SDValue RHS = N->getOperand(2);
30147
32.6k
  SDLoc DL(N);
30148
32.6k
30149
32.6k
  auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
30150
32.6k
  auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
30151
32.6k
  if (
!TrueC || 32.6k
!FalseC1.79k
)
30152
31.5k
    return SDValue();
30153
1.10k
30154
1.10k
  // Don't do this for crazy integer types.
30155
1.10k
  EVT VT = N->getValueType(0);
30156
1.10k
  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30157
4
    return SDValue();
30158
1.10k
30159
1.10k
  // We're going to use the condition bit in math or logic ops. We could allow
30160
1.10k
  // this with a wider condition value (post-legalization it becomes an i8),
30161
1.10k
  // but if nothing is creating selects that late, it doesn't matter.
30162
1.10k
  
if (1.10k
Cond.getValueType() != MVT::i11.10k
)
30163
875
    return SDValue();
30164
230
30165
230
  // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
30166
230
  // 3, 5, or 9 with i32/i64, so those get transformed too.
30167
230
  // TODO: For constants that overflow or do not differ by power-of-2 or small
30168
230
  // multiplier, convert to 'and' + 'add'.
30169
230
  const APInt &TrueVal = TrueC->getAPIntValue();
30170
230
  const APInt &FalseVal = FalseC->getAPIntValue();
30171
230
  bool OV;
30172
230
  APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
30173
230
  if (OV)
30174
5
    return SDValue();
30175
225
30176
225
  APInt AbsDiff = Diff.abs();
30177
225
  if (AbsDiff.isPowerOf2() ||
30178
116
      
((VT == MVT::i32 || 116
VT == MVT::i6469
) &&
30179
225
       
(AbsDiff == 3 || 71
AbsDiff == 568
||
AbsDiff == 966
))) {
30180
119
30181
119
    // We need a positive multiplier constant for shift/LEA codegen. The 'not'
30182
119
    // of the condition can usually be folded into a compare predicate, but even
30183
119
    // without that, the sequence should be cheaper than a CMOV alternative.
30184
119
    if (
TrueVal.slt(FalseVal)119
) {
30185
97
      Cond = DAG.getNOT(DL, Cond, MVT::i1);
30186
97
      std::swap(TrueC, FalseC);
30187
97
    }
30188
119
30189
119
    // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
30190
119
    SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
30191
119
30192
119
    // Multiply condition by the difference if non-one.
30193
119
    if (!AbsDiff.isOneValue())
30194
119
      R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
30195
119
30196
119
    // Add the base if non-zero.
30197
119
    if (!FalseC->isNullValue())
30198
114
      R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
30199
119
30200
119
    return R;
30201
119
  }
30202
106
30203
106
  return SDValue();
30204
106
}
30205
30206
// If this is a bitcasted op that can be represented as another type, push the
30207
// the bitcast to the inputs. This allows more opportunities for pattern
30208
// matching masked instructions. This is called when we know that the operation
30209
// is used as one of the inputs of a vselect.
30210
static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
30211
13.2k
                                      TargetLowering::DAGCombinerInfo &DCI) {
30212
13.2k
  // Make sure we have a bitcast.
30213
13.2k
  if (OrigOp.getOpcode() != ISD::BITCAST)
30214
9.40k
    return false;
30215
3.82k
30216
3.82k
  SDValue Op = OrigOp.getOperand(0);
30217
3.82k
30218
3.82k
  // If the operation is used by anything other than the bitcast, we shouldn't
30219
3.82k
  // do this combine as that would replicate the operation.
30220
3.82k
  if (!Op.hasOneUse())
30221
88
    return false;
30222
3.73k
30223
3.73k
  MVT VT = OrigOp.getSimpleValueType();
30224
3.73k
  MVT EltVT = VT.getVectorElementType();
30225
3.73k
  SDLoc DL(Op.getNode());
30226
3.73k
30227
3.73k
  auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
30228
55
                                      SDValue Op2) {
30229
55
    Op0 = DAG.getBitcast(VT, Op0);
30230
55
    DCI.AddToWorklist(Op0.getNode());
30231
55
    Op1 = DAG.getBitcast(VT, Op1);
30232
55
    DCI.AddToWorklist(Op1.getNode());
30233
55
    DCI.CombineTo(OrigOp.getNode(),
30234
55
                  DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
30235
55
    return true;
30236
55
  };
30237
3.73k
30238
3.73k
  unsigned Opcode = Op.getOpcode();
30239
3.73k
  switch (Opcode) {
30240
8
  case X86ISD::PALIGNR:
30241
8
    // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
30242
8
    if (!VT.is128BitVector())
30243
0
      return false;
30244
8
    Opcode = X86ISD::VALIGN;
30245
8
    LLVM_FALLTHROUGH;
30246
19
  case X86ISD::VALIGN: {
30247
19
    if (
EltVT != MVT::i32 && 19
EltVT != MVT::i643
)
30248
0
      return false;
30249
19
    uint64_t Imm = Op.getConstantOperandVal(2);
30250
19
    MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30251
19
    unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
30252
19
    unsigned EltSize = EltVT.getSizeInBits();
30253
19
    // Make sure we can represent the same shift with the new VT.
30254
19
    if ((ShiftAmt % EltSize) != 0)
30255
0
      return false;
30256
19
    Imm = ShiftAmt / EltSize;
30257
19
    return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30258
19
                                    DAG.getConstant(Imm, DL, MVT::i8));
30259
19
  }
30260
37
  case X86ISD::SHUF128: {
30261
37
    if (
EltVT.getSizeInBits() != 32 && 37
EltVT.getSizeInBits() != 641
)
30262
1
      return false;
30263
36
    // Only change element size, not type.
30264
36
    
if (36
VT.isInteger() != Op.getSimpleValueType().isInteger()36
)
30265
0
      return false;
30266
36
    return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30267
36
                                    Op.getOperand(2));
30268
36
  }
30269
20
  case ISD::INSERT_SUBVECTOR: {
30270
20
    unsigned EltSize = EltVT.getSizeInBits();
30271
20
    if (
EltSize != 32 && 20
EltSize != 646
)
30272
2
      return false;
30273
18
    MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30274
18
    // Only change element size, not type.
30275
18
    if (EltVT.isInteger() != OpEltVT.isInteger())
30276
0
      return false;
30277
18
    uint64_t Imm = Op.getConstantOperandVal(2);
30278
18
    Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30279
18
    SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
30280
18
    DCI.AddToWorklist(Op0.getNode());
30281
18
    // Op1 needs to be bitcasted to a smaller vector with the same element type.
30282
18
    SDValue Op1 = Op.getOperand(1);
30283
18
    MVT Op1VT = MVT::getVectorVT(EltVT,
30284
18
                            Op1.getSimpleValueType().getSizeInBits() / EltSize);
30285
18
    Op1 = DAG.getBitcast(Op1VT, Op1);
30286
18
    DCI.AddToWorklist(Op1.getNode());
30287
18
    DCI.CombineTo(OrigOp.getNode(),
30288
18
                  DAG.getNode(Opcode, DL, VT, Op0, Op1,
30289
18
                              DAG.getIntPtrConstant(Imm, DL)));
30290
18
    return true;
30291
18
  }
30292
12
  case X86ISD::SUBV_BROADCAST: {
30293
12
    unsigned EltSize = EltVT.getSizeInBits();
30294
12
    if (
EltSize != 32 && 12
EltSize != 646
)
30295
0
      return false;
30296
12
    // Only change element size, not type.
30297
12
    
if (12
VT.isInteger() != Op.getSimpleValueType().isInteger()12
)
30298
0
      return false;
30299
12
    SDValue Op0 = Op.getOperand(0);
30300
12
    MVT Op0VT = MVT::getVectorVT(EltVT,
30301
12
                            Op0.getSimpleValueType().getSizeInBits() / EltSize);
30302
12
    Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
30303
12
    DCI.AddToWorklist(Op0.getNode());
30304
12
    DCI.CombineTo(OrigOp.getNode(),
30305
12
                  DAG.getNode(Opcode, DL, VT, Op0));
30306
12
    return true;
30307
12
  }
30308
3.64k
  }
30309
3.64k
30310
3.64k
  return false;
30311
3.64k
}
30312
30313
/// Do target-specific dag combines on SELECT and VSELECT nodes.
30314
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
30315
                             TargetLowering::DAGCombinerInfo &DCI,
30316
33.0k
                             const X86Subtarget &Subtarget) {
30317
33.0k
  SDLoc DL(N);
30318
33.0k
  SDValue Cond = N->getOperand(0);
30319
33.0k
  // Get the LHS/RHS of the select.
30320
33.0k
  SDValue LHS = N->getOperand(1);
30321
33.0k
  SDValue RHS = N->getOperand(2);
30322
33.0k
  EVT VT = LHS.getValueType();
30323
33.0k
  EVT CondVT = Cond.getValueType();
30324
33.0k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30325
33.0k
30326
33.0k
  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
30327
33.0k
  // instructions match the semantics of the common C idiom x<y?x:y but not
30328
33.0k
  // x<=y?x:y, because of how they handle negative zero (which can be
30329
33.0k
  // ignored in unsafe-math mode).
30330
33.0k
  // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
30331
33.0k
  if (
Cond.getOpcode() == ISD::SETCC && 33.0k
VT.isFloatingPoint()11.5k
&&
30332
33.0k
      
VT != MVT::f802.91k
&&
VT != MVT::f1282.48k
&&
30333
2.47k
      
(TLI.isTypeLegal(VT) || 2.47k
VT == MVT::v2f3265
) &&
30334
2.42k
      (Subtarget.hasSSE2() ||
30335
33.0k
       
(Subtarget.hasSSE1() && 375
VT.getScalarType() == MVT::f3262
))) {
30336
2.09k
    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30337
2.09k
30338
2.09k
    unsigned Opcode = 0;
30339
2.09k
    // Check for x CC y ? x : y.
30340
2.09k
    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30341
2.09k
        
DAG.isEqualTo(RHS, Cond.getOperand(1))421
) {
30342
343
      switch (CC) {
30343
5
      default: break;
30344
16
      case ISD::SETULT:
30345
16
        // Converting this to a min would handle NaNs incorrectly, and swapping
30346
16
        // the operands would cause it to handle comparisons between positive
30347
16
        // and negative zero incorrectly.
30348
16
        if (
!DAG.isKnownNeverNaN(LHS) || 16
!DAG.isKnownNeverNaN(RHS)0
) {
30349
16
          if (!DAG.getTarget().Options.UnsafeFPMath &&
30350
16
              
!(DAG.isKnownNeverZero(LHS) || 16
DAG.isKnownNeverZero(RHS)16
))
30351
16
            break;
30352
0
          std::swap(LHS, RHS);
30353
0
        }
30354
0
        Opcode = X86ISD::FMIN;
30355
0
        break;
30356
18
      case ISD::SETOLE:
30357
18
        // Converting this to a min would handle comparisons between positive
30358
18
        // and negative zero incorrectly.
30359
18
        if (!DAG.getTarget().Options.UnsafeFPMath &&
30360
18
            
!DAG.isKnownNeverZero(LHS)18
&&
!DAG.isKnownNeverZero(RHS)18
)
30361
18
          break;
30362
0
        Opcode = X86ISD::FMIN;
30363
0
        break;
30364
4
      case ISD::SETULE:
30365
4
        // Converting this to a min would handle both negative zeros and NaNs
30366
4
        // incorrectly, but we can swap the operands to fix both.
30367
4
        std::swap(LHS, RHS);
30368
4
        LLVM_FALLTHROUGH;
30369
130
      case ISD::SETOLT:
30370
130
      case ISD::SETLT:
30371
130
      case ISD::SETLE:
30372
130
        Opcode = X86ISD::FMIN;
30373
130
        break;
30374
130
30375
36
      case ISD::SETOGE:
30376
36
        // Converting this to a max would handle comparisons between positive
30377
36
        // and negative zero incorrectly.
30378
36
        if (!DAG.getTarget().Options.UnsafeFPMath &&
30379
36
            
!DAG.isKnownNeverZero(LHS)28
&&
!DAG.isKnownNeverZero(RHS)28
)
30380
28
          break;
30381
8
        Opcode = X86ISD::FMAX;
30382
8
        break;
30383
6
      case ISD::SETUGT:
30384
6
        // Converting this to a max would handle NaNs incorrectly, and swapping
30385
6
        // the operands would cause it to handle comparisons between positive
30386
6
        // and negative zero incorrectly.
30387
6
        if (
!DAG.isKnownNeverNaN(LHS) || 6
!DAG.isKnownNeverNaN(RHS)0
) {
30388
6
          if (!DAG.getTarget().Options.UnsafeFPMath &&
30389
6
              
!(DAG.isKnownNeverZero(LHS) || 6
DAG.isKnownNeverZero(RHS)6
))
30390
6
            break;
30391
0
          std::swap(LHS, RHS);
30392
0
        }
30393
0
        Opcode = X86ISD::FMAX;
30394
0
        break;
30395
5
      case ISD::SETUGE:
30396
5
        // Converting this to a max would handle both negative zeros and NaNs
30397
5
        // incorrectly, but we can swap the operands to fix both.
30398
5
        std::swap(LHS, RHS);
30399
5
        LLVM_FALLTHROUGH;
30400
132
      case ISD::SETOGT:
30401
132
      case ISD::SETGT:
30402
132
      case ISD::SETGE:
30403
132
        Opcode = X86ISD::FMAX;
30404
132
        break;
30405
2.09k
      }
30406
2.09k
    // Check for x CC y ? y : x -- a min/max with reversed arms.
30407
1.74k
    } else 
if (1.74k
DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
30408
1.74k
               
DAG.isEqualTo(RHS, Cond.getOperand(0))153
) {
30409
111
      switch (CC) {
30410
0
      default: break;
30411
6
      case ISD::SETOGE:
30412
6
        // Converting this to a min would handle comparisons between positive
30413
6
        // and negative zero incorrectly, and swapping the operands would
30414
6
        // cause it to handle NaNs incorrectly.
30415
6
        if (!DAG.getTarget().Options.UnsafeFPMath &&
30416
6
            
!(DAG.isKnownNeverZero(LHS) || 6
DAG.isKnownNeverZero(RHS)6
)) {
30417
6
          if (
!DAG.isKnownNeverNaN(LHS) || 6
!DAG.isKnownNeverNaN(RHS)4
)
30418
6
            break;
30419
0
          std::swap(LHS, RHS);
30420
0
        }
30421
0
        Opcode = X86ISD::FMIN;
30422
0
        break;
30423
6
      case ISD::SETUGT:
30424
6
        // Converting this to a min would handle NaNs incorrectly.
30425
6
        if (!DAG.getTarget().Options.UnsafeFPMath &&
30426
6
            
(!DAG.isKnownNeverNaN(LHS) || 6
!DAG.isKnownNeverNaN(RHS)4
))
30427
6
          break;
30428
0
        Opcode = X86ISD::FMIN;
30429
0
        break;
30430
5
      case ISD::SETUGE:
30431
5
        // Converting this to a min would handle both negative zeros and NaNs
30432
5
        // incorrectly, but we can swap the operands to fix both.
30433
5
        std::swap(LHS, RHS);
30434
5
        LLVM_FALLTHROUGH;
30435
42
      case ISD::SETOGT:
30436
42
      case ISD::SETGT:
30437
42
      case ISD::SETGE:
30438
42
        Opcode = X86ISD::FMIN;
30439
42
        break;
30440
42
30441
6
      case ISD::SETULT:
30442
6
        // Converting this to a max would handle NaNs incorrectly.
30443
6
        if (
!DAG.isKnownNeverNaN(LHS) || 6
!DAG.isKnownNeverNaN(RHS)4
)
30444
6
          break;
30445
0
        Opcode = X86ISD::FMAX;
30446
0
        break;
30447
6
      case ISD::SETOLE:
30448
6
        // Converting this to a max would handle comparisons between positive
30449
6
        // and negative zero incorrectly, and swapping the operands would
30450
6
        // cause it to handle NaNs incorrectly.
30451
6
        if (!DAG.getTarget().Options.UnsafeFPMath &&
30452
6
            
!DAG.isKnownNeverZero(LHS)6
&&
!DAG.isKnownNeverZero(RHS)6
) {
30453
6
          if (
!DAG.isKnownNeverNaN(LHS) || 6
!DAG.isKnownNeverNaN(RHS)4
)
30454
6
            break;
30455
0
          std::swap(LHS, RHS);
30456
0
        }
30457
0
        Opcode = X86ISD::FMAX;
30458
0
        break;
30459
5
      case ISD::SETULE:
30460
5
        // Converting this to a max would handle both negative zeros and NaNs
30461
5
        // incorrectly, but we can swap the operands to fix both.
30462
5
        std::swap(LHS, RHS);
30463
5
        LLVM_FALLTHROUGH;
30464
45
      case ISD::SETOLT:
30465
45
      case ISD::SETLT:
30466
45
      case ISD::SETLE:
30467
45
        Opcode = X86ISD::FMAX;
30468
45
        break;
30469
2.09k
      }
30470
2.09k
    }
30471
2.09k
30472
2.09k
    
if (2.09k
Opcode2.09k
)
30473
357
      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30474
32.6k
  }
30475
32.6k
30476
32.6k
  // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30477
32.6k
  // lowering on KNL. In this case we convert it to
30478
32.6k
  // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30479
32.6k
  // The same situation for all 128 and 256-bit vectors of i8 and i16.
30480
32.6k
  // Since SKX these selects have a proper lowering.
30481
32.6k
  
if (32.6k
Subtarget.hasAVX512() && 32.6k
CondVT.isVector()17.7k
&&
30482
16.9k
      CondVT.getVectorElementType() == MVT::i1 &&
30483
15.5k
      
(VT.is128BitVector() || 15.5k
VT.is256BitVector()11.9k
) &&
30484
8.20k
      (VT.getVectorElementType() == MVT::i8 ||
30485
8.20k
       VT.getVectorElementType() == MVT::i16) &&
30486
32.6k
      
!(Subtarget.hasBWI() && 1.79k
Subtarget.hasVLX()1.75k
)) {
30487
49
    Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30488
49
    DCI.AddToWorklist(Cond.getNode());
30489
49
    return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30490
49
  }
30491
32.6k
30492
32.6k
  
if (SDValue 32.6k
V32.6k
= combineSelectOfTwoConstants(N, DAG))
30493
119
    return V;
30494
32.4k
30495
32.4k
  // Canonicalize max and min:
30496
32.4k
  // (x > y) ? x : y -> (x >= y) ? x : y
30497
32.4k
  // (x < y) ? x : y -> (x <= y) ? x : y
30498
32.4k
  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30499
32.4k
  // the need for an extra compare
30500
32.4k
  // against zero. e.g.
30501
32.4k
  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30502
32.4k
  // subl   %esi, %edi
30503
32.4k
  // testl  %edi, %edi
30504
32.4k
  // movl   $0, %eax
30505
32.4k
  // cmovgl %edi, %eax
30506
32.4k
  // =>
30507
32.4k
  // xorl   %eax, %eax
30508
32.4k
  // subl   %esi, $edi
30509
32.4k
  // cmovsl %eax, %edi
30510
32.4k
  
if (32.4k
N->getOpcode() == ISD::SELECT && 32.4k
Cond.getOpcode() == ISD::SETCC9.66k
&&
30511
8.26k
      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30512
32.4k
      
DAG.isEqualTo(RHS, Cond.getOperand(1))1.61k
) {
30513
1.11k
    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30514
1.11k
    switch (CC) {
30515
940
    default: break;
30516
174
    case ISD::SETLT:
30517
174
    case ISD::SETGT: {
30518
174
      ISD::CondCode NewCC = (CC == ISD::SETLT) ? 
ISD::SETLE2
:
ISD::SETGE172
;
30519
174
      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30520
174
                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
30521
174
      return DAG.getSelect(DL, VT, Cond, LHS, RHS);
30522
32.3k
    }
30523
1.11k
    }
30524
1.11k
  }
30525
32.3k
30526
32.3k
  // Early exit check
30527
32.3k
  
if (32.3k
!TLI.isTypeLegal(VT)32.3k
)
30528
654
    return SDValue();
30529
31.6k
30530
31.6k
  // Match VSELECTs into subs with unsigned saturation.
30531
31.6k
  
if (31.6k
N->getOpcode() == ISD::VSELECT && 31.6k
Cond.getOpcode() == ISD::SETCC21.6k
&&
30532
31.6k
      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30533
2.44k
      
((Subtarget.hasSSE2() && 2.44k
(VT == MVT::v16i8 || 2.43k
VT == MVT::v8i162.28k
)) ||
30534
31.6k
       
(Subtarget.hasAVX2() && 2.09k
(VT == MVT::v32i8 || 1.05k
VT == MVT::v16i161.03k
)))) {
30535
387
    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30536
387
30537
387
    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30538
387
    // left side invert the predicate to simplify logic below.
30539
387
    SDValue Other;
30540
387
    if (
ISD::isBuildVectorAllZeros(LHS.getNode())387
) {
30541
45
      Other = RHS;
30542
45
      CC = ISD::getSetCCInverse(CC, true);
30543
387
    } else 
if (342
ISD::isBuildVectorAllZeros(RHS.getNode())342
) {
30544
80
      Other = LHS;
30545
80
    }
30546
387
30547
387
    if (
Other.getNode() && 387
Other->getNumOperands() == 2125
&&
30548
387
        
DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))95
) {
30549
84
      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30550
84
      SDValue CondRHS = Cond->getOperand(1);
30551
84
30552
84
      // Look for a general sub with unsigned saturation first.
30553
84
      // x >= y ? x-y : 0 --> subus x, y
30554
84
      // x >  y ? x-y : 0 --> subus x, y
30555
84
      if (
(CC == ISD::SETUGE || 84
CC == ISD::SETUGT56
) &&
30556
84
          
Other->getOpcode() == ISD::SUB56
&&
DAG.isEqualTo(OpRHS, CondRHS)28
)
30557
28
        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30558
56
30559
56
      
if (auto *56
OpRHSBV56
= dyn_cast<BuildVectorSDNode>(OpRHS))
30560
56
        
if (auto *56
OpRHSConst56
= OpRHSBV->getConstantSplatNode()) {
30561
56
          if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30562
56
            
if (auto *56
CondRHSConst56
= CondRHSBV->getConstantSplatNode())
30563
56
              // If the RHS is a constant we have to reverse the const
30564
56
              // canonicalization.
30565
56
              // x > C-1 ? x+-C : 0 --> subus x, C
30566
56
              
if (56
CC == ISD::SETUGT && 56
Other->getOpcode() == ISD::ADD28
&&
30567
28
                  CondRHSConst->getAPIntValue() ==
30568
28
                      (-OpRHSConst->getAPIntValue() - 1))
30569
28
                return DAG.getNode(
30570
28
                    X86ISD::SUBUS, DL, VT, OpLHS,
30571
28
                    DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30572
28
30573
28
          // Another special case: If C was a sign bit, the sub has been
30574
28
          // canonicalized into a xor.
30575
28
          // FIXME: Would it be better to use computeKnownBits to determine
30576
28
          //        whether it's safe to decanonicalize the xor?
30577
28
          // x s< 0 ? x^C : 0 --> subus x, C
30578
28
          
if (28
CC == ISD::SETLT && 28
Other->getOpcode() == ISD::XOR28
&&
30579
28
              ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30580
28
              OpRHSConst->getAPIntValue().isSignMask())
30581
28
            // Note that we have to rebuild the RHS constant here to ensure we
30582
28
            // don't rely on particular values of undef lanes.
30583
28
            return DAG.getNode(
30584
28
                X86ISD::SUBUS, DL, VT, OpLHS,
30585
28
                DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30586
31.5k
        }
30587
84
    }
30588
387
  }
30589
31.5k
30590
31.5k
  
if (SDValue 31.5k
V31.5k
= combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30591
253
    return V;
30592
31.3k
30593
31.3k
  // If this is a *dynamic* select (non-constant condition) and we can match
30594
31.3k
  // this node with one of the variable blend instructions, restructure the
30595
31.3k
  // condition so that blends can use the high (sign) bit of each element and
30596
31.3k
  // use SimplifyDemandedBits to simplify the condition operand.
30597
31.3k
  
if (31.3k
N->getOpcode() == ISD::VSELECT && 31.3k
DCI.isBeforeLegalizeOps()21.3k
&&
30598
9.87k
      !DCI.isBeforeLegalize() &&
30599
31.3k
      
!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())3.56k
) {
30600
1.37k
    unsigned BitWidth = Cond.getScalarValueSizeInBits();
30601
1.37k
30602
1.37k
    // Don't optimize vector selects that map to mask-registers.
30603
1.37k
    if (BitWidth == 1)
30604
117
      return SDValue();
30605
1.25k
30606
1.25k
    // We can only handle the cases where VSELECT is directly legal on the
30607
1.25k
    // subtarget. We custom lower VSELECT nodes with constant conditions and
30608
1.25k
    // this makes it hard to see whether a dynamic VSELECT will correctly
30609
1.25k
    // lower, so we both check the operation's status and explicitly handle the
30610
1.25k
    // cases where a *dynamic* blend will fail even though a constant-condition
30611
1.25k
    // blend could be custom lowered.
30612
1.25k
    // FIXME: We should find a better way to handle this class of problems.
30613
1.25k
    // Potentially, we should combine constant-condition vselect nodes
30614
1.25k
    // pre-legalization into shuffles and not mark as many types as custom
30615
1.25k
    // lowered.
30616
1.25k
    
if (1.25k
!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)1.25k
)
30617
0
      return SDValue();
30618
1.25k
    // FIXME: We don't support i16-element blends currently. We could and
30619
1.25k
    // should support them by making *all* the bits in the condition be set
30620
1.25k
    // rather than just the high bit and using an i8-element blend.
30621
1.25k
    
if (1.25k
VT.getVectorElementType() == MVT::i161.25k
)
30622
107
      return SDValue();
30623
1.14k
    // Dynamic blending was only available from SSE4.1 onward.
30624
1.14k
    
if (1.14k
VT.is128BitVector() && 1.14k
!Subtarget.hasSSE41()934
)
30625
424
      return SDValue();
30626
725
    // Byte blends are only available in AVX2
30627
725
    
if (725
VT == MVT::v32i8 && 725
!Subtarget.hasAVX2()7
)
30628
1
      return SDValue();
30629
724
    // There are no 512-bit blend instructions that use sign bits.
30630
724
    
if (724
VT.is512BitVector()724
)
30631
4
      return SDValue();
30632
720
30633
724
    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30634
720
    APInt DemandedMask(APInt::getSignMask(BitWidth));
30635
720
    KnownBits Known;
30636
720
    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
30637
720
                                          !DCI.isBeforeLegalizeOps());
30638
720
    if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30639
720
        
TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)720
) {
30640
197
      // If we changed the computation somewhere in the DAG, this change will
30641
197
      // affect all users of Cond. Make sure it is fine and update all the nodes
30642
197
      // so that we do not use the generic VSELECT anymore. Otherwise, we may
30643
197
      // perform wrong optimizations as we messed with the actual expectation
30644
197
      // for the vector boolean values.
30645
197
      if (
Cond != TLO.Old197
) {
30646
9
        // Check all uses of the condition operand to check whether it will be
30647
9
        // consumed by non-BLEND instructions. Those may require that all bits
30648
9
        // are set properly.
30649
9
        for (SDNode *U : Cond->uses()) {
30650
9
          // TODO: Add other opcodes eventually lowered into BLEND.
30651
9
          if (U->getOpcode() != ISD::VSELECT)
30652
0
            return SDValue();
30653
9
        }
30654
9
30655
9
        // Update all users of the condition before committing the change, so
30656
9
        // that the VSELECT optimizations that expect the correct vector boolean
30657
9
        // value will not be triggered.
30658
9
        
for (SDNode *U : Cond->uses()) 9
{
30659
9
          SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30660
9
                                   U->getValueType(0), Cond, U->getOperand(1),
30661
9
                                   U->getOperand(2));
30662
9
          DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30663
9
        }
30664
9
        DCI.CommitTargetLoweringOpt(TLO);
30665
9
        return SDValue();
30666
9
      }
30667
188
      // Only Cond (rather than other nodes in the computation chain) was
30668
188
      // changed. Change the condition just for N to keep the opportunity to
30669
188
      // optimize all other users their own way.
30670
188
      SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30671
188
      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30672
188
      return SDValue();
30673
188
    }
30674
1.37k
  }
30675
30.4k
30676
30.4k
  // Look for vselects with LHS/RHS being bitcasted from an operation that
30677
30.4k
  // can be executed on another type. Push the bitcast to the inputs of
30678
30.4k
  // the operation. This exposes opportunities for using masking instructions.
30679
30.4k
  
if (30.4k
N->getOpcode() == ISD::VSELECT && 30.4k
DCI.isAfterLegalizeVectorOps()20.4k
&&
30680
30.4k
      
CondVT.getVectorElementType() == MVT::i18.76k
) {
30681
6.65k
    if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30682
85
      return SDValue(N, 0);
30683
6.57k
    
if (6.57k
combineBitcastForMaskedOp(RHS, DAG, DCI)6.57k
)
30684
0
      return SDValue(N, 0);
30685
30.3k
  }
30686
30.3k
30687
30.3k
  // Custom action for SELECT MMX
30688
30.3k
  
if (30.3k
VT == MVT::x86mmx30.3k
) {
30689
4
    LHS = DAG.getBitcast(MVT::i64, LHS);
30690
4
    RHS = DAG.getBitcast(MVT::i64, RHS);
30691
4
    SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
30692
4
    return DAG.getBitcast(VT, newSelect);
30693
4
  }
30694
30.3k
30695
30.3k
  return SDValue();
30696
30.3k
}
30697
30698
/// Combine:
30699
///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30700
/// to:
30701
///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30702
/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30703
/// Note that this is only legal for some op/cc combinations.
30704
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30705
51.7k
                                       SelectionDAG &DAG) {
30706
51.7k
  // This combine only operates on CMP-like nodes.
30707
51.7k
  if (!(Cmp.getOpcode() == X86ISD::CMP ||
30708
32.1k
        
(Cmp.getOpcode() == X86ISD::SUB && 32.1k
!Cmp->hasAnyUseOfValue(0)18.5k
)))
30709
14.0k
    return SDValue();
30710
37.6k
30711
37.6k
  // Can't replace the cmp if it has more uses than the one we're looking at.
30712
37.6k
  // FIXME: We would like to be able to handle this, but would need to make sure
30713
37.6k
  // all uses were updated.
30714
37.6k
  
if (37.6k
!Cmp.hasOneUse()37.6k
)
30715
2.42k
    return SDValue();
30716
35.2k
30717
35.2k
  // This only applies to variations of the common case:
30718
35.2k
  //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30719
35.2k
  //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30720
35.2k
  //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30721
35.2k
  //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30722
35.2k
  // Using the proper condcodes (see below), overflow is checked for.
30723
35.2k
30724
35.2k
  // FIXME: We can generalize both constraints:
30725
35.2k
  // - XOR/OR/AND (if they were made to survive AtomicExpand)
30726
35.2k
  // - LHS != 1
30727
35.2k
  // if the result is compared.
30728
35.2k
30729
35.2k
  SDValue CmpLHS = Cmp.getOperand(0);
30730
35.2k
  SDValue CmpRHS = Cmp.getOperand(1);
30731
35.2k
30732
35.2k
  if (!CmpLHS.hasOneUse())
30733
12.4k
    return SDValue();
30734
22.7k
30735
22.7k
  unsigned Opc = CmpLHS.getOpcode();
30736
22.7k
  if (
Opc != ISD::ATOMIC_LOAD_ADD && 22.7k
Opc != ISD::ATOMIC_LOAD_SUB22.7k
)
30737
22.7k
    return SDValue();
30738
20
30739
20
  SDValue OpRHS = CmpLHS.getOperand(2);
30740
20
  auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30741
20
  if (!OpRHSC)
30742
0
    return SDValue();
30743
20
30744
20
  APInt Addend = OpRHSC->getAPIntValue();
30745
20
  if (Opc == ISD::ATOMIC_LOAD_SUB)
30746
0
    Addend = -Addend;
30747
20
30748
20
  auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30749
20
  if (!CmpRHSC)
30750
0
    return SDValue();
30751
20
30752
20
  APInt Comparison = CmpRHSC->getAPIntValue();
30753
20
30754
20
  // If the addend is the negation of the comparison value, then we can do
30755
20
  // a full comparison by emitting the atomic arithmetic is a locked sub.
30756
20
  if (
Comparison == -Addend20
) {
30757
4
    // The CC is fine, but we need to rewrite the LHS of the comparison as an
30758
4
    // atomic sub.
30759
4
    auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
30760
4
    auto AtomicSub = DAG.getAtomic(
30761
4
        ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
30762
4
        /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
30763
4
        /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
30764
4
        AN->getMemOperand());
30765
4
    auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG);
30766
4
    DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30767
4
                                  DAG.getUNDEF(CmpLHS.getValueType()));
30768
4
    DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30769
4
    return LockOp;
30770
4
  }
30771
16
30772
16
  // We can handle comparisons with zero in a number of cases by manipulating
30773
16
  // the CC used.
30774
16
  
if (16
!Comparison.isNullValue()16
)
30775
7
    return SDValue();
30776
9
30777
9
  
if (9
CC == X86::COND_S && 9
Addend == 12
)
30778
2
    CC = X86::COND_LE;
30779
7
  else 
if (7
CC == X86::COND_NS && 7
Addend == 11
)
30780
1
    CC = X86::COND_G;
30781
6
  else 
if (6
CC == X86::COND_G && 6
Addend == -14
)
30782
2
    CC = X86::COND_GE;
30783
4
  else 
if (4
CC == X86::COND_LE && 4
Addend == -12
)
30784
1
    CC = X86::COND_L;
30785
4
  else
30786
3
    return SDValue();
30787
6
30788
6
  SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30789
6
  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30790
6
                                DAG.getUNDEF(CmpLHS.getValueType()));
30791
6
  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30792
6
  return LockOp;
30793
6
}
30794
30795
// Check whether a boolean test is testing a boolean value generated by
30796
// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30797
// code.
30798
//
30799
// Simplify the following patterns:
30800
// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30801
// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30802
// to (Op EFLAGS Cond)
30803
//
30804
// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30805
// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30806
// to (Op EFLAGS !Cond)
30807
//
30808
// where Op could be BRCOND or CMOV.
30809
//
30810
54.1k
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30811
54.1k
  // This combine only operates on CMP-like nodes.
30812
54.1k
  if (!(Cmp.getOpcode() == X86ISD::CMP ||
30813
32.2k
        
(Cmp.getOpcode() == X86ISD::SUB && 32.2k
!Cmp->hasAnyUseOfValue(0)18.5k
)))
30814
14.0k
    return SDValue();
30815
40.1k
30816
40.1k
  // Quit if not used as a boolean value.
30817
40.1k
  
if (40.1k
CC != X86::COND_E && 40.1k
CC != X86::COND_NE26.5k
)
30818
14.3k
    return SDValue();
30819
25.8k
30820
25.8k
  // Check CMP operands. One of them should be 0 or 1 and the other should be
30821
25.8k
  // an SetCC or extended from it.
30822
25.8k
  SDValue Op1 = Cmp.getOperand(0);
30823
25.8k
  SDValue Op2 = Cmp.getOperand(1);
30824
25.8k
30825
25.8k
  SDValue SetCC;
30826
25.8k
  const ConstantSDNode* C = nullptr;
30827
25.8k
  bool needOppositeCond = (CC == X86::COND_E);
30828
25.8k
  bool checkAgainstTrue = false; // Is it a comparison against 1?
30829
25.8k
30830
25.8k
  if ((C = dyn_cast<ConstantSDNode>(Op1)))
30831
590
    SetCC = Op2;
30832
25.2k
  else 
if (25.2k
(C = dyn_cast<ConstantSDNode>(Op2))25.2k
)
30833
21.8k
    SetCC = Op1;
30834
25.2k
  else // Quit if all operands are not constants.
30835
3.35k
    return SDValue();
30836
22.4k
30837
22.4k
  
if (22.4k
C->getZExtValue() == 122.4k
) {
30838
588
    needOppositeCond = !needOppositeCond;
30839
588
    checkAgainstTrue = true;
30840
22.4k
  } else 
if (21.8k
C->getZExtValue() != 021.8k
)
30841
21.8k
    // Quit if the constant is neither 0 or 1.
30842
3.49k
    return SDValue();
30843
18.9k
30844
18.9k
  bool truncatedToBoolWithAnd = false;
30845
18.9k
  // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30846
22.0k
  while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30847
22.0k
         SetCC.getOpcode() == ISD::TRUNCATE ||
30848
21.5k
         
SetCC.getOpcode() == ISD::AND21.5k
) {
30849
5.34k
    if (
SetCC.getOpcode() == ISD::AND5.34k
) {
30850
4.88k
      int OpIdx = -1;
30851
4.88k
      if (isOneConstant(SetCC.getOperand(0)))
30852
0
        OpIdx = 1;
30853
4.88k
      if (isOneConstant(SetCC.getOperand(1)))
30854
2.60k
        OpIdx = 0;
30855
4.88k
      if (OpIdx < 0)
30856
2.27k
        break;
30857
2.60k
      SetCC = SetCC.getOperand(OpIdx);
30858
2.60k
      truncatedToBoolWithAnd = true;
30859
2.60k
    } else
30860
466
      SetCC = SetCC.getOperand(0);
30861
5.34k
  }
30862
18.9k
30863
18.9k
  switch (SetCC.getOpcode()) {
30864
0
  case X86ISD::SETCC_CARRY:
30865
0
    // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30866
0
    // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30867
0
    // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30868
0
    // truncated to i1 using 'and'.
30869
0
    if (
checkAgainstTrue && 0
!truncatedToBoolWithAnd0
)
30870
0
      break;
30871
0
    assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30872
0
           "Invalid use of SETCC_CARRY!");
30873
0
    LLVM_FALLTHROUGH;
30874
2.45k
  case X86ISD::SETCC:
30875
2.45k
    // Set the condition code or opposite one if necessary.
30876
2.45k
    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30877
2.45k
    if (needOppositeCond)
30878
357
      CC = X86::GetOppositeBranchCondition(CC);
30879
2.45k
    return SetCC.getOperand(1);
30880
83
  case X86ISD::CMOV: {
30881
83
    // Check whether false/true value has canonical one, i.e. 0 or 1.
30882
83
    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30883
83
    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30884
83
    // Quit if true value is not a constant.
30885
83
    if (!TVal)
30886
74
      return SDValue();
30887
9
    // Quit if false value is not a constant.
30888
9
    
if (9
!FVal9
) {
30889
8
      SDValue Op = SetCC.getOperand(0);
30890
8
      // Skip 'zext' or 'trunc' node.
30891
8
      if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30892
6
          Op.getOpcode() == ISD::TRUNCATE)
30893
4
        Op = Op.getOperand(0);
30894
8
      // A special case for rdrand/rdseed, where 0 is set if false cond is
30895
8
      // found.
30896
8
      if ((Op.getOpcode() != X86ISD::RDRAND &&
30897
8
           
Op.getOpcode() != X86ISD::RDSEED5
) ||
Op.getResNo() != 06
)
30898
2
        return SDValue();
30899
7
    }
30900
7
    // Quit if false value is not the constant 0 or 1.
30901
7
    bool FValIsFalse = true;
30902
7
    if (
FVal && 7
FVal->getZExtValue() != 01
) {
30903
1
      if (FVal->getZExtValue() != 1)
30904
1
        return SDValue();
30905
0
      // If FVal is 1, opposite cond is needed.
30906
0
      needOppositeCond = !needOppositeCond;
30907
0
      FValIsFalse = false;
30908
0
    }
30909
7
    // Quit if TVal is not the constant opposite of FVal.
30910
6
    
if (6
FValIsFalse && 6
TVal->getZExtValue() != 16
)
30911
0
      return SDValue();
30912
6
    
if (6
!FValIsFalse && 6
TVal->getZExtValue() != 00
)
30913
0
      return SDValue();
30914
6
    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30915
6
    if (needOppositeCond)
30916
6
      CC = X86::GetOppositeBranchCondition(CC);
30917
0
    return SetCC.getOperand(3);
30918
0
  }
30919
16.4k
  }
30920
16.4k
30921
16.4k
  return SDValue();
30922
16.4k
}
30923
30924
/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30925
/// Match:
30926
///   (X86or (X86setcc) (X86setcc))
30927
///   (X86cmp (and (X86setcc) (X86setcc)), 0)
30928
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30929
                                           X86::CondCode &CC1, SDValue &Flags,
30930
1.91k
                                           bool &isAnd) {
30931
1.91k
  if (
Cond->getOpcode() == X86ISD::CMP1.91k
) {
30932
1.84k
    if (!isNullConstant(Cond->getOperand(1)))
30933
16
      return false;
30934
1.82k
30935
1.82k
    Cond = Cond->getOperand(0);
30936
1.82k
  }
30937
1.91k
30938
1.89k
  isAnd = false;
30939
1.89k
30940
1.89k
  SDValue SetCC0, SetCC1;
30941
1.89k
  switch (Cond->getOpcode()) {
30942
323
  default: return false;
30943
1.54k
  case ISD::AND:
30944
1.54k
  case X86ISD::AND:
30945
1.54k
    isAnd = true;
30946
1.54k
    LLVM_FALLTHROUGH;
30947
1.57k
  case ISD::OR:
30948
1.57k
  case X86ISD::OR:
30949
1.57k
    SetCC0 = Cond->getOperand(0);
30950
1.57k
    SetCC1 = Cond->getOperand(1);
30951
1.57k
    break;
30952
1.57k
  };
30953
1.57k
30954
1.57k
  // Make sure we have SETCC nodes, using the same flags value.
30955
1.57k
  if (SetCC0.getOpcode() != X86ISD::SETCC ||
30956
22
      SetCC1.getOpcode() != X86ISD::SETCC ||
30957
22
      SetCC0->getOperand(1) != SetCC1->getOperand(1))
30958
1.55k
    return false;
30959
21
30960
21
  CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30961
21
  CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30962
21
  Flags = SetCC0->getOperand(1);
30963
21
  return true;
30964
21
}
30965
30966
// When legalizing carry, we create carries via add X, -1
30967
// If that comes from an actual carry, via setcc, we use the
30968
// carry directly.
30969
18.6k
static SDValue combineCarryThroughADD(SDValue EFLAGS) {
30970
18.6k
  if (
EFLAGS.getOpcode() == X86ISD::ADD18.6k
) {
30971
6.75k
    if (
isAllOnesConstant(EFLAGS.getOperand(1))6.75k
) {
30972
3.22k
      SDValue Carry = EFLAGS.getOperand(0);
30973
3.24k
      while (Carry.getOpcode() == ISD::TRUNCATE ||
30974
3.22k
             Carry.getOpcode() == ISD::ZERO_EXTEND ||
30975
3.22k
             Carry.getOpcode() == ISD::SIGN_EXTEND ||
30976
3.22k
             Carry.getOpcode() == ISD::ANY_EXTEND ||
30977
3.22k
             (Carry.getOpcode() == ISD::AND &&
30978
3.24k
              isOneConstant(Carry.getOperand(1))))
30979
18
        Carry = Carry.getOperand(0);
30980
3.22k
      if (Carry.getOpcode() == X86ISD::SETCC ||
30981
3.22k
          
Carry.getOpcode() == X86ISD::SETCC_CARRY134
) {
30982
3.09k
        if (Carry.getConstantOperandVal(0) == X86::COND_B)
30983
3.09k
          return Carry.getOperand(1);
30984
15.5k
      }
30985
3.22k
    }
30986
6.75k
  }
30987
15.5k
30988
15.5k
  return SDValue();
30989
15.5k
}
30990
30991
/// Optimize an EFLAGS definition used according to the condition code \p CC
30992
/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30993
/// uses of chain values.
30994
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30995
54.1k
                                  SelectionDAG &DAG) {
30996
54.1k
  if (CC == X86::COND_B)
30997
9.43k
    
if (SDValue 9.43k
Flags9.43k
= combineCarryThroughADD(EFLAGS))
30998
0
      return Flags;
30999
54.1k
31000
54.1k
  
if (SDValue 54.1k
R54.1k
= checkBoolTestSetCCCombine(EFLAGS, CC))
31001
2.45k
    return R;
31002
51.7k
  return combineSetCCAtomicArith(EFLAGS, CC, DAG);
31003
51.7k
}
31004
31005
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
31006
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
31007
                           TargetLowering::DAGCombinerInfo &DCI,
31008
6.38k
                           const X86Subtarget &Subtarget) {
31009
6.38k
  SDLoc DL(N);
31010
6.38k
31011
6.38k
  // If the flag operand isn't dead, don't touch this CMOV.
31012
6.38k
  if (
N->getNumValues() == 2 && 6.38k
!SDValue(N, 1).use_empty()5.20k
)
31013
0
    return SDValue();
31014
6.38k
31015
6.38k
  SDValue FalseOp = N->getOperand(0);
31016
6.38k
  SDValue TrueOp = N->getOperand(1);
31017
6.38k
  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
31018
6.38k
  SDValue Cond = N->getOperand(3);
31019
6.38k
31020
6.38k
  if (
CC == X86::COND_E || 6.38k
CC == X86::COND_NE5.29k
) {
31021
3.24k
    switch (Cond.getOpcode()) {
31022
3.23k
    default: break;
31023
12
    case X86ISD::BSR:
31024
12
    case X86ISD::BSF:
31025
12
      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
31026
12
      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
31027
0
        
return (CC == X86::COND_E) ? 0
FalseOp0
:
TrueOp0
;
31028
6.38k
    }
31029
6.38k
  }
31030
6.38k
31031
6.38k
  // Try to simplify the EFLAGS and condition code operands.
31032
6.38k
  // We can't always do this as FCMOV only supports a subset of X86 cond.
31033
6.38k
  
if (SDValue 6.38k
Flags6.38k
= combineSetCCEFLAGS(Cond, CC, DAG)) {
31034
212
    if (
FalseOp.getValueType() != MVT::f80 || 212
hasFPCMov(CC)16
) {
31035
196
      SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
31036
196
        Flags};
31037
196
      return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
31038
196
    }
31039
6.19k
  }
31040
6.19k
31041
6.19k
  // If this is a select between two integer constants, try to do some
31042
6.19k
  // optimizations.  Note that the operands are ordered the opposite of SELECT
31043
6.19k
  // operands.
31044
6.19k
  
if (ConstantSDNode *6.19k
TrueC6.19k
= dyn_cast<ConstantSDNode>(TrueOp)) {
31045
1.86k
    if (ConstantSDNode *
FalseC1.86k
= dyn_cast<ConstantSDNode>(FalseOp)) {
31046
1.00k
      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
31047
1.00k
      // larger than FalseC (the false value).
31048
1.00k
      if (
TrueC->getAPIntValue().ult(FalseC->getAPIntValue())1.00k
) {
31049
280
        CC = X86::GetOppositeBranchCondition(CC);
31050
280
        std::swap(TrueC, FalseC);
31051
280
        std::swap(TrueOp, FalseOp);
31052
280
      }
31053
1.00k
31054
1.00k
      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
31055
1.00k
      // This is efficient for any integer data type (including i8/i16) and
31056
1.00k
      // shift amount.
31057
1.00k
      if (
FalseC->getAPIntValue() == 0 && 1.00k
TrueC->getAPIntValue().isPowerOf2()918
) {
31058
203
        Cond = getSETCC(CC, Cond, DL, DAG);
31059
203
31060
203
        // Zero extend the condition if needed.
31061
203
        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
31062
203
31063
203
        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
31064
203
        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
31065
203
                           DAG.getConstant(ShAmt, DL, MVT::i8));
31066
203
        if (N->getNumValues() == 2)  // Dead flag value?
31067
202
          return DCI.CombineTo(N, Cond, SDValue());
31068
1
        return Cond;
31069
1
      }
31070
797
31071
797
      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
31072
797
      // for any integer data type, including i8/i16.
31073
797
      
if (797
FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()797
) {
31074
0
        Cond = getSETCC(CC, Cond, DL, DAG);
31075
0
31076
0
        // Zero extend the condition if needed.
31077
0
        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
31078
0
                           FalseC->getValueType(0), Cond);
31079
0
        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31080
0
                           SDValue(FalseC, 0));
31081
0
31082
0
        if (N->getNumValues() == 2)  // Dead flag value?
31083
0
          return DCI.CombineTo(N, Cond, SDValue());
31084
0
        return Cond;
31085
0
      }
31086
797
31087
797
      // Optimize cases that will turn into an LEA instruction.  This requires
31088
797
      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
31089
797
      
if (797
N->getValueType(0) == MVT::i32 || 797
N->getValueType(0) == MVT::i64424
) {
31090
735
        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
31091
735
        if (
N->getValueType(0) == MVT::i32735
)
Diff = (unsigned)Diff373
;
31092
735
31093
735
        bool isFastMultiplier = false;
31094
735
        if (
Diff < 10735
) {
31095
75
          switch ((unsigned char)Diff) {
31096
75
          default: break;
31097
0
          case 1:  // result = add base, cond
31098
0
          case 2:  // result = lea base(    , cond*2)
31099
0
          case 3:  // result = lea base(cond, cond*2)
31100
0
          case 4:  // result = lea base(    , cond*4)
31101
0
          case 5:  // result = lea base(cond, cond*4)
31102
0
          case 8:  // result = lea base(    , cond*8)
31103
0
          case 9:  // result = lea base(cond, cond*8)
31104
0
            isFastMultiplier = true;
31105
0
            break;
31106
735
          }
31107
735
        }
31108
735
31109
735
        
if (735
isFastMultiplier735
) {
31110
0
          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
31111
0
          Cond = getSETCC(CC, Cond, DL ,DAG);
31112
0
          // Zero extend the condition if needed.
31113
0
          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
31114
0
                             Cond);
31115
0
          // Scale the condition by the difference.
31116
0
          if (Diff != 1)
31117
0
            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
31118
0
                               DAG.getConstant(Diff, DL, Cond.getValueType()));
31119
0
31120
0
          // Add the base if non-zero.
31121
0
          if (FalseC->getAPIntValue() != 0)
31122
0
            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31123
0
                               SDValue(FalseC, 0));
31124
0
          if (N->getNumValues() == 2)  // Dead flag value?
31125
0
            return DCI.CombineTo(N, Cond, SDValue());
31126
0
          return Cond;
31127
0
        }
31128
735
      }
31129
1.00k
    }
31130
1.86k
  }
31131
5.98k
31132
5.98k
  // Handle these cases:
31133
5.98k
  //   (select (x != c), e, c) -> select (x != c), e, x),
31134
5.98k
  //   (select (x == c), c, e) -> select (x == c), x, e)
31135
5.98k
  // where the c is an integer constant, and the "select" is the combination
31136
5.98k
  // of CMOV and CMP.
31137
5.98k
  //
31138
5.98k
  // The rationale for this change is that the conditional-move from a constant
31139
5.98k
  // needs two instructions, however, conditional-move from a register needs
31140
5.98k
  // only one instruction.
31141
5.98k
  //
31142
5.98k
  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
31143
5.98k
  //  some instruction-combining opportunities. This opt needs to be
31144
5.98k
  //  postponed as late as possible.
31145
5.98k
  //
31146
5.98k
  
if (5.98k
!DCI.isBeforeLegalize() && 5.98k
!DCI.isBeforeLegalizeOps()5.98k
) {
31147
5.98k
    // the DCI.xxxx conditions are provided to postpone the optimization as
31148
5.98k
    // late as possible.
31149
5.98k
31150
5.98k
    ConstantSDNode *CmpAgainst = nullptr;
31151
5.98k
    if (
(Cond.getOpcode() == X86ISD::CMP || 5.98k
Cond.getOpcode() == X86ISD::SUB2.59k
) &&
31152
5.46k
        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
31153
5.98k
        
!isa<ConstantSDNode>(Cond.getOperand(0))3.61k
) {
31154
3.61k
31155
3.61k
      if (CC == X86::COND_NE &&
31156
3.61k
          
CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)1.84k
) {
31157
3
        CC = X86::GetOppositeBranchCondition(CC);
31158
3
        std::swap(TrueOp, FalseOp);
31159
3
      }
31160
3.61k
31161
3.61k
      if (CC == X86::COND_E &&
31162
3.61k
          
CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)937
) {
31163
43
        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
31164
43
                          DAG.getConstant(CC, DL, MVT::i8), Cond };
31165
43
        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
31166
43
      }
31167
5.94k
    }
31168
5.98k
  }
31169
5.94k
31170
5.94k
  // Fold and/or of setcc's to double CMOV:
31171
5.94k
  //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
31172
5.94k
  //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
31173
5.94k
  //
31174
5.94k
  // This combine lets us generate:
31175
5.94k
  //   cmovcc1 (jcc1 if we don't have CMOV)
31176
5.94k
  //   cmovcc2 (same)
31177
5.94k
  // instead of:
31178
5.94k
  //   setcc1
31179
5.94k
  //   setcc2
31180
5.94k
  //   and/or
31181
5.94k
  //   cmovne (jne if we don't have CMOV)
31182
5.94k
  // When we can't use the CMOV instruction, it might increase branch
31183
5.94k
  // mispredicts.
31184
5.94k
  // When we can use CMOV, or when there is no mispredict, this improves
31185
5.94k
  // throughput and reduces register pressure.
31186
5.94k
  //
31187
5.94k
  
if (5.94k
CC == X86::COND_NE5.94k
) {
31188
1.91k
    SDValue Flags;
31189
1.91k
    X86::CondCode CC0, CC1;
31190
1.91k
    bool isAndSetCC;
31191
1.91k
    if (
checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)1.91k
) {
31192
21
      if (
isAndSetCC21
) {
31193
15
        std::swap(FalseOp, TrueOp);
31194
15
        CC0 = X86::GetOppositeBranchCondition(CC0);
31195
15
        CC1 = X86::GetOppositeBranchCondition(CC1);
31196
15
      }
31197
21
31198
21
      SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
31199
21
        Flags};
31200
21
      SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
31201
21
      SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
31202
21
      SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
31203
21
      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
31204
21
      return CMOV;
31205
21
    }
31206
5.92k
  }
31207
5.92k
31208
5.92k
  return SDValue();
31209
5.92k
}
31210
31211
/// Different mul shrinking modes.
31212
enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
31213
31214
201
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
31215
201
  EVT VT = N->getOperand(0).getValueType();
31216
201
  if (VT.getScalarSizeInBits() != 32)
31217
119
    return false;
31218
82
31219
201
  assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
31220
82
  unsigned SignBits[2] = {1, 1};
31221
82
  bool IsPositive[2] = {false, false};
31222
242
  for (unsigned i = 0; 
i < 2242
;
i++160
) {
31223
162
    SDValue Opd = N->getOperand(i);
31224
162
31225
162
    // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
31226
162
    // compute signbits for it separately.
31227
162
    if (
Opd.getOpcode() == ISD::ANY_EXTEND162
) {
31228
0
      // For anyextend, it is safe to assume an appropriate number of leading
31229
0
      // sign/zero bits.
31230
0
      if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
31231
0
        SignBits[i] = 25;
31232
0
      else 
if (0
Opd.getOperand(0).getValueType().getVectorElementType() ==
31233
0
               MVT::i16)
31234
0
        SignBits[i] = 17;
31235
0
      else
31236
0
        return false;
31237
0
      IsPositive[i] = true;
31238
162
    } else 
if (162
Opd.getOpcode() == ISD::BUILD_VECTOR162
) {
31239
28
      // All the operands of BUILD_VECTOR need to be int constant.
31240
28
      // Find the smallest value range which all the operands belong to.
31241
28
      SignBits[i] = 32;
31242
28
      IsPositive[i] = true;
31243
96
      for (const SDValue &SubOp : Opd.getNode()->op_values()) {
31244
96
        if (SubOp.isUndef())
31245
4
          continue;
31246
92
        auto *CN = dyn_cast<ConstantSDNode>(SubOp);
31247
92
        if (!CN)
31248
2
          return false;
31249
90
        APInt IntVal = CN->getAPIntValue();
31250
90
        if (IntVal.isNegative())
31251
9
          IsPositive[i] = false;
31252
96
        SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
31253
96
      }
31254
162
    } else {
31255
134
      SignBits[i] = DAG.ComputeNumSignBits(Opd);
31256
134
      if (Opd.getOpcode() == ISD::ZERO_EXTEND)
31257
39
        IsPositive[i] = true;
31258
162
    }
31259
162
  }
31260
82
31261
80
  
bool AllPositive = IsPositive[0] && 80
IsPositive[1]24
;
31262
80
  unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
31263
80
  // When ranges are from -128 ~ 127, use MULS8 mode.
31264
80
  if (MinSignBits >= 25)
31265
9
    Mode = MULS8;
31266
80
  // When ranges are from 0 ~ 255, use MULU8 mode.
31267
71
  else 
if (71
AllPositive && 71
MinSignBits >= 2421
)
31268
5
    Mode = MULU8;
31269
71
  // When ranges are from -32768 ~ 32767, use MULS16 mode.
31270
66
  else 
if (66
MinSignBits >= 1766
)
31271
20
    Mode = MULS16;
31272
66
  // When ranges are from 0 ~ 65535, use MULU16 mode.
31273
46
  else 
if (46
AllPositive && 46
MinSignBits >= 1611
)
31274
10
    Mode = MULU16;
31275
46
  else
31276
36
    return false;
31277
44
  return true;
31278
44
}
31279
31280
/// When the operands of vector mul are extended from smaller size values,
31281
/// like i8 and i16, the type of mul may be shrinked to generate more
31282
/// efficient code. Two typical patterns are handled:
31283
/// Pattern1:
31284
///     %2 = sext/zext <N x i8> %1 to <N x i32>
31285
///     %4 = sext/zext <N x i8> %3 to <N x i32>
31286
//   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31287
///     %5 = mul <N x i32> %2, %4
31288
///
31289
/// Pattern2:
31290
///     %2 = zext/sext <N x i16> %1 to <N x i32>
31291
///     %4 = zext/sext <N x i16> %3 to <N x i32>
31292
///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31293
///     %5 = mul <N x i32> %2, %4
31294
///
31295
/// There are four mul shrinking modes:
31296
/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
31297
/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
31298
/// generate pmullw+sext32 for it (MULS8 mode).
31299
/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
31300
/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
31301
/// generate pmullw+zext32 for it (MULU8 mode).
31302
/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
31303
/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
31304
/// generate pmullw+pmulhw for it (MULS16 mode).
31305
/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
31306
/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
31307
/// generate pmullw+pmulhuw for it (MULU16 mode).
31308
static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
31309
841
                               const X86Subtarget &Subtarget) {
31310
841
  // Check for legality
31311
841
  // pmullw/pmulhw are not supported by SSE.
31312
841
  if (!Subtarget.hasSSE2())
31313
1
    return SDValue();
31314
840
31315
840
  // Check for profitability
31316
840
  // pmulld is supported since SSE41. It is better to use pmulld
31317
840
  // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
31318
840
  // the expansion.
31319
840
  bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
31320
840
  if (
Subtarget.hasSSE41() && 840
(OptForMinSize || 665
!Subtarget.isPMULLDSlow()653
))
31321
657
    return SDValue();
31322
183
31323
183
  ShrinkMode Mode;
31324
183
  if (!canReduceVMulWidth(N, DAG, Mode))
31325
151
    return SDValue();
31326
32
31327
32
  SDLoc DL(N);
31328
32
  SDValue N0 = N->getOperand(0);
31329
32
  SDValue N1 = N->getOperand(1);
31330
32
  EVT VT = N->getOperand(0).getValueType();
31331
32
  unsigned RegSize = 128;
31332
32
  MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
31333
32
  EVT ReducedVT =
31334
32
      EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
31335
32
  // Shrink the operands of mul.
31336
32
  SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
31337
32
  SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
31338
32
31339
32
  if (
VT.getVectorNumElements() >= OpsVT.getVectorNumElements()32
) {
31340
7
    // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
31341
7
    // lower part is needed.
31342
7
    SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
31343
7
    if (
Mode == MULU8 || 7
Mode == MULS85
) {
31344
3
      return DAG.getNode((Mode == MULU8) ? 
ISD::ZERO_EXTEND2
:
ISD::SIGN_EXTEND1
,
31345
3
                         DL, VT, MulLo);
31346
0
    } else {
31347
4
      MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
31348
4
      // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
31349
4
      // the higher part is also needed.
31350
4
      SDValue MulHi = DAG.getNode(Mode == MULS16 ? 
ISD::MULHS1
:
ISD::MULHU3
, DL,
31351
4
                                  ReducedVT, NewN0, NewN1);
31352
4
31353
4
      // Repack the lower part and higher part result of mul into a wider
31354
4
      // result.
31355
4
      // Generate shuffle functioning as punpcklwd.
31356
4
      SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
31357
28
      for (unsigned i = 0; 
i < VT.getVectorNumElements() / 228
;
i++24
) {
31358
24
        ShuffleMask[2 * i] = i;
31359
24
        ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
31360
24
      }
31361
4
      SDValue ResLo =
31362
4
          DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31363
4
      ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
31364
4
      // Generate shuffle functioning as punpckhwd.
31365
28
      for (unsigned i = 0; 
i < VT.getVectorNumElements() / 228
;
i++24
) {
31366
24
        ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
31367
24
        ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
31368
24
      }
31369
4
      SDValue ResHi =
31370
4
          DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31371
4
      ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
31372
4
      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
31373
4
    }
31374
25
  } else {
31375
25
    // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
31376
25
    // to legalize the mul explicitly because implicit legalization for type
31377
25
    // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
31378
25
    // instructions which will not exist when we explicitly legalize it by
31379
25
    // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
31380
25
    // <4 x i16> undef).
31381
25
    //
31382
25
    // Legalize the operands of mul.
31383
25
    // FIXME: We may be able to handle non-concatenated vectors by insertion.
31384
25
    unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
31385
25
    if ((RegSize % ReducedSizeInBits) != 0)
31386
6
      return SDValue();
31387
19
31388
19
    SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
31389
19
                                 DAG.getUNDEF(ReducedVT));
31390
19
    Ops[0] = NewN0;
31391
19
    NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31392
19
    Ops[0] = NewN1;
31393
19
    NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31394
19
31395
19
    if (
Mode == MULU8 || 19
Mode == MULS816
) {
31396
5
      // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
31397
5
      // part is needed.
31398
5
      SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31399
5
31400
5
      // convert the type of mul result to VT.
31401
5
      MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31402
3
      SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
31403
2
                                              : ISD::SIGN_EXTEND_VECTOR_INREG,
31404
5
                                DL, ResVT, Mul);
31405
5
      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31406
5
                         DAG.getIntPtrConstant(0, DL));
31407
0
    } else {
31408
14
      // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
31409
14
      // MULU16/MULS16, both parts are needed.
31410
14
      SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31411
14
      SDValue MulHi = DAG.getNode(Mode == MULS16 ? 
ISD::MULHS11
:
ISD::MULHU3
, DL,
31412
14
                                  OpsVT, NewN0, NewN1);
31413
14
31414
14
      // Repack the lower part and higher part result of mul into a wider
31415
14
      // result. Make sure the type of mul result is VT.
31416
14
      MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31417
14
      SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
31418
14
      Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
31419
14
      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31420
14
                         DAG.getIntPtrConstant(0, DL));
31421
14
    }
31422
0
  }
31423
841
}
31424
31425
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
31426
1.74k
                                 EVT VT, SDLoc DL) {
31427
1.74k
31428
75
  auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
31429
75
    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31430
75
                                 DAG.getConstant(Mult, DL, VT));
31431
75
    Result = DAG.getNode(ISD::SHL, DL, VT, Result,
31432
75
                         DAG.getConstant(Shift, DL, MVT::i8));
31433
75
    Result = DAG.getNode(isAdd ? 
ISD::ADD54
:
ISD::SUB21
, DL, VT, Result,
31434
75
                         N->getOperand(0));
31435
75
    return Result;
31436
75
  };
31437
1.74k
31438
32
  auto combineMulMulAddOrSub = [&](bool isAdd) {
31439
32
    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31440
32
                                 DAG.getConstant(9, DL, VT));
31441
32
    Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
31442
32
    Result = DAG.getNode(isAdd ? 
ISD::ADD22
:
ISD::SUB10
, DL, VT, Result,
31443
32
                         N->getOperand(0));
31444
32
    return Result;
31445
32
  };
31446
1.74k
31447
1.74k
  switch (MulAmt) {
31448
1.63k
  default:
31449
1.63k
    break;
31450
12
  case 11:
31451
12
    // mul x, 11 => add ((shl (mul x, 5), 1), x)
31452
12
    return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
31453
10
  case 21:
31454
10
    // mul x, 21 => add ((shl (mul x, 5), 2), x)
31455
10
    return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
31456
10
  case 22:
31457
10
    // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
31458
10
    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31459
10
                       combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
31460
11
  case 19:
31461
11
    // mul x, 19 => sub ((shl (mul x, 5), 2), x)
31462
11
    return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
31463
11
  case 13:
31464
11
    // mul x, 13 => add ((shl (mul x, 3), 2), x)
31465
11
    return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
31466
10
  case 23:
31467
10
    // mul x, 13 => sub ((shl (mul x, 3), 3), x)
31468
10
    return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
31469
11
  case 14:
31470
11
    // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
31471
11
    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31472
11
                       combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
31473
10
  case 26:
31474
10
    // mul x, 26 => sub ((mul (mul x, 9), 3), x)
31475
10
    return combineMulMulAddOrSub(/*isAdd*/ false);
31476
12
  case 28:
31477
12
    // mul x, 28 => add ((mul (mul x, 9), 3), x)
31478
12
    return combineMulMulAddOrSub(/*isAdd*/ true);
31479
10
  case 29:
31480
10
    // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
31481
10
    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31482
10
                       combineMulMulAddOrSub(/*isAdd*/ true));
31483
10
  case 30:
31484
10
    // mul x, 30 => sub (sub ((shl x, 5), x), x)
31485
10
    return DAG.getNode(
31486
10
        ISD::SUB, DL, VT,
31487
10
        DAG.getNode(ISD::SUB, DL, VT,
31488
10
                    DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31489
10
                                DAG.getConstant(5, DL, MVT::i8)),
31490
10
                    N->getOperand(0)),
31491
10
        N->getOperand(0));
31492
1.63k
  }
31493
1.63k
  return SDValue();
31494
1.63k
}
31495
31496
/// Optimize a single multiply with constant into two operations in order to
31497
/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
31498
static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
31499
                          TargetLowering::DAGCombinerInfo &DCI,
31500
10.5k
                          const X86Subtarget &Subtarget) {
31501
10.5k
  EVT VT = N->getValueType(0);
31502
10.5k
  if (
DCI.isBeforeLegalize() && 10.5k
VT.isVector()3.92k
)
31503
841
    return reduceVMULWidth(N, DAG, Subtarget);
31504
9.74k
31505
9.74k
  
if (9.74k
!MulConstantOptimization9.74k
)
31506
525
    return SDValue();
31507
9.22k
  // An imul is usually smaller than the alternative sequence.
31508
9.22k
  
if (9.22k
DAG.getMachineFunction().getFunction()->optForMinSize()9.22k
)
31509
414
    return SDValue();
31510
8.80k
31511
8.80k
  
if (8.80k
DCI.isBeforeLegalize() || 8.80k
DCI.isCalledByLegalizer()6.18k
)
31512
2.62k
    return SDValue();
31513
6.18k
31514
6.18k
  
if (6.18k
VT != MVT::i64 && 6.18k
VT != MVT::i324.84k
)
31515
2.41k
    return SDValue();
31516
3.77k
31517
3.77k
  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
31518
3.77k
  if (!C)
31519
1.29k
    return SDValue();
31520
2.47k
  uint64_t MulAmt = C->getZExtValue();
31521
2.47k
  if (
isPowerOf2_64(MulAmt) || 2.47k
MulAmt == 32.47k
||
MulAmt == 52.39k
||
MulAmt == 92.36k
)
31522
176
    return SDValue();
31523
2.30k
31524
2.30k
  uint64_t MulAmt1 = 0;
31525
2.30k
  uint64_t MulAmt2 = 0;
31526
2.30k
  if (
(MulAmt % 9) == 02.30k
) {
31527
180
    MulAmt1 = 9;
31528
180
    MulAmt2 = MulAmt / 9;
31529
2.30k
  } else 
if (2.12k
(MulAmt % 5) == 02.12k
) {
31530
370
    MulAmt1 = 5;
31531
370
    MulAmt2 = MulAmt / 5;
31532
2.12k
  } else 
if (1.75k
(MulAmt % 3) == 01.75k
) {
31533
795
    MulAmt1 = 3;
31534
795
    MulAmt2 = MulAmt / 3;
31535
795
  }
31536
2.30k
31537
2.30k
  SDLoc DL(N);
31538
2.30k
  SDValue NewMul;
31539
2.30k
  if (MulAmt2 &&
31540
2.30k
      
(isPowerOf2_64(MulAmt2) || 1.34k
MulAmt2 == 3858
||
MulAmt2 == 5833
||
MulAmt2 == 9820
)){
31541
526
31542
526
    if (isPowerOf2_64(MulAmt2) &&
31543
487
        
!(N->hasOneUse() && 487
N->use_begin()->getOpcode() == ISD::ADD478
))
31544
526
      // If second multiplifer is pow2, issue it first. We want the multiply by
31545
526
      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
31546
526
      // is an add.
31547
229
      std::swap(MulAmt1, MulAmt2);
31548
526
31549
526
    if (isPowerOf2_64(MulAmt1))
31550
229
      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31551
229
                           DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
31552
526
    else
31553
297
      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31554
297
                           DAG.getConstant(MulAmt1, DL, VT));
31555
526
31556
526
    if (isPowerOf2_64(MulAmt2))
31557
258
      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
31558
258
                           DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
31559
526
    else
31560
268
      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31561
268
                           DAG.getConstant(MulAmt2, DL, VT));
31562
2.30k
  } else 
if (1.77k
!Subtarget.slowLEA()1.77k
)
31563
1.74k
    NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
31564
2.30k
31565
2.30k
  if (
!NewMul2.30k
) {
31566
1.65k
    assert(MulAmt != 0 &&
31567
1.65k
           MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
31568
1.65k
           "Both cases that could cause potential overflows should have "
31569
1.65k
           "already been handled.");
31570
1.65k
    int64_t SignMulAmt = C->getSExtValue();
31571
1.65k
    if (
(SignMulAmt != INT64_MIN) && 1.65k
(SignMulAmt != INT64_MAX)1.65k
&&
31572
1.65k
        
(SignMulAmt != -INT64_MAX)1.65k
) {
31573
1.65k
      int NumSign = SignMulAmt > 0 ? 
11.23k
:
-1419
;
31574
1.65k
      bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31575
1.65k
      bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31576
1.65k
      if (
IsPowerOf2_64PlusOne1.65k
) {
31577
37
        // (mul x, 2^N + 1) => (add (shl x, N), x)
31578
37
        NewMul = DAG.getNode(
31579
37
            ISD::ADD, DL, VT, N->getOperand(0),
31580
37
            DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31581
37
                        DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31582
37
                                        MVT::i8)));
31583
1.65k
      } else 
if (1.62k
IsPowerOf2_64MinusOne1.62k
) {
31584
133
        // (mul x, 2^N - 1) => (sub (shl x, N), x)
31585
133
        NewMul = DAG.getNode(
31586
133
            ISD::SUB, DL, VT,
31587
133
            DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31588
133
                        DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31589
133
                                        MVT::i8)),
31590
133
            N->getOperand(0));
31591
133
      }
31592
1.65k
      // To negate, subtract the number from zero
31593
1.65k
      if (
(IsPowerOf2_64PlusOne || 1.65k
IsPowerOf2_64MinusOne1.62k
) &&
NumSign == -1170
)
31594
13
        NewMul =
31595
13
            DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31596
1.65k
    }
31597
1.65k
  }
31598
2.30k
31599
2.30k
  if (NewMul)
31600
2.30k
    // Do not add new nodes to DAG combiner worklist.
31601
813
    DCI.CombineTo(N, NewMul, false);
31602
10.5k
31603
10.5k
  return SDValue();
31604
10.5k
}
31605
31606
29.9k
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31607
29.9k
  SDValue N0 = N->getOperand(0);
31608
29.9k
  SDValue N1 = N->getOperand(1);
31609
29.9k
  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31610
29.9k
  EVT VT = N0.getValueType();
31611
29.9k
31612
29.9k
  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31613
29.9k
  // since the result of setcc_c is all zero's or all ones.
31614
29.9k
  if (
VT.isInteger() && 29.9k
!VT.isVector()29.9k
&&
31615
29.9k
      
N1C28.0k
&&
N0.getOpcode() == ISD::AND24.1k
&&
31616
29.9k
      
N0.getOperand(1).getOpcode() == ISD::Constant1.12k
) {
31617
1.07k
    SDValue N00 = N0.getOperand(0);
31618
1.07k
    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31619
1.07k
    Mask <<= N1C->getAPIntValue();
31620
1.07k
    bool MaskOK = false;
31621
1.07k
    // We can handle cases concerning bit-widening nodes containing setcc_c if
31622
1.07k
    // we carefully interrogate the mask to make sure we are semantics
31623
1.07k
    // preserving.
31624
1.07k
    // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31625
1.07k
    // of the underlying setcc_c operation if the setcc_c was zero extended.
31626
1.07k
    // Consider the following example:
31627
1.07k
    //   zext(setcc_c)                 -> i32 0x0000FFFF
31628
1.07k
    //   c1                            -> i32 0x0000FFFF
31629
1.07k
    //   c2                            -> i32 0x00000001
31630
1.07k
    //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31631
1.07k
    //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
31632
1.07k
    if (
N00.getOpcode() == X86ISD::SETCC_CARRY1.07k
) {
31633
0
      MaskOK = true;
31634
1.07k
    } else 
if (1.07k
N00.getOpcode() == ISD::SIGN_EXTEND &&
31635
1.07k
               
N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY0
) {
31636
0
      MaskOK = true;
31637
1.07k
    } else 
if (1.07k
(N00.getOpcode() == ISD::ZERO_EXTEND ||
31638
1.07k
                N00.getOpcode() == ISD::ANY_EXTEND) &&
31639
1.07k
               
N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY515
) {
31640
0
      MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31641
0
    }
31642
1.07k
    if (
MaskOK && 1.07k
Mask != 00
) {
31643
0
      SDLoc DL(N);
31644
0
      return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31645
0
    }
31646
29.9k
  }
31647
29.9k
31648
29.9k
  // Hardware support for vector shifts is sparse which makes us scalarize the
31649
29.9k
  // vector operations in many cases. Also, on sandybridge ADD is faster than
31650
29.9k
  // shl.
31651
29.9k
  // (shl V, 1) -> add V,V
31652
29.9k
  
if (auto *29.9k
N1BV29.9k
= dyn_cast<BuildVectorSDNode>(N1))
31653
1.10k
    
if (auto *1.10k
N1SplatC1.10k
= N1BV->getConstantSplatNode()) {
31654
695
      assert(N0.getValueType().isVector() && "Invalid vector shift type");
31655
695
      // We shift all of the values by one. In many cases we do not have
31656
695
      // hardware support for this operation. This is better expressed as an ADD
31657
695
      // of two values.
31658
695
      if (N1SplatC->getAPIntValue() == 1)
31659
29
        return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31660
29.9k
    }
31661
29.9k
31662
29.9k
  return SDValue();
31663
29.9k
}
31664
31665
4.70k
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
31666
4.70k
  SDValue N0 = N->getOperand(0);
31667
4.70k
  SDValue N1 = N->getOperand(1);
31668
4.70k
  EVT VT = N0.getValueType();
31669
4.70k
  unsigned Size = VT.getSizeInBits();
31670
4.70k
31671
4.70k
  // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31672
4.70k
  // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31673
4.70k
  // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31674
4.70k
  // depending on sign of (SarConst - [56,48,32,24,16])
31675
4.70k
31676
4.70k
  // sexts in X86 are MOVs. The MOVs have the same code size
31677
4.70k
  // as above SHIFTs (only SHIFT on 1 has lower code size).
31678
4.70k
  // However the MOVs have 2 advantages to a SHIFT:
31679
4.70k
  // 1. MOVs can write to a register that differs from source
31680
4.70k
  // 2. MOVs accept memory operands
31681
4.70k
31682
4.70k
  if (
!VT.isInteger() || 4.70k
VT.isVector()4.70k
||
N1.getOpcode() != ISD::Constant3.32k
||
31683
4.70k
      
N0.getOpcode() != ISD::SHL3.13k
||
!N0.hasOneUse()878
||
31684
856
      N0.getOperand(1).getOpcode() != ISD::Constant)
31685
3.85k
    return SDValue();
31686
856
31687
856
  SDValue N00 = N0.getOperand(0);
31688
856
  SDValue N01 = N0.getOperand(1);
31689
856
  APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31690
856
  APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31691
856
  EVT CVT = N1.getValueType();
31692
856
31693
856
  if (SarConst.isNegative())
31694
0
    return SDValue();
31695
856
31696
856
  
for (MVT SVT : MVT::integer_valuetypes()) 856
{
31697
4.93k
    unsigned ShiftSize = SVT.getSizeInBits();
31698
4.93k
    // skipping types without corresponding sext/zext and
31699
4.93k
    // ShlConst that is not one of [56,48,32,24,16]
31700
4.93k
    if (
ShiftSize < 8 || 4.93k
ShiftSize > 644.07k
||
ShlConst != Size - ShiftSize3.28k
)
31701
4.87k
      continue;
31702
62
    SDLoc DL(N);
31703
62
    SDValue NN =
31704
62
        DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31705
62
    SarConst = SarConst - (Size - ShiftSize);
31706
62
    if (SarConst == 0)
31707
0
      return NN;
31708
62
    else 
if (62
SarConst.isNegative()62
)
31709
7
      return DAG.getNode(ISD::SHL, DL, VT, NN,
31710
7
                         DAG.getConstant(-SarConst, DL, CVT));
31711
62
    else
31712
55
      return DAG.getNode(ISD::SRA, DL, VT, NN,
31713
55
                         DAG.getConstant(SarConst, DL, CVT));
31714
794
  }
31715
794
  return SDValue();
31716
794
}
31717
31718
24.5k
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
31719
24.5k
  SDValue N0 = N->getOperand(0);
31720
24.5k
  SDValue N1 = N->getOperand(1);
31721
24.5k
  EVT VT = N0.getValueType();
31722
24.5k
31723
24.5k
  // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
31724
24.5k
  // TODO: This is a generic DAG combine that became an x86-only combine to
31725
24.5k
  // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
31726
24.5k
  // and-not ('andn').
31727
24.5k
  if (
N0.getOpcode() != ISD::AND || 24.5k
!N0.hasOneUse()1.92k
)
31728
23.1k
    return SDValue();
31729
1.41k
31730
1.41k
  auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
31731
1.41k
  auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
31732
1.41k
  if (
!ShiftC || 1.41k
!AndC1.36k
)
31733
60
    return SDValue();
31734
1.35k
31735
1.35k
  // If we can shrink the constant mask below 8-bits or 32-bits, then this
31736
1.35k
  // transform should reduce code size. It may also enable secondary transforms
31737
1.35k
  // from improved known-bits analysis or instruction selection.
31738
1.35k
  APInt MaskVal = AndC->getAPIntValue();
31739
1.35k
  APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
31740
1.35k
  unsigned OldMaskSize = MaskVal.getMinSignedBits();
31741
1.35k
  unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
31742
1.35k
  if (
(OldMaskSize > 8 && 1.35k
NewMaskSize <= 81.10k
) ||
31743
1.35k
      
(OldMaskSize > 32 && 958
NewMaskSize <= 3282
)) {
31744
455
    // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
31745
455
    SDLoc DL(N);
31746
455
    SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
31747
455
    SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
31748
455
    return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
31749
455
  }
31750
900
  return SDValue();
31751
900
}
31752
31753
/// \brief Returns a vector of 0s if the node in input is a vector logical
31754
/// shift by a constant amount which is known to be bigger than or equal
31755
/// to the vector element size in bits.
31756
static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31757
54.0k
                                      const X86Subtarget &Subtarget) {
31758
54.0k
  EVT VT = N->getValueType(0);
31759
54.0k
31760
54.0k
  if (
VT != MVT::v2i64 && 54.0k
VT != MVT::v4i3253.3k
&&
VT != MVT::v8i1652.3k
&&
31761
51.8k
      (!Subtarget.hasInt256() ||
31762
16.1k
       
(VT != MVT::v4i64 && 16.1k
VT != MVT::v8i3215.8k
&&
VT != MVT::v16i1615.4k
)))
31763
50.8k
    return SDValue();
31764
3.15k
31765
3.15k
  SDValue Amt = N->getOperand(1);
31766
3.15k
  SDLoc DL(N);
31767
3.15k
  if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31768
1.92k
    
if (auto *1.92k
AmtSplat1.92k
= AmtBV->getConstantSplatNode()) {
31769
1.45k
      const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31770
1.45k
      unsigned MaxAmount =
31771
1.45k
        VT.getSimpleVT().getScalarSizeInBits();
31772
1.45k
31773
1.45k
      // SSE2/AVX2 logical shifts always return a vector of 0s
31774
1.45k
      // if the shift amount is bigger than or equal to
31775
1.45k
      // the element size. The constant shift amount will be
31776
1.45k
      // encoded as a 8-bit immediate.
31777
1.45k
      if (ShiftAmt.trunc(8).uge(MaxAmount))
31778
0
        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31779
3.15k
    }
31780
3.15k
31781
3.15k
  return SDValue();
31782
3.15k
}
31783
31784
static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31785
                            TargetLowering::DAGCombinerInfo &DCI,
31786
59.2k
                            const X86Subtarget &Subtarget) {
31787
59.2k
  if (N->getOpcode() == ISD::SHL)
31788
29.9k
    
if (SDValue 29.9k
V29.9k
= combineShiftLeft(N, DAG))
31789
29
      return V;
31790
59.1k
31791
59.1k
  
if (59.1k
N->getOpcode() == ISD::SRA59.1k
)
31792
4.70k
    
if (SDValue 4.70k
V4.70k
= combineShiftRightArithmetic(N, DAG))
31793
62
      return V;
31794
59.1k
31795
59.1k
  
if (59.1k
N->getOpcode() == ISD::SRL59.1k
)
31796
24.5k
    
if (SDValue 24.5k
V24.5k
= combineShiftRightLogical(N, DAG))
31797
455
      return V;
31798
58.6k
31799
58.6k
  // Try to fold this logical shift into a zero vector.
31800
58.6k
  
if (58.6k
N->getOpcode() != ISD::SRA58.6k
)
31801
54.0k
    
if (SDValue 54.0k
V54.0k
= performShiftToAllZeros(N, DAG, Subtarget))
31802
0
      return V;
31803
58.6k
31804
58.6k
  return SDValue();
31805
58.6k
}
31806
31807
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31808
                                     TargetLowering::DAGCombinerInfo &DCI,
31809
20.0k
                                     const X86Subtarget &Subtarget) {
31810
20.0k
  unsigned Opcode = N->getOpcode();
31811
20.0k
  assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31812
20.0k
          X86ISD::VSRLI == Opcode) &&
31813
20.0k
         "Unexpected shift opcode");
31814
12.7k
  bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31815
20.0k
  EVT VT = N->getValueType(0);
31816
20.0k
  SDValue N0 = N->getOperand(0);
31817
20.0k
  SDValue N1 = N->getOperand(1);
31818
20.0k
  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31819
20.0k
  assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31820
20.0k
         "Unexpected value type");
31821
20.0k
31822
20.0k
  // Out of range logical bit shifts are guaranteed to be zero.
31823
20.0k
  // Out of range arithmetic bit shifts splat the sign bit.
31824
20.0k
  APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31825
20.0k
  if (
ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)20.0k
) {
31826
0
    if (LogicalShift)
31827
0
      return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31828
0
    else
31829
0
      ShiftVal = NumBitsPerElt - 1;
31830
0
  }
31831
20.0k
31832
20.0k
  // Shift N0 by zero -> N0.
31833
20.0k
  
if (20.0k
!ShiftVal20.0k
)
31834
0
    return N0;
31835
20.0k
31836
20.0k
  // Shift zero -> zero.
31837
20.0k
  
if (20.0k
ISD::isBuildVectorAllZeros(N0.getNode())20.0k
)
31838
260
    return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31839
19.7k
31840
19.7k
  // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31841
19.7k
  // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31842
19.7k
  // TODO - support other sra opcodes as needed.
31843
19.7k
  
if (19.7k
Opcode == X86ISD::VSRLI && 19.7k
(ShiftVal + 1) == NumBitsPerElt8.23k
&&
31844
404
      N0.getOpcode() == X86ISD::VSRAI)
31845
0
    return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31846
19.7k
31847
19.7k
  // We can decode 'whole byte' logical bit shifts as shuffles.
31848
19.7k
  
if (19.7k
LogicalShift && 19.7k
(ShiftVal.getZExtValue() % 8) == 015.2k
) {
31849
4.62k
    SDValue Op(N, 0);
31850
4.62k
    if (combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
31851
4.62k
                                      /*HasVarMask*/ false, DAG, DCI,
31852
4.62k
                                      Subtarget))
31853
127
      return SDValue(); // This routine will use CombineTo to replace N.
31854
19.6k
  }
31855
19.6k
31856
19.6k
  // Constant Folding.
31857
19.6k
  APInt UndefElts;
31858
19.6k
  SmallVector<APInt, 32> EltBits;
31859
19.6k
  if (N->isOnlyUserOf(N0.getNode()) &&
31860
19.6k
      
getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)15.1k
) {
31861
75
    assert(EltBits.size() == VT.getVectorNumElements() &&
31862
75
           "Unexpected shift value type");
31863
75
    unsigned ShiftImm = ShiftVal.getZExtValue();
31864
912
    for (APInt &Elt : EltBits) {
31865
912
      if (X86ISD::VSHLI == Opcode)
31866
852
        Elt <<= ShiftImm;
31867
60
      else 
if (60
X86ISD::VSRAI == Opcode60
)
31868
56
        Elt.ashrInPlace(ShiftImm);
31869
60
      else
31870
4
        Elt.lshrInPlace(ShiftImm);
31871
912
    }
31872
75
    return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31873
75
  }
31874
19.5k
31875
19.5k
  return SDValue();
31876
19.5k
}
31877
31878
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31879
                                   TargetLowering::DAGCombinerInfo &DCI,
31880
9.75k
                                   const X86Subtarget &Subtarget) {
31881
9.75k
  assert(
31882
9.75k
      ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31883
9.75k
       (N->getOpcode() == X86ISD::PINSRW &&
31884
9.75k
        N->getValueType(0) == MVT::v8i16)) &&
31885
9.75k
      "Unexpected vector insertion");
31886
9.75k
31887
9.75k
  // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31888
9.75k
  SDValue Op(N, 0);
31889
9.75k
  combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
31890
9.75k
                                /*HasVarMask*/ false, DAG, DCI, Subtarget);
31891
9.75k
  return SDValue();
31892
9.75k
}
31893
31894
/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31895
/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31896
/// OR -> CMPNEQSS.
31897
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31898
                                   TargetLowering::DAGCombinerInfo &DCI,
31899
60.4k
                                   const X86Subtarget &Subtarget) {
31900
60.4k
  unsigned opcode;
31901
60.4k
31902
60.4k
  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31903
60.4k
  // we're requiring SSE2 for both.
31904
60.4k
  if (
Subtarget.hasSSE2() && 60.4k
isAndOrOfSetCCs(SDValue(N, 0U), opcode)56.7k
) {
31905
353
    SDValue N0 = N->getOperand(0);
31906
353
    SDValue N1 = N->getOperand(1);
31907
353
    SDValue CMP0 = N0->getOperand(1);
31908
353
    SDValue CMP1 = N1->getOperand(1);
31909
353
    SDLoc DL(N);
31910
353
31911
353
    // The SETCCs should both refer to the same CMP.
31912
353
    if (
CMP0.getOpcode() != X86ISD::CMP || 353
CMP0 != CMP1120
)
31913
314
      return SDValue();
31914
39
31915
39
    SDValue CMP00 = CMP0->getOperand(0);
31916
39
    SDValue CMP01 = CMP0->getOperand(1);
31917
39
    EVT     VT    = CMP00.getValueType();
31918
39
31919
39
    if (
VT == MVT::f32 || 39
VT == MVT::f6417
) {
31920
39
      bool ExpectingFlags = false;
31921
39
      // Check for any users that want flags:
31922
39
      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31923
78
           
!ExpectingFlags && 78
UI != UE69
;
++UI39
)
31924
39
        switch (UI->getOpcode()) {
31925
9
        default:
31926
9
        case ISD::BR_CC:
31927
9
        case ISD::BRCOND:
31928
9
        case ISD::SELECT:
31929
9
          ExpectingFlags = true;
31930
9
          break;
31931
30
        case ISD::CopyToReg:
31932
30
        case ISD::SIGN_EXTEND:
31933
30
        case ISD::ZERO_EXTEND:
31934
30
        case ISD::ANY_EXTEND:
31935
30
          break;
31936
39
        }
31937
39
31938
39
      
if (39
!ExpectingFlags39
) {
31939
30
        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31940
30
        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31941
30
31942
30
        if (
cc1 == X86::COND_E || 30
cc1 == X86::COND_NE30
) {
31943
0
          X86::CondCode tmp = cc0;
31944
0
          cc0 = cc1;
31945
0
          cc1 = tmp;
31946
0
        }
31947
30
31948
30
        if (
(cc0 == X86::COND_E && 30
cc1 == X86::COND_NP23
) ||
31949
30
            
(cc0 == X86::COND_NE && 7
cc1 == X86::COND_P7
)) {
31950
30
          // FIXME: need symbolic constants for these magic numbers.
31951
30
          // See X86ATTInstPrinter.cpp:printSSECC().
31952
30
          unsigned x86cc = (cc0 == X86::COND_E) ? 
023
:
47
;
31953
30
          if (
Subtarget.hasAVX512()30
) {
31954
7
            SDValue FSetCC =
31955
7
                DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
31956
7
                            DAG.getConstant(x86cc, DL, MVT::i8));
31957
7
            return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
31958
7
                               FSetCC, DAG.getIntPtrConstant(0, DL));
31959
7
          }
31960
23
          SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31961
23
                                              CMP00.getValueType(), CMP00, CMP01,
31962
23
                                              DAG.getConstant(x86cc, DL,
31963
23
                                                              MVT::i8));
31964
23
31965
23
          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31966
23
          MVT IntVT = is64BitFP ? 
MVT::i6414
:
MVT::i329
;
31967
23
31968
23
          if (
is64BitFP && 23
!Subtarget.is64Bit()14
) {
31969
3
            // On a 32-bit target, we cannot bitcast the 64-bit float to a
31970
3
            // 64-bit integer, since that's not a legal type. Since
31971
3
            // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31972
3
            // bits, but can do this little dance to extract the lowest 32 bits
31973
3
            // and work with those going forward.
31974
3
            SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31975
3
                                           OnesOrZeroesF);
31976
3
            SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31977
3
            OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31978
3
                                        Vector32, DAG.getIntPtrConstant(0, DL));
31979
3
            IntVT = MVT::i32;
31980
3
          }
31981
30
31982
30
          SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31983
30
          SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31984
30
                                      DAG.getConstant(1, DL, IntVT));
31985
30
          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31986
30
                                              ANDed);
31987
30
          return OneBitOfTruth;
31988
30
        }
31989
30
      }
31990
39
    }
31991
353
  }
31992
60.1k
  return SDValue();
31993
60.1k
}
31994
31995
/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31996
38.9k
static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31997
38.9k
  assert(N->getOpcode() == ISD::AND);
31998
38.9k
31999
38.9k
  EVT VT = N->getValueType(0);
32000
38.9k
  SDValue N0 = N->getOperand(0);
32001
38.9k
  SDValue N1 = N->getOperand(1);
32002
38.9k
  SDLoc DL(N);
32003
38.9k
32004
38.9k
  if (
VT != MVT::v2i64 && 38.9k
VT != MVT::v4i6423.1k
&&
VT != MVT::v8i6419.2k
)
32005
18.4k
    return SDValue();
32006
20.4k
32007
20.4k
  
if (20.4k
N0.getOpcode() == ISD::XOR &&
32008
291
      ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
32009
204
    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
32010
20.2k
32011
20.2k
  
if (20.2k
N1.getOpcode() == ISD::XOR &&
32012
1.16k
      ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
32013
1.16k
    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
32014
19.0k
32015
19.0k
  return SDValue();
32016
19.0k
}
32017
32018
// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
32019
// register. In most cases we actually compare or select YMM-sized registers
32020
// and mixing the two types creates horrible code. This method optimizes
32021
// some of the transition sequences.
32022
static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
32023
                                 TargetLowering::DAGCombinerInfo &DCI,
32024
506
                                 const X86Subtarget &Subtarget) {
32025
506
  EVT VT = N->getValueType(0);
32026
506
  if (!VT.is256BitVector())
32027
0
    return SDValue();
32028
506
32029
506
  assert((N->getOpcode() == ISD::ANY_EXTEND ||
32030
506
          N->getOpcode() == ISD::ZERO_EXTEND ||
32031
506
          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
32032
506
32033
506
  SDValue Narrow = N->getOperand(0);
32034
506
  EVT NarrowVT = Narrow->getValueType(0);
32035
506
  if (!NarrowVT.is128BitVector())
32036
102
    return SDValue();
32037
404
32038
404
  
if (404
Narrow->getOpcode() != ISD::XOR &&
32039
404
      Narrow->getOpcode() != ISD::AND &&
32040
381
      Narrow->getOpcode() != ISD::OR)
32041
381
    return SDValue();
32042
23
32043
23
  SDValue N0  = Narrow->getOperand(0);
32044
23
  SDValue N1  = Narrow->getOperand(1);
32045
23
  SDLoc DL(Narrow);
32046
23
32047
23
  // The Left side has to be a trunc.
32048
23
  if (N0.getOpcode() != ISD::TRUNCATE)
32049
13
    return SDValue();
32050
10
32051
10
  // The type of the truncated inputs.
32052
10
  EVT WideVT = N0->getOperand(0)->getValueType(0);
32053
10
  if (WideVT != VT)
32054
0
    return SDValue();
32055
10
32056
10
  // The right side has to be a 'trunc' or a constant vector.
32057
10
  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
32058
10
  ConstantSDNode *RHSConstSplat = nullptr;
32059
10
  if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
32060
0
    RHSConstSplat = RHSBV->getConstantSplatNode();
32061
10
  if (
!RHSTrunc && 10
!RHSConstSplat0
)
32062
0
    return SDValue();
32063
10
32064
10
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32065
10
32066
10
  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
32067
0
    return SDValue();
32068
10
32069
10
  // Set N0 and N1 to hold the inputs to the new wide operation.
32070
10
  N0 = N0->getOperand(0);
32071
10
  if (
RHSConstSplat10
) {
32072
0
    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
32073
0
                     SDValue(RHSConstSplat, 0));
32074
0
    N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
32075
10
  } else 
if (10
RHSTrunc10
) {
32076
10
    N1 = N1->getOperand(0);
32077
10
  }
32078
10
32079
10
  // Generate the wide operation.
32080
10
  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
32081
10
  unsigned Opcode = N->getOpcode();
32082
10
  switch (Opcode) {
32083
7
  case ISD::ANY_EXTEND:
32084
7
    return Op;
32085
0
  case ISD::ZERO_EXTEND: {
32086
0
    unsigned InBits = NarrowVT.getScalarSizeInBits();
32087
0
    APInt Mask = APInt::getAllOnesValue(InBits);
32088
0
    Mask = Mask.zext(VT.getScalarSizeInBits());
32089
0
    return DAG.getNode(ISD::AND, DL, VT,
32090
0
                       Op, DAG.getConstant(Mask, DL, VT));
32091
10
  }
32092
3
  case ISD::SIGN_EXTEND:
32093
3
    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
32094
3
                       Op, DAG.getValueType(NarrowVT));
32095
0
  default:
32096
0
    llvm_unreachable("Unexpected opcode");
32097
0
  }
32098
0
}
32099
32100
/// If both input operands of a logic op are being cast from floating point
32101
/// types, try to convert this into a floating point logic node to avoid
32102
/// unnecessary moves from SSE to integer registers.
32103
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
32104
73.3k
                                        const X86Subtarget &Subtarget) {
32105
73.3k
  unsigned FPOpcode = ISD::DELETED_NODE;
32106
73.3k
  if (N->getOpcode() == ISD::AND)
32107
38.9k
    FPOpcode = X86ISD::FAND;
32108
34.3k
  else 
if (34.3k
N->getOpcode() == ISD::OR34.3k
)
32109
21.4k
    FPOpcode = X86ISD::FOR;
32110
12.8k
  else 
if (12.8k
N->getOpcode() == ISD::XOR12.8k
)
32111
12.8k
    FPOpcode = X86ISD::FXOR;
32112
73.3k
32113
73.3k
  assert(FPOpcode != ISD::DELETED_NODE &&
32114
73.3k
         "Unexpected input node for FP logic conversion");
32115
73.3k
32116
73.3k
  EVT VT = N->getValueType(0);
32117
73.3k
  SDValue N0 = N->getOperand(0);
32118
73.3k
  SDValue N1 = N->getOperand(1);
32119
73.3k
  SDLoc DL(N);
32120
73.3k
  if (
N0.getOpcode() == ISD::BITCAST && 73.3k
N1.getOpcode() == ISD::BITCAST23.6k
&&
32121
16.8k
      
((Subtarget.hasSSE1() && 16.8k
VT == MVT::i3216.8k
) ||
32122
73.3k
       
(Subtarget.hasSSE2() && 16.8k
VT == MVT::i6416.8k
))) {
32123
29
    SDValue N00 = N0.getOperand(0);
32124
29
    SDValue N10 = N1.getOperand(0);
32125
29
    EVT N00Type = N00.getValueType();
32126
29
    EVT N10Type = N10.getValueType();
32127
29
    if (
N00Type.isFloatingPoint() && 29
N10Type.isFloatingPoint()29
) {
32128
29
      SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
32129
29
      return DAG.getBitcast(VT, FPLogic);
32130
29
    }
32131
73.3k
  }
32132
73.3k
  return SDValue();
32133
73.3k
}
32134
32135
/// If this is a zero/all-bits result that is bitwise-anded with a low bits
32136
/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
32137
/// with a shift-right to eliminate loading the vector constant mask value.
32138
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
32139
37.5k
                                     const X86Subtarget &Subtarget) {
32140
37.5k
  SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
32141
37.5k
  SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
32142
37.5k
  EVT VT0 = Op0.getValueType();
32143
37.5k
  EVT VT1 = Op1.getValueType();
32144
37.5k
32145
37.5k
  if (
VT0 != VT1 || 37.5k
!VT0.isSimple()25.4k
||
!VT0.isInteger()25.4k
)
32146
13.6k
    return SDValue();
32147
23.9k
32148
23.9k
  APInt SplatVal;
32149
23.9k
  if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
32150
655
      !SplatVal.isMask())
32151
23.4k
    return SDValue();
32152
430
32153
430
  
if (430
!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL)430
)
32154
231
    return SDValue();
32155
199
32156
199
  unsigned EltBitWidth = VT0.getScalarSizeInBits();
32157
199
  if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
32158
162
    return SDValue();
32159
37
32160
37
  SDLoc DL(N);
32161
37
  unsigned ShiftVal = SplatVal.countTrailingOnes();
32162
37
  SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
32163
37
  SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
32164
37
  return DAG.getBitcast(N->getValueType(0), Shift);
32165
37
}
32166
32167
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
32168
                          TargetLowering::DAGCombinerInfo &DCI,
32169
64.8k
                          const X86Subtarget &Subtarget) {
32170
64.8k
  if (DCI.isBeforeLegalizeOps())
32171
25.8k
    return SDValue();
32172
38.9k
32173
38.9k
  
if (SDValue 38.9k
R38.9k
= combineCompareEqual(N, DAG, DCI, Subtarget))
32174
23
    return R;
32175
38.9k
32176
38.9k
  
if (SDValue 38.9k
FPLogic38.9k
= convertIntLogicToFPLogic(N, DAG, Subtarget))
32177
1
    return FPLogic;
32178
38.9k
32179
38.9k
  
if (SDValue 38.9k
R38.9k
= combineANDXORWithAllOnesIntoANDNP(N, DAG))
32180
1.36k
    return R;
32181
37.5k
32182
37.5k
  
if (SDValue 37.5k
ShiftRight37.5k
= combineAndMaskToShift(N, DAG, Subtarget))
32183
37
    return ShiftRight;
32184
37.5k
32185
37.5k
  EVT VT = N->getValueType(0);
32186
37.5k
32187
37.5k
  // Attempt to recursively combine a bitmask AND with shuffles.
32188
37.5k
  if (
VT.isVector() && 37.5k
(VT.getScalarSizeInBits() % 8) == 020.8k
) {
32189
19.0k
    SDValue Op(N, 0);
32190
19.0k
    if (combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
32191
19.0k
                                      /*HasVarMask*/ false, DAG, DCI,
32192
19.0k
                                      Subtarget))
32193
83
      return SDValue(); // This routine will use CombineTo to replace N.
32194
37.4k
  }
32195
37.4k
32196
37.4k
  return SDValue();
32197
37.4k
}
32198
32199
// Try to fold:
32200
//   (or (and (m, y), (pandn m, x)))
32201
// into:
32202
//   (vselect m, x, y)
32203
// As a special case, try to fold:
32204
//   (or (and (m, (sub 0, x)), (pandn m, x)))
32205
// into:
32206
//   (sub (xor X, M), M)
32207
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
32208
21.4k
                                            const X86Subtarget &Subtarget) {
32209
21.4k
  assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
32210
21.4k
32211
21.4k
  SDValue N0 = N->getOperand(0);
32212
21.4k
  SDValue N1 = N->getOperand(1);
32213
21.4k
  EVT VT = N->getValueType(0);
32214
21.4k
32215
21.4k
  if (
!((VT.is128BitVector() && 21.4k
Subtarget.hasSSE2()7.30k
) ||
32216
14.1k
        
(VT.is256BitVector() && 14.1k
Subtarget.hasInt256()940
)))
32217
13.5k
    return SDValue();
32218
7.98k
32219
7.98k
  // Canonicalize AND to LHS.
32220
7.98k
  
if (7.98k
N1.getOpcode() == ISD::AND7.98k
)
32221
1.48k
    std::swap(N0, N1);
32222
7.98k
32223
7.98k
  // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
32224
7.98k
  // ANDNP combine allows other combines to happen that prevent matching.
32225
7.98k
  if (
N0.getOpcode() != ISD::AND || 7.98k
N1.getOpcode() != X86ISD::ANDNP4.22k
)
32226
6.31k
    return SDValue();
32227
1.67k
32228
1.67k
  SDValue Mask = N1.getOperand(0);
32229
1.67k
  SDValue X = N1.getOperand(1);
32230
1.67k
  SDValue Y;
32231
1.67k
  if (N0.getOperand(0) == Mask)
32232
71
    Y = N0.getOperand(1);
32233
1.67k
  if (N0.getOperand(1) == Mask)
32234
1.56k
    Y = N0.getOperand(0);
32235
1.67k
32236
1.67k
  // Check to see if the mask appeared in both the AND and ANDNP.
32237
1.67k
  if (!Y.getNode())
32238
34
    return SDValue();
32239
1.63k
32240
1.63k
  // Validate that X, Y, and Mask are bitcasts, and see through them.
32241
1.63k
  Mask = peekThroughBitcasts(Mask);
32242
1.63k
  X = peekThroughBitcasts(X);
32243
1.63k
  Y = peekThroughBitcasts(Y);
32244
1.63k
32245
1.63k
  EVT MaskVT = Mask.getValueType();
32246
1.63k
  unsigned EltBits = MaskVT.getScalarSizeInBits();
32247
1.63k
32248
1.63k
  // TODO: Attempt to handle floating point cases as well?
32249
1.63k
  if (
!MaskVT.isInteger() || 1.63k
DAG.ComputeNumSignBits(Mask) != EltBits1.57k
)
32250
610
    return SDValue();
32251
1.02k
32252
1.02k
  SDLoc DL(N);
32253
1.02k
32254
1.02k
  // Try to match:
32255
1.02k
  //   (or (and (M, (sub 0, X)), (pandn M, X)))
32256
1.02k
  // which is a special case of vselect:
32257
1.02k
  //   (vselect M, (sub 0, X), X)
32258
1.02k
  // Per:
32259
1.02k
  // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
32260
1.02k
  // We know that, if fNegate is 0 or 1:
32261
1.02k
  //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
32262
1.02k
  //
32263
1.02k
  // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
32264
1.02k
  //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
32265
1.02k
  //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
32266
1.02k
  // This lets us transform our vselect to:
32267
1.02k
  //   (add (xor X, M), (and M, 1))
32268
1.02k
  // And further to:
32269
1.02k
  //   (sub (xor X, M), M)
32270
1.02k
  if (
X.getValueType() == MaskVT && 1.02k
Y.getValueType() == MaskVT568
&&
32271
1.02k
      
DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)498
) {
32272
981
    auto IsNegV = [](SDNode *N, SDValue V) {
32273
17
      return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
32274
17
        ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
32275
981
    };
32276
498
    SDValue V;
32277
498
    if (IsNegV(Y.getNode(), X))
32278
15
      V = X;
32279
483
    else 
if (483
IsNegV(X.getNode(), Y)483
)
32280
2
      V = Y;
32281
498
32282
498
    if (
V498
) {
32283
17
      SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
32284
17
      SDValue SubOp2 = Mask;
32285
17
32286
17
      // If the negate was on the false side of the select, then
32287
17
      // the operands of the SUB need to be swapped. PR 27251.
32288
17
      // This is because the pattern being matched above is
32289
17
      // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
32290
17
      // but if the pattern matched was
32291
17
      // (vselect M, X, (sub (0, X))), that is really negation of the pattern
32292
17
      // above, -(vselect M, (sub 0, X), X), and therefore the replacement
32293
17
      // pattern also needs to be a negation of the replacement pattern above.
32294
17
      // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
32295
17
      // sub accomplishes the negation of the replacement pattern.
32296
17
      if (V == Y)
32297
2
         std::swap(SubOp1, SubOp2);
32298
17
32299
17
      SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
32300
17
      return DAG.getBitcast(VT, Res);
32301
17
    }
32302
1.01k
  }
32303
1.01k
32304
1.01k
  // PBLENDVB is only available on SSE 4.1.
32305
1.01k
  
if (1.01k
!Subtarget.hasSSE41()1.01k
)
32306
981
    return SDValue();
32307
29
32308
29
  
MVT BlendVT = (VT == MVT::v4i64) ? 29
MVT::v32i87
:
MVT::v16i822
;
32309
21.4k
32310
21.4k
  X = DAG.getBitcast(BlendVT, X);
32311
21.4k
  Y = DAG.getBitcast(BlendVT, Y);
32312
21.4k
  Mask = DAG.getBitcast(BlendVT, Mask);
32313
21.4k
  Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
32314
21.4k
  return DAG.getBitcast(VT, Mask);
32315
21.4k
}
32316
32317
// Helper function for combineOrCmpEqZeroToCtlzSrl
32318
// Transforms:
32319
//   seteq(cmp x, 0)
32320
//   into:
32321
//   srl(ctlz x), log2(bitsize(x))
32322
// Input pattern is checked by caller.
32323
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
32324
40
                                          SelectionDAG &DAG) {
32325
40
  SDValue Cmp = Op.getOperand(1);
32326
40
  EVT VT = Cmp.getOperand(0).getValueType();
32327
40
  unsigned Log2b = Log2_32(VT.getSizeInBits());
32328
40
  SDLoc dl(Op);
32329
40
  SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
32330
40
  // The result of the shift is true or false, and on X86, the 32-bit
32331
40
  // encoding of shr and lzcnt is more desirable.
32332
40
  SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
32333
40
  SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
32334
40
                            DAG.getConstant(Log2b, dl, VT));
32335
40
  return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
32336
40
}
32337
32338
// Try to transform:
32339
//   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
32340
//   into:
32341
//   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
32342
// Will also attempt to match more generic cases, eg:
32343
//   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
32344
// Only applies if the target supports the FastLZCNT feature.
32345
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
32346
                                           TargetLowering::DAGCombinerInfo &DCI,
32347
28.8k
                                           const X86Subtarget &Subtarget) {
32348
28.8k
  if (
DCI.isBeforeLegalize() || 28.8k
!Subtarget.getTargetLowering()->isCtlzFast()18.2k
)
32349
28.7k
    return SDValue();
32350
154
32351
154
  
auto isORCandidate = [](SDValue N) 154
{
32352
104
    return (N->getOpcode() == ISD::OR && N->hasOneUse());
32353
312
  };
32354
154
32355
154
  // Check the zero extend is extending to 32-bit or more. The code generated by
32356
154
  // srl(ctlz) for 16-bit or less variants of the pattern would require extra
32357
154
  // instructions to clear the upper bits.
32358
154
  if (
!N->hasOneUse() || 154
!N->getSimpleValueType(0).bitsGE(MVT::i32)154
||
32359
150
      !isORCandidate(N->getOperand(0)))
32360
86
    return SDValue();
32361
68
32362
68
  // Check the node matches: setcc(eq, cmp 0)
32363
68
  
auto isSetCCCandidate = [](SDValue N) 68
{
32364
44
    return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
32365
44
           X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
32366
44
           N->getOperand(1).getOpcode() == X86ISD::CMP &&
32367
42
           isNullConstant(N->getOperand(1).getOperand(1)) &&
32368
40
           N->getOperand(1).getValueType().bitsGE(MVT::i32);
32369
104
  };
32370
68
32371
68
  SDNode *OR = N->getOperand(0).getNode();
32372
68
  SDValue LHS = OR->getOperand(0);
32373
68
  SDValue RHS = OR->getOperand(1);
32374
68
32375
68
  // Save nodes matching or(or, setcc(eq, cmp 0)).
32376
68
  SmallVector<SDNode *, 2> ORNodes;
32377
76
  while (
((isORCandidate(LHS) && 76
isSetCCCandidate(RHS)16
) ||
32378
76
          
(isORCandidate(RHS) && 70
isSetCCCandidate(LHS)4
))) {
32379
8
    ORNodes.push_back(OR);
32380
8
    OR = (LHS->getOpcode() == ISD::OR) ? 
LHS.getNode()6
:
RHS.getNode()2
;
32381
8
    LHS = OR->getOperand(0);
32382
8
    RHS = OR->getOperand(1);
32383
8
  }
32384
68
32385
68
  // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
32386
68
  if (
!(isSetCCCandidate(LHS) && 68
isSetCCCandidate(RHS)16
) ||
32387
16
      !isORCandidate(SDValue(OR, 0)))
32388
52
    return SDValue();
32389
16
32390
16
  // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
32391
16
  // to
32392
16
  // or(srl(ctlz),srl(ctlz)).
32393
16
  // The dag combiner can then fold it into:
32394
16
  // srl(or(ctlz, ctlz)).
32395
16
  EVT VT = OR->getValueType(0);
32396
16
  SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
32397
16
  SDValue Ret, NewRHS;
32398
16
  if (
NewLHS && 16
(NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG))16
)
32399
16
    Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
32400
16
32401
16
  if (!Ret)
32402
0
    return SDValue();
32403
16
32404
16
  // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
32405
24
  
while (16
ORNodes.size() > 024
) {
32406
8
    OR = ORNodes.pop_back_val();
32407
8
    LHS = OR->getOperand(0);
32408
8
    RHS = OR->getOperand(1);
32409
8
    // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
32410
8
    if (RHS->getOpcode() == ISD::OR)
32411
2
      std::swap(LHS, RHS);
32412
8
    EVT VT = OR->getValueType(0);
32413
8
    SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
32414
8
    if (!NewRHS)
32415
0
      return SDValue();
32416
8
    Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
32417
8
  }
32418
16
32419
16
  
if (16
Ret16
)
32420
16
    Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
32421
16
32422
16
  return Ret;
32423
28.8k
}
32424
32425
static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
32426
                         TargetLowering::DAGCombinerInfo &DCI,
32427
37.5k
                         const X86Subtarget &Subtarget) {
32428
37.5k
  if (DCI.isBeforeLegalizeOps())
32429
16.0k
    return SDValue();
32430
21.5k
32431
21.5k
  
if (SDValue 21.5k
R21.5k
= combineCompareEqual(N, DAG, DCI, Subtarget))
32432
7
    return R;
32433
21.4k
32434
21.4k
  
if (SDValue 21.4k
FPLogic21.4k
= convertIntLogicToFPLogic(N, DAG, Subtarget))
32435
0
    return FPLogic;
32436
21.4k
32437
21.4k
  
if (SDValue 21.4k
R21.4k
= combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
32438
46
    return R;
32439
21.4k
32440
21.4k
  SDValue N0 = N->getOperand(0);
32441
21.4k
  SDValue N1 = N->getOperand(1);
32442
21.4k
  EVT VT = N->getValueType(0);
32443
21.4k
32444
21.4k
  if (
VT != MVT::i16 && 21.4k
VT != MVT::i3221.2k
&&
VT != MVT::i6415.6k
)
32445
9.18k
    return SDValue();
32446
12.2k
32447
12.2k
  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
32448
12.2k
  bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
32449
12.2k
32450
12.2k
  // SHLD/SHRD instructions have lower register pressure, but on some
32451
12.2k
  // platforms they have higher latency than the equivalent
32452
12.2k
  // series of shifts/or that would otherwise be generated.
32453
12.2k
  // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
32454
12.2k
  // have higher latencies and we are not optimizing for size.
32455
12.2k
  if (
!OptForSize && 12.2k
Subtarget.isSHLDSlow()11.0k
)
32456
106
    return SDValue();
32457
12.1k
32458
12.1k
  
if (12.1k
N0.getOpcode() == ISD::SRL && 12.1k
N1.getOpcode() == ISD::SHL515
)
32459
391
    std::swap(N0, N1);
32460
12.1k
  if (
N0.getOpcode() != ISD::SHL || 12.1k
N1.getOpcode() != ISD::SRL1.70k
)
32461
11.4k
    return SDValue();
32462
695
  
if (695
!N0.hasOneUse() || 695
!N1.hasOneUse()695
)
32463
14
    return SDValue();
32464
681
32465
681
  SDValue ShAmt0 = N0.getOperand(1);
32466
681
  if (ShAmt0.getValueType() != MVT::i8)
32467
0
    return SDValue();
32468
681
  SDValue ShAmt1 = N1.getOperand(1);
32469
681
  if (ShAmt1.getValueType() != MVT::i8)
32470
0
    return SDValue();
32471
681
  
if (681
ShAmt0.getOpcode() == ISD::TRUNCATE681
)
32472
173
    ShAmt0 = ShAmt0.getOperand(0);
32473
681
  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
32474
172
    ShAmt1 = ShAmt1.getOperand(0);
32475
681
32476
681
  SDLoc DL(N);
32477
681
  unsigned Opc = X86ISD::SHLD;
32478
681
  SDValue Op0 = N0.getOperand(0);
32479
681
  SDValue Op1 = N1.getOperand(0);
32480
681
  if (ShAmt0.getOpcode() == ISD::SUB ||
32481
681
      
ShAmt0.getOpcode() == ISD::XOR538
) {
32482
151
    Opc = X86ISD::SHRD;
32483
151
    std::swap(Op0, Op1);
32484
151
    std::swap(ShAmt0, ShAmt1);
32485
151
  }
32486
681
32487
681
  // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
32488
681
  // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
32489
681
  // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
32490
681
  // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
32491
681
  unsigned Bits = VT.getSizeInBits();
32492
681
  if (
ShAmt1.getOpcode() == ISD::SUB681
) {
32493
171
    SDValue Sum = ShAmt1.getOperand(0);
32494
171
    if (ConstantSDNode *
SumC171
= dyn_cast<ConstantSDNode>(Sum)) {
32495
171
      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
32496
171
      if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
32497
19
        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
32498
171
      if (
SumC->getSExtValue() == Bits && 171
ShAmt1Op1 == ShAmt0120
)
32499
120
        return DAG.getNode(Opc, DL, VT,
32500
120
                           Op0, Op1,
32501
120
                           DAG.getNode(ISD::TRUNCATE, DL,
32502
120
                                       MVT::i8, ShAmt0));
32503
681
    }
32504
510
  } else 
if (ConstantSDNode *510
ShAmt1C510
= dyn_cast<ConstantSDNode>(ShAmt1)) {
32505
497
    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
32506
497
    if (
ShAmt0C && 497
(ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits497
)
32507
376
      return DAG.getNode(Opc, DL, VT,
32508
376
                         N0.getOperand(0), N1.getOperand(0),
32509
376
                         DAG.getNode(ISD::TRUNCATE, DL,
32510
376
                                       MVT::i8, ShAmt0));
32511
13
  } else 
if (13
ShAmt1.getOpcode() == ISD::XOR13
) {
32512
13
    SDValue Mask = ShAmt1.getOperand(1);
32513
13
    if (ConstantSDNode *
MaskC13
= dyn_cast<ConstantSDNode>(Mask)) {
32514
13
      unsigned InnerShift = (X86ISD::SHLD == Opc ? 
ISD::SRL5
:
ISD::SHL8
);
32515
13
      SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
32516
13
      if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
32517
4
        ShAmt1Op0 = ShAmt1Op0.getOperand(0);
32518
13
      if (
MaskC->getSExtValue() == (Bits - 1) && 13
ShAmt1Op0 == ShAmt012
) {
32519
12
        if (Op1.getOpcode() == InnerShift &&
32520
10
            isa<ConstantSDNode>(Op1.getOperand(1)) &&
32521
12
            
Op1.getConstantOperandVal(1) == 110
) {
32522
10
          return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32523
10
                             DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32524
10
        }
32525
2
        // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
32526
2
        
if (2
InnerShift == ISD::SHL && 2
Op1.getOpcode() == ISD::ADD2
&&
32527
2
            
Op1.getOperand(0) == Op1.getOperand(1)2
) {
32528
2
          return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32529
2
                     DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32530
2
        }
32531
173
      }
32532
13
    }
32533
510
  }
32534
173
32535
173
  return SDValue();
32536
173
}
32537
32538
/// Generate NEG and CMOV for integer abs.
32539
11.8k
static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
32540
11.8k
  EVT VT = N->getValueType(0);
32541
11.8k
32542
11.8k
  // Since X86 does not have CMOV for 8-bit integer, we don't convert
32543
11.8k
  // 8-bit integer abs to NEG and CMOV.
32544
11.8k
  if (
VT.isInteger() && 11.8k
VT.getSizeInBits() == 811.8k
)
32545
222
    return SDValue();
32546
11.6k
32547
11.6k
  SDValue N0 = N->getOperand(0);
32548
11.6k
  SDValue N1 = N->getOperand(1);
32549
11.6k
  SDLoc DL(N);
32550
11.6k
32551
11.6k
  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
32552
11.6k
  // and change it to SUB and CMOV.
32553
11.6k
  if (
VT.isInteger() && 11.6k
N->getOpcode() == ISD::XOR11.6k
&&
32554
11.6k
      
N0.getOpcode() == ISD::ADD11.6k
&&
N0.getOperand(1) == N1119
&&
32555
11.6k
      
N1.getOpcode() == ISD::SRA89
&&
N1.getOperand(0) == N0.getOperand(0)27
) {
32556
27
    auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32557
27
    if (
Y1C && 27
Y1C->getAPIntValue() == VT.getSizeInBits() - 127
) {
32558
27
      // Generate SUB & CMOV.
32559
27
      SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32560
27
                                DAG.getConstant(0, DL, VT), N0.getOperand(0));
32561
27
      SDValue Ops[] = {N0.getOperand(0), Neg,
32562
27
                       DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32563
27
                       SDValue(Neg.getNode(), 1)};
32564
27
      return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32565
27
    }
32566
11.6k
  }
32567
11.6k
  return SDValue();
32568
11.6k
}
32569
32570
/// Try to turn tests against the signbit in the form of:
32571
///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32572
/// into:
32573
///   SETGT(X, -1)
32574
12.9k
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32575
12.9k
  // This is only worth doing if the output type is i8 or i1.
32576
12.9k
  EVT ResultType = N->getValueType(0);
32577
12.9k
  if (
ResultType != MVT::i8 && 12.9k
ResultType != MVT::i112.6k
)
32578
12.6k
    return SDValue();
32579
325
32580
325
  SDValue N0 = N->getOperand(0);
32581
325
  SDValue N1 = N->getOperand(1);
32582
325
32583
325
  // We should be performing an xor against a truncated shift.
32584
325
  if (
N0.getOpcode() != ISD::TRUNCATE || 325
!N0.hasOneUse()25
)
32585
301
    return SDValue();
32586
24
32587
24
  // Make sure we are performing an xor against one.
32588
24
  
if (24
!isOneConstant(N1)24
)
32589
10
    return SDValue();
32590
14
32591
14
  // SetCC on x86 zero extends so only act on this if it's a logical shift.
32592
14
  SDValue Shift = N0.getOperand(0);
32593
14
  if (
Shift.getOpcode() != ISD::SRL || 14
!Shift.hasOneUse()6
)
32594
8
    return SDValue();
32595
6
32596
6
  // Make sure we are truncating from one of i16, i32 or i64.
32597
6
  EVT ShiftTy = Shift.getValueType();
32598
6
  if (
ShiftTy != MVT::i16 && 6
ShiftTy != MVT::i325
&&
ShiftTy != MVT::i641
)
32599
0
    return SDValue();
32600
6
32601
6
  // Make sure the shift amount extracts the sign bit.
32602
6
  
if (6
!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32603
6
      Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32604
0
    return SDValue();
32605
6
32606
6
  // Create a greater-than comparison against -1.
32607
6
  // N.B. Using SETGE against 0 works but we want a canonical looking
32608
6
  // comparison, using SETGT matches up with what TranslateX86CC.
32609
6
  SDLoc DL(N);
32610
6
  SDValue ShiftOp = Shift.getOperand(0);
32611
6
  EVT ShiftOpTy = ShiftOp.getValueType();
32612
6
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32613
6
  EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32614
6
                                               *DAG.getContext(), ResultType);
32615
6
  SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32616
6
                              DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32617
6
  if (SetCCResultType != ResultType)
32618
0
    Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32619
12.9k
  return Cond;
32620
12.9k
}
32621
32622
/// Turn vector tests of the signbit in the form of:
32623
///   xor (sra X, elt_size(X)-1), -1
32624
/// into:
32625
///   pcmpgt X, -1
32626
///
32627
/// This should be called before type legalization because the pattern may not
32628
/// persist after that.
32629
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32630
20.3k
                                         const X86Subtarget &Subtarget) {
32631
20.3k
  EVT VT = N->getValueType(0);
32632
20.3k
  if (!VT.isSimple())
32633
55
    return SDValue();
32634
20.2k
32635
20.2k
  switch (VT.getSimpleVT().SimpleTy) {
32636
10.9k
  default: return SDValue();
32637
594
  case MVT::v16i8:
32638
594
  case MVT::v8i16:
32639
594
  
case MVT::v4i32: if (594
!Subtarget.hasSSE2()594
)
return SDValue()7
; break;
32640
7.22k
  
case MVT::v2i64: if (7.22k
!Subtarget.hasSSE42()7.22k
)
return SDValue()4.27k
; break;
32641
1.51k
  case MVT::v32i8:
32642
1.51k
  case MVT::v16i16:
32643
1.51k
  case MVT::v8i32:
32644
1.51k
  
case MVT::v4i64: if (1.51k
!Subtarget.hasAVX2()1.51k
)
return SDValue()322
;
break1.19k
;
32645
4.73k
  }
32646
4.73k
32647
4.73k
  // There must be a shift right algebraic before the xor, and the xor must be a
32648
4.73k
  // 'not' operation.
32649
4.73k
  SDValue Shift = N->getOperand(0);
32650
4.73k
  SDValue Ones = N->getOperand(1);
32651
4.73k
  if (
Shift.getOpcode() != ISD::SRA || 4.73k
!Shift.hasOneUse()65
||
32652
37
      !ISD::isBuildVectorAllOnes(Ones.getNode()))
32653
4.70k
    return SDValue();
32654
33
32655
33
  // The shift should be smearing the sign bit across each vector element.
32656
33
  auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32657
33
  if (!ShiftBV)
32658
0
    return SDValue();
32659
33
32660
33
  EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32661
33
  auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32662
33
  if (
!ShiftAmt || 33
ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 133
)
32663
0
    return SDValue();
32664
33
32665
33
  // Create a greater-than comparison against -1. We don't use the more obvious
32666
33
  // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32667
33
  return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32668
33
}
32669
32670
/// Check if truncation with saturation form type \p SrcVT to \p DstVT
32671
/// is valid for the given \p Subtarget.
32672
static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32673
3.44k
                                        const X86Subtarget &Subtarget) {
32674
3.44k
  if (!Subtarget.hasAVX512())
32675
799
    return false;
32676
2.64k
32677
2.64k
  // FIXME: Scalar type may be supported if we move it to vector register.
32678
2.64k
  
if (2.64k
!SrcVT.isVector() || 2.64k
!SrcVT.isSimple()2.64k
||
SrcVT.getSizeInBits() > 5122.64k
)
32679
0
    return false;
32680
2.64k
32681
2.64k
  EVT SrcElVT = SrcVT.getScalarType();
32682
2.64k
  EVT DstElVT = DstVT.getScalarType();
32683
2.64k
  if (
SrcElVT.getSizeInBits() < 16 || 2.64k
SrcElVT.getSizeInBits() > 64974
)
32684
1.67k
    return false;
32685
974
  
if (974
DstElVT.getSizeInBits() < 8 || 974
DstElVT.getSizeInBits() > 32872
)
32686
102
    return false;
32687
872
  
if (872
SrcVT.is512BitVector() || 872
Subtarget.hasVLX()562
)
32688
720
    
return SrcElVT.getSizeInBits() >= 32 || 720
Subtarget.hasBWI()70
;
32689
152
  return false;
32690
152
}
32691
32692
/// Detect a pattern of truncation with saturation:
32693
/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32694
/// Return the source value to be truncated or SDValue() if the pattern was not
32695
/// matched.
32696
47.7k
static SDValue detectUSatPattern(SDValue In, EVT VT) {
32697
47.7k
  if (In.getOpcode() != ISD::UMIN)
32698
47.6k
    return SDValue();
32699
32
32700
32
  //Saturation with truncation. We truncate from InVT to VT.
32701
47.7k
  assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32702
32
    "Unexpected types for truncate operation");
32703
32
32704
32
  APInt C;
32705
32
  if (
ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)32
) {
32706
32
    // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32707
32
    // the element size of the destination type.
32708
21
    return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32709
11
      SDValue();
32710
32
  }
32711
0
  return SDValue();
32712
0
}
32713
32714
/// Detect a pattern of truncation with saturation:
32715
/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32716
/// The types should allow to use VPMOVUS* instruction on AVX512.
32717
/// Return the source value to be truncated or SDValue() if the pattern was not
32718
/// matched.
32719
static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32720
3.43k
                                       const X86Subtarget &Subtarget) {
32721
3.43k
  if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32722
2.73k
    return SDValue();
32723
702
  return detectUSatPattern(In, VT);
32724
702
}
32725
32726
static SDValue
32727
combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32728
53.5k
                        const X86Subtarget &Subtarget) {
32729
53.5k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32730
53.5k
  if (
!TLI.isTypeLegal(In.getValueType()) || 53.5k
!TLI.isTypeLegal(VT)50.5k
)
32731
6.53k
    return SDValue();
32732
47.0k
  
if (auto 47.0k
USatVal47.0k
= detectUSatPattern(In, VT))
32733
11
    
if (11
isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)11
)
32734
7
      return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32735
47.0k
  return SDValue();
32736
47.0k
}
32737
32738
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
32739
/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32740
/// X86ISD::AVG instruction.
32741
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32742
                                const X86Subtarget &Subtarget,
32743
57.2k
                                const SDLoc &DL) {
32744
57.2k
  if (
!VT.isVector() || 57.2k
!VT.isSimple()9.76k
)
32745
47.5k
    return SDValue();
32746
9.71k
  EVT InVT = In.getValueType();
32747
9.71k
  unsigned NumElems = VT.getVectorNumElements();
32748
9.71k
32749
9.71k
  EVT ScalarVT = VT.getVectorElementType();
32750
9.71k
  if (
!((ScalarVT == MVT::i8 || 9.71k
ScalarVT == MVT::i167.69k
) &&
32751
3.41k
        isPowerOf2_32(NumElems)))
32752
6.29k
    return SDValue();
32753
3.41k
32754
3.41k
  // InScalarVT is the intermediate type in AVG pattern and it should be greater
32755
3.41k
  // than the original input type (i8/i16).
32756
3.41k
  EVT InScalarVT = InVT.getVectorElementType();
32757
3.41k
  if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32758
0
    return SDValue();
32759
3.41k
32760
3.41k
  
if (3.41k
!Subtarget.hasSSE2()3.41k
)
32761
0
    return SDValue();
32762
3.41k
  
if (3.41k
Subtarget.hasBWI()3.41k
) {
32763
774
    if (VT.getSizeInBits() > 512)
32764
0
      return SDValue();
32765
2.64k
  } else 
if (2.64k
Subtarget.hasAVX2()2.64k
) {
32766
1.43k
    if (VT.getSizeInBits() > 256)
32767
18
      return SDValue();
32768
1.21k
  } else {
32769
1.21k
    if (VT.getSizeInBits() > 128)
32770
53
      return SDValue();
32771
3.34k
  }
32772
3.34k
32773
3.34k
  // Detect the following pattern:
32774
3.34k
  //
32775
3.34k
  //   %1 = zext <N x i8> %a to <N x i32>
32776
3.34k
  //   %2 = zext <N x i8> %b to <N x i32>
32777
3.34k
  //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32778
3.34k
  //   %4 = add nuw nsw <N x i32> %3, %2
32779
3.34k
  //   %5 = lshr <N x i32> %N, <i32 1 x N>
32780
3.34k
  //   %6 = trunc <N x i32> %5 to <N x i8>
32781
3.34k
  //
32782
3.34k
  // In AVX512, the last instruction can also be a trunc store.
32783
3.34k
32784
3.34k
  
if (3.34k
In.getOpcode() != ISD::SRL3.34k
)
32785
3.04k
    return SDValue();
32786
302
32787
302
  // A lambda checking the given SDValue is a constant vector and each element
32788
302
  // is in the range [Min, Max].
32789
302
  
auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) 302
{
32790
846
    BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32791
846
    if (
!BV || 846
!BV->isConstant()780
)
32792
66
      return false;
32793
780
    
for (SDValue Op : V->ops()) 780
{
32794
13.1k
      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32795
13.1k
      if (!C)
32796
0
        return false;
32797
13.1k
      uint64_t Val = C->getZExtValue();
32798
13.1k
      if (
Val < Min || 13.1k
Val > Max13.1k
)
32799
3
        return false;
32800
777
    }
32801
777
    return true;
32802
777
  };
32803
302
32804
302
  // Check if each element of the vector is left-shifted by one.
32805
302
  auto LHS = In.getOperand(0);
32806
302
  auto RHS = In.getOperand(1);
32807
302
  if (!IsConstVectorInRange(RHS, 1, 1))
32808
3
    return SDValue();
32809
299
  
if (299
LHS.getOpcode() != ISD::ADD299
)
32810
18
    return SDValue();
32811
281
32812
281
  // Detect a pattern of a + b + 1 where the order doesn't matter.
32813
281
  SDValue Operands[3];
32814
281
  Operands[0] = LHS.getOperand(0);
32815
281
  Operands[1] = LHS.getOperand(1);
32816
281
32817
281
  // Take care of the case when one of the operands is a constant vector whose
32818
281
  // element is in the range [1, 256].
32819
281
  if (
IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 281
256153
:
65536128
) &&
32820
248
      Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32821
281
      
Operands[0].getOperand(0).getValueType() == VT33
) {
32822
33
    // The pattern is detected. Subtract one from the constant vector, then
32823
33
    // demote it and emit X86ISD::AVG instruction.
32824
33
    SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32825
33
    Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32826
33
    Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32827
33
    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32828
33
                       Operands[1]);
32829
33
  }
32830
248
32831
248
  
if (248
Operands[0].getOpcode() == ISD::ADD248
)
32832
230
    std::swap(Operands[0], Operands[1]);
32833
18
  else 
if (18
Operands[1].getOpcode() != ISD::ADD18
)
32834
18
    return SDValue();
32835
230
  Operands[2] = Operands[1].getOperand(0);
32836
230
  Operands[1] = Operands[1].getOperand(1);
32837
230
32838
230
  // Now we have three operands of two additions. Check that one of them is a
32839
230
  // constant vector with ones, and the other two are promoted from i8/i16.
32840
263
  for (int i = 0; 
i < 3263
;
++i33
) {
32841
263
    if (!IsConstVectorInRange(Operands[i], 1, 1))
32842
33
      continue;
32843
230
    std::swap(Operands[i], Operands[2]);
32844
230
32845
230
    // Check if Operands[0] and Operands[1] are results of type promotion.
32846
632
    for (int j = 0; 
j < 2632
;
++j402
)
32847
431
      
if (431
Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32848
402
          Operands[j].getOperand(0).getValueType() != VT)
32849
29
        return SDValue();
32850
230
32851
230
    // The pattern is detected, emit X86ISD::AVG instruction.
32852
201
    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32853
201
                       Operands[1].getOperand(0));
32854
263
  }
32855
230
32856
0
  return SDValue();
32857
57.2k
}
32858
32859
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32860
                           TargetLowering::DAGCombinerInfo &DCI,
32861
276k
                           const X86Subtarget &Subtarget) {
32862
276k
  LoadSDNode *Ld = cast<LoadSDNode>(N);
32863
276k
  EVT RegVT = Ld->getValueType(0);
32864
276k
  EVT MemVT = Ld->getMemoryVT();
32865
276k
  SDLoc dl(Ld);
32866
276k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32867
276k
32868
276k
  // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32869
276k
  // into two 16-byte operations. Also split non-temporal aligned loads on
32870
276k
  // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
32871
276k
  ISD::LoadExtType Ext = Ld->getExtensionType();
32872
276k
  bool Fast;
32873
276k
  unsigned AddressSpace = Ld->getAddressSpace();
32874
276k
  unsigned Alignment = Ld->getAlignment();
32875
276k
  if (
RegVT.is256BitVector() && 276k
!DCI.isBeforeLegalizeOps()16.7k
&&
32876
9.61k
      Ext == ISD::NON_EXTLOAD &&
32877
9.09k
      
((Ld->isNonTemporal() && 9.09k
!Subtarget.hasInt256()232
&&
Alignment >= 1674
) ||
32878
9.03k
       (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32879
276k
                               AddressSpace, Alignment, &Fast) && 
!Fast9.03k
))) {
32880
79
    unsigned NumElems = RegVT.getVectorNumElements();
32881
79
    if (NumElems < 2)
32882
0
      return SDValue();
32883
79
32884
79
    SDValue Ptr = Ld->getBasePtr();
32885
79
32886
79
    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32887
79
                                  NumElems/2);
32888
79
    SDValue Load1 =
32889
79
        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32890
79
                    Alignment, Ld->getMemOperand()->getFlags());
32891
79
32892
79
    Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32893
79
    SDValue Load2 =
32894
79
        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32895
79
                    std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32896
79
    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32897
79
                             Load1.getValue(1),
32898
79
                             Load2.getValue(1));
32899
79
32900
79
    SDValue NewVec = DAG.getUNDEF(RegVT);
32901
79
    NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32902
79
    NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32903
79
    return DCI.CombineTo(N, NewVec, TF, true);
32904
79
  }
32905
276k
32906
276k
  return SDValue();
32907
276k
}
32908
32909
/// If V is a build vector of boolean constants and exactly one of those
32910
/// constants is true, return the operand index of that true element.
32911
/// Otherwise, return -1.
32912
895
static int getOneTrueElt(SDValue V) {
32913
895
  // This needs to be a build vector of booleans.
32914
895
  // TODO: Checking for the i1 type matches the IR definition for the mask,
32915
895
  // but the mask check could be loosened to i8 or other types. That might
32916
895
  // also require checking more than 'allOnesValue'; eg, the x86 HW
32917
895
  // instructions only require that the MSB is set for each mask element.
32918
895
  // The ISD::MSTORE comments/definition do not specify how the mask operand
32919
895
  // is formatted.
32920
895
  auto *BV = dyn_cast<BuildVectorSDNode>(V);
32921
895
  if (
!BV || 895
BV->getValueType(0).getVectorElementType() != MVT::i1140
)
32922
785
    return -1;
32923
110
32924
110
  int TrueIndex = -1;
32925
110
  unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32926
494
  for (unsigned i = 0; 
i < NumElts494
;
++i384
) {
32927
434
    const SDValue &Op = BV->getOperand(i);
32928
434
    if (Op.isUndef())
32929
0
      continue;
32930
434
    auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32931
434
    if (!ConstNode)
32932
0
      return -1;
32933
434
    
if (434
ConstNode->getAPIntValue().isAllOnesValue()434
) {
32934
144
      // If we already found a one, this is too many.
32935
144
      if (TrueIndex >= 0)
32936
50
        return -1;
32937
94
      TrueIndex = i;
32938
94
    }
32939
434
  }
32940
60
  return TrueIndex;
32941
895
}
32942
32943
/// Given a masked memory load/store operation, return true if it has one mask
32944
/// bit set. If it has one mask bit set, then also return the memory address of
32945
/// the scalar element to load/store, the vector index to insert/extract that
32946
/// scalar element, and the alignment for the scalar memory access.
32947
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32948
                                         SelectionDAG &DAG, SDValue &Addr,
32949
895
                                         SDValue &Index, unsigned &Alignment) {
32950
895
  int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32951
895
  if (TrueMaskElt < 0)
32952
851
    return false;
32953
44
32954
44
  // Get the address of the one scalar element that is specified by the mask
32955
44
  // using the appropriate offset from the base pointer.
32956
44
  EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32957
44
  Addr = MaskedOp->getBasePtr();
32958
44
  if (
TrueMaskElt != 044
) {
32959
36
    unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32960
36
    Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32961
36
  }
32962
895
32963
895
  Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32964
895
  Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32965
895
  return true;
32966
895
}
32967
32968
/// If exactly one element of the mask is set for a non-extending masked load,
32969
/// it is a scalar load and vector insert.
32970
/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32971
/// mask have already been optimized in IR, so we don't bother with those here.
32972
static SDValue
32973
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32974
641
                             TargetLowering::DAGCombinerInfo &DCI) {
32975
641
  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32976
641
  // However, some target hooks may need to be added to know when the transform
32977
641
  // is profitable. Endianness would also have to be considered.
32978
641
32979
641
  SDValue Addr, VecIndex;
32980
641
  unsigned Alignment;
32981
641
  if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32982
619
    return SDValue();
32983
22
32984
22
  // Load the one scalar element that is specified by the mask using the
32985
22
  // appropriate offset from the base pointer.
32986
22
  SDLoc DL(ML);
32987
22
  EVT VT = ML->getValueType(0);
32988
22
  EVT EltVT = VT.getVectorElementType();
32989
22
  SDValue Load =
32990
22
      DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32991
22
                  Alignment, ML->getMemOperand()->getFlags());
32992
22
32993
22
  // Insert the loaded element into the appropriate place in the vector.
32994
22
  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32995
22
                               Load, VecIndex);
32996
22
  return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32997
22
}
32998
32999
static SDValue
33000
combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
33001
148
                              TargetLowering::DAGCombinerInfo &DCI) {
33002
148
  if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
33003
106
    return SDValue();
33004
42
33005
42
  SDLoc DL(ML);
33006
42
  EVT VT = ML->getValueType(0);
33007
42
33008
42
  // If we are loading the first and last elements of a vector, it is safe and
33009
42
  // always faster to load the whole vector. Replace the masked load with a
33010
42
  // vector load and select.
33011
42
  unsigned NumElts = VT.getVectorNumElements();
33012
42
  BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
33013
42
  bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
33014
42
  bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
33015
42
  if (
LoadFirstElt && 42
LoadLastElt30
) {
33016
10
    SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
33017
10
                                ML->getMemOperand());
33018
10
    SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
33019
10
    return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
33020
10
  }
33021
32
33022
32
  // Convert a masked load with a constant mask into a masked load and a select.
33023
32
  // This allows the select operation to use a faster kind of select instruction
33024
32
  // (for example, vblendvps -> vblendps).
33025
32
33026
32
  // Don't try this if the pass-through operand is already undefined. That would
33027
32
  // cause an infinite loop because that's what we're about to create.
33028
32
  
if (32
ML->getSrc0().isUndef()32
)
33029
26
    return SDValue();
33030
6
33031
6
  // The new masked load has an undef pass-through operand. The select uses the
33032
6
  // original pass-through operand.
33033
6
  SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
33034
6
                                    ML->getMask(), DAG.getUNDEF(VT),
33035
6
                                    ML->getMemoryVT(), ML->getMemOperand(),
33036
6
                                    ML->getExtensionType());
33037
6
  SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
33038
6
33039
6
  return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
33040
6
}
33041
33042
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
33043
                                 TargetLowering::DAGCombinerInfo &DCI,
33044
719
                                 const X86Subtarget &Subtarget) {
33045
719
  MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
33046
719
33047
719
  // TODO: Expanding load with constant mask may be optimized as well.
33048
719
  if (Mld->isExpandingLoad())
33049
74
    return SDValue();
33050
645
33051
645
  
if (645
Mld->getExtensionType() == ISD::NON_EXTLOAD645
) {
33052
641
    if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
33053
22
      return ScalarLoad;
33054
619
    // TODO: Do some AVX512 subsets benefit from this transform?
33055
619
    
if (619
!Subtarget.hasAVX512()619
)
33056
148
      
if (SDValue 148
Blend148
= combineMaskedLoadConstantMask(Mld, DAG, DCI))
33057
16
        return Blend;
33058
607
  }
33059
607
33060
607
  
if (607
Mld->getExtensionType() != ISD::SEXTLOAD607
)
33061
603
    return SDValue();
33062
4
33063
4
  // Resolve extending loads.
33064
4
  EVT VT = Mld->getValueType(0);
33065
4
  unsigned NumElems = VT.getVectorNumElements();
33066
4
  EVT LdVT = Mld->getMemoryVT();
33067
4
  SDLoc dl(Mld);
33068
4
33069
4
  assert(LdVT != VT && "Cannot extend to the same type");
33070
4
  unsigned ToSz = VT.getScalarSizeInBits();
33071
4
  unsigned FromSz = LdVT.getScalarSizeInBits();
33072
4
  // From/To sizes and ElemCount must be pow of two.
33073
4
  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33074
4
    "Unexpected size for extending masked load");
33075
4
33076
4
  unsigned SizeRatio  = ToSz / FromSz;
33077
4
  assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
33078
4
33079
4
  // Create a type on which we perform the shuffle.
33080
4
  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33081
4
          LdVT.getScalarType(), NumElems*SizeRatio);
33082
4
  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33083
4
33084
4
  // Convert Src0 value.
33085
4
  SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
33086
4
  if (
!Mld->getSrc0().isUndef()4
) {
33087
4
    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33088
12
    for (unsigned i = 0; 
i != NumElems12
;
++i8
)
33089
8
      ShuffleVec[i] = i * SizeRatio;
33090
4
33091
4
    // Can't shuffle using an illegal type.
33092
4
    assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33093
4
           "WideVecVT should be legal");
33094
4
    WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
33095
4
                                    DAG.getUNDEF(WideVecVT), ShuffleVec);
33096
4
  }
33097
4
33098
4
  // Prepare the new mask.
33099
4
  SDValue NewMask;
33100
4
  SDValue Mask = Mld->getMask();
33101
4
  if (
Mask.getValueType() == VT4
) {
33102
3
    // Mask and original value have the same type.
33103
3
    NewMask = DAG.getBitcast(WideVecVT, Mask);
33104
3
    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33105
9
    for (unsigned i = 0; 
i != NumElems9
;
++i6
)
33106
6
      ShuffleVec[i] = i * SizeRatio;
33107
9
    for (unsigned i = NumElems; 
i != NumElems * SizeRatio9
;
++i6
)
33108
6
      ShuffleVec[i] = NumElems * SizeRatio;
33109
3
    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33110
3
                                   DAG.getConstant(0, dl, WideVecVT),
33111
3
                                   ShuffleVec);
33112
4
  } else {
33113
1
    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33114
1
    unsigned WidenNumElts = NumElems*SizeRatio;
33115
1
    unsigned MaskNumElts = VT.getVectorNumElements();
33116
1
    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
33117
1
                                     WidenNumElts);
33118
1
33119
1
    unsigned NumConcat = WidenNumElts / MaskNumElts;
33120
1
    SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33121
1
    SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
33122
1
    Ops[0] = Mask;
33123
1
    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33124
1
  }
33125
719
33126
719
  SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
33127
719
                                     Mld->getBasePtr(), NewMask, WideSrc0,
33128
719
                                     Mld->getMemoryVT(), Mld->getMemOperand(),
33129
719
                                     ISD::NON_EXTLOAD);
33130
719
  SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
33131
719
  return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
33132
719
}
33133
33134
/// If exactly one element of the mask is set for a non-truncating masked store,
33135
/// it is a vector extract and scalar store.
33136
/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
33137
/// mask have already been optimized in IR, so we don't bother with those here.
33138
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
33139
254
                                              SelectionDAG &DAG) {
33140
254
  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
33141
254
  // However, some target hooks may need to be added to know when the transform
33142
254
  // is profitable. Endianness would also have to be considered.
33143
254
33144
254
  SDValue Addr, VecIndex;
33145
254
  unsigned Alignment;
33146
254
  if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
33147
232
    return SDValue();
33148
22
33149
22
  // Extract the one scalar element that is actually being stored.
33150
22
  SDLoc DL(MS);
33151
22
  EVT VT = MS->getValue().getValueType();
33152
22
  EVT EltVT = VT.getVectorElementType();
33153
22
  SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
33154
22
                                MS->getValue(), VecIndex);
33155
22
33156
22
  // Store that element at the appropriate offset from the base pointer.
33157
22
  return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
33158
22
                      Alignment, MS->getMemOperand()->getFlags());
33159
22
}
33160
33161
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
33162
367
                                  const X86Subtarget &Subtarget) {
33163
367
  MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
33164
367
33165
367
  if (Mst->isCompressingStore())
33166
87
    return SDValue();
33167
280
33168
280
  
if (280
!Mst->isTruncatingStore()280
) {
33169
254
    if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
33170
22
      return ScalarStore;
33171
232
33172
232
    // If the mask is checking (0 > X), we're creating a vector with all-zeros
33173
232
    // or all-ones elements based on the sign bits of X. AVX1 masked store only
33174
232
    // cares about the sign bit of each mask element, so eliminate the compare:
33175
232
    // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
33176
232
    // Note that by waiting to match an x86-specific PCMPGT node, we're
33177
232
    // eliminating potentially more complex matching of a setcc node which has
33178
232
    // a full range of predicates.
33179
232
    SDValue Mask = Mst->getMask();
33180
232
    if (Mask.getOpcode() == X86ISD::PCMPGT &&
33181
232
        
ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())3
) {
33182
3
      assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
33183
3
             "Unexpected type for PCMPGT");
33184
3
      return DAG.getMaskedStore(
33185
3
          Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
33186
3
          Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
33187
3
    }
33188
229
33189
229
    // TODO: AVX512 targets should also be able to simplify something like the
33190
229
    // pattern above, but that pattern will be different. It will either need to
33191
229
    // match setcc more generally or match PCMPGTM later (in tablegen?).
33192
229
33193
229
    return SDValue();
33194
229
  }
33195
26
33196
26
  // Resolve truncating stores.
33197
26
  EVT VT = Mst->getValue().getValueType();
33198
26
  unsigned NumElems = VT.getVectorNumElements();
33199
26
  EVT StVT = Mst->getMemoryVT();
33200
26
  SDLoc dl(Mst);
33201
26
33202
26
  assert(StVT != VT && "Cannot truncate to the same type");
33203
26
  unsigned FromSz = VT.getScalarSizeInBits();
33204
26
  unsigned ToSz = StVT.getScalarSizeInBits();
33205
26
33206
26
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33207
26
33208
26
  // The truncating store is legal in some cases. For example
33209
26
  // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33210
26
  // are designated for truncate store.
33211
26
  // In this case we don't need any further transformations.
33212
26
  if (TLI.isTruncStoreLegal(VT, StVT))
33213
23
    return SDValue();
33214
3
33215
3
  // From/To sizes and ElemCount must be pow of two.
33216
26
  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33217
3
    "Unexpected size for truncating masked store");
33218
3
  // We are going to use the original vector elt for storing.
33219
3
  // Accumulated smaller vector elements must be a multiple of the store size.
33220
3
  assert (((NumElems * FromSz) % ToSz) == 0 &&
33221
3
          "Unexpected ratio for truncating masked store");
33222
3
33223
3
  unsigned SizeRatio  = FromSz / ToSz;
33224
3
  assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33225
3
33226
3
  // Create a type on which we perform the shuffle.
33227
3
  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33228
3
          StVT.getScalarType(), NumElems*SizeRatio);
33229
3
33230
3
  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33231
3
33232
3
  SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
33233
3
  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33234
9
  for (unsigned i = 0; 
i != NumElems9
;
++i6
)
33235
6
    ShuffleVec[i] = i * SizeRatio;
33236
3
33237
3
  // Can't shuffle using an illegal type.
33238
3
  assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33239
3
         "WideVecVT should be legal");
33240
3
33241
3
  SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33242
3
                                              DAG.getUNDEF(WideVecVT),
33243
3
                                              ShuffleVec);
33244
3
33245
3
  SDValue NewMask;
33246
3
  SDValue Mask = Mst->getMask();
33247
3
  if (
Mask.getValueType() == VT3
) {
33248
3
    // Mask and original value have the same type.
33249
3
    NewMask = DAG.getBitcast(WideVecVT, Mask);
33250
9
    for (unsigned i = 0; 
i != NumElems9
;
++i6
)
33251
6
      ShuffleVec[i] = i * SizeRatio;
33252
9
    for (unsigned i = NumElems; 
i != NumElems*SizeRatio9
;
++i6
)
33253
6
      ShuffleVec[i] = NumElems*SizeRatio;
33254
3
    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33255
3
                                   DAG.getConstant(0, dl, WideVecVT),
33256
3
                                   ShuffleVec);
33257
0
  } else {
33258
0
    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33259
0
    unsigned WidenNumElts = NumElems*SizeRatio;
33260
0
    unsigned MaskNumElts = VT.getVectorNumElements();
33261
0
    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
33262
0
                                     WidenNumElts);
33263
0
33264
0
    unsigned NumConcat = WidenNumElts / MaskNumElts;
33265
0
    SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33266
0
    SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
33267
0
    Ops[0] = Mask;
33268
0
    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33269
0
  }
33270
367
33271
367
  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
33272
367
                            Mst->getBasePtr(), NewMask, StVT,
33273
367
                            Mst->getMemOperand(), false);
33274
367
}
33275
33276
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
33277
304k
                            const X86Subtarget &Subtarget) {
33278
304k
  StoreSDNode *St = cast<StoreSDNode>(N);
33279
304k
  EVT VT = St->getValue().getValueType();
33280
304k
  EVT StVT = St->getMemoryVT();
33281
304k
  SDLoc dl(St);
33282
304k
  SDValue StoredVal = St->getOperand(1);
33283
304k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33284
304k
33285
304k
  // If we are saving a concatenation of two XMM registers and 32-byte stores
33286
304k
  // are slow, such as on Sandy Bridge, perform two 16-byte stores.
33287
304k
  bool Fast;
33288
304k
  unsigned AddressSpace = St->getAddressSpace();
33289
304k
  unsigned Alignment = St->getAlignment();
33290
304k
  if (
VT.is256BitVector() && 304k
StVT == VT8.01k
&&
33291
6.11k
      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
33292
6.11k
                             AddressSpace, Alignment, &Fast) &&
33293
304k
      
!Fast6.11k
) {
33294
34
    unsigned NumElems = VT.getVectorNumElements();
33295
34
    if (NumElems < 2)
33296
0
      return SDValue();
33297
34
33298
34
    SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
33299
34
    SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
33300
34
33301
34
    SDValue Ptr0 = St->getBasePtr();
33302
34
    SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
33303
34
33304
34
    SDValue Ch0 =
33305
34
        DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
33306
34
                     Alignment, St->getMemOperand()->getFlags());
33307
34
    SDValue Ch1 =
33308
34
        DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
33309
34
                     std::min(16U, Alignment), St->getMemOperand()->getFlags());
33310
34
    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
33311
34
  }
33312
303k
33313
303k
  // Optimize trunc store (of multiple scalars) to shuffle and store.
33314
303k
  // First, pack all of the elements in one place. Next, store to memory
33315
303k
  // in fewer chunks.
33316
303k
  
if (303k
St->isTruncatingStore() && 303k
VT.isVector()10.2k
) {
33317
3.44k
    // Check if we can detect an AVG pattern from the truncation. If yes,
33318
3.44k
    // replace the trunc store by a normal store with the result of X86ISD::AVG
33319
3.44k
    // instruction.
33320
3.44k
    if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
33321
3.44k
                                       Subtarget, dl))
33322
12
      return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
33323
12
                          St->getPointerInfo(), St->getAlignment(),
33324
12
                          St->getMemOperand()->getFlags());
33325
3.43k
33326
3.43k
    
if (SDValue 3.43k
Val3.43k
=
33327
3.43k
        detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
33328
10
      return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
33329
10
                             dl, Val, St->getBasePtr(),
33330
10
                             St->getMemoryVT(), St->getMemOperand(), DAG);
33331
3.42k
33332
3.42k
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33333
3.42k
    unsigned NumElems = VT.getVectorNumElements();
33334
3.42k
    assert(StVT != VT && "Cannot truncate to the same type");
33335
3.42k
    unsigned FromSz = VT.getScalarSizeInBits();
33336
3.42k
    unsigned ToSz = StVT.getScalarSizeInBits();
33337
3.42k
33338
3.42k
    // The truncating store is legal in some cases. For example
33339
3.42k
    // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33340
3.42k
    // are designated for truncate store.
33341
3.42k
    // In this case we don't need any further transformations.
33342
3.42k
    if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
33343
2.46k
      return SDValue();
33344
958
33345
958
    // From, To sizes and ElemCount must be pow of two
33346
958
    
if (958
!isPowerOf2_32(NumElems * FromSz * ToSz)958
)
return SDValue()0
;
33347
958
    // We are going to use the original vector elt for storing.
33348
958
    // Accumulated smaller vector elements must be a multiple of the store size.
33349
958
    
if (958
0 != (NumElems * FromSz) % ToSz958
)
return SDValue()0
;
33350
958
33351
958
    unsigned SizeRatio  = FromSz / ToSz;
33352
958
33353
958
    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33354
958
33355
958
    // Create a type on which we perform the shuffle
33356
958
    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33357
958
            StVT.getScalarType(), NumElems*SizeRatio);
33358
958
33359
958
    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33360
958
33361
958
    SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
33362
958
    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
33363
8.78k
    for (unsigned i = 0; 
i != NumElems8.78k
;
++i7.82k
)
33364
7.82k
      ShuffleVec[i] = i * SizeRatio;
33365
958
33366
958
    // Can't shuffle using an illegal type.
33367
958
    if (!TLI.isTypeLegal(WideVecVT))
33368
220
      return SDValue();
33369
738
33370
738
    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33371
738
                                         DAG.getUNDEF(WideVecVT),
33372
738
                                         ShuffleVec);
33373
738
    // At this point all of the data is stored at the bottom of the
33374
738
    // register. We now need to save it to mem.
33375
738
33376
738
    // Find the largest store unit
33377
738
    MVT StoreType = MVT::i8;
33378
4.42k
    for (MVT Tp : MVT::integer_valuetypes()) {
33379
4.42k
      if (
TLI.isTypeLegal(Tp) && 4.42k
Tp.getSizeInBits() <= NumElems * ToSz2.77k
)
33380
2.47k
        StoreType = Tp;
33381
4.42k
    }
33382
738
33383
738
    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
33384
738
    if (
TLI.isTypeLegal(MVT::f64) && 738
StoreType.getSizeInBits() < 64738
&&
33385
411
        (64 <= NumElems * ToSz))
33386
167
      StoreType = MVT::f64;
33387
738
33388
738
    // Bitcast the original vector into a vector of store-size units
33389
738
    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
33390
738
            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
33391
738
    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
33392
738
    SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
33393
738
    SmallVector<SDValue, 8> Chains;
33394
738
    SDValue Ptr = St->getBasePtr();
33395
738
33396
738
    // Perform one or more big stores into memory.
33397
1.47k
    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); 
i!=e1.47k
;
++i738
) {
33398
738
      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
33399
738
                                   StoreType, ShuffWide,
33400
738
                                   DAG.getIntPtrConstant(i, dl));
33401
738
      SDValue Ch =
33402
738
          DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
33403
738
                       St->getAlignment(), St->getMemOperand()->getFlags());
33404
738
      Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
33405
738
      Chains.push_back(Ch);
33406
738
    }
33407
3.44k
33408
3.44k
    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
33409
3.44k
  }
33410
300k
33411
300k
  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
33412
300k
  // the FP state in cases where an emms may be missing.
33413
300k
  // A preferable solution to the general problem is to figure out the right
33414
300k
  // places to insert EMMS.  This qualifies as a quick hack.
33415
300k
33416
300k
  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
33417
300k
  
if (300k
VT.getSizeInBits() != 64300k
)
33418
251k
    return SDValue();
33419
49.4k
33420
49.4k
  const Function *F = DAG.getMachineFunction().getFunction();
33421
49.4k
  bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
33422
49.4k
  bool F64IsLegal =
33423
49.4k
      !Subtarget.useSoftFloat() && 
!NoImplicitFloatOps49.4k
&&
Subtarget.hasSSE2()48.1k
;
33424
49.4k
  if ((VT.isVector() ||
33425
48.4k
       
(VT == MVT::i64 && 48.4k
F64IsLegal40.3k
&&
!Subtarget.is64Bit()38.7k
)) &&
33426
1.99k
      isa<LoadSDNode>(St->getValue()) &&
33427
308
      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
33428
49.4k
      
St->getChain().hasOneUse()308
&&
!St->isVolatile()142
) {
33429
138
    SDNode* LdVal = St->getValue().getNode();
33430
138
    LoadSDNode *Ld = nullptr;
33431
138
    int TokenFactorIndex = -1;
33432
138
    SmallVector<SDValue, 8> Ops;
33433
138
    SDNode* ChainVal = St->getChain().getNode();
33434
138
    // Must be a store of a load.  We currently handle two cases:  the load
33435
138
    // is a direct child, and it's under an intervening TokenFactor.  It is
33436
138
    // possible to dig deeper under nested TokenFactors.
33437
138
    if (ChainVal == LdVal)
33438
34
      Ld = cast<LoadSDNode>(St->getChain());
33439
104
    else 
if (104
St->getValue().hasOneUse() &&
33440
104
             
ChainVal->getOpcode() == ISD::TokenFactor104
) {
33441
162
      for (unsigned i = 0, e = ChainVal->getNumOperands(); 
i != e162
;
++i108
) {
33442
108
        if (
ChainVal->getOperand(i).getNode() == LdVal108
) {
33443
30
          TokenFactorIndex = i;
33444
30
          Ld = cast<LoadSDNode>(St->getValue());
33445
30
        } else
33446
78
          Ops.push_back(ChainVal->getOperand(i));
33447
108
      }
33448
104
    }
33449
138
33450
138
    if (
!Ld || 138
!ISD::isNormalLoad(Ld)64
)
33451
86
      return SDValue();
33452
52
33453
52
    // If this is not the MMX case, i.e. we are just turning i64 load/store
33454
52
    // into f64 load/store, avoid the transformation if there are multiple
33455
52
    // uses of the loaded value.
33456
52
    
if (52
!VT.isVector() && 52
!Ld->hasNUsesOfValue(1, 0)39
)
33457
2
      return SDValue();
33458
50
33459
50
    SDLoc LdDL(Ld);
33460
50
    SDLoc StDL(N);
33461
50
    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
33462
50
    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
33463
50
    // pair instead.
33464
50
    if (
Subtarget.is64Bit() || 50
F64IsLegal47
) {
33465
49
      MVT LdVT = Subtarget.is64Bit() ? 
MVT::i643
:
MVT::f6446
;
33466
49
      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
33467
49
                                  Ld->getPointerInfo(), Ld->getAlignment(),
33468
49
                                  Ld->getMemOperand()->getFlags());
33469
49
      // Make sure new load is placed in same chain order.
33470
49
      SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
33471
49
      if (
TokenFactorIndex >= 049
) {
33472
26
        Ops.push_back(NewChain);
33473
26
        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33474
26
      }
33475
49
      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
33476
49
                          St->getPointerInfo(), St->getAlignment(),
33477
49
                          St->getMemOperand()->getFlags());
33478
49
    }
33479
1
33480
1
    // Otherwise, lower to two pairs of 32-bit loads / stores.
33481
1
    SDValue LoAddr = Ld->getBasePtr();
33482
1
    SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
33483
1
33484
1
    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
33485
1
                               Ld->getPointerInfo(), Ld->getAlignment(),
33486
1
                               Ld->getMemOperand()->getFlags());
33487
1
    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
33488
1
                               Ld->getPointerInfo().getWithOffset(4),
33489
1
                               MinAlign(Ld->getAlignment(), 4),
33490
1
                               Ld->getMemOperand()->getFlags());
33491
1
    // Make sure new loads are placed in same chain order.
33492
1
    SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
33493
1
    NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
33494
1
33495
1
    if (
TokenFactorIndex >= 01
) {
33496
0
      Ops.push_back(NewChain);
33497
0
      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33498
0
    }
33499
138
33500
138
    LoAddr = St->getBasePtr();
33501
138
    HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
33502
138
33503
138
    SDValue LoSt =
33504
138
        DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
33505
138
                     St->getAlignment(), St->getMemOperand()->getFlags());
33506
138
    SDValue HiSt = DAG.getStore(
33507
138
        NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
33508
138
        MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
33509
138
    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
33510
138
  }
33511
49.3k
33512
49.3k
  // This is similar to the above case, but here we handle a scalar 64-bit
33513
49.3k
  // integer store that is extracted from a vector on a 32-bit target.
33514
49.3k
  // If we have SSE2, then we can treat it like a floating-point double
33515
49.3k
  // to get past legalization. The execution dependencies fixup pass will
33516
49.3k
  // choose the optimal machine instruction for the store if this really is
33517
49.3k
  // an integer or v2f32 rather than an f64.
33518
49.3k
  
if (49.3k
VT == MVT::i64 && 49.3k
F64IsLegal40.2k
&&
!Subtarget.is64Bit()38.6k
&&
33519
49.3k
      
St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT909
) {
33520
21
    SDValue OldExtract = St->getOperand(1);
33521
21
    SDValue ExtOp0 = OldExtract.getOperand(0);
33522
21
    unsigned VecSize = ExtOp0.getValueSizeInBits();
33523
21
    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
33524
21
    SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
33525
21
    SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
33526
21
                                     BitCast, OldExtract.getOperand(1));
33527
21
    return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
33528
21
                        St->getPointerInfo(), St->getAlignment(),
33529
21
                        St->getMemOperand()->getFlags());
33530
21
  }
33531
49.3k
33532
49.3k
  return SDValue();
33533
49.3k
}
33534
33535
/// Return 'true' if this vector operation is "horizontal"
33536
/// and return the operands for the horizontal operation in LHS and RHS.  A
33537
/// horizontal operation performs the binary operation on successive elements
33538
/// of its first operand, then on successive elements of its second operand,
33539
/// returning the resulting values in a vector.  For example, if
33540
///   A = < float a0, float a1, float a2, float a3 >
33541
/// and
33542
///   B = < float b0, float b1, float b2, float b3 >
33543
/// then the result of doing a horizontal operation on A and B is
33544
///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
33545
/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
33546
/// A horizontal-op B, for some already available A and B, and if so then LHS is
33547
/// set to A, RHS to B, and the routine returns 'true'.
33548
/// Note that the binary operation should have the property that if one of the
33549
/// operands is UNDEF then the result is UNDEF.
33550
13.3k
static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
33551
13.3k
  // Look for the following pattern: if
33552
13.3k
  //   A = < float a0, float a1, float a2, float a3 >
33553
13.3k
  //   B = < float b0, float b1, float b2, float b3 >
33554
13.3k
  // and
33555
13.3k
  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
33556
13.3k
  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
33557
13.3k
  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
33558
13.3k
  // which is A horizontal-op B.
33559
13.3k
33560
13.3k
  // At least one of the operands should be a vector shuffle.
33561
13.3k
  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
33562
12.9k
      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
33563
12.6k
    return false;
33564
705
33565
705
  MVT VT = LHS.getSimpleValueType();
33566
705
33567
705
  assert((VT.is128BitVector() || VT.is256BitVector()) &&
33568
705
         "Unsupported vector type for horizontal add/sub");
33569
705
33570
705
  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
33571
705
  // operate independently on 128-bit lanes.
33572
705
  unsigned NumElts = VT.getVectorNumElements();
33573
705
  unsigned NumLanes = VT.getSizeInBits()/128;
33574
705
  unsigned NumLaneElts = NumElts / NumLanes;
33575
705
  assert((NumLaneElts % 2 == 0) &&
33576
705
         "Vector type should have an even number of elements in each lane");
33577
705
  unsigned HalfLaneElts = NumLaneElts/2;
33578
705
33579
705
  // View LHS in the form
33580
705
  //   LHS = VECTOR_SHUFFLE A, B, LMask
33581
705
  // If LHS is not a shuffle then pretend it is the shuffle
33582
705
  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33583
705
  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
33584
705
  // type VT.
33585
705
  SDValue A, B;
33586
705
  SmallVector<int, 16> LMask(NumElts);
33587
705
  if (
LHS.getOpcode() == ISD::VECTOR_SHUFFLE705
) {
33588
469
    if (!LHS.getOperand(0).isUndef())
33589
469
      A = LHS.getOperand(0);
33590
469
    if (!LHS.getOperand(1).isUndef())
33591
213
      B = LHS.getOperand(1);
33592
469
    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33593
469
    std::copy(Mask.begin(), Mask.end(), LMask.begin());
33594
705
  } else {
33595
236
    if (!LHS.isUndef())
33596
236
      A = LHS;
33597
1.51k
    for (unsigned i = 0; 
i != NumElts1.51k
;
++i1.27k
)
33598
1.27k
      LMask[i] = i;
33599
236
  }
33600
705
33601
705
  // Likewise, view RHS in the form
33602
705
  //   RHS = VECTOR_SHUFFLE C, D, RMask
33603
705
  SDValue C, D;
33604
705
  SmallVector<int, 16> RMask(NumElts);
33605
705
  if (
RHS.getOpcode() == ISD::VECTOR_SHUFFLE705
) {
33606
574
    if (!RHS.getOperand(0).isUndef())
33607
574
      C = RHS.getOperand(0);
33608
574
    if (!RHS.getOperand(1).isUndef())
33609
260
      D = RHS.getOperand(1);
33610
574
    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33611
574
    std::copy(Mask.begin(), Mask.end(), RMask.begin());
33612
705
  } else {
33613
131
    if (!RHS.isUndef())
33614
130
      C = RHS;
33615
911
    for (unsigned i = 0; 
i != NumElts911
;
++i780
)
33616
780
      RMask[i] = i;
33617
131
  }
33618
705
33619
705
  // Check that the shuffles are both shuffling the same vectors.
33620
705
  if (
!(A == C && 705
B == D218
) &&
!(A == D && 507
B == C36
))
33621
494
    return false;
33622
211
33623
211
  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33624
211
  
if (211
!A.getNode() && 211
!B.getNode()0
)
33625
0
    return false;
33626
211
33627
211
  // If A and B occur in reverse order in RHS, then "swap" them (which means
33628
211
  // rewriting the mask).
33629
211
  
if (211
A != C211
)
33630
13
    ShuffleVectorSDNode::commuteMask(RMask);
33631
211
33632
211
  // At this point LHS and RHS are equivalent to
33633
211
  //   LHS = VECTOR_SHUFFLE A, B, LMask
33634
211
  //   RHS = VECTOR_SHUFFLE A, B, RMask
33635
211
  // Check that the masks correspond to performing a horizontal operation.
33636
389
  for (unsigned l = 0; 
l != NumElts389
;
l += NumLaneElts178
) {
33637
1.00k
    for (unsigned i = 0; 
i != NumLaneElts1.00k
;
++i752
) {
33638
825
      int LIdx = LMask[i+l], RIdx = RMask[i+l];
33639
825
33640
825
      // Ignore any UNDEF components.
33641
825
      if (
LIdx < 0 || 825
RIdx < 0699
||
33642
543
          
(!A.getNode() && 543
(LIdx < (int)NumElts || 0
RIdx < (int)NumElts0
)) ||
33643
543
          
(!B.getNode() && 543
(LIdx >= (int)NumElts || 201
RIdx >= (int)NumElts201
)))
33644
282
        continue;
33645
543
33646
543
      // Check that successive elements are being operated on.  If not, this is
33647
543
      // not a horizontal operation.
33648
543
      unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33649
543
      int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33650
543
      if (
!(LIdx == Index && 543
RIdx == Index + 1453
) &&
33651
155
          
!(IsCommutative && 155
LIdx == Index + 1151
&&
RIdx == Index82
))
33652
73
        return false;
33653
825
    }
33654
251
  }
33655
211
33656
138
  
LHS = A.getNode() ? 138
A138
:
B0
; // If A is 'UNDEF', use B for it.
33657
138
  RHS = B.getNode() ? 
B56
:
A82
; // If B is 'UNDEF', use A for it.
33658
138
  return true;
33659
13.3k
}
33660
33661
/// Do target-specific dag combines on floating-point adds/subs.
33662
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33663
13.0k
                               const X86Subtarget &Subtarget) {
33664
13.0k
  EVT VT = N->getValueType(0);
33665
13.0k
  SDValue LHS = N->getOperand(0);
33666
13.0k
  SDValue RHS = N->getOperand(1);
33667
13.0k
  bool IsFadd = N->getOpcode() == ISD::FADD;
33668
13.0k
  assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33669
13.0k
33670
13.0k
  // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33671
13.0k
  if (
((Subtarget.hasSSE3() && 13.0k
(VT == MVT::v4f32 || 9.58k
VT == MVT::v2f647.78k
)) ||
33672
10.1k
       
(Subtarget.hasFp256() && 10.1k
(VT == MVT::v8f32 || 4.90k
VT == MVT::v4f643.70k
))) &&
33673
13.0k
      
isHorizontalBinOp(LHS, RHS, IsFadd)5.02k
) {
33674
64
    auto NewOpcode = IsFadd ? 
X86ISD::FHADD39
:
X86ISD::FHSUB25
;
33675
64
    return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33676
64
  }
33677
12.9k
  return SDValue();
33678
12.9k
}
33679
33680
/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33681
/// the codegen.
33682
/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33683
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33684
                                          const X86Subtarget &Subtarget,
33685
54.0k
                                          SDLoc &DL) {
33686
54.0k
  assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33687
54.0k
  SDValue Src = N->getOperand(0);
33688
54.0k
  unsigned Opcode = Src.getOpcode();
33689
54.0k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33690
54.0k
33691
54.0k
  EVT VT = N->getValueType(0);
33692
54.0k
  EVT SrcVT = Src.getValueType();
33693
54.0k
33694
410
  auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33695
410
    unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33696
410
33697
410
    // Repeated operand, so we are only trading one output truncation for
33698
410
    // one input truncation.
33699
410
    if (Op0 == Op1)
33700
6
      return true;
33701
404
33702
404
    // See if either operand has been extended from a smaller/equal size to
33703
404
    // the truncation size, allowing a truncation to combine with the extend.
33704
404
    unsigned Opcode0 = Op0.getOpcode();
33705
404
    if (
(Opcode0 == ISD::ANY_EXTEND || 404
Opcode0 == ISD::SIGN_EXTEND402
||
33706
389
         Opcode0 == ISD::ZERO_EXTEND) &&
33707
21
        Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33708
21
      return true;
33709
383
33710
383
    unsigned Opcode1 = Op1.getOpcode();
33711
383
    if (
(Opcode1 == ISD::ANY_EXTEND || 383
Opcode1 == ISD::SIGN_EXTEND383
||
33712
383
         Opcode1 == ISD::ZERO_EXTEND) &&
33713
0
        Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33714
0
      return true;
33715
383
33716
383
    // See if either operand is a single use constant which can be constant
33717
383
    // folded.
33718
383
    SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33719
383
    SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33720
383
    return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33721
383
           ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33722
410
  };
33723
54.0k
33724
244
  auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33725
244
    SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33726
244
    SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33727
244
    return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33728
244
  };
33729
54.0k
33730
54.0k
  // Don't combine if the operation has other uses.
33731
54.0k
  if (!N->isOnlyUserOf(Src.getNode()))
33732
10.0k
    return SDValue();
33733
43.9k
33734
43.9k
  // Only support vector truncation for now.
33735
43.9k
  // TODO: i64 scalar math would benefit as well.
33736
43.9k
  
if (43.9k
!VT.isVector()43.9k
)
33737
37.5k
    return SDValue();
33738
6.43k
33739
6.43k
  // In most cases its only worth pre-truncating if we're only facing the cost
33740
6.43k
  // of one truncation.
33741
6.43k
  // i.e. if one of the inputs will constant fold or the input is repeated.
33742
6.43k
  switch (Opcode) {
33743
276
  case ISD::AND:
33744
276
  case ISD::XOR:
33745
276
  case ISD::OR: {
33746
276
    SDValue Op0 = Src.getOperand(0);
33747
276
    SDValue Op1 = Src.getOperand(1);
33748
276
    if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33749
266
        IsRepeatedOpOrFreeTruncation(Op0, Op1))
33750
116
      return TruncateArithmetic(Op0, Op1);
33751
160
    break;
33752
160
  }
33753
160
33754
120
  case ISD::MUL:
33755
120
    // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33756
120
    // better to truncate if we have the chance.
33757
120
    if (
SrcVT.getScalarType() == MVT::i64 && 120
TLI.isOperationLegal(Opcode, VT)74
&&
33758
57
        !TLI.isOperationLegal(Opcode, SrcVT))
33759
46
      return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33760
74
    
LLVM_FALLTHROUGH74
;
33761
187
  case ISD::ADD: {
33762
187
    SDValue Op0 = Src.getOperand(0);
33763
187
    SDValue Op1 = Src.getOperand(1);
33764
187
    if (TLI.isOperationLegal(Opcode, VT) &&
33765
144
        IsRepeatedOpOrFreeTruncation(Op0, Op1))
33766
82
      return TruncateArithmetic(Op0, Op1);
33767
105
    break;
33768
105
  }
33769
6.18k
  }
33770
6.18k
33771
6.18k
  return SDValue();
33772
6.18k
}
33773
33774
/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33775
static SDValue
33776
combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33777
105
                                  SmallVector<SDValue, 8> &Regs) {
33778
105
  assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33779
105
                             Regs[0].getValueType() == MVT::v2i64));
33780
105
  EVT OutVT = N->getValueType(0);
33781
105
  EVT OutSVT = OutVT.getVectorElementType();
33782
105
  EVT InVT = Regs[0].getValueType();
33783
105
  EVT InSVT = InVT.getVectorElementType();
33784
105
  SDLoc DL(N);
33785
105
33786
105
  // First, use mask to unset all bits that won't appear in the result.
33787
105
  assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33788
105
         "OutSVT can only be either i8 or i16.");
33789
105
  APInt Mask =
33790
105
      APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33791
105
  SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33792
105
  for (auto &Reg : Regs)
33793
626
    Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33794
105
33795
105
  MVT UnpackedVT, PackedVT;
33796
105
  if (
OutSVT == MVT::i8105
) {
33797
76
    UnpackedVT = MVT::v8i16;
33798
76
    PackedVT = MVT::v16i8;
33799
105
  } else {
33800
29
    UnpackedVT = MVT::v4i32;
33801
29
    PackedVT = MVT::v8i16;
33802
29
  }
33803
105
33804
105
  // In each iteration, truncate the type by a half size.
33805
105
  auto RegNum = Regs.size();
33806
105
  for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33807
333
       
j < e333
;
j *= 2, RegNum /= 2228
) {
33808
1.19k
    for (unsigned i = 0; 
i < RegNum1.19k
;
i++965
)
33809
965
      Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33810
708
    for (unsigned i = 0; 
i < RegNum / 2708
;
i++480
)
33811
480
      Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33812
480
                            Regs[i * 2 + 1]);
33813
228
  }
33814
105
33815
105
  // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33816
105
  // then extract a subvector as the result since v8i8 is not a legal type.
33817
105
  if (
OutVT == MVT::v8i8105
) {
33818
5
    Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33819
5
    Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33820
5
                          DAG.getIntPtrConstant(0, DL));
33821
5
    return Regs[0];
33822
100
  } else 
if (100
RegNum > 1100
) {
33823
23
    Regs.resize(RegNum);
33824
23
    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33825
23
  } else
33826
77
    return Regs[0];
33827
0
}
33828
33829
/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33830
static SDValue
33831
combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33832
                                  SelectionDAG &DAG,
33833
36
                                  SmallVector<SDValue, 8> &Regs) {
33834
36
  assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33835
36
  EVT OutVT = N->getValueType(0);
33836
36
  SDLoc DL(N);
33837
36
33838
36
  // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33839
36
  SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33840
104
  for (auto &Reg : Regs) {
33841
104
    Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33842
104
                              Subtarget, DAG);
33843
104
    Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33844
104
                              Subtarget, DAG);
33845
104
  }
33846
36
33847
88
  for (unsigned i = 0, e = Regs.size() / 2; 
i < e88
;
i++52
)
33848
52
    Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33849
52
                          Regs[i * 2 + 1]);
33850
36
33851
36
  if (
Regs.size() > 236
) {
33852
10
    Regs.resize(Regs.size() / 2);
33853
10
    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33854
10
  } else
33855
26
    return Regs[0];
33856
0
}
33857
33858
/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33859
/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33860
/// legalization the truncation will be translated into a BUILD_VECTOR with each
33861
/// element that is extracted from a vector and then truncated, and it is
33862
/// difficult to do this optimization based on them.
33863
static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33864
53.3k
                                       const X86Subtarget &Subtarget) {
33865
53.3k
  EVT OutVT = N->getValueType(0);
33866
53.3k
  if (!OutVT.isVector())
33867
47.4k
    return SDValue();
33868
5.85k
33869
5.85k
  SDValue In = N->getOperand(0);
33870
5.85k
  if (!In.getValueType().isSimple())
33871
8
    return SDValue();
33872
5.85k
33873
5.85k
  EVT InVT = In.getValueType();
33874
5.85k
  unsigned NumElems = OutVT.getVectorNumElements();
33875
5.85k
33876
5.85k
  // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33877
5.85k
  // SSE2, and we need to take care of it specially.
33878
5.85k
  // AVX512 provides vpmovdb.
33879
5.85k
  if (
!Subtarget.hasSSE2() || 5.85k
Subtarget.hasAVX2()5.85k
)
33880
4.94k
    return SDValue();
33881
911
33882
911
  EVT OutSVT = OutVT.getVectorElementType();
33883
911
  EVT InSVT = InVT.getVectorElementType();
33884
911
  if (
!((InSVT == MVT::i32 || 911
InSVT == MVT::i64532
) &&
33885
911
        
(OutSVT == MVT::i8 || 750
OutSVT == MVT::i16537
) &&
isPowerOf2_32(NumElems)497
&&
33886
497
        NumElems >= 8))
33887
688
    return SDValue();
33888
223
33889
223
  // SSSE3's pshufb results in less instructions in the cases below.
33890
223
  
if (223
Subtarget.hasSSSE3() && 223
NumElems == 8140
&&
33891
89
      
((OutSVT == MVT::i8 && 89
InSVT != MVT::i6411
) ||
33892
81
       
(InSVT == MVT::i32 && 81
OutSVT == MVT::i1657
)))
33893
65
    return SDValue();
33894
158
33895
158
  SDLoc DL(N);
33896
158
33897
158
  // Split a long vector into vectors of legal type.
33898
158
  unsigned RegNum = InVT.getSizeInBits() / 128;
33899
158
  SmallVector<SDValue, 8> SubVec(RegNum);
33900
158
  unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33901
158
  EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33902
158
33903
956
  for (unsigned i = 0; 
i < RegNum956
;
i++798
)
33904
798
    SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33905
798
                            DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33906
158
33907
158
  // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33908
158
  // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33909
158
  // truncate 2 x v4i32 to v8i16.
33910
158
  if (
Subtarget.hasSSE41() || 158
OutSVT == MVT::i890
)
33911
105
    return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33912
53
  else 
if (53
InSVT == MVT::i3253
)
33913
36
    return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33914
53
  else
33915
17
    return SDValue();
33916
0
}
33917
33918
/// This function transforms vector truncation of 'all or none' bits values.
33919
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33920
static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33921
                                               SelectionDAG &DAG,
33922
53.5k
                                               const X86Subtarget &Subtarget) {
33923
53.5k
  // Requires SSE2 but AVX512 has fast truncate.
33924
53.5k
  if (
!Subtarget.hasSSE2() || 53.5k
Subtarget.hasAVX512()50.3k
)
33925
22.4k
    return SDValue();
33926
31.0k
33927
31.0k
  
if (31.0k
!N->getValueType(0).isVector() || 31.0k
!N->getValueType(0).isSimple()1.83k
)
33928
29.2k
    return SDValue();
33929
1.80k
33930
1.80k
  SDValue In = N->getOperand(0);
33931
1.80k
  if (!In.getValueType().isSimple())
33932
0
    return SDValue();
33933
1.80k
33934
1.80k
  MVT VT = N->getValueType(0).getSimpleVT();
33935
1.80k
  MVT SVT = VT.getScalarType();
33936
1.80k
33937
1.80k
  MVT InVT = In.getValueType().getSimpleVT();
33938
1.80k
  MVT InSVT = InVT.getScalarType();
33939
1.80k
33940
1.80k
  // Use PACKSS if the input is a splatted sign bit.
33941
1.80k
  // e.g. Comparison result, sext_in_reg, etc.
33942
1.80k
  unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33943
1.80k
  if (NumSignBits != InSVT.getSizeInBits())
33944
1.57k
    return SDValue();
33945
235
33946
235
  // Check we have a truncation suited for PACKSS.
33947
235
  
if (235
!VT.is128BitVector() && 235
!VT.is256BitVector()28
)
33948
9
    return SDValue();
33949
226
  
if (226
SVT != MVT::i8 && 226
SVT != MVT::i16137
&&
SVT != MVT::i3256
)
33950
0
    return SDValue();
33951
226
  
if (226
InSVT != MVT::i16 && 226
InSVT != MVT::i32175
&&
InSVT != MVT::i6486
)
33952
0
    return SDValue();
33953
226
33954
226
  return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33955
226
}
33956
33957
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33958
54.0k
                               const X86Subtarget &Subtarget) {
33959
54.0k
  EVT VT = N->getValueType(0);
33960
54.0k
  SDValue Src = N->getOperand(0);
33961
54.0k
  SDLoc DL(N);
33962
54.0k
33963
54.0k
  // Attempt to pre-truncate inputs to arithmetic ops instead.
33964
54.0k
  if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33965
244
    return V;
33966
53.7k
33967
53.7k
  // Try to detect AVG pattern first.
33968
53.7k
  
if (SDValue 53.7k
Avg53.7k
= detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33969
222
    return Avg;
33970
53.5k
33971
53.5k
  // Try to combine truncation with unsigned saturation.
33972
53.5k
  
if (SDValue 53.5k
Val53.5k
= combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33973
7
    return Val;
33974
53.5k
33975
53.5k
  // The bitcast source is a direct mmx result.
33976
53.5k
  // Detect bitcasts between i32 to x86mmx
33977
53.5k
  
if (53.5k
Src.getOpcode() == ISD::BITCAST && 53.5k
VT == MVT::i32925
) {
33978
29
    SDValue BCSrc = Src.getOperand(0);
33979
29
    if (BCSrc.getValueType() == MVT::x86mmx)
33980
10
      return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33981
53.5k
  }
33982
53.5k
33983
53.5k
  // Try to truncate extended sign bits with PACKSS.
33984
53.5k
  
if (SDValue 53.5k
V53.5k
= combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33985
226
    return V;
33986
53.3k
33987
53.3k
  return combineVectorTruncation(N, DAG, Subtarget);
33988
53.3k
}
33989
33990
/// Returns the negated value if the node \p N flips sign of FP value.
33991
///
33992
/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33993
/// AVX512F does not have FXOR, so FNEG is lowered as
33994
/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33995
/// In this case we go though all bitcasts.
33996
20.5k
static SDValue isFNEG(SDNode *N) {
33997
20.5k
  if (N->getOpcode() == ISD::FNEG)
33998
1.19k
    return N->getOperand(0);
33999
19.3k
34000
19.3k
  SDValue Op = peekThroughBitcasts(SDValue(N, 0));
34001
19.3k
  if (
Op.getOpcode() != X86ISD::FXOR && 19.3k
Op.getOpcode() != ISD::XOR18.9k
)
34002
5.88k
    return SDValue();
34003
13.4k
34004
13.4k
  SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
34005
13.4k
  if (!Op1.getValueType().isFloatingPoint())
34006
12.4k
    return SDValue();
34007
1.01k
34008
1.01k
  SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
34009
1.01k
34010
1.01k
  unsigned EltBits = Op1.getScalarValueSizeInBits();
34011
753
  auto isSignMask = [&](const ConstantFP *C) {
34012
753
    return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
34013
753
  };
34014
1.01k
34015
1.01k
  // There is more than one way to represent the same constant on
34016
1.01k
  // the different X86 targets. The type of the node may also depend on size.
34017
1.01k
  //  - load scalar value and broadcast
34018
1.01k
  //  - BUILD_VECTOR node
34019
1.01k
  //  - load from a constant pool.
34020
1.01k
  // We check all variants here.
34021
1.01k
  if (
Op1.getOpcode() == X86ISD::VBROADCAST1.01k
) {
34022
131
    if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
34023
122
      
if (122
isSignMask(cast<ConstantFP>(C))122
)
34024
122
        return Op0;
34025
1.01k
34026
886
  } else 
if (BuildVectorSDNode *886
BV886
= dyn_cast<BuildVectorSDNode>(Op1)) {
34027
358
    if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
34028
340
      
if (340
isSignMask(CN->getConstantFPValue())340
)
34029
340
        return Op0;
34030
886
34031
528
  } else 
if (auto *528
C528
= getTargetConstantFromNode(Op1)) {
34032
291
    if (
C->getType()->isVectorTy()291
) {
34033
288
      if (auto *SplatV = C->getSplatValue())
34034
288
        
if (288
isSignMask(cast<ConstantFP>(SplatV))288
)
34035
288
          return Op0;
34036
3
    } else 
if (auto *3
FPConst3
= dyn_cast<ConstantFP>(C))
34037
3
      
if (3
isSignMask(FPConst)3
)
34038
2
        return Op0;
34039
265
  }
34040
265
  return SDValue();
34041
265
}
34042
34043
/// Do target-specific dag combines on floating point negations.
34044
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
34045
927
                           const X86Subtarget &Subtarget) {
34046
927
  EVT OrigVT = N->getValueType(0);
34047
927
  SDValue Arg = isFNEG(N);
34048
927
  assert(Arg.getNode() && "N is expected to be an FNEG node");
34049
927
34050
927
  EVT VT = Arg.getValueType();
34051
927
  EVT SVT = VT.getScalarType();
34052
927
  SDLoc DL(N);
34053
927
34054
927
  // Let legalize expand this if it isn't a legal type yet.
34055
927
  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34056
127
    return SDValue();
34057
800
34058
800
  // If we're negating a FMUL node on a target with FMA, then we can avoid the
34059
800
  // use of a constant by performing (-0 - A*B) instead.
34060
800
  // FIXME: Check rounding control flags as well once it becomes available.
34061
800
  
if (800
Arg.getOpcode() == ISD::FMUL && 800
(SVT == MVT::f32 || 141
SVT == MVT::f64119
) &&
34062
800
      
Arg->getFlags().hasNoSignedZeros()140
&&
Subtarget.hasAnyFMA()52
) {
34063
52
    SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
34064
52
    SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
34065
52
                                  Arg.getOperand(1), Zero);
34066
52
    return DAG.getBitcast(OrigVT, NewNode);
34067
52
  }
34068
748
34069
748
  // If we're negating an FMA node, then we can adjust the
34070
748
  // instruction to include the extra negation.
34071
748
  unsigned NewOpcode = 0;
34072
748
  if (
Arg.hasOneUse()748
) {
34073
598
    switch (Arg.getOpcode()) {
34074
30
    case ISD::FMA:             NewOpcode = X86ISD::FNMSUB;       break;
34075
4
    case X86ISD::FMSUB:        NewOpcode = X86ISD::FNMADD;       break;
34076
2
    case X86ISD::FNMADD:       NewOpcode = X86ISD::FMSUB;        break;
34077
2
    case X86ISD::FNMSUB:       NewOpcode = ISD::FMA;             break;
34078
0
    case X86ISD::FMADD_RND:    NewOpcode = X86ISD::FNMSUB_RND;   break;
34079
0
    case X86ISD::FMSUB_RND:    NewOpcode = X86ISD::FNMADD_RND;   break;
34080
0
    case X86ISD::FNMADD_RND:   NewOpcode = X86ISD::FMSUB_RND;    break;
34081
2
    case X86ISD::FNMSUB_RND:   NewOpcode = X86ISD::FMADD_RND;    break;
34082
748
    // We can't handle scalar intrinsic node here because it would only
34083
748
    // invert one element and not the whole vector. But we could try to handle
34084
748
    // a negation of the lower element only.
34085
748
    }
34086
748
  }
34087
748
  
if (748
NewOpcode748
)
34088
40
    return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
34089
40
                                              Arg.getNode()->ops()));
34090
708
34091
708
  return SDValue();
34092
708
}
34093
34094
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
34095
2.80k
                                 const X86Subtarget &Subtarget) {
34096
2.80k
  MVT VT = N->getSimpleValueType(0);
34097
2.80k
  // If we have integer vector types available, use the integer opcodes.
34098
2.80k
  if (
VT.isVector() && 2.80k
Subtarget.hasSSE2()1.40k
) {
34099
1.40k
    SDLoc dl(N);
34100
1.40k
34101
1.40k
    MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
34102
1.40k
34103
1.40k
    SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
34104
1.40k
    SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
34105
1.40k
    unsigned IntOpcode;
34106
1.40k
    switch (N->getOpcode()) {
34107
0
    
default: 0
llvm_unreachable0
("Unexpected FP logic op");
34108
270
    case X86ISD::FOR: IntOpcode = ISD::OR; break;
34109
151
    case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
34110
981
    case X86ISD::FAND: IntOpcode = ISD::AND; break;
34111
0
    case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
34112
1.40k
    }
34113
1.40k
    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
34114
1.40k
    return DAG.getBitcast(VT, IntOp);
34115
1.40k
  }
34116
1.40k
  return SDValue();
34117
1.40k
}
34118
34119
static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
34120
                          TargetLowering::DAGCombinerInfo &DCI,
34121
20.3k
                          const X86Subtarget &Subtarget) {
34122
20.3k
  if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
34123
33
    return Cmp;
34124
20.2k
34125
20.2k
  
if (20.2k
DCI.isBeforeLegalizeOps()20.2k
)
34126
7.34k
    return SDValue();
34127
12.9k
34128
12.9k
  
if (SDValue 12.9k
RV12.9k
= foldXorTruncShiftIntoCmp(N, DAG))
34129
6
    return RV;
34130
12.9k
34131
12.9k
  
if (12.9k
Subtarget.hasCMov()12.9k
)
34132
11.8k
    
if (SDValue 11.8k
RV11.8k
= combineIntegerAbs(N, DAG))
34133
27
      return RV;
34134
12.8k
34135
12.8k
  
if (SDValue 12.8k
FPLogic12.8k
= convertIntLogicToFPLogic(N, DAG, Subtarget))
34136
28
    return FPLogic;
34137
12.8k
34138
12.8k
  
if (12.8k
isFNEG(N)12.8k
)
34139
223
    return combineFneg(N, DAG, Subtarget);
34140
12.6k
  return SDValue();
34141
12.6k
}
34142
34143
34144
6.36k
static bool isNullFPScalarOrVectorConst(SDValue V) {
34145
5.97k
  return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
34146
6.36k
}
34147
34148
/// If a value is a scalar FP zero or a vector FP zero (potentially including
34149
/// undefined elements), return a zero constant that may be used to fold away
34150
/// that value. In the case of a vector, the returned constant will not contain
34151
/// undefined elements even if the input parameter does. This makes it suitable
34152
/// to be used as a replacement operand with operations (eg, bitwise-and) where
34153
/// an undef should not propagate.
34154
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
34155
3.71k
                                        const X86Subtarget &Subtarget) {
34156
3.71k
  if (!isNullFPScalarOrVectorConst(V))
34157
3.51k
    return SDValue();
34158
198
34159
198
  
if (198
V.getValueType().isVector()198
)
34160
6
    return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
34161
192
34162
192
  return V;
34163
192
}
34164
34165
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
34166
1.61k
                                      const X86Subtarget &Subtarget) {
34167
1.61k
  SDValue N0 = N->getOperand(0);
34168
1.61k
  SDValue N1 = N->getOperand(1);
34169
1.61k
  EVT VT = N->getValueType(0);
34170
1.61k
  SDLoc DL(N);
34171
1.61k
34172
1.61k
  // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
34173
1.61k
  if (
!((VT == MVT::f32 && 1.61k
Subtarget.hasSSE1()315
) ||
34174
1.29k
        
(VT == MVT::f64 && 1.29k
Subtarget.hasSSE2()305
)))
34175
992
    return SDValue();
34176
620
34177
620
  
auto isAllOnesConstantFP = [](SDValue V) 620
{
34178
3
    auto *C = dyn_cast<ConstantFPSDNode>(V);
34179
3
    return C && C->getConstantFPValue()->isAllOnesValue();
34180
3
  };
34181
620
34182
620
  // fand (fxor X, -1), Y --> fandn X, Y
34183
620
  if (
N0.getOpcode() == X86ISD::FXOR && 620
isAllOnesConstantFP(N0.getOperand(1))0
)
34184
0
    return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
34185
620
34186
620
  // fand X, (fxor Y, -1) --> fandn Y, X
34187
620
  
if (620
N1.getOpcode() == X86ISD::FXOR && 620
isAllOnesConstantFP(N1.getOperand(1))3
)
34188
3
    return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
34189
617
34190
617
  return SDValue();
34191
617
}
34192
34193
/// Do target-specific dag combines on X86ISD::FAND nodes.
34194
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
34195
1.63k
                           const X86Subtarget &Subtarget) {
34196
1.63k
  // FAND(0.0, x) -> 0.0
34197
1.63k
  if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
34198
6
    return V;
34199
1.63k
34200
1.63k
  // FAND(x, 0.0) -> 0.0
34201
1.63k
  
if (SDValue 1.63k
V1.63k
= getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34202
21
    return V;
34203
1.61k
34204
1.61k
  
if (SDValue 1.61k
V1.61k
= combineFAndFNotToFAndn(N, DAG, Subtarget))
34205
3
    return V;
34206
1.60k
34207
1.60k
  return lowerX86FPLogicOp(N, DAG, Subtarget);
34208
1.60k
}
34209
34210
/// Do target-specific dag combines on X86ISD::FANDN nodes.
34211
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
34212
441
                            const X86Subtarget &Subtarget) {
34213
441
  // FANDN(0.0, x) -> x
34214
441
  if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34215
0
    return N->getOperand(1);
34216
441
34217
441
  // FANDN(x, 0.0) -> 0.0
34218
441
  
if (SDValue 441
V441
= getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34219
171
    return V;
34220
270
34221
270
  return lowerX86FPLogicOp(N, DAG, Subtarget);
34222
270
}
34223
34224
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
34225
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
34226
1.26k
                          const X86Subtarget &Subtarget) {
34227
1.26k
  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
34228
1.26k
34229
1.26k
  // F[X]OR(0.0, x) -> x
34230
1.26k
  if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34231
319
    return N->getOperand(1);
34232
945
34233
945
  // F[X]OR(x, 0.0) -> x
34234
945
  
if (945
isNullFPScalarOrVectorConst(N->getOperand(1))945
)
34235
21
    return N->getOperand(0);
34236
924
34237
924
  
if (924
isFNEG(N)924
)
34238
148
    
if (SDValue 148
NewVal148
= combineFneg(N, DAG, Subtarget))
34239
0
      return NewVal;
34240
924
34241
924
  return lowerX86FPLogicOp(N, DAG, Subtarget);
34242
924
}
34243
34244
/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
34245
977
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
34246
977
  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
34247
977
34248
977
  // Only perform optimizations if UnsafeMath is used.
34249
977
  if (!DAG.getTarget().Options.UnsafeFPMath)
34250
745
    return SDValue();
34251
232
34252
232
  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
34253
232
  // into FMINC and FMAXC, which are Commutative operations.
34254
232
  unsigned NewOp = 0;
34255
232
  switch (N->getOpcode()) {
34256
0
    
default: 0
llvm_unreachable0
("unknown opcode");
34257
111
    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
34258
121
    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
34259
232
  }
34260
232
34261
232
  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
34262
232
                     N->getOperand(0), N->getOperand(1));
34263
232
}
34264
34265
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
34266
136
                                     const X86Subtarget &Subtarget) {
34267
136
  if (Subtarget.useSoftFloat())
34268
0
    return SDValue();
34269
136
34270
136
  // TODO: Check for global or instruction-level "nnan". In that case, we
34271
136
  //       should be able to lower to FMAX/FMIN alone.
34272
136
  // TODO: If an operand is already known to be a NaN or not a NaN, this
34273
136
  //       should be an optional swap and FMAX/FMIN.
34274
136
34275
136
  EVT VT = N->getValueType(0);
34276
136
  if (
!((Subtarget.hasSSE1() && 136
(VT == MVT::f32 || 128
VT == MVT::v4f32106
)) ||
34277
106
        
(Subtarget.hasSSE2() && 106
(VT == MVT::f64 || 98
VT == MVT::v2f6478
)) ||
34278
70
        
(Subtarget.hasAVX() && 70
(VT == MVT::v8f32 || 24
VT == MVT::v4f6424
))))
34279
62
    return SDValue();
34280
74
34281
74
  // This takes at least 3 instructions, so favor a library call when operating
34282
74
  // on a scalar and minimizing code size.
34283
74
  
if (74
!VT.isVector() && 74
DAG.getMachineFunction().getFunction()->optForMinSize()42
)
34284
10
    return SDValue();
34285
64
34286
64
  SDValue Op0 = N->getOperand(0);
34287
64
  SDValue Op1 = N->getOperand(1);
34288
64
  SDLoc DL(N);
34289
64
  EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
34290
64
      DAG.getDataLayout(), *DAG.getContext(), VT);
34291
64
34292
64
  // There are 4 possibilities involving NaN inputs, and these are the required
34293
64
  // outputs:
34294
64
  //                   Op1
34295
64
  //               Num     NaN
34296
64
  //            ----------------
34297
64
  //       Num  |  Max  |  Op0 |
34298
64
  // Op0        ----------------
34299
64
  //       NaN  |  Op1  |  NaN |
34300
64
  //            ----------------
34301
64
  //
34302
64
  // The SSE FP max/min instructions were not designed for this case, but rather
34303
64
  // to implement:
34304
64
  //   Min = Op1 < Op0 ? Op1 : Op0
34305
64
  //   Max = Op1 > Op0 ? Op1 : Op0
34306
64
  //
34307
64
  // So they always return Op0 if either input is a NaN. However, we can still
34308
64
  // use those instructions for fmaxnum by selecting away a NaN input.
34309
64
34310
64
  // If either operand is NaN, the 2nd source operand (Op0) is passed through.
34311
64
  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? 
X86ISD::FMAX40
:
X86ISD::FMIN24
;
34312
136
  SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
34313
136
  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
34314
136
34315
136
  // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
34316
136
  // are NaN, the NaN value of Op1 is the result.
34317
136
  return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
34318
136
}
34319
34320
/// Do target-specific dag combines on X86ISD::ANDNP nodes.
34321
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
34322
                            TargetLowering::DAGCombinerInfo &DCI,
34323
3.18k
                            const X86Subtarget &Subtarget) {
34324
3.18k
  // ANDNP(0, x) -> x
34325
3.18k
  if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
34326
0
    return N->getOperand(1);
34327
3.18k
34328
3.18k
  // ANDNP(x, 0) -> 0
34329
3.18k
  
if (3.18k
ISD::isBuildVectorAllZeros(N->getOperand(1).getNode())3.18k
)
34330
26
    return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
34331
3.16k
34332
3.16k
  EVT VT = N->getValueType(0);
34333
3.16k
34334
3.16k
  // Attempt to recursively combine a bitmask ANDNP with shuffles.
34335
3.16k
  if (
VT.isVector() && 3.16k
(VT.getScalarSizeInBits() % 8) == 03.16k
) {
34336
3.16k
    SDValue Op(N, 0);
34337
3.16k
    if (combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34338
3.16k
                                      /*HasVarMask*/ false, DAG, DCI,
34339
3.16k
                                      Subtarget))
34340
1
      return SDValue(); // This routine will use CombineTo to replace N.
34341
3.16k
  }
34342
3.16k
34343
3.16k
  return SDValue();
34344
3.16k
}
34345
34346
static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
34347
790
                         TargetLowering::DAGCombinerInfo &DCI) {
34348
790
  SDValue N0 = N->getOperand(0);
34349
790
  SDValue N1 = N->getOperand(1);
34350
790
34351
790
  // BT ignores high bits in the bit index operand.
34352
790
  unsigned BitWidth = N1.getValueSizeInBits();
34353
790
  APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
34354
790
  if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
34355
175
    return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
34356
615
34357
615
  return SDValue();
34358
615
}
34359
34360
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
34361
1.67k
                                      const X86Subtarget &Subtarget) {
34362
1.67k
  EVT VT = N->getValueType(0);
34363
1.67k
  if (!VT.isVector())
34364
666
    return SDValue();
34365
1.00k
34366
1.00k
  SDValue N0 = N->getOperand(0);
34367
1.00k
  SDValue N1 = N->getOperand(1);
34368
1.00k
  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
34369
1.00k
  SDLoc dl(N);
34370
1.00k
34371
1.00k
  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
34372
1.00k
  // both SSE and AVX2 since there is no sign-extended shift right
34373
1.00k
  // operation on a vector with 64-bit elements.
34374
1.00k
  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
34375
1.00k
  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
34376
1.00k
  if (
VT == MVT::v4i64 && 1.00k
(N0.getOpcode() == ISD::ANY_EXTEND ||
34377
1.00k
      
N0.getOpcode() == ISD::SIGN_EXTEND2
)) {
34378
75
    SDValue N00 = N0.getOperand(0);
34379
75
34380
75
    // EXTLOAD has a better solution on AVX2,
34381
75
    // it may be replaced with X86ISD::VSEXT node.
34382
75
    if (
N00.getOpcode() == ISD::LOAD && 75
Subtarget.hasInt256()3
)
34383
1
      
if (1
!ISD::isNormalLoad(N00.getNode())1
)
34384
1
        return SDValue();
34385
74
34386
74
    
if (74
N00.getValueType() == MVT::v4i32 && 74
ExtraVT.getSizeInBits() < 12874
) {
34387
74
        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
34388
74
                                  N00, N1);
34389
74
      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
34390
74
    }
34391
931
  }
34392
931
  return SDValue();
34393
931
}
34394
34395
/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
34396
/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
34397
/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
34398
/// opportunities to combine math ops, use an LEA, or use a complex addressing
34399
/// mode. This can eliminate extend, add, and shift instructions.
34400
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
34401
31.2k
                                   const X86Subtarget &Subtarget) {
34402
31.2k
  if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
34403
28.8k
      Ext->getOpcode() != ISD::ZERO_EXTEND)
34404
15.1k
    return SDValue();
34405
16.0k
34406
16.0k
  // TODO: This should be valid for other integer types.
34407
16.0k
  EVT VT = Ext->getValueType(0);
34408
16.0k
  if (VT != MVT::i64)
34409
8.84k
    return SDValue();
34410
7.17k
34411
7.17k
  SDValue Add = Ext->getOperand(0);
34412
7.17k
  if (Add.getOpcode() != ISD::ADD)
34413
6.30k
    return SDValue();
34414
869
34415
869
  bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
34416
869
  bool NSW = Add->getFlags().hasNoSignedWrap();
34417
869
  bool NUW = Add->getFlags().hasNoUnsignedWrap();
34418
869
34419
869
  // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
34420
869
  // into the 'zext'
34421
869
  if (
(Sext && 869
!NSW80
) ||
(!Sext && 843
!NUW789
))
34422
799
    return SDValue();
34423
70
34424
70
  // Having a constant operand to the 'add' ensures that we are not increasing
34425
70
  // the instruction count because the constant is extended for free below.
34426
70
  // A constant operand can also become the displacement field of an LEA.
34427
70
  auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
34428
70
  if (!AddOp1)
34429
1
    return SDValue();
34430
69
34431
69
  // Don't make the 'add' bigger if there's no hope of combining it with some
34432
69
  // other 'add' or 'shl' instruction.
34433
69
  // TODO: It may be profitable to generate simpler LEA instructions in place
34434
69
  // of single 'add' instructions, but the cost model for selecting an LEA
34435
69
  // currently has a high threshold.
34436
69
  bool HasLEAPotential = false;
34437
77
  for (auto *User : Ext->uses()) {
34438
77
    if (
User->getOpcode() == ISD::ADD || 77
User->getOpcode() == ISD::SHL60
) {
34439
45
      HasLEAPotential = true;
34440
45
      break;
34441
45
    }
34442
69
  }
34443
69
  if (!HasLEAPotential)
34444
24
    return SDValue();
34445
45
34446
45
  // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
34447
45
  
int64_t AddConstant = Sext ? 45
AddOp1->getSExtValue()39
:
AddOp1->getZExtValue()6
;
34448
31.2k
  SDValue AddOp0 = Add.getOperand(0);
34449
31.2k
  SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
34450
31.2k
  SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
34451
31.2k
34452
31.2k
  // The wider add is guaranteed to not wrap because both operands are
34453
31.2k
  // sign-extended.
34454
31.2k
  SDNodeFlags Flags;
34455
31.2k
  Flags.setNoSignedWrap(NSW);
34456
31.2k
  Flags.setNoUnsignedWrap(NUW);
34457
31.2k
  return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
34458
31.2k
}
34459
34460
/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
34461
/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
34462
/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
34463
/// extends from AH (which we otherwise need to do contortions to access).
34464
32.6k
static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
34465
32.6k
  SDValue N0 = N->getOperand(0);
34466
32.6k
  auto OpcodeN = N->getOpcode();
34467
32.6k
  auto OpcodeN0 = N0.getOpcode();
34468
32.6k
  if (
!((OpcodeN == ISD::SIGN_EXTEND && 32.6k
OpcodeN0 == ISD::SDIVREM3.78k
) ||
34469
32.6k
        
(OpcodeN == ISD::ZERO_EXTEND && 32.6k
OpcodeN0 == ISD::UDIVREM13.6k
)))
34470
32.5k
    return SDValue();
34471
81
34472
81
  EVT VT = N->getValueType(0);
34473
81
  EVT InVT = N0.getValueType();
34474
81
  if (
N0.getResNo() != 1 || 81
InVT != MVT::i830
||
VT != MVT::i3210
)
34475
76
    return SDValue();
34476
5
34477
5
  SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
34478
1
  auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
34479
4
                                               : X86ISD::UDIVREM8_ZEXT_HREG;
34480
32.6k
  SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
34481
32.6k
                          N0.getOperand(1));
34482
32.6k
  DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
34483
32.6k
  return R.getValue(1);
34484
32.6k
}
34485
34486
// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
34487
// operands and the result of CMOV is not used anywhere else - promote CMOV
34488
// itself instead of promoting its result. This could be beneficial, because:
34489
//     1) X86TargetLowering::EmitLoweredSelect later can do merging of two
34490
//        (or more) pseudo-CMOVs only when they go one-after-another and
34491
//        getting rid of result extension code after CMOV will help that.
34492
//     2) Promotion of constant CMOV arguments is free, hence the
34493
//        {ANY,SIGN,ZERO}_EXTEND will just be deleted.
34494
//     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
34495
//        promotion is also good in terms of code-size.
34496
//        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
34497
//         promotion).
34498
33.5k
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
34499
33.5k
  SDValue CMovN = Extend->getOperand(0);
34500
33.5k
  if (CMovN.getOpcode() != X86ISD::CMOV)
34501
33.4k
    return SDValue();
34502
139
34503
139
  EVT TargetVT = Extend->getValueType(0);
34504
139
  unsigned ExtendOpcode = Extend->getOpcode();
34505
139
  SDLoc DL(Extend);
34506
139
34507
139
  EVT VT = CMovN.getValueType();
34508
139
  SDValue CMovOp0 = CMovN.getOperand(0);
34509
139
  SDValue CMovOp1 = CMovN.getOperand(1);
34510
139
34511
139
  bool DoPromoteCMOV =
34512
86
      (VT == MVT::i16 && 
(TargetVT == MVT::i32 || 86
TargetVT == MVT::i642
)) &&
34513
86
      CMovN.hasOneUse() &&
34514
86
      (isa<ConstantSDNode>(CMovOp0.getNode()) &&
34515
86
       isa<ConstantSDNode>(CMovOp1.getNode()));
34516
139
34517
139
  if (!DoPromoteCMOV)
34518
64
    return SDValue();
34519
75
34520
75
  CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
34521
75
  CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);
34522
75
34523
75
  return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
34524
75
                     CMovN.getOperand(2), CMovN.getOperand(3));
34525
75
}
34526
34527
// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
34528
// This is more or less the reverse of combineBitcastvxi1.
34529
static SDValue
34530
combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
34531
                               TargetLowering::DAGCombinerInfo &DCI,
34532
31.3k
                               const X86Subtarget &Subtarget) {
34533
31.3k
  unsigned Opcode = N->getOpcode();
34534
31.3k
  if (
Opcode != ISD::SIGN_EXTEND && 31.3k
Opcode != ISD::ZERO_EXTEND28.9k
&&
34535
15.2k
      Opcode != ISD::ANY_EXTEND)
34536
0
    return SDValue();
34537
31.3k
  
if (31.3k
!DCI.isBeforeLegalizeOps()31.3k
)
34538
13.9k
    return SDValue();
34539
17.4k
  
if (17.4k
!Subtarget.hasSSE2() || 17.4k
Subtarget.hasAVX512()16.1k
)
34540
3.70k
    return SDValue();
34541
13.7k
34542
13.7k
  SDValue N0 = N->getOperand(0);
34543
13.7k
  EVT VT = N->getValueType(0);
34544
13.7k
  EVT SVT = VT.getScalarType();
34545
13.7k
  EVT InSVT = N0.getValueType().getScalarType();
34546
13.7k
  unsigned EltSizeInBits = SVT.getSizeInBits();
34547
13.7k
34548
13.7k
  // Input type must be extending a bool vector (bit-casted from a scalar
34549
13.7k
  // integer) to legal integer types.
34550
13.7k
  if (!VT.isVector())
34551
12.9k
    return SDValue();
34552
803
  
if (803
SVT != MVT::i64 && 803
SVT != MVT::i32566
&&
SVT != MVT::i16250
&&
SVT != MVT::i890
)
34553
4
    return SDValue();
34554
799
  
if (799
InSVT != MVT::i1 || 799
N0.getOpcode() != ISD::BITCAST376
)
34555
684
    return SDValue();
34556
115
34557
115
  SDValue N00 = N0.getOperand(0);
34558
115
  EVT SclVT = N0.getOperand(0).getValueType();
34559
115
  if (!SclVT.isScalarInteger())
34560
1
    return SDValue();
34561
114
34562
114
  SDLoc DL(N);
34563
114
  SDValue Vec;
34564
114
  SmallVector<int, 32> ShuffleMask;
34565
114
  unsigned NumElts = VT.getVectorNumElements();
34566
114
  assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
34567
114
34568
114
  // Broadcast the scalar integer to the vector elements.
34569
114
  if (
NumElts > EltSizeInBits114
) {
34570
38
    // If the scalar integer is greater than the vector element size, then we
34571
38
    // must split it down into sub-sections for broadcasting. For example:
34572
38
    //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
34573
38
    //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
34574
38
    assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
34575
38
    unsigned Scale = NumElts / EltSizeInBits;
34576
38
    EVT BroadcastVT =
34577
38
        EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
34578
38
    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
34579
38
    Vec = DAG.getBitcast(VT, Vec);
34580
38
34581
182
    for (unsigned i = 0; 
i != Scale182
;
++i144
)
34582
144
      ShuffleMask.append(EltSizeInBits, i);
34583
114
  } else {
34584
76
    // For smaller scalar integers, we can simply any-extend it to the vector
34585
76
    // element size (we don't care about the upper bits) and broadcast it to all
34586
76
    // elements.
34587
76
    SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
34588
76
    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
34589
76
    ShuffleMask.append(NumElts, 0);
34590
76
  }
34591
114
  Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
34592
114
34593
114
  // Now, mask the relevant bit in each element.
34594
114
  SmallVector<SDValue, 32> Bits;
34595
1.97k
  for (unsigned i = 0; 
i != NumElts1.97k
;
++i1.86k
) {
34596
1.86k
    int BitIdx = (i % EltSizeInBits);
34597
1.86k
    APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
34598
1.86k
    Bits.push_back(DAG.getConstant(Bit, DL, SVT));
34599
1.86k
  }
34600
114
  SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
34601
114
  Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
34602
114
34603
114
  // Compare against the bitmask and extend the result.
34604
114
  EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
34605
114
  Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
34606
114
  Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
34607
114
34608
114
  // For SEXT, this is now done, otherwise shift the result down for
34609
114
  // zero-extension.
34610
114
  if (Opcode == ISD::SIGN_EXTEND)
34611
48
    return Vec;
34612
66
  return DAG.getNode(ISD::SRL, DL, VT, Vec,
34613
66
                     DAG.getConstant(EltSizeInBits - 1, DL, VT));
34614
66
}
34615
34616
/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
34617
/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
34618
/// with UNDEFs) of the input to vectors of the same size as the target type
34619
/// which then extends the lowest elements.
34620
static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
34621
                                          TargetLowering::DAGCombinerInfo &DCI,
34622
32.6k
                                          const X86Subtarget &Subtarget) {
34623
32.6k
  unsigned Opcode = N->getOpcode();
34624
32.6k
  if (
Opcode != ISD::SIGN_EXTEND && 32.6k
Opcode != ISD::ZERO_EXTEND29.7k
)
34625
15.2k
    return SDValue();
34626
17.4k
  
if (17.4k
!DCI.isBeforeLegalizeOps()17.4k
)
34627
4.41k
    return SDValue();
34628
13.0k
  
if (13.0k
!Subtarget.hasSSE2()13.0k
)
34629
1.04k
    return SDValue();
34630
12.0k
34631
12.0k
  SDValue N0 = N->getOperand(0);
34632
12.0k
  EVT VT = N->getValueType(0);
34633
12.0k
  EVT SVT = VT.getScalarType();
34634
12.0k
  EVT InVT = N0.getValueType();
34635
12.0k
  EVT InSVT = InVT.getScalarType();
34636
12.0k
34637
12.0k
  // Input type must be a vector and we must be extending legal integer types.
34638
12.0k
  if (!VT.isVector())
34639
9.37k
    return SDValue();
34640
2.64k
  
if (2.64k
SVT != MVT::i64 && 2.64k
SVT != MVT::i321.88k
&&
SVT != MVT::i16637
)
34641
216
    return SDValue();
34642
2.43k
  
if (2.43k
InSVT != MVT::i32 && 2.43k
InSVT != MVT::i162.12k
&&
InSVT != MVT::i81.43k
)
34643
460
    return SDValue();
34644
1.97k
34645
1.97k
  // On AVX2+ targets, if the input/output types are both legal then we will be
34646
1.97k
  // able to use SIGN_EXTEND/ZERO_EXTEND directly.
34647
1.97k
  
if (1.97k
Subtarget.hasInt256() && 1.97k
DAG.getTargetLoweringInfo().isTypeLegal(VT)1.03k
&&
34648
938
      DAG.getTargetLoweringInfo().isTypeLegal(InVT))
34649
573
    return SDValue();
34650
1.39k
34651
1.39k
  SDLoc DL(N);
34652
1.39k
34653
1.68k
  auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
34654
1.68k
    EVT InVT = N.getValueType();
34655
1.68k
    EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
34656
1.68k
                                 Size / InVT.getScalarSizeInBits());
34657
1.68k
    SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
34658
1.68k
                                  DAG.getUNDEF(InVT));
34659
1.68k
    Opnds[0] = N;
34660
1.68k
    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
34661
1.68k
  };
34662
1.39k
34663
1.39k
  // If target-size is less than 128-bits, extend to a type that would extend
34664
1.39k
  // to 128 bits, extend that and extract the original target vector.
34665
1.39k
  if (
VT.getSizeInBits() < 128 && 1.39k
!(128 % VT.getSizeInBits())105
) {
34666
85
    unsigned Scale = 128 / VT.getSizeInBits();
34667
85
    EVT ExVT =
34668
85
        EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
34669
85
    SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
34670
85
    SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
34671
85
    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
34672
85
                       DAG.getIntPtrConstant(0, DL));
34673
85
  }
34674
1.31k
34675
1.31k
  // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
34676
1.31k
  // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
34677
1.31k
  // Also use this if we don't have SSE41 to allow the legalizer do its job.
34678
1.31k
  
if (1.31k
!Subtarget.hasSSE41() || 1.31k
VT.is128BitVector()840
||
34679
405
      
(VT.is256BitVector() && 405
Subtarget.hasInt256()264
) ||
34680
1.31k
      
(VT.is512BitVector() && 256
Subtarget.hasAVX512()96
)) {
34681
1.07k
    SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
34682
1.07k
    return Opcode == ISD::SIGN_EXTEND
34683
462
               ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
34684
612
               : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
34685
1.07k
  }
34686
238
34687
238
  
auto SplitAndExtendInReg = [&](unsigned SplitSize) 238
{
34688
200
    unsigned NumVecs = VT.getSizeInBits() / SplitSize;
34689
200
    unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
34690
200
    EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
34691
200
    EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
34692
200
34693
200
    SmallVector<SDValue, 8> Opnds;
34694
730
    for (unsigned i = 0, Offset = 0; 
i != NumVecs730
;
++i, Offset += NumSubElts530
) {
34695
530
      SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
34696
530
                                   DAG.getIntPtrConstant(Offset, DL));
34697
530
      SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
34698
530
      SrcVec = Opcode == ISD::SIGN_EXTEND
34699
240
                   ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
34700
290
                   : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
34701
530
      Opnds.push_back(SrcVec);
34702
530
    }
34703
200
    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34704
200
  };
34705
238
34706
238
  // On pre-AVX2 targets, split into 128-bit nodes of
34707
238
  // ISD::*_EXTEND_VECTOR_INREG.
34708
238
  if (
!Subtarget.hasInt256() && 238
!(VT.getSizeInBits() % 128)179
)
34709
169
    return SplitAndExtendInReg(128);
34710
69
34711
69
  // On pre-AVX512 targets, split into 256-bit nodes of
34712
69
  // ISD::*_EXTEND_VECTOR_INREG.
34713
69
  
if (69
!Subtarget.hasAVX512() && 69
!(VT.getSizeInBits() % 256)45
)
34714
31
    return SplitAndExtendInReg(256);
34715
38
34716
38
  return SDValue();
34717
38
}
34718
34719
static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34720
                           TargetLowering::DAGCombinerInfo &DCI,
34721
3.78k
                           const X86Subtarget &Subtarget) {
34722
3.78k
  SDValue N0 = N->getOperand(0);
34723
3.78k
  EVT VT = N->getValueType(0);
34724
3.78k
  EVT InVT = N0.getValueType();
34725
3.78k
  SDLoc DL(N);
34726
3.78k
34727
3.78k
  if (SDValue DivRem8 = getDivRem8(N, DAG))
34728
1
    return DivRem8;
34729
3.77k
34730
3.77k
  
if (SDValue 3.77k
NewCMov3.77k
= combineToExtendCMOV(N, DAG))
34731
7
    return NewCMov;
34732
3.77k
34733
3.77k
  
if (3.77k
!DCI.isBeforeLegalizeOps()3.77k
) {
34734
780
    if (
InVT == MVT::i1780
) {
34735
0
      SDValue Zero = DAG.getConstant(0, DL, VT);
34736
0
      SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34737
0
      return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
34738
0
    }
34739
780
    return SDValue();
34740
780
  }
34741
2.99k
34742
2.99k
  
if (2.99k
InVT == MVT::i1 && 2.99k
N0.getOpcode() == ISD::XOR90
&&
34743
2.99k
      
isAllOnesConstant(N0.getOperand(1))11
&&
N0.hasOneUse()11
) {
34744
11
    // Invert and sign-extend a boolean is the same as zero-extend and subtract
34745
11
    // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34746
11
    // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34747
11
    // sext (xor Bool, -1) --> sub (zext Bool), 1
34748
11
    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34749
11
    return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34750
11
  }
34751
2.98k
34752
2.98k
  
if (SDValue 2.98k
V2.98k
= combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34753
601
    return V;
34754
2.38k
34755
2.38k
  
if (SDValue 2.38k
V2.38k
= combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
34756
48
    return V;
34757
2.33k
34758
2.33k
  
if (2.33k
Subtarget.hasAVX() && 2.33k
VT.is256BitVector()1.16k
)
34759
233
    
if (SDValue 233
R233
= WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34760
3
      return R;
34761
2.32k
34762
2.32k
  
if (SDValue 2.32k
NewAdd2.32k
= promoteExtBeforeAdd(N, DAG, Subtarget))
34763
39
    return NewAdd;
34764
2.29k
34765
2.29k
  return SDValue();
34766
2.29k
}
34767
34768
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34769
2.27k
                          const X86Subtarget &Subtarget) {
34770
2.27k
  SDLoc dl(N);
34771
2.27k
  EVT VT = N->getValueType(0);
34772
2.27k
34773
2.27k
  // Let legalize expand this if it isn't a legal type yet.
34774
2.27k
  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34775
154
    return SDValue();
34776
2.11k
34777
2.11k
  EVT ScalarVT = VT.getScalarType();
34778
2.11k
  if (
(ScalarVT != MVT::f32 && 2.11k
ScalarVT != MVT::f64709
) ||
!Subtarget.hasAnyFMA()2.11k
)
34779
170
    return SDValue();
34780
1.94k
34781
1.94k
  SDValue A = N->getOperand(0);
34782
1.94k
  SDValue B = N->getOperand(1);
34783
1.94k
  SDValue C = N->getOperand(2);
34784
1.94k
34785
5.80k
  auto invertIfNegative = [](SDValue &V) {
34786
5.80k
    if (SDValue 
NegVal5.80k
= isFNEG(V.getNode())) {
34787
648
      V = NegVal;
34788
648
      return true;
34789
648
    }
34790
5.16k
    return false;
34791
5.16k
  };
34792
1.94k
34793
1.94k
  // Do not convert the passthru input of scalar intrinsics.
34794
1.94k
  // FIXME: We could allow negations of the lower element only.
34795
1.92k
  bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34796
1.94k
  bool NegB = invertIfNegative(B);
34797
1.94k
  bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34798
1.94k
34799
1.94k
  // Negative multiplication when NegA xor NegB
34800
1.94k
  bool NegMul = (NegA != NegB);
34801
1.94k
  bool HasNeg = NegA || 
NegB1.59k
||
NegC1.58k
;
34802
1.94k
34803
1.94k
  unsigned NewOpcode;
34804
1.94k
  if (!NegMul)
34805
1.58k
    
NewOpcode = (!NegC) ? 1.58k
unsigned(ISD::FMA)1.40k
:
unsigned(X86ISD::FMSUB)188
;
34806
1.94k
  else
34807
360
    
NewOpcode = (!NegC) ? 360
X86ISD::FNMADD260
:
X86ISD::FNMSUB100
;
34808
1.94k
34809
1.94k
  // For FMA, we risk reconstructing the node we started with.
34810
1.94k
  // In order to avoid this, we check for negation or opcode change. If
34811
1.94k
  // one of the two happened, then it is a new node and we return it.
34812
1.94k
  if (
N->getOpcode() == ISD::FMA1.94k
) {
34813
1.88k
    if (
HasNeg || 1.88k
NewOpcode != N->getOpcode()1.34k
)
34814
542
      return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34815
1.34k
    return SDValue();
34816
1.34k
  }
34817
65
34818
65
  
if (65
N->getOpcode() == X86ISD::FMADD_RND65
) {
34819
30
    switch (NewOpcode) {
34820
24
    case ISD::FMA:       NewOpcode = X86ISD::FMADD_RND; break;
34821
2
    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB_RND; break;
34822
4
    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34823
0
    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34824
65
    }
34825
35
  } else 
if (35
N->getOpcode() == X86ISD::FMADDS1_RND35
) {
34826
28
    switch (NewOpcode) {
34827
28
    case ISD::FMA:       NewOpcode = X86ISD::FMADDS1_RND; break;
34828
0
    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS1_RND; break;
34829
0
    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34830
0
    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34831
35
    }
34832
7
  } else 
if (7
N->getOpcode() == X86ISD::FMADDS3_RND7
) {
34833
7
    switch (NewOpcode) {
34834
7
    case ISD::FMA:       NewOpcode = X86ISD::FMADDS3_RND; break;
34835
0
    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS3_RND; break;
34836
0
    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34837
0
    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34838
7
    }
34839
0
  } else {
34840
0
    llvm_unreachable("Unexpected opcode!");
34841
35
  }
34842
65
34843
65
  // Only return the node is the opcode was changed or one of the
34844
65
  // operand was negated. If not, we'll just recreate the same node.
34845
65
  
if (65
HasNeg || 65
NewOpcode != N->getOpcode()59
)
34846
6
    return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34847
59
34848
59
  return SDValue();
34849
59
}
34850
34851
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34852
                           TargetLowering::DAGCombinerInfo &DCI,
34853
29.8k
                           const X86Subtarget &Subtarget) {
34854
29.8k
  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
34855
29.8k
  //           (and (i32 x86isd::setcc_carry), 1)
34856
29.8k
  // This eliminates the zext. This transformation is necessary because
34857
29.8k
  // ISD::SETCC is always legalized to i8.
34858
29.8k
  SDLoc dl(N);
34859
29.8k
  SDValue N0 = N->getOperand(0);
34860
29.8k
  EVT VT = N->getValueType(0);
34861
29.8k
34862
29.8k
  if (N0.getOpcode() == ISD::AND &&
34863
1.82k
      N0.hasOneUse() &&
34864
29.8k
      
N0.getOperand(0).hasOneUse()1.60k
) {
34865
1.39k
    SDValue N00 = N0.getOperand(0);
34866
1.39k
    if (
N00.getOpcode() == X86ISD::SETCC_CARRY1.39k
) {
34867
113
      if (!isOneConstant(N0.getOperand(1)))
34868
0
        return SDValue();
34869
113
      return DAG.getNode(ISD::AND, dl, VT,
34870
113
                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34871
113
                                     N00.getOperand(0), N00.getOperand(1)),
34872
113
                         DAG.getConstant(1, dl, VT));
34873
113
    }
34874
1.39k
  }
34875
29.7k
34876
29.7k
  
if (29.7k
N0.getOpcode() == ISD::TRUNCATE &&
34877
0
      N0.hasOneUse() &&
34878
29.7k
      
N0.getOperand(0).hasOneUse()0
) {
34879
0
    SDValue N00 = N0.getOperand(0);
34880
0
    if (
N00.getOpcode() == X86ISD::SETCC_CARRY0
) {
34881
0
      return DAG.getNode(ISD::AND, dl, VT,
34882
0
                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34883
0
                                     N00.getOperand(0), N00.getOperand(1)),
34884
0
                         DAG.getConstant(1, dl, VT));
34885
0
    }
34886
29.7k
  }
34887
29.7k
34888
29.7k
  
if (SDValue 29.7k
NewCMov29.7k
= combineToExtendCMOV(N, DAG))
34889
68
    return NewCMov;
34890
29.7k
34891
29.7k
  
if (SDValue 29.7k
V29.7k
= combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34892
758
    return V;
34893
28.9k
34894
28.9k
  
if (SDValue 28.9k
V28.9k
= combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
34895
66
    return V;
34896
28.8k
34897
28.8k
  
if (28.8k
VT.is256BitVector()28.8k
)
34898
273
    
if (SDValue 273
R273
= WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34899
7
      return R;
34900
28.8k
34901
28.8k
  
if (SDValue 28.8k
DivRem828.8k
= getDivRem8(N, DAG))
34902
4
    return DivRem8;
34903
28.8k
34904
28.8k
  
if (SDValue 28.8k
NewAdd28.8k
= promoteExtBeforeAdd(N, DAG, Subtarget))
34905
6
    return NewAdd;
34906
28.8k
34907
28.8k
  
if (SDValue 28.8k
R28.8k
= combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34908
16
    return R;
34909
28.8k
34910
28.8k
  return SDValue();
34911
28.8k
}
34912
34913
/// Try to map a 128-bit or larger integer comparison to vector instructions
34914
/// before type legalization splits it up into chunks.
34915
static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34916
48.3k
                                               const X86Subtarget &Subtarget) {
34917
48.3k
  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34918
48.3k
  assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34919
48.3k
34920
48.3k
  // We're looking for an oversized integer equality comparison, but ignore a
34921
48.3k
  // comparison with zero because that gets special treatment in EmitTest().
34922
48.3k
  SDValue X = SetCC->getOperand(0);
34923
48.3k
  SDValue Y = SetCC->getOperand(1);
34924
48.3k
  EVT OpVT = X.getValueType();
34925
48.3k
  unsigned OpSize = OpVT.getSizeInBits();
34926
48.3k
  if (
!OpVT.isScalarInteger() || 48.3k
OpSize < 12845.7k
||
isNullConstant(Y)190
)
34927
48.3k
    return SDValue();
34928
71
34929
71
  // TODO: Use PXOR + PTEST for SSE4.1 or later?
34930
71
  // TODO: Add support for AVX-512.
34931
71
  EVT VT = SetCC->getValueType(0);
34932
71
  SDLoc DL(SetCC);
34933
71
  if (
(OpSize == 128 && 71
Subtarget.hasSSE2()60
) ||
34934
71
      
(OpSize == 256 && 11
Subtarget.hasAVX2()10
)) {
34935
68
    EVT VecVT = OpSize == 128 ? 
MVT::v16i860
:
MVT::v32i88
;
34936
68
    SDValue VecX = DAG.getBitcast(VecVT, X);
34937
68
    SDValue VecY = DAG.getBitcast(VecVT, Y);
34938
68
34939
68
    // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34940
68
    // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34941
68
    // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34942
68
    // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34943
68
    // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34944
68
    SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34945
68
    SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34946
68
    SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 
0xFFFF60
:
0xFFFFFFFF8
, DL,
34947
68
                                    MVT::i32);
34948
68
    return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34949
68
  }
34950
3
34951
3
  return SDValue();
34952
3
}
34953
34954
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34955
86.0k
                            const X86Subtarget &Subtarget) {
34956
86.0k
  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34957
86.0k
  SDValue LHS = N->getOperand(0);
34958
86.0k
  SDValue RHS = N->getOperand(1);
34959
86.0k
  EVT VT = N->getValueType(0);
34960
86.0k
  SDLoc DL(N);
34961
86.0k
34962
86.0k
  if (
CC == ISD::SETNE || 86.0k
CC == ISD::SETEQ67.5k
) {
34963
48.4k
    EVT OpVT = LHS.getValueType();
34964
48.4k
    // 0-x == y --> x+y == 0
34965
48.4k
    // 0-x != y --> x+y != 0
34966
48.4k
    if (
LHS.getOpcode() == ISD::SUB && 48.4k
isNullConstant(LHS.getOperand(0))438
&&
34967
48.4k
        
LHS.hasOneUse()3
) {
34968
1
      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34969
1
      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34970
1
    }
34971
48.4k
    // x == 0-y --> x+y == 0
34972
48.4k
    // x != 0-y --> x+y != 0
34973
48.4k
    
if (48.4k
RHS.getOpcode() == ISD::SUB && 48.4k
isNullConstant(RHS.getOperand(0))16
&&
34974
48.4k
        
RHS.hasOneUse()8
) {
34975
8
      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34976
8
      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34977
8
    }
34978
48.3k
34979
48.3k
    
if (SDValue 48.3k
V48.3k
= combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34980
68
      return V;
34981
85.9k
  }
34982
85.9k
34983
85.9k
  
if (85.9k
VT.getScalarType() == MVT::i1 &&
34984
85.9k
      
(CC == ISD::SETNE || 43.4k
CC == ISD::SETEQ34.2k
||
ISD::isSignedIntSetCC(CC)19.3k
)) {
34985
30.3k
    bool IsSEXT0 =
34986
30.3k
        (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34987
136
        (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34988
30.3k
    bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34989
30.3k
34990
30.3k
    if (
!IsSEXT0 || 30.3k
!IsVZero111
) {
34991
30.3k
      // Swap the operands and update the condition code.
34992
30.3k
      std::swap(LHS, RHS);
34993
30.3k
      CC = ISD::getSetCCSwappedOperands(CC);
34994
30.3k
34995
30.3k
      IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34996
116
                (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34997
30.3k
      IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34998
30.3k
    }
34999
30.3k
35000
30.3k
    if (
IsSEXT0 && 30.3k
IsVZero119
) {
35001
14
      assert(VT == LHS.getOperand(0).getValueType() &&
35002
14
             "Uexpected operand type");
35003
14
      if (CC == ISD::SETGT)
35004
1
        return DAG.getConstant(0, DL, VT);
35005
13
      
if (13
CC == ISD::SETLE13
)
35006
2
        return DAG.getConstant(1, DL, VT);
35007
11
      
if (11
CC == ISD::SETEQ || 11
CC == ISD::SETGE9
)
35008
4
        return DAG.getNOT(DL, LHS.getOperand(0), VT);
35009
7
35010
0
      assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
35011
7
             "Unexpected condition code!");
35012
7
      return LHS.getOperand(0);
35013
7
    }
35014
30.3k
  }
35015
85.9k
35016
85.9k
  // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
35017
85.9k
  // to avoid scalarization via legalization because v4i32 is not a legal type.
35018
85.9k
  
if (85.9k
Subtarget.hasSSE1() && 85.9k
!Subtarget.hasSSE2()81.8k
&&
VT == MVT::v4i32160
&&
35019
30
      LHS.getValueType() == MVT::v4f32)
35020
28
    return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
35021
85.9k
35022
85.9k
  return SDValue();
35023
85.9k
}
35024
35025
924
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
35026
924
  SDLoc DL(N);
35027
924
  // Gather and Scatter instructions use k-registers for masks. The type of
35028
924
  // the masks is v*i1. So the mask will be truncated anyway.
35029
924
  // The SIGN_EXTEND_INREG my be dropped.
35030
924
  SDValue Mask = N->getOperand(2);
35031
924
  if (
Mask.getOpcode() == ISD::SIGN_EXTEND_INREG924
) {
35032
21
    SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
35033
21
    NewOps[2] = Mask.getOperand(0);
35034
21
    DAG.UpdateNodeOperands(N, NewOps);
35035
21
  }
35036
924
  return SDValue();
35037
924
}
35038
35039
// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
35040
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
35041
11.2k
                               const X86Subtarget &Subtarget) {
35042
11.2k
  SDLoc DL(N);
35043
11.2k
  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
35044
11.2k
  SDValue EFLAGS = N->getOperand(1);
35045
11.2k
35046
11.2k
  // Try to simplify the EFLAGS and condition code operands.
35047
11.2k
  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
35048
9
    return getSETCC(CC, Flags, DL, DAG);
35049
11.2k
35050
11.2k
  return SDValue();
35051
11.2k
}
35052
35053
/// Optimize branch condition evaluation.
35054
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
35055
36.5k
                             const X86Subtarget &Subtarget) {
35056
36.5k
  SDLoc DL(N);
35057
36.5k
  SDValue EFLAGS = N->getOperand(3);
35058
36.5k
  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
35059
36.5k
35060
36.5k
  // Try to simplify the EFLAGS and condition code operands.
35061
36.5k
  // Make sure to not keep references to operands, as combineSetCCEFLAGS can
35062
36.5k
  // RAUW them under us.
35063
36.5k
  if (SDValue 
Flags36.5k
= combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
35064
2.24k
    SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
35065
2.24k
    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
35066
2.24k
                       N->getOperand(1), Cond, Flags);
35067
2.24k
  }
35068
34.3k
35069
34.3k
  return SDValue();
35070
34.3k
}
35071
35072
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
35073
6.66k
                                                  SelectionDAG &DAG) {
35074
6.66k
  // Take advantage of vector comparisons producing 0 or -1 in each lane to
35075
6.66k
  // optimize away operation when it's from a constant.
35076
6.66k
  //
35077
6.66k
  // The general transformation is:
35078
6.66k
  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
35079
6.66k
  //       AND(VECTOR_CMP(x,y), constant2)
35080
6.66k
  //    constant2 = UNARYOP(constant)
35081
6.66k
35082
6.66k
  // Early exit if this isn't a vector operation, the operand of the
35083
6.66k
  // unary operation isn't a bitwise AND, or if the sizes of the operations
35084
6.66k
  // aren't the same.
35085
6.66k
  EVT VT = N->getValueType(0);
35086
6.66k
  if (
!VT.isVector() || 6.66k
N->getOperand(0)->getOpcode() != ISD::AND3.68k
||
35087
52
      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
35088
14
      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
35089
6.65k
    return SDValue();
35090
10
35091
10
  // Now check that the other operand of the AND is a constant. We could
35092
10
  // make the transformation for non-constant splats as well, but it's unclear
35093
10
  // that would be a benefit as it would not eliminate any operations, just
35094
10
  // perform one more step in scalar code before moving to the vector unit.
35095
10
  
if (BuildVectorSDNode *10
BV10
=
35096
10
          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
35097
10
    // Bail out if the vector isn't a constant.
35098
10
    if (!BV->isConstant())
35099
0
      return SDValue();
35100
10
35101
10
    // Everything checks out. Build up the new and improved node.
35102
10
    SDLoc DL(N);
35103
10
    EVT IntVT = BV->getValueType(0);
35104
10
    // Create a new constant of the appropriate type for the transformed
35105
10
    // DAG.
35106
10
    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
35107
10
    // The AND node needs bitcasts to/from an integer vector type around it.
35108
10
    SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
35109
10
    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
35110
10
                                 N->getOperand(0)->getOperand(0), MaskConst);
35111
10
    SDValue Res = DAG.getBitcast(VT, NewAnd);
35112
10
    return Res;
35113
10
  }
35114
0
35115
0
  return SDValue();
35116
0
}
35117
35118
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
35119
2.21k
                               const X86Subtarget &Subtarget) {
35120
2.21k
  SDValue Op0 = N->getOperand(0);
35121
2.21k
  EVT VT = N->getValueType(0);
35122
2.21k
  EVT InVT = Op0.getValueType();
35123
2.21k
  EVT InSVT = InVT.getScalarType();
35124
2.21k
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35125
2.21k
35126
2.21k
  // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
35127
2.21k
  // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
35128
2.21k
  if (
InVT.isVector() && 2.21k
(InSVT == MVT::i8 || 1.19k
InSVT == MVT::i161.04k
)) {
35129
298
    SDLoc dl(N);
35130
298
    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
35131
298
                                 InVT.getVectorNumElements());
35132
298
    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
35133
298
35134
298
    if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
35135
129
      return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
35136
169
35137
169
    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
35138
169
  }
35139
1.91k
35140
1.91k
  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
35141
1.91k
  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
35142
1.91k
  // the optimization here.
35143
1.91k
  
if (1.91k
DAG.SignBitIsZero(Op0)1.91k
)
35144
186
    return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
35145
1.72k
35146
1.72k
  return SDValue();
35147
1.72k
}
35148
35149
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
35150
6.66k
                               const X86Subtarget &Subtarget) {
35151
6.66k
  // First try to optimize away the conversion entirely when it's
35152
6.66k
  // conditionally from a constant. Vectors only.
35153
6.66k
  if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
35154
10
    return Res;
35155
6.65k
35156
6.65k
  // Now move on to more general possibilities.
35157
6.65k
  SDValue Op0 = N->getOperand(0);
35158
6.65k
  EVT VT = N->getValueType(0);
35159
6.65k
  EVT InVT = Op0.getValueType();
35160
6.65k
  EVT InSVT = InVT.getScalarType();
35161
6.65k
35162
6.65k
  // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
35163
6.65k
  // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
35164
6.65k
  // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
35165
6.65k
  if (InVT.isVector() &&
35166
3.67k
      
(InSVT == MVT::i8 || 3.67k
InSVT == MVT::i163.52k
||
35167
6.65k
       
(InSVT == MVT::i1 && 3.36k
!DAG.getTargetLoweringInfo().isTypeLegal(InVT)103
))) {
35168
345
    SDLoc dl(N);
35169
345
    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
35170
345
                                 InVT.getVectorNumElements());
35171
345
    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
35172
345
    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
35173
345
  }
35174
6.30k
35175
6.30k
  // Without AVX512DQ we only support i64 to float scalar conversion. For both
35176
6.30k
  // vectors and scalars, see if we know that the upper bits are all the sign
35177
6.30k
  // bit, in which case we can truncate the input to i32 and convert from that.
35178
6.30k
  
if (6.30k
InVT.getScalarSizeInBits() > 32 && 6.30k
!Subtarget.hasDQI()1.88k
) {
35179
1.73k
    unsigned BitWidth = InVT.getScalarSizeInBits();
35180
1.73k
    unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
35181
1.73k
    if (
NumSignBits >= (BitWidth - 31)1.73k
) {
35182
30
      EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
35183
30
      if (InVT.isVector())
35184
24
        TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
35185
24
                                   InVT.getVectorNumElements());
35186
30
      SDLoc dl(N);
35187
30
      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
35188
30
      return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
35189
30
    }
35190
6.27k
  }
35191
6.27k
35192
6.27k
  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
35193
6.27k
  // a 32-bit target where SSE doesn't support i64->FP operations.
35194
6.27k
  
if (6.27k
!Subtarget.useSoftFloat() && 6.27k
Op0.getOpcode() == ISD::LOAD6.26k
) {
35195
1.23k
    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
35196
1.23k
    EVT LdVT = Ld->getValueType(0);
35197
1.23k
35198
1.23k
    // This transformation is not supported if the result type is f16 or f128.
35199
1.23k
    if (
VT == MVT::f16 || 1.23k
VT == MVT::f1281.23k
)
35200
11
      return SDValue();
35201
1.22k
35202
1.22k
    
if (1.22k
!Ld->isVolatile() && 1.22k
!VT.isVector()1.22k
&&
35203
1.22k
        
ISD::isNON_EXTLoad(Op0.getNode())901
&&
Op0.hasOneUse()348
&&
35204
1.22k
        
!Subtarget.is64Bit()338
&&
LdVT == MVT::i64169
) {
35205
19
      SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
35206
19
          SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
35207
19
      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
35208
19
      return FILDChain;
35209
19
    }
35210
6.24k
  }
35211
6.24k
  return SDValue();
35212
6.24k
}
35213
35214
2.52k
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
35215
2.52k
  if (SDValue 
Flags2.52k
= combineCarryThroughADD(N->getOperand(2))) {
35216
926
    MVT VT = N->getSimpleValueType(0);
35217
926
    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35218
926
    return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
35219
926
                       N->getOperand(0), N->getOperand(1),
35220
926
                       Flags);
35221
926
  }
35222
1.60k
35223
1.60k
  return SDValue();
35224
1.60k
}
35225
35226
// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
35227
static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
35228
6.64k
                          X86TargetLowering::DAGCombinerInfo &DCI) {
35229
6.64k
  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
35230
6.64k
  // the result is either zero or one (depending on the input carry bit).
35231
6.64k
  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
35232
6.64k
  if (X86::isZeroNode(N->getOperand(0)) &&
35233
9
      X86::isZeroNode(N->getOperand(1)) &&
35234
6.64k
      // We don't have a good way to replace an EFLAGS use, so only do this when
35235
6.64k
      // dead right now.
35236
6.64k
      
SDValue(N, 1).use_empty()3
) {
35237
3
    SDLoc DL(N);
35238
3
    EVT VT = N->getValueType(0);
35239
3
    SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
35240
3
    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
35241
3
                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35242
3
                                           DAG.getConstant(X86::COND_B, DL,
35243
3
                                                           MVT::i8),
35244
3
                                           N->getOperand(2)),
35245
3
                               DAG.getConstant(1, DL, VT));
35246
3
    return DCI.CombineTo(N, Res1, CarryOut);
35247
3
  }
35248
6.63k
35249
6.63k
  
if (SDValue 6.63k
Flags6.63k
= combineCarryThroughADD(N->getOperand(2))) {
35250
2.16k
    MVT VT = N->getSimpleValueType(0);
35251
2.16k
    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35252
2.16k
    return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
35253
2.16k
                       N->getOperand(0), N->getOperand(1),
35254
2.16k
                       Flags);
35255
2.16k
  }
35256
4.47k
35257
4.47k
  return SDValue();
35258
4.47k
}
35259
35260
/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
35261
/// which is more useful than 0/1 in some cases.
35262
116
static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
35263
116
  SDLoc DL(N);
35264
116
  // "Condition code B" is also known as "the carry flag" (CF).
35265
116
  SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
35266
116
  SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
35267
116
  MVT VT = N->getSimpleValueType(0);
35268
116
  if (VT == MVT::i8)
35269
116
    return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
35270
0
35271
116
  assert(VT == MVT::i1 && "Unexpected type for SETCC node");
35272
0
  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
35273
0
}
35274
35275
/// If this is an add or subtract where one operand is produced by a cmp+setcc,
35276
/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
35277
/// with CMP+{ADC, SBB}.
35278
280k
static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
35279
280k
  bool IsSub = N->getOpcode() == ISD::SUB;
35280
280k
  SDValue X = N->getOperand(0);
35281
280k
  SDValue Y = N->getOperand(1);
35282
280k
35283
280k
  // If this is an add, canonicalize a zext operand to the RHS.
35284
280k
  // TODO: Incomplete? What if both sides are zexts?
35285
280k
  if (
!IsSub && 280k
X.getOpcode() == ISD::ZERO_EXTEND261k
&&
35286
839
      Y.getOpcode() != ISD::ZERO_EXTEND)
35287
597
    std::swap(X, Y);
35288
280k
35289
280k
  // Look through a one-use zext.
35290
280k
  bool PeekedThroughZext = false;
35291
280k
  if (
Y.getOpcode() == ISD::ZERO_EXTEND && 280k
Y.hasOneUse()2.38k
) {
35292
2.08k
    Y = Y.getOperand(0);
35293
2.08k
    PeekedThroughZext = true;
35294
2.08k
  }
35295
280k
35296
280k
  // If this is an add, canonicalize a setcc operand to the RHS.
35297
280k
  // TODO: Incomplete? What if both sides are setcc?
35298
280k
  // TODO: Should we allow peeking through a zext of the other operand?
35299
280k
  if (
!IsSub && 280k
!PeekedThroughZext261k
&&
X.getOpcode() == X86ISD::SETCC259k
&&
35300
6
      Y.getOpcode() != X86ISD::SETCC)
35301
5
    std::swap(X, Y);
35302
280k
35303
280k
  if (
Y.getOpcode() != X86ISD::SETCC || 280k
!Y.hasOneUse()329
)
35304
280k
    return SDValue();
35305
325
35306
325
  SDLoc DL(N);
35307
325
  EVT VT = N->getValueType(0);
35308
325
  X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
35309
325
35310
325
  // If X is -1 or 0, then we have an opportunity to avoid constants required in
35311
325
  // the general case below.
35312
325
  auto *ConstantX = dyn_cast<ConstantSDNode>(X);
35313
325
  if (
ConstantX325
) {
35314
144
    if (
(!IsSub && 144
CC == X86::COND_AE44
&&
ConstantX->isAllOnesValue()2
) ||
35315
144
        
(IsSub && 142
CC == X86::COND_B100
&&
ConstantX->isNullValue()19
)) {
35316
9
      // This is a complicated way to get -1 or 0 from the carry flag:
35317
9
      // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35318
9
      //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35319
9
      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35320
9
                         DAG.getConstant(X86::COND_B, DL, MVT::i8),
35321
9
                         Y.getOperand(1));
35322
9
    }
35323
135
35324
135
    
if (135
(!IsSub && 135
CC == X86::COND_BE42
&&
ConstantX->isAllOnesValue()1
) ||
35325
135
        
(IsSub && 134
CC == X86::COND_A93
&&
ConstantX->isNullValue()2
)) {
35326
3
      SDValue EFLAGS = Y->getOperand(1);
35327
3
      if (
EFLAGS.getOpcode() == X86ISD::SUB && 3
EFLAGS.hasOneUse()3
&&
35328
3
          EFLAGS.getValueType().isInteger() &&
35329
3
          
!isa<ConstantSDNode>(EFLAGS.getOperand(1))3
) {
35330
3
        // Swap the operands of a SUB, and we have the same pattern as above.
35331
3
        // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
35332
3
        //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
35333
3
        SDValue NewSub = DAG.getNode(
35334
3
            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
35335
3
            EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35336
3
        SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35337
3
        return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35338
3
                           DAG.getConstant(X86::COND_B, DL, MVT::i8),
35339
3
                           NewEFLAGS);
35340
3
      }
35341
313
    }
35342
144
  }
35343
313
35344
313
  
if (313
CC == X86::COND_B313
) {
35345
62
    // X + SETB Z --> X + (mask SBB Z, Z)
35346
62
    // X - SETB Z --> X - (mask SBB Z, Z)
35347
62
    // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
35348
62
    SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
35349
62
    if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35350
59
      SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35351
62
    return DAG.getNode(IsSub ? 
ISD::SUB28
:
ISD::ADD34
, DL, VT, X, SBB);
35352
62
  }
35353
251
35354
251
  
if (251
CC == X86::COND_A251
) {
35355
97
    SDValue EFLAGS = Y->getOperand(1);
35356
97
    // Try to convert COND_A into COND_B in an attempt to facilitate
35357
97
    // materializing "setb reg".
35358
97
    //
35359
97
    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
35360
97
    // cannot take an immediate as its first operand.
35361
97
    //
35362
97
    if (
EFLAGS.getOpcode() == X86ISD::SUB && 97
EFLAGS.hasOneUse()97
&&
35363
54
        EFLAGS.getValueType().isInteger() &&
35364
97
        
!isa<ConstantSDNode>(EFLAGS.getOperand(1))54
) {
35365
54
      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
35366
54
                                   EFLAGS.getNode()->getVTList(),
35367
54
                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35368
54
      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35369
54
      SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
35370
54
      if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35371
54
        SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35372
54
      return DAG.getNode(IsSub ? 
ISD::SUB1
:
ISD::ADD53
, DL, VT, X, SBB);
35373
54
    }
35374
197
  }
35375
197
35376
197
  
if (197
CC != X86::COND_E && 197
CC != X86::COND_NE108
)
35377
55
    return SDValue();
35378
142
35379
142
  SDValue Cmp = Y.getOperand(1);
35380
142
  if (
Cmp.getOpcode() != X86ISD::CMP || 142
!Cmp.hasOneUse()77
||
35381
72
      !X86::isZeroNode(Cmp.getOperand(1)) ||
35382
72
      !Cmp.getOperand(0).getValueType().isInteger())
35383
70
    return SDValue();
35384
72
35385
72
  SDValue Z = Cmp.getOperand(0);
35386
72
  EVT ZVT = Z.getValueType();
35387
72
35388
72
  // If X is -1 or 0, then we have an opportunity to avoid constants required in
35389
72
  // the general case below.
35390
72
  if (
ConstantX72
) {
35391
39
    // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
35392
39
    // fake operands:
35393
39
    //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
35394
39
    // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
35395
39
    if (
(IsSub && 39
CC == X86::COND_NE33
&&
ConstantX->isNullValue()16
) ||
35396
39
        
(!IsSub && 23
CC == X86::COND_E6
&&
ConstantX->isAllOnesValue()4
)) {
35397
19
      SDValue Zero = DAG.getConstant(0, DL, ZVT);
35398
19
      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
35399
19
      SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
35400
19
      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35401
19
                         DAG.getConstant(X86::COND_B, DL, MVT::i8),
35402
19
                         SDValue(Neg.getNode(), 1));
35403
19
    }
35404
20
35405
20
    // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
35406
20
    // with fake operands:
35407
20
    //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
35408
20
    // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
35409
20
    
if (20
(IsSub && 20
CC == X86::COND_E17
&&
ConstantX->isNullValue()17
) ||
35410
20
        
(!IsSub && 8
CC == X86::COND_NE3
&&
ConstantX->isAllOnesValue()2
)) {
35411
14
      SDValue One = DAG.getConstant(1, DL, ZVT);
35412
14
      SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35413
14
      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35414
14
                         DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
35415
14
    }
35416
39
  }
35417
39
35418
39
  // (cmp Z, 1) sets the carry flag if Z is 0.
35419
39
  SDValue One = DAG.getConstant(1, DL, ZVT);
35420
39
  SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35421
39
35422
39
  // Add the flags type for ADC/SBB nodes.
35423
39
  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35424
39
35425
39
  // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
35426
39
  // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
35427
39
  if (CC == X86::COND_NE)
35428
26
    
return DAG.getNode(IsSub ? 26
X86ISD::ADC2
:
X86ISD::SBB24
, DL, VTs, X,
35429
26
                       DAG.getConstant(-1ULL, DL, VT), Cmp1);
35430
13
35431
13
  // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
35432
13
  // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
35433
13
  
return DAG.getNode(IsSub ? 13
X86ISD::SBB7
:
X86ISD::ADC6
, DL, VTs, X,
35434
280k
                     DAG.getConstant(0, DL, VT), Cmp1);
35435
280k
}
35436
35437
static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
35438
1.13k
                                      const X86Subtarget &Subtarget) {
35439
1.13k
  if (!Subtarget.hasSSE2())
35440
0
    return SDValue();
35441
1.13k
35442
1.13k
  SDValue MulOp = N->getOperand(0);
35443
1.13k
  SDValue Phi = N->getOperand(1);
35444
1.13k
35445
1.13k
  if (MulOp.getOpcode() != ISD::MUL)
35446
1.11k
    std::swap(MulOp, Phi);
35447
1.13k
  if (MulOp.getOpcode() != ISD::MUL)
35448
1.11k
    return SDValue();
35449
18
35450
18
  ShrinkMode Mode;
35451
18
  if (
!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || 18
Mode == MULU1612
)
35452
10
    return SDValue();
35453
8
35454
8
  EVT VT = N->getValueType(0);
35455
8
35456
8
  unsigned RegSize = 128;
35457
8
  if (Subtarget.hasBWI())
35458
2
    RegSize = 512;
35459
6
  else 
if (6
Subtarget.hasAVX2()6
)
35460
4
    RegSize = 256;
35461
8
  unsigned VectorSize = VT.getVectorNumElements() * 16;
35462
8
  // If the vector size is less than 128, or greater than the supported RegSize,
35463
8
  // do not use PMADD.
35464
8
  if (
VectorSize < 128 || 8
VectorSize > RegSize8
)
35465
1
    return SDValue();
35466
7
35467
7
  SDLoc DL(N);
35468
7
  EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35469
7
                                   VT.getVectorNumElements());
35470
7
  EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
35471
7
                                VT.getVectorNumElements() / 2);
35472
7
35473
7
  // Shrink the operands of mul.
35474
7
  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
35475
7
  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
35476
7
35477
7
  // Madd vector size is half of the original vector size
35478
7
  SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
35479
7
  // Fill the rest of the output with 0
35480
7
  SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
35481
7
  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
35482
7
  return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
35483
7
}
35484
35485
static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
35486
1.14k
                                     const X86Subtarget &Subtarget) {
35487
1.14k
  if (!Subtarget.hasSSE2())
35488
0
    return SDValue();
35489
1.14k
35490
1.14k
  SDLoc DL(N);
35491
1.14k
  EVT VT = N->getValueType(0);
35492
1.14k
  SDValue Op0 = N->getOperand(0);
35493
1.14k
  SDValue Op1 = N->getOperand(1);
35494
1.14k
35495
1.14k
  // TODO: There's nothing special about i32, any integer type above i16 should
35496
1.14k
  // work just as well.
35497
1.14k
  if (
!VT.isVector() || 1.14k
!VT.isSimple()1.14k
||
35498
1.14k
      !(VT.getVectorElementType() == MVT::i32))
35499
0
    return SDValue();
35500
1.14k
35501
1.14k
  unsigned RegSize = 128;
35502
1.14k
  if (Subtarget.hasBWI())
35503
9
    RegSize = 512;
35504
1.13k
  else 
if (1.13k
Subtarget.hasAVX2()1.13k
)
35505
412
    RegSize = 256;
35506
1.14k
35507
1.14k
  // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
35508
1.14k
  // TODO: We should be able to handle larger vectors by splitting them before
35509
1.14k
  // feeding them into several SADs, and then reducing over those.
35510
1.14k
  if (VT.getSizeInBits() / 4 > RegSize)
35511
8
    return SDValue();
35512
1.13k
35513
1.13k
  // We know N is a reduction add, which means one of its operands is a phi.
35514
1.13k
  // To match SAD, we need the other operand to be a vector select.
35515
1.13k
  SDValue SelectOp, Phi;
35516
1.13k
  if (
Op0.getOpcode() == ISD::VSELECT1.13k
) {
35517
12
    SelectOp = Op0;
35518
12
    Phi = Op1;
35519
1.13k
  } else 
if (1.12k
Op1.getOpcode() == ISD::VSELECT1.12k
) {
35520
0
    SelectOp = Op1;
35521
0
    Phi = Op0;
35522
0
  } else
35523
1.12k
    return SDValue();
35524
12
35525
12
  // Check whether we have an abs-diff pattern feeding into the select.
35526
12
  
if(12
!detectZextAbsDiff(SelectOp, Op0, Op1)12
)
35527
0
    return SDValue();
35528
12
35529
12
  // SAD pattern detected. Now build a SAD instruction and an addition for
35530
12
  // reduction. Note that the number of elements of the result of SAD is less
35531
12
  // than the number of elements of its input. Therefore, we could only update
35532
12
  // part of elements in the reduction vector.
35533
12
  SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
35534
12
35535
12
  // The output of PSADBW is a vector of i64.
35536
12
  // We need to turn the vector of i64 into a vector of i32.
35537
12
  // If the reduction vector is at least as wide as the psadbw result, just
35538
12
  // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
35539
12
  // anyway.
35540
12
  MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
35541
12
  if (VT.getSizeInBits() >= ResVT.getSizeInBits())
35542
8
    Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
35543
12
  else
35544
4
    Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
35545
12
35546
12
  if (
VT.getSizeInBits() > ResVT.getSizeInBits()12
) {
35547
8
    // Fill the upper elements with zero to match the add width.
35548
8
    SDValue Zero = DAG.getConstant(0, DL, VT);
35549
8
    Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
35550
8
                      DAG.getIntPtrConstant(0, DL));
35551
8
  }
35552
1.14k
35553
1.14k
  return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
35554
1.14k
}
35555
35556
/// Convert vector increment or decrement to sub/add with an all-ones constant:
35557
/// add X, <1, 1...> --> sub X, <-1, -1...>
35558
/// sub X, <1, 1...> --> add X, <-1, -1...>
35559
/// The all-ones vector constant can be materialized using a pcmpeq instruction
35560
/// that is commonly recognized as an idiom (has no register dependency), so
35561
/// that's better/smaller than loading a splat 1 constant.
35562
281k
static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
35563
281k
  assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
35564
281k
         "Unexpected opcode for increment/decrement transform");
35565
281k
35566
281k
  // Pseudo-legality check: getOnesVector() expects one of these types, so bail
35567
281k
  // out and wait for legalization if we have an unsupported vector length.
35568
281k
  EVT VT = N->getValueType(0);
35569
281k
  if (
!VT.is128BitVector() && 281k
!VT.is256BitVector()267k
&&
!VT.is512BitVector()261k
)
35570
259k
    return SDValue();
35571
21.9k
35572
21.9k
  SDNode *N1 = N->getOperand(1).getNode();
35573
21.9k
  APInt SplatVal;
35574
21.9k
  if (!ISD::isConstantSplatVector(N1, SplatVal) ||
35575
2.08k
      !SplatVal.isOneValue())
35576
21.4k
    return SDValue();
35577
516
35578
516
  SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
35579
516
  unsigned NewOpcode = N->getOpcode() == ISD::ADD ? 
ISD::SUB350
:
ISD::ADD166
;
35580
281k
  return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
35581
281k
}
35582
35583
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
35584
261k
                          const X86Subtarget &Subtarget) {
35585
261k
  const SDNodeFlags Flags = N->getFlags();
35586
261k
  if (
Flags.hasVectorReduction()261k
) {
35587
1.14k
    if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
35588
12
      return Sad;
35589
1.13k
    
if (SDValue 1.13k
MAdd1.13k
= combineLoopMAddPattern(N, DAG, Subtarget))
35590
7
      return MAdd;
35591
261k
  }
35592
261k
  EVT VT = N->getValueType(0);
35593
261k
  SDValue Op0 = N->getOperand(0);
35594
261k
  SDValue Op1 = N->getOperand(1);
35595
261k
35596
261k
  // Try to synthesize horizontal adds from adds of shuffles.
35597
261k
  if (
((Subtarget.hasSSSE3() && 261k
(VT == MVT::v8i16 || 113k
VT == MVT::v4i32112k
)) ||
35598
257k
       
(Subtarget.hasInt256() && 257k
(VT == MVT::v16i16 || 44.4k
VT == MVT::v8i3243.7k
))) &&
35599
6.40k
      isHorizontalBinOp(Op0, Op1, true))
35600
54
    return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
35601
261k
35602
261k
  
if (SDValue 261k
V261k
= combineIncDecVector(N, DAG))
35603
350
    return V;
35604
261k
35605
261k
  return combineAddOrSubToADCOrSBB(N, DAG);
35606
261k
}
35607
35608
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
35609
19.9k
                          const X86Subtarget &Subtarget) {
35610
19.9k
  SDValue Op0 = N->getOperand(0);
35611
19.9k
  SDValue Op1 = N->getOperand(1);
35612
19.9k
35613
19.9k
  // X86 can't encode an immediate LHS of a sub. See if we can push the
35614
19.9k
  // negation into a preceding instruction.
35615
19.9k
  if (ConstantSDNode *
C19.9k
= dyn_cast<ConstantSDNode>(Op0)) {
35616
6.15k
    // If the RHS of the sub is a XOR with one use and a constant, invert the
35617
6.15k
    // immediate. Then add one to the LHS of the sub so we can turn
35618
6.15k
    // X-Y -> X+~Y+1, saving one register.
35619
6.15k
    if (
Op1->hasOneUse() && 6.15k
Op1.getOpcode() == ISD::XOR3.71k
&&
35620
6.15k
        
isa<ConstantSDNode>(Op1.getOperand(1))28
) {
35621
28
      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
35622
28
      EVT VT = Op0.getValueType();
35623
28
      SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
35624
28
                                   Op1.getOperand(0),
35625
28
                                   DAG.getConstant(~XorC, SDLoc(Op1), VT));
35626
28
      return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
35627
28
                         DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
35628
28
    }
35629
19.9k
  }
35630
19.9k
35631
19.9k
  // Try to synthesize horizontal subs from subs of shuffles.
35632
19.9k
  EVT VT = N->getValueType(0);
35633
19.9k
  if (
((Subtarget.hasSSSE3() && 19.9k
(VT == MVT::v8i16 || 13.0k
VT == MVT::v4i3212.5k
)) ||
35634
18.6k
       
(Subtarget.hasInt256() && 18.6k
(VT == MVT::v16i16 || 5.73k
VT == MVT::v8i325.48k
))) &&
35635
1.94k
      isHorizontalBinOp(Op0, Op1, false))
35636
20
    return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
35637
19.9k
35638
19.9k
  
if (SDValue 19.9k
V19.9k
= combineIncDecVector(N, DAG))
35639
166
    return V;
35640
19.7k
35641
19.7k
  return combineAddOrSubToADCOrSBB(N, DAG);
35642
19.7k
}
35643
35644
static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
35645
                             TargetLowering::DAGCombinerInfo &DCI,
35646
11.6k
                             const X86Subtarget &Subtarget) {
35647
11.6k
  if (DCI.isBeforeLegalize())
35648
2.37k
    return SDValue();
35649
9.30k
35650
9.30k
  SDLoc DL(N);
35651
9.30k
  unsigned Opcode = N->getOpcode();
35652
9.30k
  MVT VT = N->getSimpleValueType(0);
35653
9.30k
  MVT SVT = VT.getVectorElementType();
35654
9.30k
  unsigned NumElts = VT.getVectorNumElements();
35655
9.30k
  unsigned EltSizeInBits = SVT.getSizeInBits();
35656
9.30k
35657
9.30k
  SDValue Op = N->getOperand(0);
35658
9.30k
  MVT OpVT = Op.getSimpleValueType();
35659
9.30k
  MVT OpEltVT = OpVT.getVectorElementType();
35660
9.30k
  unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
35661
9.30k
  unsigned InputBits = OpEltSizeInBits * NumElts;
35662
9.30k
35663
9.30k
  // Perform any constant folding.
35664
9.30k
  // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
35665
9.30k
  APInt UndefElts;
35666
9.30k
  SmallVector<APInt, 64> EltBits;
35667
9.30k
  if (
getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)9.30k
) {
35668
395
    APInt Undefs(NumElts, 0);
35669
395
    SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
35670
395
    bool IsZEXT =
35671
343
        (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
35672
6.47k
    for (unsigned i = 0; 
i != NumElts6.47k
;
++i6.08k
) {
35673
6.08k
      if (
UndefElts[i]6.08k
) {
35674
0
        Undefs.setBit(i);
35675
0
        continue;
35676
0
      }
35677
6.08k
      
Vals[i] = IsZEXT ? 6.08k
EltBits[i].zextOrTrunc(EltSizeInBits)1.03k
35678
5.05k
                       : EltBits[i].sextOrTrunc(EltSizeInBits);
35679
6.08k
    }
35680
395
    return getConstVector(Vals, Undefs, VT, DAG, DL);
35681
395
  }
35682
8.90k
35683
8.90k
  // (vzext (bitcast (vzext (x)) -> (vzext x)
35684
8.90k
  // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
35685
8.90k
  SDValue V = peekThroughBitcasts(Op);
35686
8.90k
  if (
Opcode == X86ISD::VZEXT && 8.90k
V != Op1.52k
&&
V.getOpcode() == X86ISD::VZEXT199
) {
35687
0
    MVT InnerVT = V.getSimpleValueType();
35688
0
    MVT InnerEltVT = InnerVT.getVectorElementType();
35689
0
35690
0
    // If the element sizes match exactly, we can just do one larger vzext. This
35691
0
    // is always an exact type match as vzext operates on integer types.
35692
0
    if (
OpEltVT == InnerEltVT0
) {
35693
0
      assert(OpVT == InnerVT && "Types must match for vzext!");
35694
0
      return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
35695
0
    }
35696
0
35697
0
    // The only other way we can combine them is if only a single element of the
35698
0
    // inner vzext is used in the input to the outer vzext.
35699
0
    
if (0
InnerEltVT.getSizeInBits() < InputBits0
)
35700
0
      return SDValue();
35701
0
35702
0
    // In this case, the inner vzext is completely dead because we're going to
35703
0
    // only look at bits inside of the low element. Just do the outer vzext on
35704
0
    // a bitcast of the input to the inner.
35705
0
    return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
35706
0
  }
35707
8.90k
35708
8.90k
  // Check if we can bypass extracting and re-inserting an element of an input
35709
8.90k
  // vector. Essentially:
35710
8.90k
  // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
35711
8.90k
  // TODO: Add X86ISD::VSEXT support
35712
8.90k
  
if (8.90k
Opcode == X86ISD::VZEXT &&
35713
1.52k
      V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35714
0
      V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
35715
8.90k
      
V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits0
) {
35716
0
    SDValue ExtractedV = V.getOperand(0);
35717
0
    SDValue OrigV = ExtractedV.getOperand(0);
35718
0
    if (
isNullConstant(ExtractedV.getOperand(1))0
) {
35719
0
        MVT OrigVT = OrigV.getSimpleValueType();
35720
0
        // Extract a subvector if necessary...
35721
0
        if (
OrigVT.getSizeInBits() > OpVT.getSizeInBits()0
) {
35722
0
          int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
35723
0
          OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
35724
0
                                    OrigVT.getVectorNumElements() / Ratio);
35725
0
          OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
35726
0
                              DAG.getIntPtrConstant(0, DL));
35727
0
        }
35728
0
        Op = DAG.getBitcast(OpVT, OrigV);
35729
0
        return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
35730
0
      }
35731
8.90k
  }
35732
8.90k
35733
8.90k
  return SDValue();
35734
8.90k
}
35735
35736
/// Canonicalize (LSUB p, 1) -> (LADD p, -1).
35737
static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
35738
70
                                  const X86Subtarget &Subtarget) {
35739
70
  SDValue Chain = N->getOperand(0);
35740
70
  SDValue LHS = N->getOperand(1);
35741
70
  SDValue RHS = N->getOperand(2);
35742
70
  MVT VT = RHS.getSimpleValueType();
35743
70
  SDLoc DL(N);
35744
70
35745
70
  auto *C = dyn_cast<ConstantSDNode>(RHS);
35746
70
  if (
!C || 70
C->getZExtValue() != 154
)
35747
33
    return SDValue();
35748
37
35749
37
  RHS = DAG.getConstant(-1, DL, VT);
35750
37
  MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
35751
37
  return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
35752
37
                                 DAG.getVTList(MVT::i32, MVT::Other),
35753
37
                                 {Chain, LHS, RHS}, VT, MMO);
35754
37
}
35755
35756
static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
35757
2.39k
                            const X86Subtarget &Subtarget) {
35758
2.39k
  SDValue Op0 = N->getOperand(0);
35759
2.39k
  SDValue Op1 = N->getOperand(1);
35760
2.39k
35761
2.39k
  MVT VT = N->getSimpleValueType(0);
35762
2.39k
  SDLoc DL(N);
35763
2.39k
35764
2.39k
  // TEST (AND a, b) ,(AND a, b) -> TEST a, b
35765
2.39k
  if (
Op0 == Op1 && 2.39k
Op1->getOpcode() == ISD::AND2.37k
)
35766
4
    return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
35767
4
                       Op0->getOperand(1));
35768
2.39k
35769
2.39k
  // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
35770
2.39k
  // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
35771
2.39k
  
if (2.39k
ISD::isBuildVectorAllZeros(Op0.getNode()) ||
35772
2.13k
      ISD::isBuildVectorAllZeros(Op1.getNode()))
35773
260
    return getZeroVector(VT, Subtarget, DAG, DL);
35774
2.13k
35775
2.13k
  return SDValue();
35776
2.13k
}
35777
35778
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
35779
9.40k
                                    const X86Subtarget &Subtarget) {
35780
9.40k
  MVT VT = N->getSimpleValueType(0);
35781
9.40k
  SDLoc DL(N);
35782
9.40k
35783
9.40k
  if (
N->getOperand(0) == N->getOperand(1)9.40k
) {
35784
0
    if (N->getOpcode() == X86ISD::PCMPEQ)
35785
0
      return getOnesVector(VT, DAG, DL);
35786
0
    
if (0
N->getOpcode() == X86ISD::PCMPGT0
)
35787
0
      return getZeroVector(VT, Subtarget, DAG, DL);
35788
9.40k
  }
35789
9.40k
35790
9.40k
  return SDValue();
35791
9.40k
}
35792
35793
static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
35794
                                      TargetLowering::DAGCombinerInfo &DCI,
35795
11.8k
                                      const X86Subtarget &Subtarget) {
35796
11.8k
  if (DCI.isBeforeLegalizeOps())
35797
16
    return SDValue();
35798
11.8k
35799
11.8k
  MVT OpVT = N->getSimpleValueType(0);
35800
11.8k
35801
11.8k
  // Early out for mask vectors.
35802
11.8k
  if (OpVT.getVectorElementType() == MVT::i1)
35803
1.37k
    return SDValue();
35804
10.5k
35805
10.5k
  SDLoc dl(N);
35806
10.5k
  SDValue Vec = N->getOperand(0);
35807
10.5k
  SDValue SubVec = N->getOperand(1);
35808
10.5k
  SDValue Idx = N->getOperand(2);
35809
10.5k
35810
10.5k
  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
35811
10.5k
  MVT SubVecVT = SubVec.getSimpleValueType();
35812
10.5k
35813
10.5k
  if (
ISD::isBuildVectorAllZeros(Vec.getNode())10.5k
) {
35814
138
    // Inserting zeros into zeros is a nop.
35815
138
    if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
35816
6
      return Vec;
35817
132
35818
132
    // If we're inserting into a zero vector and then into a larger zero vector,
35819
132
    // just insert into the larger zero vector directly.
35820
132
    
if (132
SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
35821
132
        
ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())17
) {
35822
10
      unsigned Idx2Val = SubVec.getConstantOperandVal(2);
35823
10
      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
35824
10
                         SubVec.getOperand(1),
35825
10
                         DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
35826
10
    }
35827
10.4k
  }
35828
10.4k
35829
10.4k
  // If this is an insert of an extract, combine to a shuffle. Don't do this
35830
10.4k
  // if the insert or extract can be represented with a subregister operation.
35831
10.4k
  
if (10.4k
SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
35832
630
      SubVec.getOperand(0).getSimpleValueType() == OpVT &&
35833
10.4k
      
(IdxVal != 0 || 630
!Vec.isUndef()150
)) {
35834
484
    int ExtIdxVal = SubVec.getConstantOperandVal(1);
35835
484
    if (
ExtIdxVal != 0484
) {
35836
139
      int VecNumElts = OpVT.getVectorNumElements();
35837
139
      int SubVecNumElts = SubVecVT.getVectorNumElements();
35838
139
      SmallVector<int, 64> Mask(VecNumElts);
35839
139
      // First create an identity shuffle mask.
35840
8.50k
      for (int i = 0; 
i != VecNumElts8.50k
;
++i8.36k
)
35841
8.36k
        Mask[i] = i;
35842
139
      // Now insert the extracted portion.
35843
4.32k
      for (int i = 0; 
i != SubVecNumElts4.32k
;
++i4.18k
)
35844
4.18k
        Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
35845
139
35846
139
      return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
35847
139
    }
35848
10.3k
  }
35849
10.3k
35850
10.3k
  // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
35851
10.3k
  // load:
35852
10.3k
  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35853
10.3k
  //                   (load16 addr + 16), Elts/2)
35854
10.3k
  // --> load32 addr
35855
10.3k
  // or:
35856
10.3k
  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35857
10.3k
  //                   (load32 addr + 32), Elts/2)
35858
10.3k
  // --> load64 addr
35859
10.3k
  // or a 16-byte or 32-byte broadcast:
35860
10.3k
  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35861
10.3k
  //                   (load16 addr), Elts/2)
35862
10.3k
  // --> X86SubVBroadcast(load16 addr)
35863
10.3k
  // or:
35864
10.3k
  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35865
10.3k
  //                   (load32 addr), Elts/2)
35866
10.3k
  // --> X86SubVBroadcast(load32 addr)
35867
10.3k
  
if (10.3k
(IdxVal == OpVT.getVectorNumElements() / 2) &&
35868
4.69k
      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
35869
10.3k
      
OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 23.90k
) {
35870
3.90k
    auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
35871
3.90k
    if (
Idx2 && 3.90k
Idx2->getZExtValue() == 03.90k
) {
35872
3.90k
      SDValue SubVec2 = Vec.getOperand(1);
35873
3.90k
      // If needed, look through bitcasts to get to the load.
35874
3.90k
      if (auto *
FirstLd3.90k
= dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
35875
656
        bool Fast;
35876
656
        unsigned Alignment = FirstLd->getAlignment();
35877
656
        unsigned AS = FirstLd->getAddressSpace();
35878
656
        const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35879
656
        if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35880
656
                                    OpVT, AS, Alignment, &Fast) && 
Fast656
) {
35881
630
          SDValue Ops[] = {SubVec2, SubVec};
35882
630
          if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
35883
630
                                                    Subtarget, false))
35884
31
            return Ld;
35885
3.87k
        }
35886
656
      }
35887
3.87k
      // If lower/upper loads are the same and the only users of the load, then
35888
3.87k
      // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35889
3.87k
      
if (auto *3.87k
Ld3.87k
= dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
35890
625
        
if (625
SubVec2 == SubVec && 625
ISD::isNormalLoad(Ld)494
&&
35891
494
            SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
35892
408
          return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35893
3.47k
35894
3.47k
      // If this is subv_broadcast insert into both halves, use a larger
35895
3.47k
      // subv_broadcast.
35896
3.47k
      
if (3.47k
SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && 3.47k
SubVec == SubVec292
)
35897
92
        return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35898
92
                           SubVec.getOperand(0));
35899
3.37k
35900
3.37k
      // If we're inserting all zeros into the upper half, change this to
35901
3.37k
      // an insert into an all zeros vector. We will match this to a move
35902
3.37k
      // with implicit upper bit zeroing during isel.
35903
3.37k
      
if (3.37k
ISD::isBuildVectorAllZeros(SubVec.getNode())3.37k
)
35904
68
        return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
35905
68
                           getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
35906
68
                           Vec.getOperand(2));
35907
3.31k
35908
3.31k
      // If we are inserting into both halves of the vector, the starting
35909
3.31k
      // vector should be undef. If it isn't, make it so. Only do this if the
35910
3.31k
      // the early insert has no other uses.
35911
3.31k
      // TODO: Should this be a generic DAG combine?
35912
3.31k
      
if (3.31k
!Vec.getOperand(0).isUndef() && 3.31k
Vec.hasOneUse()33
) {
35913
4
        Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
35914
4
                          SubVec2, Vec.getOperand(2));
35915
4
        DCI.AddToWorklist(Vec.getNode());
35916
4
        return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, Idx);
35917
4
35918
4
      }
35919
9.74k
    }
35920
3.90k
  }
35921
9.74k
35922
9.74k
  return SDValue();
35923
9.74k
}
35924
35925
static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
35926
                                       TargetLowering::DAGCombinerInfo &DCI,
35927
33.3k
                                       const X86Subtarget &Subtarget) {
35928
33.3k
  if (DCI.isBeforeLegalizeOps())
35929
14.3k
    return SDValue();
35930
18.9k
35931
18.9k
  MVT OpVT = N->getSimpleValueType(0);
35932
18.9k
  SDValue InVec = N->getOperand(0);
35933
18.9k
  unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
35934
18.9k
35935
18.9k
  if (ISD::isBuildVectorAllZeros(InVec.getNode()))
35936
520
    return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
35937
18.3k
35938
18.3k
  
if (18.3k
ISD::isBuildVectorAllOnes(InVec.getNode())18.3k
) {
35939
2
    if (OpVT.getScalarType() == MVT::i1)
35940
0
      return DAG.getConstant(1, SDLoc(N), OpVT);
35941
2
    return getOnesVector(OpVT, DAG, SDLoc(N));
35942
2
  }
35943
18.3k
35944
18.3k
  
if (18.3k
InVec.getOpcode() == ISD::BUILD_VECTOR18.3k
)
35945
598
    return DAG.getBuildVector(
35946
598
        OpVT, SDLoc(N),
35947
598
        InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
35948
17.7k
35949
17.7k
  return SDValue();
35950
17.7k
}
35951
35952
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35953
2.16M
                                             DAGCombinerInfo &DCI) const {
35954
2.16M
  SelectionDAG &DAG = DCI.DAG;
35955
2.16M
  switch (N->getOpcode()) {
35956
387k
  default: break;
35957
48.3k
  case ISD::EXTRACT_VECTOR_ELT:
35958
48.3k
    return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35959
3.83k
  case X86ISD::PEXTRW:
35960
3.83k
  case X86ISD::PEXTRB:
35961
3.83k
    return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35962
11.8k
  case ISD::INSERT_SUBVECTOR:
35963
11.8k
    return combineInsertSubvector(N, DAG, DCI, Subtarget);
35964
33.3k
  case ISD::EXTRACT_SUBVECTOR:
35965
33.3k
    return combineExtractSubvector(N, DAG, DCI, Subtarget);
35966
33.0k
  case ISD::VSELECT:
35967
33.0k
  case ISD::SELECT:
35968
33.0k
  case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35969
183k
  case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
35970
6.38k
  case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
35971
261k
  case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
35972
19.9k
  case ISD::SUB:            return combineSub(N, DAG, Subtarget);
35973
2.52k
  case X86ISD::SBB:         return combineSBB(N, DAG);
35974
6.64k
  case X86ISD::ADC:         return combineADC(N, DAG, DCI);
35975
10.5k
  case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
35976
59.2k
  case ISD::SHL:
35977
59.2k
  case ISD::SRA:
35978
59.2k
  case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
35979
64.8k
  case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
35980
37.5k
  case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
35981
20.3k
  case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
35982
276k
  case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
35983
719
  case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
35984
304k
  case ISD::STORE:          return combineStore(N, DAG, Subtarget);
35985
367
  case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
35986
6.66k
  case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
35987
2.21k
  case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
35988
13.0k
  case ISD::FADD:
35989
13.0k
  case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
35990
556
  case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
35991
54.0k
  case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
35992
3.18k
  case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
35993
1.63k
  case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
35994
441
  case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
35995
1.26k
  case X86ISD::FXOR:
35996
1.26k
  case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
35997
977
  case X86ISD::FMIN:
35998
977
  case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
35999
136
  case ISD::FMINNUM:
36000
136
  case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
36001
790
  case X86ISD::BT:          return combineBT(N, DAG, DCI);
36002
29.8k
  case ISD::ANY_EXTEND:
36003
29.8k
  case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
36004
3.78k
  case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
36005
1.67k
  case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
36006
86.0k
  case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
36007
11.2k
  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
36008
36.5k
  case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
36009
20.0k
  case X86ISD::VSHLI:
36010
20.0k
  case X86ISD::VSRAI:
36011
20.0k
  case X86ISD::VSRLI:
36012
20.0k
    return combineVectorShiftImm(N, DAG, DCI, Subtarget);
36013
11.6k
  case ISD::SIGN_EXTEND_VECTOR_INREG:
36014
11.6k
  case ISD::ZERO_EXTEND_VECTOR_INREG:
36015
11.6k
  case X86ISD::VSEXT:
36016
11.6k
  case X86ISD::VZEXT:       return combineVSZext(N, DAG, DCI, Subtarget);
36017
9.75k
  case X86ISD::PINSRB:
36018
9.75k
  case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
36019
83.1k
  case X86ISD::SHUFP:       // Handle all target specific shuffles
36020
83.1k
  case X86ISD::INSERTPS:
36021
83.1k
  case X86ISD::EXTRQI:
36022
83.1k
  case X86ISD::INSERTQI:
36023
83.1k
  case X86ISD::PALIGNR:
36024
83.1k
  case X86ISD::VSHLDQ:
36025
83.1k
  case X86ISD::VSRLDQ:
36026
83.1k
  case X86ISD::BLENDI:
36027
83.1k
  case X86ISD::UNPCKH:
36028
83.1k
  case X86ISD::UNPCKL:
36029
83.1k
  case X86ISD::MOVHLPS:
36030
83.1k
  case X86ISD::MOVLHPS:
36031
83.1k
  case X86ISD::PSHUFB:
36032
83.1k
  case X86ISD::PSHUFD:
36033
83.1k
  case X86ISD::PSHUFHW:
36034
83.1k
  case X86ISD::PSHUFLW:
36035
83.1k
  case X86ISD::MOVSHDUP:
36036
83.1k
  case X86ISD::MOVSLDUP:
36037
83.1k
  case X86ISD::MOVDDUP:
36038
83.1k
  case X86ISD::MOVSS:
36039
83.1k
  case X86ISD::MOVSD:
36040
83.1k
  case X86ISD::VPPERM:
36041
83.1k
  case X86ISD::VPERMI:
36042
83.1k
  case X86ISD::VPERMV:
36043
83.1k
  case X86ISD::VPERMV3:
36044
83.1k
  case X86ISD::VPERMIV3:
36045
83.1k
  case X86ISD::VPERMIL2:
36046
83.1k
  case X86ISD::VPERMILPI:
36047
83.1k
  case X86ISD::VPERMILPV:
36048
83.1k
  case X86ISD::VPERM2X128:
36049
83.1k
  case X86ISD::VZEXT_MOVL:
36050
83.1k
  case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
36051
2.27k
  case X86ISD::FMADD_RND:
36052
2.27k
  case X86ISD::FMADDS1_RND:
36053
2.27k
  case X86ISD::FMADDS3_RND:
36054
2.27k
  case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
36055
924
  case ISD::MGATHER:
36056
924
  case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
36057
70
  case X86ISD::LSUB:        return combineLockSub(N, DAG, Subtarget);
36058
2.39k
  case X86ISD::TESTM:       return combineTestM(N, DAG, Subtarget);
36059
9.40k
  case X86ISD::PCMPEQ:
36060
9.40k
  case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
36061
387k
  }
36062
387k
36063
387k
  return SDValue();
36064
387k
}
36065
36066
/// Return true if the target has native support for the specified value type
36067
/// and it is 'desirable' to use the type for the given node type. e.g. On x86
36068
/// i16 is legal, but undesirable since i16 instruction encodings are longer and
36069
/// some i16 instructions are slow.
36070
308k
bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
36071
308k
  if (!isTypeLegal(VT))
36072
757
    return false;
36073
307k
  
if (307k
VT != MVT::i16307k
)
36074
292k
    return true;
36075
14.5k
36076
14.5k
  switch (Opc) {
36077
3.97k
  default:
36078
3.97k
    return true;
36079
10.5k
  case ISD::LOAD:
36080
10.5k
  case ISD::SIGN_EXTEND:
36081
10.5k
  case ISD::ZERO_EXTEND:
36082
10.5k
  case ISD::ANY_EXTEND:
36083
10.5k
  case ISD::SHL:
36084
10.5k
  case ISD::SRL:
36085
10.5k
  case ISD::SUB:
36086
10.5k
  case ISD::ADD:
36087
10.5k
  case ISD::MUL:
36088
10.5k
  case ISD::AND:
36089
10.5k
  case ISD::OR:
36090
10.5k
  case ISD::XOR:
36091
10.5k
    return false;
36092
0
  }
36093
0
}
36094
36095
/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
36096
/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
36097
/// we don't adjust the stack we clobber the first frame index.
36098
/// See X86InstrInfo::copyPhysReg.
36099
73.2k
static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
36100
73.2k
  const MachineRegisterInfo &MRI = MF.getRegInfo();
36101
73.2k
  return any_of(MRI.reg_instructions(X86::EFLAGS),
36102
234k
                [](const MachineInstr &RI) { return RI.isCopy(); });
36103
73.2k
}
36104
36105
73.2k
void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
36106
73.2k
  if (
hasCopyImplyingStackAdjustment(MF)73.2k
) {
36107
43
    MachineFrameInfo &MFI = MF.getFrameInfo();
36108
43
    MFI.setHasCopyImplyingStackAdjustment(true);
36109
43
  }
36110
73.2k
36111
73.2k
  TargetLoweringBase::finalizeLowering(MF);
36112
73.2k
}
36113
36114
/// This method query the target whether it is beneficial for dag combiner to
36115
/// promote the specified node. If true, it should return the desired promotion
36116
/// type by reference.
36117
3.65k
bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
36118
3.65k
  EVT VT = Op.getValueType();
36119
3.65k
  if (VT != MVT::i16)
36120
0
    return false;
36121
3.65k
36122
3.65k
  bool Promote = false;
36123
3.65k
  bool Commute = false;
36124
3.65k
  switch (Op.getOpcode()) {
36125
1.42k
  default: break;
36126
104
  case ISD::SIGN_EXTEND:
36127
104
  case ISD::ZERO_EXTEND:
36128
104
  case ISD::ANY_EXTEND:
36129
104
    Promote = true;
36130
104
    break;
36131
829
  case ISD::SHL:
36132
829
  case ISD::SRL: {
36133
829
    SDValue N0 = Op.getOperand(0);
36134
829
    // Look out for (store (shl (load), x)).
36135
829
    if (
MayFoldLoad(N0) && 829
MayFoldIntoStore(Op)11
)
36136
2
      return false;
36137
827
    Promote = true;
36138
827
    break;
36139
827
  }
36140
1.21k
  case ISD::ADD:
36141
1.21k
  case ISD::MUL:
36142
1.21k
  case ISD::AND:
36143
1.21k
  case ISD::OR:
36144
1.21k
  case ISD::XOR:
36145
1.21k
    Commute = true;
36146
1.21k
    LLVM_FALLTHROUGH;
36147
1.29k
  case ISD::SUB: {
36148
1.29k
    SDValue N0 = Op.getOperand(0);
36149
1.29k
    SDValue N1 = Op.getOperand(1);
36150
1.29k
    if (
!Commute && 1.29k
MayFoldLoad(N1)85
)
36151
0
      return false;
36152
1.29k
    // Avoid disabling potential load folding opportunities.
36153
1.29k
    
if (1.29k
MayFoldLoad(N0) && 1.29k
(!isa<ConstantSDNode>(N1) || 76
MayFoldIntoStore(Op)45
))
36154
33
      return false;
36155
1.26k
    
if (1.26k
MayFoldLoad(N1) && 1.26k
(!isa<ConstantSDNode>(N0) || 20
MayFoldIntoStore(Op)0
))
36156
20
      return false;
36157
1.24k
    Promote = true;
36158
1.24k
  }
36159
3.65k
  }
36160
3.65k
36161
3.59k
  PVT = MVT::i32;
36162
3.59k
  return Promote;
36163
3.65k
}
36164
36165
bool X86TargetLowering::
36166
    isDesirableToCombineBuildVectorToShuffleTruncate(
36167
290
        ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
36168
290
36169
290
  assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
36170
290
         "Element count mismatch");
36171
290
  assert(
36172
290
      Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
36173
290
      "Shuffle Mask expected to be legal");
36174
290
36175
290
  // For 32-bit elements VPERMD is better than shuffle+truncate.
36176
290
  // TODO: After we improve lowerBuildVector, add execption for VPERMW.
36177
290
  if (
SrcVT.getScalarSizeInBits() == 32 || 290
!Subtarget.hasAVX2()255
)
36178
98
    return false;
36179
192
36180
192
  
if (192
is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask)192
)
36181
0
    return false;
36182
192
36183
192
  return true;
36184
192
}
36185
36186
//===----------------------------------------------------------------------===//
36187
//                           X86 Inline Assembly Support
36188
//===----------------------------------------------------------------------===//
36189
36190
// Helper to match a string separated by whitespace.
36191
1.39k
static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
36192
1.39k
  S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
36193
1.39k
36194
1.44k
  for (StringRef Piece : Pieces) {
36195
1.44k
    if (!S.startswith(Piece)) // Check if the piece matches.
36196
1.35k
      return false;
36197
91
36198
91
    S = S.substr(Piece.size());
36199
91
    StringRef::size_type Pos = S.find_first_not_of(" \t");
36200
91
    if (Pos == 0) // We matched a prefix.
36201
8
      return false;
36202
83
36203
83
    S = S.substr(Pos);
36204
83
  }
36205
1.39k
36206
30
  return S.empty();
36207
1.39k
}
36208
36209
11
static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
36210
11
36211
11
  if (
AsmPieces.size() == 3 || 11
AsmPieces.size() == 410
) {
36212
11
    if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
36213
11
        std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
36214
11
        
std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")11
) {
36215
11
36216
11
      if (AsmPieces.size() == 3)
36217
1
        return true;
36218
10
      else 
if (10
std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")10
)
36219
10
        return true;
36220
0
    }
36221
11
  }
36222
0
  return false;
36223
0
}
36224
36225
7.06k
bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
36226
7.06k
  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
36227
7.06k
36228
7.06k
  const std::string &AsmStr = IA->getAsmString();
36229
7.06k
36230
7.06k
  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
36231
7.06k
  if (
!Ty || 7.06k
Ty->getBitWidth() % 16 != 0320
)
36232
6.75k
    return false;
36233
312
36234
312
  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
36235
312
  SmallVector<StringRef, 4> AsmPieces;
36236
312
  SplitString(AsmStr, AsmPieces, ";\n");
36237
312
36238
312
  switch (AsmPieces.size()) {
36239
71
  default: return false;
36240
237
  case 1:
36241
237
    // FIXME: this should verify that we are targeting a 486 or better.  If not,
36242
237
    // we will turn this bswap into something that will be lowered to logical
36243
237
    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
36244
237
    // lower so don't worry about this.
36245
237
    // bswap $0
36246
237
    if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
36247
230
        matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
36248
230
        matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
36249
230
        matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
36250
226
        matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
36251
237
        
matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})224
) {
36252
15
      // No need to check constraints, nothing other than the equivalent of
36253
15
      // "=r,0" would be valid here.
36254
15
      return IntrinsicLowering::LowerToByteSwap(CI);
36255
15
    }
36256
222
36257
222
    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
36258
222
    
if (222
CI->getType()->isIntegerTy(16) &&
36259
14
        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
36260
9
        (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
36261
222
         
matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"})4
)) {
36262
9
      AsmPieces.clear();
36263
9
      StringRef ConstraintsStr = IA->getConstraintString();
36264
9
      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
36265
9
      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
36266
9
      if (clobbersFlagRegisters(AsmPieces))
36267
9
        return IntrinsicLowering::LowerToByteSwap(CI);
36268
213
    }
36269
213
    break;
36270
4
  case 3:
36271
4
    if (CI->getType()->isIntegerTy(32) &&
36272
4
        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
36273
2
        matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
36274
2
        matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
36275
4
        
matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})2
) {
36276
2
      AsmPieces.clear();
36277
2
      StringRef ConstraintsStr = IA->getConstraintString();
36278
2
      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
36279
2
      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
36280
2
      if (clobbersFlagRegisters(AsmPieces))
36281
2
        return IntrinsicLowering::LowerToByteSwap(CI);
36282
2
    }
36283
2
36284
2
    
if (2
CI->getType()->isIntegerTy(64)2
) {
36285
0
      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
36286
0
      if (Constraints.size() >= 2 &&
36287
0
          
Constraints[0].Codes.size() == 10
&&
Constraints[0].Codes[0] == "A"0
&&
36288
0
          
Constraints[1].Codes.size() == 10
&&
Constraints[1].Codes[0] == "0"0
) {
36289
0
        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
36290
0
        if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
36291
0
            matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
36292
0
            matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
36293
0
          return IntrinsicLowering::LowerToByteSwap(CI);
36294
2
      }
36295
0
    }
36296
2
    break;
36297
215
  }
36298
215
  return false;
36299
215
}
36300
36301
/// Given a constraint letter, return the type of constraint for this target.
36302
X86TargetLowering::ConstraintType
36303
243k
X86TargetLowering::getConstraintType(StringRef Constraint) const {
36304
243k
  if (
Constraint.size() == 1243k
) {
36305
11.6k
    switch (Constraint[0]) {
36306
7.68k
    case 'R':
36307
7.68k
    case 'q':
36308
7.68k
    case 'Q':
36309
7.68k
    case 'f':
36310
7.68k
    case 't':
36311
7.68k
    case 'u':
36312
7.68k
    case 'y':
36313
7.68k
    case 'x':
36314
7.68k
    case 'v':
36315
7.68k
    case 'Y':
36316
7.68k
    case 'l':
36317
7.68k
    case 'k': // AVX512 masking registers.
36318
7.68k
      return C_RegisterClass;
36319
31
    case 'a':
36320
31
    case 'b':
36321
31
    case 'c':
36322
31
    case 'd':
36323
31
    case 'S':
36324
31
    case 'D':
36325
31
    case 'A':
36326
31
      return C_Register;
36327
80
    case 'I':
36328
80
    case 'J':
36329
80
    case 'K':
36330
80
    case 'L':
36331
80
    case 'M':
36332
80
    case 'N':
36333
80
    case 'G':
36334
80
    case 'C':
36335
80
    case 'e':
36336
80
    case 'Z':
36337
80
      return C_Other;
36338
3.80k
    default:
36339
3.80k
      break;
36340
243k
    }
36341
243k
  }
36342
231k
  else 
if (231k
Constraint.size() == 2231k
) {
36343
64
    switch (Constraint[0]) {
36344
0
    default:
36345
0
      break;
36346
64
    case 'Y':
36347
64
      switch (Constraint[1]) {
36348
0
      default:
36349
0
        break;
36350
8
      case 'z':
36351
8
      case '0':
36352
8
        return C_Register;
36353
56
      case 'i':
36354
56
      case 'm':
36355
56
      case 'k':
36356
56
      case 't':
36357
56
      case '2':
36358
56
        return C_RegisterClass;
36359
235k
      }
36360
231k
    }
36361
231k
  }
36362
235k
  return TargetLowering::getConstraintType(Constraint);
36363
235k
}
36364
36365
/// Examine constraint type and operand type and determine a weight value.
36366
/// This object must already have been set up with the operand type
36367
/// and the current alternative constraint selected.
36368
TargetLowering::ConstraintWeight
36369
  X86TargetLowering::getSingleConstraintMatchWeight(
36370
741
    AsmOperandInfo &info, const char *constraint) const {
36371
741
  ConstraintWeight weight = CW_Invalid;
36372
741
  Value *CallOperandVal = info.CallOperandVal;
36373
741
    // If we don't have a value, we can't do a match,
36374
741
    // but allow it at the lowest weight.
36375
741
  if (!CallOperandVal)
36376
204
    return CW_Default;
36377
537
  Type *type = CallOperandVal->getType();
36378
537
  // Look at the constraint type.
36379
537
  switch (*constraint) {
36380
501
  default:
36381
501
    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
36382
501
    LLVM_FALLTHROUGH;
36383
513
  case 'R':
36384
513
  case 'q':
36385
513
  case 'Q':
36386
513
  case 'a':
36387
513
  case 'b':
36388
513
  case 'c':
36389
513
  case 'd':
36390
513
  case 'S':
36391
513
  case 'D':
36392
513
  case 'A':
36393
513
    if (CallOperandVal->getType()->isIntegerTy())
36394
285
      weight = CW_SpecificReg;
36395
513
    break;
36396
0
  case 'f':
36397
0
  case 't':
36398
0
  case 'u':
36399
0
    if (type->isFloatingPointTy())
36400
0
      weight = CW_SpecificReg;
36401
0
    break;
36402
3
  case 'y':
36403
3
    if (
type->isX86_MMXTy() && 3
Subtarget.hasMMX()0
)
36404
0
      weight = CW_SpecificReg;
36405
3
    break;
36406
0
  case 'Y': {
36407
0
    unsigned Size = StringRef(constraint).size();
36408
0
    // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
36409
0
    char NextChar = Size == 2 ? 
constraint[1]0
:
'i'0
;
36410
0
    if (Size > 2)
36411
0
      break;
36412
0
    switch (NextChar) {
36413
0
      default:
36414
0
        return CW_Invalid;
36415
0
      // XMM0
36416
0
      case 'z':
36417
0
      case '0':
36418
0
        if (
(type->getPrimitiveSizeInBits() == 128) && 0
Subtarget.hasSSE1()0
)
36419
0
          return CW_SpecificReg;
36420
0
        return CW_Invalid;
36421
0
      // Conditional OpMask regs (AVX512)
36422
0
      case 'k':
36423
0
        if (
(type->getPrimitiveSizeInBits() == 64) && 0
Subtarget.hasAVX512()0
)
36424
0
          return CW_Register;
36425
0
        return CW_Invalid;
36426
0
      // Any MMX reg
36427
0
      case 'm':
36428
0
        if (
type->isX86_MMXTy() && 0
Subtarget.hasMMX()0
)
36429
0
          return weight;
36430
0
        return CW_Invalid;
36431
0
      // Any SSE reg when ISA >= SSE2, same as 'Y'
36432
0
      case 'i':
36433
0
      case 't':
36434
0
      case '2':
36435
0
        if (!Subtarget.hasSSE2())
36436
0
          return CW_Invalid;
36437
0
        break;
36438
0
    }
36439
0
    // Fall through (handle "Y" constraint).
36440
0
    
LLVM_FALLTHROUGH0
;
36441
0
  }
36442
0
  case 'v':
36443
0
    if (
(type->getPrimitiveSizeInBits() == 512) && 0
Subtarget.hasAVX512()0
)
36444
0
      weight = CW_Register;
36445
0
    LLVM_FALLTHROUGH;
36446
3
  case 'x':
36447
3
    if (
((type->getPrimitiveSizeInBits() == 128) && 3
Subtarget.hasSSE1()0
) ||
36448
3
        
((type->getPrimitiveSizeInBits() == 256) && 3
Subtarget.hasFp256()0
))
36449
0
      weight = CW_Register;
36450
3
    break;
36451
0
  case 'k':
36452
0
    // Enable conditional vector operations using %k<#> registers.
36453
0
    if (
(type->getPrimitiveSizeInBits() == 64) && 0
Subtarget.hasAVX512()0
)
36454
0
      weight = CW_Register;
36455
0
    break;
36456
3
  case 'I':
36457
3
    if (ConstantInt *
C3
= dyn_cast<ConstantInt>(info.CallOperandVal)) {
36458
3
      if (C->getZExtValue() <= 31)
36459
3
        weight = CW_Constant;
36460
3
    }
36461
3
    break;
36462
3
  case 'J':
36463
3
    if (ConstantInt *
C3
= dyn_cast<ConstantInt>(CallOperandVal)) {
36464
3
      if (C->getZExtValue() <= 63)
36465
3
        weight = CW_Constant;
36466
3
    }
36467
3
    break;
36468
3
  case 'K':
36469
3
    if (ConstantInt *
C3
= dyn_cast<ConstantInt>(CallOperandVal)) {
36470
3
      if (
(C->getSExtValue() >= -0x80) && 3
(C->getSExtValue() <= 0x7f)3
)
36471
3
        weight = CW_Constant;
36472
3
    }
36473
3
    break;
36474
0
  case 'L':
36475
0
    if (ConstantInt *
C0
= dyn_cast<ConstantInt>(CallOperandVal)) {
36476
0
      if (
(C->getZExtValue() == 0xff) || 0
(C->getZExtValue() == 0xffff)0
)
36477
0
        weight = CW_Constant;
36478
0
    }
36479
0
    break;
36480
0
  case 'M':
36481
0
    if (ConstantInt *
C0
= dyn_cast<ConstantInt>(CallOperandVal)) {
36482
0
      if (C->getZExtValue() <= 3)
36483
0
        weight = CW_Constant;
36484
0
    }
36485
0
    break;
36486
3
  case 'N':
36487
3
    if (ConstantInt *
C3
= dyn_cast<ConstantInt>(CallOperandVal)) {
36488
3
      if (C->getZExtValue() <= 0xff)
36489
3
        weight = CW_Constant;
36490
3
    }
36491
3
    break;
36492
0
  case 'G':
36493
0
  case 'C':
36494
0
    if (
isa<ConstantFP>(CallOperandVal)0
) {
36495
0
      weight = CW_Constant;
36496
0
    }
36497
0
    break;
36498
3
  case 'e':
36499
3
    if (ConstantInt *
C3
= dyn_cast<ConstantInt>(CallOperandVal)) {
36500
3
      if ((C->getSExtValue() >= -0x80000000LL) &&
36501
3
          (C->getSExtValue() <= 0x7fffffffLL))
36502
3
        weight = CW_Constant;
36503
3
    }
36504
3
    break;
36505
3
  case 'Z':
36506
3
    if (ConstantInt *
C3
= dyn_cast<ConstantInt>(CallOperandVal)) {
36507
3
      if (C->getZExtValue() <= 0xffffffff)
36508
3
        weight = CW_Constant;
36509
3
    }
36510
501
    break;
36511
537
  }
36512
537
  return weight;
36513
537
}
36514
36515
/// Try to replace an X constraint, which matches anything, with another that
36516
/// has more specific requirements based on the type of the corresponding
36517
/// operand.
36518
const char *X86TargetLowering::
36519
63
LowerXConstraint(EVT ConstraintVT) const {
36520
63
  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
36521
63
  // 'f' like normal targets.
36522
63
  if (
ConstraintVT.isFloatingPoint()63
) {
36523
24
    if (Subtarget.hasSSE2())
36524
12
      return "Y";
36525
12
    
if (12
Subtarget.hasSSE1()12
)
36526
0
      return "x";
36527
51
  }
36528
51
36529
51
  return TargetLowering::LowerXConstraint(ConstraintVT);
36530
51
}
36531
36532
/// Lower the specified operand into the Ops vector.
36533
/// If it is invalid, don't add anything to Ops.
36534
void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
36535
                                                     std::string &Constraint,
36536
                                                     std::vector<SDValue>&Ops,
36537
156
                                                     SelectionDAG &DAG) const {
36538
156
  SDValue Result;
36539
156
36540
156
  // Only support length 1 constraints for now.
36541
156
  if (
Constraint.length() > 1156
)
return0
;
36542
156
36543
156
  char ConstraintLetter = Constraint[0];
36544
156
  switch (ConstraintLetter) {
36545
26
  default: break;
36546
3
  case 'I':
36547
3
    if (ConstantSDNode *
C3
= dyn_cast<ConstantSDNode>(Op)) {
36548
3
      if (
C->getZExtValue() <= 313
) {
36549
3
        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36550
3
                                       Op.getValueType());
36551
3
        break;
36552
3
      }
36553
0
    }
36554
0
    return;
36555
5
  case 'J':
36556
5
    if (ConstantSDNode *
C5
= dyn_cast<ConstantSDNode>(Op)) {
36557
5
      if (
C->getZExtValue() <= 635
) {
36558
5
        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36559
5
                                       Op.getValueType());
36560
5
        break;
36561
5
      }
36562
0
    }
36563
0
    return;
36564
2
  case 'K':
36565
2
    if (ConstantSDNode *
C2
= dyn_cast<ConstantSDNode>(Op)) {
36566
2
      if (
isInt<8>(C->getSExtValue())2
) {
36567
2
        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36568
2
                                       Op.getValueType());
36569
2
        break;
36570
2
      }
36571
0
    }
36572
0
    return;
36573
2
  case 'L':
36574
2
    if (ConstantSDNode *
C2
= dyn_cast<ConstantSDNode>(Op)) {
36575
2
      if (
C->getZExtValue() == 0xff || 2
C->getZExtValue() == 0xffff1
||
36576
2
          
(Subtarget.is64Bit() && 0
C->getZExtValue() == 0xffffffff0
)) {
36577
2
        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
36578
2
                                       Op.getValueType());
36579
2
        break;
36580
2
      }
36581
0
    }
36582
0
    return;
36583
1
  case 'M':
36584
1
    if (ConstantSDNode *
C1
= dyn_cast<ConstantSDNode>(Op)) {
36585
1
      if (
C->getZExtValue() <= 31
) {
36586
1
        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36587
1
                                       Op.getValueType());
36588
1
        break;
36589
1
      }
36590
0
    }
36591
0
    return;
36592
5
  case 'N':
36593
5
    if (ConstantSDNode *
C5
= dyn_cast<ConstantSDNode>(Op)) {
36594
4
      if (
C->getZExtValue() <= 2554
) {
36595
4
        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36596
4
                                       Op.getValueType());
36597
4
        break;
36598
4
      }
36599
1
    }
36600
1
    return;
36601
1
  case 'O':
36602
1
    if (ConstantSDNode *
C1
= dyn_cast<ConstantSDNode>(Op)) {
36603
1
      if (
C->getZExtValue() <= 1271
) {
36604
1
        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36605
1
                                       Op.getValueType());
36606
1
        break;
36607
1
      }
36608
0
    }
36609
0
    return;
36610
3
  case 'e': {
36611
3
    // 32-bit signed value
36612
3
    if (ConstantSDNode *
C3
= dyn_cast<ConstantSDNode>(Op)) {
36613
3
      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36614
3
                                           C->getSExtValue())) {
36615
3
        // Widen to 64 bits here to get it sign extended.
36616
3
        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
36617
3
        break;
36618
3
      }
36619
0
    // FIXME gcc accepts some relocatable values here too, but only in certain
36620
0
    // memory models; it's complicated.
36621
0
    }
36622
0
    return;
36623
0
  }
36624
3
  case 'Z': {
36625
3
    // 32-bit unsigned value
36626
3
    if (ConstantSDNode *
C3
= dyn_cast<ConstantSDNode>(Op)) {
36627
3
      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36628
3
                                           C->getZExtValue())) {
36629
3
        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36630
3
                                       Op.getValueType());
36631
3
        break;
36632
3
      }
36633
0
    }
36634
0
    // FIXME gcc accepts some relocatable values here too, but only in certain
36635
0
    // memory models; it's complicated.
36636
0
    return;
36637
0
  }
36638
105
  case 'i': {
36639
105
    // Literal immediates are always ok.
36640
105
    if (ConstantSDNode *
CST105
= dyn_cast<ConstantSDNode>(Op)) {
36641
54
      // Widen to 64 bits here to get it sign extended.
36642
54
      Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
36643
54
      break;
36644
54
    }
36645
51
36646
51
    // In any sort of PIC mode addresses need to be computed at runtime by
36647
51
    // adding in a register or some sort of table lookup.  These can't
36648
51
    // be used as immediates.
36649
51
    
if (51
Subtarget.isPICStyleGOT() || 51
Subtarget.isPICStyleStubPIC()51
)
36650
2
      return;
36651
49
36652
49
    // If we are in non-pic codegen mode, we allow the address of a global (with
36653
49
    // an optional displacement) to be used with 'i'.
36654
49
    GlobalAddressSDNode *GA = nullptr;
36655
49
    int64_t Offset = 0;
36656
49
36657
49
    // Match either (GA), (GA+C), (GA+C1+C2), etc.
36658
53
    while (
153
) {
36659
53
      if (
(GA = dyn_cast<GlobalAddressSDNode>(Op))53
) {
36660
28
        Offset += GA->getOffset();
36661
28
        break;
36662
25
      } else 
if (25
Op.getOpcode() == ISD::ADD25
) {
36663
4
        if (ConstantSDNode *
C4
= dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36664
4
          Offset += C->getZExtValue();
36665
4
          Op = Op.getOperand(0);
36666
4
          continue;
36667
4
        }
36668
21
      } else 
if (21
Op.getOpcode() == ISD::SUB21
) {
36669
0
        if (ConstantSDNode *
C0
= dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36670
0
          Offset += -C->getZExtValue();
36671
0
          Op = Op.getOperand(0);
36672
0
          continue;
36673
0
        }
36674
21
      }
36675
21
36676
21
      // Otherwise, this isn't something we can handle, reject it.
36677
21
      return;
36678
21
    }
36679
49
36680
28
    const GlobalValue *GV = GA->getGlobal();
36681
28
    // If we require an extra load to get this address, as in PIC mode, we
36682
28
    // can't accept it.
36683
28
    if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
36684
1
      return;
36685
27
36686
27
    Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
36687
27
                                        GA->getValueType(0), Offset);
36688
27
    break;
36689
27
  }
36690
131
  }
36691
131
36692
131
  
if (131
Result.getNode()131
) {
36693
105
    Ops.push_back(Result);
36694
105
    return;
36695
105
  }
36696
26
  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
36697
26
}
36698
36699
/// Check if \p RC is a general purpose register class.
36700
/// I.e., GR* or one of their variant.
36701
514
static bool isGRClass(const TargetRegisterClass &RC) {
36702
514
  return RC.hasSuperClassEq(&X86::GR8RegClass) ||
36703
509
         RC.hasSuperClassEq(&X86::GR16RegClass) ||
36704
13
         RC.hasSuperClassEq(&X86::GR32RegClass) ||
36705
13
         RC.hasSuperClassEq(&X86::GR64RegClass) ||
36706
6
         RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
36707
514
}
36708
36709
/// Check if \p RC is a vector register class.
36710
/// I.e., FR* / VR* or one of their variant.
36711
1
static bool isFRClass(const TargetRegisterClass &RC) {
36712
1
  return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
36713
0
         RC.hasSuperClassEq(&X86::FR64XRegClass) ||
36714
0
         RC.hasSuperClassEq(&X86::VR128XRegClass) ||
36715
0
         RC.hasSuperClassEq(&X86::VR256XRegClass) ||
36716
0
         RC.hasSuperClassEq(&X86::VR512RegClass);
36717
1
}
36718
36719
std::pair<unsigned, const TargetRegisterClass *>
36720
X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
36721
                                                StringRef Constraint,
36722
92.0k
                                                MVT VT) const {
36723
92.0k
  // First, see if this is a constraint that directly corresponds to an LLVM
36724
92.0k
  // register class.
36725
92.0k
  if (
Constraint.size() == 192.0k
) {
36726
2.41k
    // GCC Constraint Letters
36727
2.41k
    switch (Constraint[0]) {
36728
8
    default: break;
36729
2.41k
      // TODO: Slight differences here in allocation order and leaving
36730
2.41k
      // RIP in the class. Do they matter any more here than they do
36731
2.41k
      // in the normal allocation?
36732
0
    case 'k':
36733
0
      if (
Subtarget.hasAVX512()0
) {
36734
0
        //  Only supported in AVX512 or later.
36735
0
        switch (VT.SimpleTy) {
36736
0
        default: break;
36737
0
        case MVT::i32:
36738
0
          return std::make_pair(0U, &X86::VK32RegClass);
36739
0
        case MVT::i16:
36740
0
          return std::make_pair(0U, &X86::VK16RegClass);
36741
0
        case MVT::i8:
36742
0
          return std::make_pair(0U, &X86::VK8RegClass);
36743
0
        case MVT::i1:
36744
0
          return std::make_pair(0U, &X86::VK1RegClass);
36745
0
        case MVT::i64:
36746
0
          return std::make_pair(0U, &X86::VK64RegClass);
36747
0
        }
36748
0
      }
36749
0
      break;
36750
21
    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
36751
21
      if (
Subtarget.is64Bit()21
) {
36752
11
        if (
VT == MVT::i32 || 11
VT == MVT::f3211
)
36753
2
          return std::make_pair(0U, &X86::GR32RegClass);
36754
9
        
if (9
VT == MVT::i169
)
36755
0
          return std::make_pair(0U, &X86::GR16RegClass);
36756
9
        
if (9
VT == MVT::i8 || 9
VT == MVT::i18
)
36757
2
          return std::make_pair(0U, &X86::GR8RegClass);
36758
7
        
if (7
VT == MVT::i64 || 7
VT == MVT::f641
)
36759
7
          return std::make_pair(0U, &X86::GR64RegClass);
36760
0
        break;
36761
0
      }
36762
10
      
LLVM_FALLTHROUGH10
;
36763
10
      // 32-bit fallthrough
36764
16
    case 'Q':   // Q_REGS
36765
16
      if (
VT == MVT::i32 || 16
VT == MVT::f324
)
36766
13
        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
36767
3
      
if (3
VT == MVT::i163
)
36768
2
        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
36769
1
      
if (1
VT == MVT::i8 || 1
VT == MVT::i11
)
36770
1
        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
36771
0
      
if (0
VT == MVT::i640
)
36772
0
        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
36773
0
      break;
36774
478
    case 'r':   // GENERAL_REGS
36775
478
    case 'l':   // INDEX_REGS
36776
478
      if (
VT == MVT::i8 || 478
VT == MVT::i1472
)
36777
7
        return std::make_pair(0U, &X86::GR8RegClass);
36778
471
      
if (471
VT == MVT::i16471
)
36779
3
        return std::make_pair(0U, &X86::GR16RegClass);
36780
468
      
if (468
VT == MVT::i32 || 468
VT == MVT::f32110
||
!Subtarget.is64Bit()109
)
36781
363
        return std::make_pair(0U, &X86::GR32RegClass);
36782
105
      return std::make_pair(0U, &X86::GR64RegClass);
36783
3
    case 'R':   // LEGACY_REGS
36784
3
      if (
VT == MVT::i8 || 3
VT == MVT::i13
)
36785
0
        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
36786
3
      
if (3
VT == MVT::i163
)
36787
0
        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
36788
3
      
if (3
VT == MVT::i32 || 3
!Subtarget.is64Bit()1
)
36789
2
        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
36790
1
      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
36791
12
    case 'f':  // FP Stack registers.
36792
12
      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
36793
12
      // value to the correct fpstack register class.
36794
12
      if (
VT == MVT::f32 && 12
!isScalarFPTypeInSSEReg(VT)0
)
36795
0
        return std::make_pair(0U, &X86::RFP32RegClass);
36796
12
      
if (12
VT == MVT::f64 && 12
!isScalarFPTypeInSSEReg(VT)11
)
36797
11
        return std::make_pair(0U, &X86::RFP64RegClass);
36798
1
      return std::make_pair(0U, &X86::RFP80RegClass);
36799
117
    case 'y':   // MMX_REGS if MMX allowed.
36800
117
      if (
!Subtarget.hasMMX()117
)
break0
;
36801
117
      return std::make_pair(0U, &X86::VR64RegClass);
36802
17
    case 'Y':   // SSE_REGS if SSE2 allowed
36803
17
      if (
!Subtarget.hasSSE2()17
)
break0
;
36804
17
      
LLVM_FALLTHROUGH17
;
36805
1.77k
    case 'v':
36806
1.77k
    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
36807
1.77k
      if (
!Subtarget.hasSSE1()1.77k
)
break0
;
36808
1.77k
      bool VConstraint = (Constraint[0] == 'v');
36809
1.77k
36810
1.77k
      switch (VT.SimpleTy) {
36811
0
      default: break;
36812
1.77k
      // Scalar SSE types.
36813
20
      case MVT::f32:
36814
20
      case MVT::i32:
36815
20
        if (
VConstraint && 20
Subtarget.hasAVX512()12
&&
Subtarget.hasVLX()0
)
36816
0
          return std::make_pair(0U, &X86::FR32XRegClass);
36817
20
        return std::make_pair(0U, &X86::FR32RegClass);
36818
99
      case MVT::f64:
36819
99
      case MVT::i64:
36820
99
        if (
VConstraint && 99
Subtarget.hasVLX()42
)
36821
18
          return std::make_pair(0U, &X86::FR64XRegClass);
36822
81
        return std::make_pair(0U, &X86::FR64RegClass);
36823
81
      // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36824
81
      // Vector types.
36825
1.50k
      case MVT::v16i8:
36826
1.50k
      case MVT::v8i16:
36827
1.50k
      case MVT::v4i32:
36828
1.50k
      case MVT::v2i64:
36829
1.50k
      case MVT::v4f32:
36830
1.50k
      case MVT::v2f64:
36831
1.50k
        if (
VConstraint && 1.50k
Subtarget.hasVLX()59
)
36832
20
          return std::make_pair(0U, &X86::VR128XRegClass);
36833
1.48k
        return std::make_pair(0U, &X86::VR128RegClass);
36834
1.48k
      // AVX types.
36835
126
      case MVT::v32i8:
36836
126
      case MVT::v16i16:
36837
126
      case MVT::v8i32:
36838
126
      case MVT::v4i64:
36839
126
      case MVT::v8f32:
36840
126
      case MVT::v4f64:
36841
126
        if (
VConstraint && 126
Subtarget.hasVLX()110
)
36842
50
          return std::make_pair(0U, &X86::VR256XRegClass);
36843
76
        return std::make_pair(0U, &X86::VR256RegClass);
36844
27
      case MVT::v8f64:
36845
27
      case MVT::v16f32:
36846
27
      case MVT::v16i32:
36847
27
      case MVT::v8i64:
36848
27
        return std::make_pair(0U, &X86::VR512RegClass);
36849
0
      }
36850
0
      break;
36851
2.41k
    }
36852
89.5k
  } else 
if (89.5k
Constraint.size() == 2 && 89.5k
Constraint[0] == 'Y'16
) {
36853
16
    switch (Constraint[1]) {
36854
0
    default:
36855
0
      break;
36856
13
    case 'i':
36857
13
    case 't':
36858
13
    case '2':
36859
13
      return getRegForInlineAsmConstraint(TRI, "Y", VT);
36860
1
    case 'm':
36861
1
      if (
!Subtarget.hasMMX()1
)
break0
;
36862
1
      return std::make_pair(0U, &X86::VR64RegClass);
36863
2
    case 'z':
36864
2
    case '0':
36865
2
      if (
!Subtarget.hasSSE1()2
)
break0
;
36866
2
      return std::make_pair(X86::XMM0, &X86::VR128RegClass);
36867
0
    case 'k':
36868
0
      // This register class doesn't allocate k0 for masked vector operation.
36869
0
      if (
Subtarget.hasAVX512()0
) { // Only supported in AVX512.
36870
0
        switch (VT.SimpleTy) {
36871
0
        default: break;
36872
0
        case MVT::i32:
36873
0
          return std::make_pair(0U, &X86::VK32WMRegClass);
36874
0
        case MVT::i16:
36875
0
          return std::make_pair(0U, &X86::VK16WMRegClass);
36876
0
        case MVT::i8:
36877
0
          return std::make_pair(0U, &X86::VK8WMRegClass);
36878
0
        case MVT::i1:
36879
0
          return std::make_pair(0U, &X86::VK1WMRegClass);
36880
0
        case MVT::i64:
36881
0
          return std::make_pair(0U, &X86::VK64WMRegClass);
36882
0
        }
36883
0
      }
36884
0
      break;
36885
89.5k
    }
36886
89.5k
  }
36887
89.5k
36888
89.5k
  // Use the default implementation in TargetLowering to convert the register
36889
89.5k
  // constraint into a member of a register class.
36890
89.5k
  std::pair<unsigned, const TargetRegisterClass*> Res;
36891
89.5k
  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
36892
89.5k
36893
89.5k
  // Not found as a standard register?
36894
89.5k
  if (
!Res.second89.5k
) {
36895
22.3k
    // Map st(0) -> st(7) -> ST0
36896
22.3k
    if (
Constraint.size() == 7 && 22.3k
Constraint[0] == '{'8.73k
&&
36897
8.73k
        tolower(Constraint[1]) == 's' &&
36898
18
        tolower(Constraint[2]) == 't' &&
36899
18
        Constraint[3] == '(' &&
36900
18
        
(Constraint[4] >= '0' && 18
Constraint[4] <= '7'18
) &&
36901
18
        Constraint[5] == ')' &&
36902
22.3k
        
Constraint[6] == '}'18
) {
36903
18
36904
18
      Res.first = X86::FP0+Constraint[4]-'0';
36905
18
      Res.second = &X86::RFP80RegClass;
36906
18
      return Res;
36907
18
    }
36908
22.2k
36909
22.2k
    // GCC allows "st(0)" to be called just plain "st".
36910
22.2k
    
if (22.2k
StringRef("{st}").equals_lower(Constraint)22.2k
) {
36911
48
      Res.first = X86::FP0;
36912
48
      Res.second = &X86::RFP80RegClass;
36913
48
      return Res;
36914
48
    }
36915
22.2k
36916
22.2k
    // flags -> EFLAGS
36917
22.2k
    
if (22.2k
StringRef("{flags}").equals_lower(Constraint)22.2k
) {
36918
8.71k
      Res.first = X86::EFLAGS;
36919
8.71k
      Res.second = &X86::CCRRegClass;
36920
8.71k
      return Res;
36921
8.71k
    }
36922
13.5k
36923
13.5k
    // 'A' means [ER]AX + [ER]DX.
36924
13.5k
    
if (13.5k
Constraint == "A"13.5k
) {
36925
8
      if (
Subtarget.is64Bit()8
) {
36926
1
        Res.first = X86::RAX;
36927
1
        Res.second = &X86::GR64_ADRegClass;
36928
8
      } else {
36929
7
        assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
36930
7
               "Expecting 64, 32 or 16 bit subtarget");
36931
7
        Res.first = X86::EAX;
36932
7
        Res.second = &X86::GR32_ADRegClass;
36933
7
      }
36934
8
      return Res;
36935
8
    }
36936
13.5k
    return Res;
36937
13.5k
  }
36938
67.2k
36939
67.2k
  // Otherwise, check to see if this is a register class of the wrong value
36940
67.2k
  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
36941
67.2k
  // turn into {ax},{dx}.
36942
67.2k
  // MVT::Other is used to specify clobber names.
36943
67.2k
  
if (67.2k
TRI->isTypeLegalForClass(*Res.second, VT) || 67.2k
VT == MVT::Other66.9k
)
36944
66.7k
    return Res;   // Correct type already, nothing to do.
36945
514
36946
514
  // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
36947
514
  // return "eax". This should even work for things like getting 64bit integer
36948
514
  // registers when given an f64 type.
36949
514
  const TargetRegisterClass *Class = Res.second;
36950
514
  // The generic code will match the first register class that contains the
36951
514
  // given register. Thus, based on the ordering of the tablegened file,
36952
514
  // the "plain" GR classes might not come first.
36953
514
  // Therefore, use a helper method.
36954
514
  if (
isGRClass(*Class)514
) {
36955
513
    unsigned Size = VT.getSizeInBits();
36956
513
    if (
Size == 1513
)
Size = 81
;
36957
513
    unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
36958
513
    if (
DestReg > 0513
) {
36959
512
      bool is64Bit = Subtarget.is64Bit();
36960
512
      const TargetRegisterClass *RC =
36961
13
          Size == 8 ? 
(is64Bit ? 13
&X86::GR8RegClass11
:
&X86::GR8_NOREXRegClass2
)
36962
499
        : 
Size == 16 ? 499
(is64Bit ? 3
&X86::GR16RegClass3
:
&X86::GR16_NOREXRegClass0
)
36963
496
        : 
Size == 32 ? 496
(is64Bit ? 433
&X86::GR32RegClass212
:
&X86::GR32_NOREXRegClass221
)
36964
63
        : &X86::GR64RegClass;
36965
512
      if (RC->contains(DestReg))
36966
510
        Res = std::make_pair(DestReg, RC);
36967
513
    } else {
36968
1
      // No register found/type mismatch.
36969
1
      Res.first = 0;
36970
1
      Res.second = nullptr;
36971
1
    }
36972
514
  } else 
if (1
isFRClass(*Class)1
) {
36973
1
    // Handle references to XMM physical registers that got mapped into the
36974
1
    // wrong class.  This can happen with constraints like {xmm0} where the
36975
1
    // target independent register mapper will just pick the first match it can
36976
1
    // find, ignoring the required type.
36977
1
36978
1
    // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36979
1
    if (
VT == MVT::f32 || 1
VT == MVT::i321
)
36980
0
      Res.second = &X86::FR32RegClass;
36981
1
    else 
if (1
VT == MVT::f64 || 1
VT == MVT::i641
)
36982
1
      Res.second = &X86::FR64RegClass;
36983
0
    else 
if (0
TRI->isTypeLegalForClass(X86::VR128RegClass, VT)0
)
36984
0
      Res.second = &X86::VR128RegClass;
36985
0
    else 
if (0
TRI->isTypeLegalForClass(X86::VR256RegClass, VT)0
)
36986
0
      Res.second = &X86::VR256RegClass;
36987
0
    else 
if (0
TRI->isTypeLegalForClass(X86::VR512RegClass, VT)0
)
36988
0
      Res.second = &X86::VR512RegClass;
36989
0
    else {
36990
0
      // Type mismatch and not a clobber: Return an error;
36991
0
      Res.first = 0;
36992
0
      Res.second = nullptr;
36993
0
    }
36994
1
  }
36995
92.0k
36996
92.0k
  return Res;
36997
92.0k
}
36998
36999
int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
37000
                                            const AddrMode &AM, Type *Ty,
37001
358k
                                            unsigned AS) const {
37002
358k
  // Scaling factors are not free at all.
37003
358k
  // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
37004
358k
  // will take 2 allocations in the out of order engine instead of 1
37005
358k
  // for plain addressing mode, i.e. inst (reg1).
37006
358k
  // E.g.,
37007
358k
  // vaddps (%rsi,%drx), %ymm0, %ymm1
37008
358k
  // Requires two allocations (one for the load, one for the computation)
37009
358k
  // whereas:
37010
358k
  // vaddps (%rsi), %ymm0, %ymm1
37011
358k
  // Requires just 1 allocation, i.e., freeing allocations for other operations
37012
358k
  // and having less micro operations to execute.
37013
358k
  //
37014
358k
  // For some X86 architectures, this is even worse because for instance for
37015
358k
  // stores, the complex addressing mode forces the instruction to use the
37016
358k
  // "load" ports instead of the dedicated "store" port.
37017
358k
  // E.g., on Haswell:
37018
358k
  // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
37019
358k
  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
37020
358k
  if (isLegalAddressingMode(DL, AM, Ty, AS))
37021
358k
    // Scale represents reg2 * scale, thus account for 1
37022
358k
    // as soon as we use a second register.
37023
358k
    return AM.Scale != 0;
37024
0
  return -1;
37025
0
}
37026
37027
1.41k
bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
37028
1.41k
  // Integer division on x86 is expensive. However, when aggressively optimizing
37029
1.41k
  // for code size, we prefer to use a div instruction, as it is usually smaller
37030
1.41k
  // than the alternative sequence.
37031
1.41k
  // The exception to this is vector division. Since x86 doesn't have vector
37032
1.41k
  // integer division, leaving the division as-is is a loss even in terms of
37033
1.41k
  // size, because it will have to be scalarized, while the alternative code
37034
1.41k
  // sequence can be performed in vector form.
37035
1.41k
  bool OptSize =
37036
1.41k
      Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
37037
102
  return OptSize && !VT.isVector();
37038
1.41k
}
37039
37040
10
void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
37041
10
  if (!Subtarget.is64Bit())
37042
0
    return;
37043
10
37044
10
  // Update IsSplitCSR in X86MachineFunctionInfo.
37045
10
  X86MachineFunctionInfo *AFI =
37046
10
    Entry->getParent()->getInfo<X86MachineFunctionInfo>();
37047
10
  AFI->setIsSplitCSR(true);
37048
10
}
37049
37050
void X86TargetLowering::insertCopiesSplitCSR(
37051
    MachineBasicBlock *Entry,
37052
10
    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
37053
10
  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37054
10
  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
37055
10
  if (!IStart)
37056
0
    return;
37057
10
37058
10
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37059
10
  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
37060
10
  MachineBasicBlock::iterator MBBI = Entry->begin();
37061
130
  for (const MCPhysReg *I = IStart; 
*I130
;
++I120
) {
37062
120
    const TargetRegisterClass *RC = nullptr;
37063
120
    if (X86::GR64RegClass.contains(*I))
37064
120
      RC = &X86::GR64RegClass;
37065
120
    else
37066
0
      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
37067
120
37068
120
    unsigned NewVR = MRI->createVirtualRegister(RC);
37069
120
    // Create copy from CSR to a virtual register.
37070
120
    // FIXME: this currently does not emit CFI pseudo-instructions, it works
37071
120
    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
37072
120
    // nounwind. If we want to generalize this later, we may need to emit
37073
120
    // CFI pseudo-instructions.
37074
120
    assert(Entry->getParent()->getFunction()->hasFnAttribute(
37075
120
               Attribute::NoUnwind) &&
37076
120
           "Function should be nounwind in insertCopiesSplitCSR!");
37077
120
    Entry->addLiveIn(*I);
37078
120
    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
37079
120
        .addReg(*I);
37080
120
37081
120
    // Insert the copy-back instructions right before the terminator.
37082
120
    for (auto *Exit : Exits)
37083
120
      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
37084
120
              TII->get(TargetOpcode::COPY), *I)
37085
120
          .addReg(NewVR);
37086
120
  }
37087
10
}
37088
37089
1.22M
bool X86TargetLowering::supportSwiftError() const {
37090
1.22M
  return Subtarget.is64Bit();
37091
1.22M
}
37092
37093
/// Returns the name of the symbol used to emit stack probes or the empty
37094
/// string if not applicable.
37095
73.2k
StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
37096
73.2k
  // If the function specifically requests stack probes, emit them.
37097
73.2k
  if (MF.getFunction()->hasFnAttribute("probe-stack"))
37098
8
    return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
37099
73.2k
37100
73.2k
  // Generally, if we aren't on Windows, the platform ABI does not include
37101
73.2k
  // support for stack probes, so don't emit them.
37102
73.2k
  
if (73.2k
!Subtarget.isOSWindows() || 73.2k
Subtarget.isTargetMachO()2.30k
)
37103
70.9k
    return "";
37104
2.29k
37105
2.29k
  // We need a stack probe to conform to the Windows ABI. Choose the right
37106
2.29k
  // symbol.
37107
2.29k
  
if (2.29k
Subtarget.is64Bit()2.29k
)
37108
1.24k
    
return Subtarget.isTargetCygMing() ? 1.24k
"___chkstk_ms"116
:
"__chkstk"1.13k
;
37109
1.04k
  
return Subtarget.isTargetCygMing() ? 1.04k
"_alloca"196
:
"_chkstk"850
;
37110
73.2k
}