Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- InstCombineCalls.cpp -----------------------------------------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This file implements the visitCall and visitInvoke functions.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "InstCombineInternal.h"
15
#include "llvm/ADT/APFloat.h"
16
#include "llvm/ADT/APInt.h"
17
#include "llvm/ADT/ArrayRef.h"
18
#include "llvm/ADT/None.h"
19
#include "llvm/ADT/STLExtras.h"
20
#include "llvm/ADT/SmallVector.h"
21
#include "llvm/ADT/Statistic.h"
22
#include "llvm/ADT/Twine.h"
23
#include "llvm/Analysis/InstructionSimplify.h"
24
#include "llvm/Analysis/MemoryBuiltins.h"
25
#include "llvm/Analysis/ValueTracking.h"
26
#include "llvm/IR/BasicBlock.h"
27
#include "llvm/IR/CallSite.h"
28
#include "llvm/IR/Constant.h"
29
#include "llvm/IR/DataLayout.h"
30
#include "llvm/IR/DerivedTypes.h"
31
#include "llvm/IR/Function.h"
32
#include "llvm/IR/GlobalVariable.h"
33
#include "llvm/IR/InstrTypes.h"
34
#include "llvm/IR/Instruction.h"
35
#include "llvm/IR/Instructions.h"
36
#include "llvm/IR/IntrinsicInst.h"
37
#include "llvm/IR/Intrinsics.h"
38
#include "llvm/IR/LLVMContext.h"
39
#include "llvm/IR/Metadata.h"
40
#include "llvm/IR/PatternMatch.h"
41
#include "llvm/IR/Statepoint.h"
42
#include "llvm/IR/Type.h"
43
#include "llvm/IR/Value.h"
44
#include "llvm/IR/ValueHandle.h"
45
#include "llvm/Support/Casting.h"
46
#include "llvm/Support/Debug.h"
47
#include "llvm/Support/KnownBits.h"
48
#include "llvm/Support/MathExtras.h"
49
#include "llvm/Transforms/Utils/Local.h"
50
#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
51
#include <algorithm>
52
#include <cassert>
53
#include <cstdint>
54
#include <cstring>
55
#include <vector>
56
57
using namespace llvm;
58
using namespace PatternMatch;
59
60
#define DEBUG_TYPE "instcombine"
61
62
STATISTIC(NumSimplified, "Number of library calls simplified");
63
64
static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
65
    "unfold-element-atomic-memcpy-max-elements",
66
    cl::init(16),
67
    cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
68
             "allowed to unfold"));
69
70
/// Return the specified type promoted as it would be to pass though a va_arg
71
/// area.
72
4
static Type *getPromotedType(Type *Ty) {
73
4
  if (IntegerType* 
ITy4
= dyn_cast<IntegerType>(Ty)) {
74
3
    if (ITy->getBitWidth() < 32)
75
1
      return Type::getInt32Ty(Ty->getContext());
76
3
  }
77
3
  return Ty;
78
3
}
79
80
/// Return a constant boolean vector that has true elements in all positions
81
/// where the input constant data vector has an element with the sign bit set.
82
26
static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
83
26
  SmallVector<Constant *, 32> BoolVec;
84
26
  IntegerType *BoolTy = Type::getInt1Ty(V->getContext());
85
180
  for (unsigned I = 0, E = V->getNumElements(); 
I != E180
;
++I154
) {
86
154
    Constant *Elt = V->getElementAsConstant(I);
87
154
    assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) &&
88
154
           "Unexpected constant data vector element type");
89
154
    bool Sign = V->getElementType()->isIntegerTy()
90
136
                    ? cast<ConstantInt>(Elt)->isNegative()
91
18
                    : cast<ConstantFP>(Elt)->isNegative();
92
154
    BoolVec.push_back(ConstantInt::get(BoolTy, Sign));
93
154
  }
94
26
  return ConstantVector::get(BoolVec);
95
26
}
96
97
Instruction *InstCombiner::SimplifyElementUnorderedAtomicMemCpy(
98
4
    ElementUnorderedAtomicMemCpyInst *AMI) {
99
4
  // Try to unfold this intrinsic into sequence of explicit atomic loads and
100
4
  // stores.
101
4
  // First check that number of elements is compile time constant.
102
4
  auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
103
4
  if (!LengthCI)
104
0
    return nullptr;
105
4
106
4
  // Check that there are not too many elements.
107
4
  uint64_t LengthInBytes = LengthCI->getZExtValue();
108
4
  uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
109
4
  uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
110
4
  if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
111
1
    return nullptr;
112
3
113
3
  // Only expand if there are elements to copy.
114
3
  
if (3
NumElements > 03
) {
115
3
    // Don't unfold into illegal integers
116
3
    uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
117
3
    if (!getDataLayout().isLegalInteger(ElementSizeInBits))
118
1
      return nullptr;
119
2
120
2
    // Cast source and destination to the correct type. Intrinsic input
121
2
    // arguments are usually represented as i8*. Often operands will be
122
2
    // explicitly casted to i8* and we can just strip those casts instead of
123
2
    // inserting new ones. However it's easier to rely on other InstCombine
124
2
    // rules which will cover trivial cases anyway.
125
2
    Value *Src = AMI->getRawSource();
126
2
    Value *Dst = AMI->getRawDest();
127
2
    Type *ElementPointerType =
128
2
        Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
129
2
                           Src->getType()->getPointerAddressSpace());
130
2
131
2
    Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType,
132
2
                                                 "memcpy_unfold.src_casted");
133
2
    Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType,
134
2
                                                 "memcpy_unfold.dst_casted");
135
2
136
10
    for (uint64_t i = 0; 
i < NumElements10
;
++i8
) {
137
8
      // Get current element addresses
138
8
      ConstantInt *ElementIdxCI =
139
8
          ConstantInt::get(AMI->getContext(), APInt(64, i));
140
8
      Value *SrcElementAddr =
141
8
          Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
142
8
      Value *DstElementAddr =
143
8
          Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
144
8
145
8
      // Load from the source. Transfer alignment information and mark load as
146
8
      // unordered atomic.
147
8
      LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val");
148
8
      Load->setOrdering(AtomicOrdering::Unordered);
149
8
      // We know alignment of the first element. It is also guaranteed by the
150
8
      // verifier that element size is less or equal than first element
151
8
      // alignment and both of this values are powers of two. This means that
152
8
      // all subsequent accesses are at least element size aligned.
153
8
      // TODO: We can infer better alignment but there is no evidence that this
154
8
      // will matter.
155
2
      Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
156
6
                                : ElementSizeInBytes);
157
8
      Load->setDebugLoc(AMI->getDebugLoc());
158
8
159
8
      // Store loaded value via unordered atomic store.
160
8
      StoreInst *Store = Builder.CreateStore(Load, DstElementAddr);
161
8
      Store->setOrdering(AtomicOrdering::Unordered);
162
2
      Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
163
6
                                 : ElementSizeInBytes);
164
8
      Store->setDebugLoc(AMI->getDebugLoc());
165
8
    }
166
3
  }
167
3
168
3
  // Set the number of elements of the copy to 0, it will be deleted on the
169
3
  // next iteration.
170
2
  AMI->setLength(Constant::getNullValue(LengthCI->getType()));
171
2
  return AMI;
172
4
}
173
174
154k
Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
175
154k
  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
176
154k
  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
177
154k
  unsigned MinAlign = std::min(DstAlign, SrcAlign);
178
154k
  unsigned CopyAlign = MI->getAlignment();
179
154k
180
154k
  if (
CopyAlign < MinAlign154k
) {
181
98
    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false));
182
98
    return MI;
183
98
  }
184
154k
185
154k
  // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
186
154k
  // load/store.
187
154k
  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
188
154k
  if (
!MemOpLength154k
)
return nullptr6.02k
;
189
148k
190
148k
  // Source and destination pointer types are always "i8*" for intrinsic.  See
191
148k
  // if the size is something we can handle with a single primitive load/store.
192
148k
  // A single load+store correctly handles overlapping memory in the memmove
193
148k
  // case.
194
148k
  uint64_t Size = MemOpLength->getLimitedValue();
195
148k
  assert(Size && "0-sized memory transferring should be removed already.");
196
148k
197
148k
  if (
Size > 8 || 148k
(Size&(Size-1))4.37k
)
198
147k
    return nullptr;  // If not 1/2/4/8 bytes, exit.
199
1.04k
200
1.04k
  // Use an integer load+store unless we can find something better.
201
1.04k
  unsigned SrcAddrSp =
202
1.04k
    cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace();
203
1.04k
  unsigned DstAddrSp =
204
1.04k
    cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();
205
1.04k
206
1.04k
  IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
207
1.04k
  Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
208
1.04k
  Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
209
1.04k
210
1.04k
  // If the memcpy has metadata describing the members, see if we can get the
211
1.04k
  // TBAA tag describing our copy.
212
1.04k
  MDNode *CopyMD = nullptr;
213
1.04k
  if (MDNode *
M1.04k
= MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
214
469
    if (
M->getNumOperands() == 3 && 469
M->getOperand(0)143
&&
215
143
        mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
216
143
        mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
217
143
        M->getOperand(1) &&
218
143
        mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
219
143
        mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
220
143
        Size &&
221
469
        
M->getOperand(2)143
&&
isa<MDNode>(M->getOperand(2))141
)
222
141
      CopyMD = cast<MDNode>(M->getOperand(2));
223
469
  }
224
1.04k
225
1.04k
  // If the memcpy/memmove provides better alignment info than we can
226
1.04k
  // infer, use it.
227
1.04k
  SrcAlign = std::max(SrcAlign, CopyAlign);
228
1.04k
  DstAlign = std::max(DstAlign, CopyAlign);
229
1.04k
230
1.04k
  Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
231
1.04k
  Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
232
1.04k
  LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile());
233
1.04k
  L->setAlignment(SrcAlign);
234
1.04k
  if (CopyMD)
235
141
    L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
236
1.04k
  MDNode *LoopMemParallelMD =
237
1.04k
    MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
238
1.04k
  if (LoopMemParallelMD)
239
1
    L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
240
1.04k
241
1.04k
  StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile());
242
1.04k
  S->setAlignment(DstAlign);
243
1.04k
  if (CopyMD)
244
141
    S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
245
1.04k
  if (LoopMemParallelMD)
246
1
    S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
247
154k
248
154k
  // Set the size of the copy to 0, it will be deleted on the next iteration.
249
154k
  MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
250
154k
  return MI;
251
154k
}
252
253
305k
Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
254
305k
  unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
255
305k
  if (
MI->getAlignment() < Alignment305k
) {
256
3.38k
    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
257
3.38k
                                             Alignment, false));
258
3.38k
    return MI;
259
3.38k
  }
260
301k
261
301k
  // Extract the length and alignment and fill if they are constant.
262
301k
  ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
263
301k
  ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
264
301k
  if (
!LenC || 301k
!FillC240k
||
!FillC->getType()->isIntegerTy(8)239k
)
265
62.0k
    return nullptr;
266
239k
  uint64_t Len = LenC->getLimitedValue();
267
239k
  Alignment = MI->getAlignment();
268
239k
  assert(Len && "0-sized memory setting should be removed already.");
269
239k
270
239k
  // memset(s,c,n) -> store s, c (for n=1,2,4,8)
271
239k
  if (
Len <= 8 && 239k
isPowerOf2_32((uint32_t)Len)1.16k
) {
272
371
    Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8.
273
371
274
371
    Value *Dest = MI->getDest();
275
371
    unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
276
371
    Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp);
277
371
    Dest = Builder.CreateBitCast(Dest, NewDstPtrTy);
278
371
279
371
    // Alignment 0 is identity for alignment 1 for memset, but not store.
280
371
    if (
Alignment == 0371
)
Alignment = 10
;
281
371
282
371
    // Extract the fill value and store.
283
371
    uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
284
371
    StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
285
371
                                       MI->isVolatile());
286
371
    S->setAlignment(Alignment);
287
371
288
371
    // Set the size of the copy to 0, it will be deleted on the next iteration.
289
371
    MI->setLength(Constant::getNullValue(LenC->getType()));
290
371
    return MI;
291
371
  }
292
239k
293
239k
  return nullptr;
294
239k
}
295
296
static Value *simplifyX86immShift(const IntrinsicInst &II,
297
395
                                  InstCombiner::BuilderTy &Builder) {
298
395
  bool LogicalShift = false;
299
395
  bool ShiftLeft = false;
300
395
301
395
  switch (II.getIntrinsicID()) {
302
0
  
default: 0
llvm_unreachable0
("Unexpected intrinsic!");
303
139
  case Intrinsic::x86_sse2_psra_d:
304
139
  case Intrinsic::x86_sse2_psra_w:
305
139
  case Intrinsic::x86_sse2_psrai_d:
306
139
  case Intrinsic::x86_sse2_psrai_w:
307
139
  case Intrinsic::x86_avx2_psra_d:
308
139
  case Intrinsic::x86_avx2_psra_w:
309
139
  case Intrinsic::x86_avx2_psrai_d:
310
139
  case Intrinsic::x86_avx2_psrai_w:
311
139
  case Intrinsic::x86_avx512_psra_q_128:
312
139
  case Intrinsic::x86_avx512_psrai_q_128:
313
139
  case Intrinsic::x86_avx512_psra_q_256:
314
139
  case Intrinsic::x86_avx512_psrai_q_256:
315
139
  case Intrinsic::x86_avx512_psra_d_512:
316
139
  case Intrinsic::x86_avx512_psra_q_512:
317
139
  case Intrinsic::x86_avx512_psra_w_512:
318
139
  case Intrinsic::x86_avx512_psrai_d_512:
319
139
  case Intrinsic::x86_avx512_psrai_q_512:
320
139
  case Intrinsic::x86_avx512_psrai_w_512:
321
139
    LogicalShift = false; ShiftLeft = false;
322
139
    break;
323
136
  case Intrinsic::x86_sse2_psrl_d:
324
136
  case Intrinsic::x86_sse2_psrl_q:
325
136
  case Intrinsic::x86_sse2_psrl_w:
326
136
  case Intrinsic::x86_sse2_psrli_d:
327
136
  case Intrinsic::x86_sse2_psrli_q:
328
136
  case Intrinsic::x86_sse2_psrli_w:
329
136
  case Intrinsic::x86_avx2_psrl_d:
330
136
  case Intrinsic::x86_avx2_psrl_q:
331
136
  case Intrinsic::x86_avx2_psrl_w:
332
136
  case Intrinsic::x86_avx2_psrli_d:
333
136
  case Intrinsic::x86_avx2_psrli_q:
334
136
  case Intrinsic::x86_avx2_psrli_w:
335
136
  case Intrinsic::x86_avx512_psrl_d_512:
336
136
  case Intrinsic::x86_avx512_psrl_q_512:
337
136
  case Intrinsic::x86_avx512_psrl_w_512:
338
136
  case Intrinsic::x86_avx512_psrli_d_512:
339
136
  case Intrinsic::x86_avx512_psrli_q_512:
340
136
  case Intrinsic::x86_avx512_psrli_w_512:
341
136
    LogicalShift = true; ShiftLeft = false;
342
136
    break;
343
120
  case Intrinsic::x86_sse2_psll_d:
344
120
  case Intrinsic::x86_sse2_psll_q:
345
120
  case Intrinsic::x86_sse2_psll_w:
346
120
  case Intrinsic::x86_sse2_pslli_d:
347
120
  case Intrinsic::x86_sse2_pslli_q:
348
120
  case Intrinsic::x86_sse2_pslli_w:
349
120
  case Intrinsic::x86_avx2_psll_d:
350
120
  case Intrinsic::x86_avx2_psll_q:
351
120
  case Intrinsic::x86_avx2_psll_w:
352
120
  case Intrinsic::x86_avx2_pslli_d:
353
120
  case Intrinsic::x86_avx2_pslli_q:
354
120
  case Intrinsic::x86_avx2_pslli_w:
355
120
  case Intrinsic::x86_avx512_psll_d_512:
356
120
  case Intrinsic::x86_avx512_psll_q_512:
357
120
  case Intrinsic::x86_avx512_psll_w_512:
358
120
  case Intrinsic::x86_avx512_pslli_d_512:
359
120
  case Intrinsic::x86_avx512_pslli_q_512:
360
120
  case Intrinsic::x86_avx512_pslli_w_512:
361
120
    LogicalShift = true; ShiftLeft = true;
362
120
    break;
363
395
  }
364
395
  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
365
395
366
395
  // Simplify if count is constant.
367
395
  auto Arg1 = II.getArgOperand(1);
368
395
  auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);
369
395
  auto CDV = dyn_cast<ConstantDataVector>(Arg1);
370
395
  auto CInt = dyn_cast<ConstantInt>(Arg1);
371
395
  if (
!CAZ && 395
!CDV368
&&
!CInt261
)
372
132
    return nullptr;
373
263
374
263
  APInt Count(64, 0);
375
263
  if (
CDV263
) {
376
107
    // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
377
107
    // operand to compute the shift amount.
378
107
    auto VT = cast<VectorType>(CDV->getType());
379
107
    unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits();
380
107
    assert((64 % BitWidth) == 0 && "Unexpected packed shift size");
381
107
    unsigned NumSubElts = 64 / BitWidth;
382
107
383
107
    // Concatenate the sub-elements to create the 64-bit value.
384
377
    for (unsigned i = 0; 
i != NumSubElts377
;
++i270
) {
385
270
      unsigned SubEltIdx = (NumSubElts - 1) - i;
386
270
      auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
387
270
      Count <<= BitWidth;
388
270
      Count |= SubElt->getValue().zextOrTrunc(64);
389
270
    }
390
107
  }
391
156
  else 
if (156
CInt156
)
392
129
    Count = CInt->getValue();
393
263
394
263
  auto Vec = II.getArgOperand(0);
395
263
  auto VT = cast<VectorType>(Vec->getType());
396
263
  auto SVT = VT->getElementType();
397
263
  unsigned VWidth = VT->getNumElements();
398
263
  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
399
263
400
263
  // If shift-by-zero then just return the original value.
401
263
  if (Count.isNullValue())
402
71
    return Vec;
403
192
404
192
  // Handle cases when Shift >= BitWidth.
405
192
  
if (192
Count.uge(BitWidth)192
) {
406
96
    // If LogicalShift - just return zero.
407
96
    if (LogicalShift)
408
72
      return ConstantAggregateZero::get(VT);
409
24
410
24
    // If ArithmeticShift - clamp Shift to (BitWidth - 1).
411
24
    Count = APInt(64, BitWidth - 1);
412
24
  }
413
192
414
192
  // Get a constant vector of the same type as the first operand.
415
120
  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
416
120
  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
417
120
418
120
  if (ShiftLeft)
419
30
    return Builder.CreateShl(Vec, ShiftVec);
420
90
421
90
  
if (90
LogicalShift90
)
422
30
    return Builder.CreateLShr(Vec, ShiftVec);
423
60
424
60
  return Builder.CreateAShr(Vec, ShiftVec);
425
60
}
426
427
// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
428
// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
429
// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
430
static Value *simplifyX86varShift(const IntrinsicInst &II,
431
126
                                  InstCombiner::BuilderTy &Builder) {
432
126
  bool LogicalShift = false;
433
126
  bool ShiftLeft = false;
434
126
435
126
  switch (II.getIntrinsicID()) {
436
0
  
default: 0
llvm_unreachable0
("Unexpected intrinsic!");
437
36
  case Intrinsic::x86_avx2_psrav_d:
438
36
  case Intrinsic::x86_avx2_psrav_d_256:
439
36
  case Intrinsic::x86_avx512_psrav_q_128:
440
36
  case Intrinsic::x86_avx512_psrav_q_256:
441
36
  case Intrinsic::x86_avx512_psrav_d_512:
442
36
  case Intrinsic::x86_avx512_psrav_q_512:
443
36
  case Intrinsic::x86_avx512_psrav_w_128:
444
36
  case Intrinsic::x86_avx512_psrav_w_256:
445
36
  case Intrinsic::x86_avx512_psrav_w_512:
446
36
    LogicalShift = false;
447
36
    ShiftLeft = false;
448
36
    break;
449
45
  case Intrinsic::x86_avx2_psrlv_d:
450
45
  case Intrinsic::x86_avx2_psrlv_d_256:
451
45
  case Intrinsic::x86_avx2_psrlv_q:
452
45
  case Intrinsic::x86_avx2_psrlv_q_256:
453
45
  case Intrinsic::x86_avx512_psrlv_d_512:
454
45
  case Intrinsic::x86_avx512_psrlv_q_512:
455
45
  case Intrinsic::x86_avx512_psrlv_w_128:
456
45
  case Intrinsic::x86_avx512_psrlv_w_256:
457
45
  case Intrinsic::x86_avx512_psrlv_w_512:
458
45
    LogicalShift = true;
459
45
    ShiftLeft = false;
460
45
    break;
461
45
  case Intrinsic::x86_avx2_psllv_d:
462
45
  case Intrinsic::x86_avx2_psllv_d_256:
463
45
  case Intrinsic::x86_avx2_psllv_q:
464
45
  case Intrinsic::x86_avx2_psllv_q_256:
465
45
  case Intrinsic::x86_avx512_psllv_d_512:
466
45
  case Intrinsic::x86_avx512_psllv_q_512:
467
45
  case Intrinsic::x86_avx512_psllv_w_128:
468
45
  case Intrinsic::x86_avx512_psllv_w_256:
469
45
  case Intrinsic::x86_avx512_psllv_w_512:
470
45
    LogicalShift = true;
471
45
    ShiftLeft = true;
472
45
    break;
473
126
  }
474
126
  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
475
126
476
126
  // Simplify if all shift amounts are constant/undef.
477
126
  auto *CShift = dyn_cast<Constant>(II.getArgOperand(1));
478
126
  if (!CShift)
479
0
    return nullptr;
480
126
481
126
  auto Vec = II.getArgOperand(0);
482
126
  auto VT = cast<VectorType>(II.getType());
483
126
  auto SVT = VT->getVectorElementType();
484
126
  int NumElts = VT->getNumElements();
485
126
  int BitWidth = SVT->getIntegerBitWidth();
486
126
487
126
  // Collect each element's shift amount.
488
126
  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
489
126
  bool AnyOutOfRange = false;
490
126
  SmallVector<int, 8> ShiftAmts;
491
1.49k
  for (int I = 0; 
I < NumElts1.49k
;
++I1.37k
) {
492
1.37k
    auto *CElt = CShift->getAggregateElement(I);
493
1.37k
    if (
CElt && 1.37k
isa<UndefValue>(CElt)1.37k
) {
494
67
      ShiftAmts.push_back(-1);
495
67
      continue;
496
67
    }
497
1.30k
498
1.30k
    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
499
1.30k
    if (!COp)
500
0
      return nullptr;
501
1.30k
502
1.30k
    // Handle out of range shifts.
503
1.30k
    // If LogicalShift - set to BitWidth (special case).
504
1.30k
    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
505
1.30k
    APInt ShiftVal = COp->getValue();
506
1.30k
    if (
ShiftVal.uge(BitWidth)1.30k
) {
507
284
      AnyOutOfRange = LogicalShift;
508
284
      ShiftAmts.push_back(LogicalShift ? 
BitWidth192
:
BitWidth - 192
);
509
284
      continue;
510
284
    }
511
1.02k
512
1.02k
    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
513
1.02k
  }
514
126
515
126
  // If all elements out of range or UNDEF, return vector of zeros/undefs.
516
126
  // ArithmeticShift should only hit this if they are all UNDEF.
517
325
  
auto OutOfRange = [&](int Idx) 126
{ return (Idx < 0) || 325
(BitWidth <= Idx)278
; };
518
126
  if (
all_of(ShiftAmts, OutOfRange)126
) {
519
18
    SmallVector<Constant *, 8> ConstantVec;
520
196
    for (int Idx : ShiftAmts) {
521
196
      if (
Idx < 0196
) {
522
26
        ConstantVec.push_back(UndefValue::get(SVT));
523
196
      } else {
524
170
        assert(LogicalShift && "Logical shift expected");
525
170
        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
526
170
      }
527
196
    }
528
18
    return ConstantVector::get(ConstantVec);
529
18
  }
530
108
531
108
  // We can't handle only some out of range values with generic logical shifts.
532
108
  
if (108
AnyOutOfRange108
)
533
18
    return nullptr;
534
90
535
90
  // Build the shift amount constant vector.
536
90
  SmallVector<Constant *, 8> ShiftVecAmts;
537
980
  for (int Idx : ShiftAmts) {
538
980
    if (Idx < 0)
539
41
      ShiftVecAmts.push_back(UndefValue::get(SVT));
540
980
    else
541
939
      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
542
980
  }
543
90
  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
544
90
545
90
  if (ShiftLeft)
546
27
    return Builder.CreateShl(Vec, ShiftVec);
547
63
548
63
  
if (63
LogicalShift63
)
549
27
    return Builder.CreateLShr(Vec, ShiftVec);
550
36
551
36
  return Builder.CreateAShr(Vec, ShiftVec);
552
36
}
553
554
static Value *simplifyX86muldq(const IntrinsicInst &II,
555
42
                               InstCombiner::BuilderTy &Builder) {
556
42
  Value *Arg0 = II.getArgOperand(0);
557
42
  Value *Arg1 = II.getArgOperand(1);
558
42
  Type *ResTy = II.getType();
559
42
  assert(Arg0->getType()->getScalarSizeInBits() == 32 &&
560
42
         Arg1->getType()->getScalarSizeInBits() == 32 &&
561
42
         ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types");
562
42
563
42
  // muldq/muludq(undef, undef) -> zero (matches generic mul behavior)
564
42
  if (
isa<UndefValue>(Arg0) || 42
isa<UndefValue>(Arg1)33
)
565
12
    return ConstantAggregateZero::get(ResTy);
566
30
567
30
  // Constant folding.
568
30
  // PMULDQ  = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)),
569
30
  //                vXi64 sext(shuffle<0,2,..>(Arg1))))
570
30
  // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)),
571
30
  //                vXi64 zext(shuffle<0,2,..>(Arg1))))
572
30
  
if (30
!isa<Constant>(Arg0) || 30
!isa<Constant>(Arg1)6
)
573
24
    return nullptr;
574
6
575
6
  unsigned NumElts = ResTy->getVectorNumElements();
576
6
  assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) &&
577
6
         Arg1->getType()->getVectorNumElements() == (2 * NumElts) &&
578
6
         "Unexpected muldq/muludq types");
579
6
580
6
  unsigned IntrinsicID = II.getIntrinsicID();
581
6
  bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID ||
582
5
                   Intrinsic::x86_avx2_pmul_dq == IntrinsicID ||
583
4
                   Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID);
584
6
585
6
  SmallVector<unsigned, 16> ShuffleMask;
586
34
  for (unsigned i = 0; 
i != NumElts34
;
++i28
)
587
28
    ShuffleMask.push_back(i * 2);
588
6
589
6
  auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask);
590
6
  auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask);
591
6
592
6
  if (
IsSigned6
) {
593
3
    LHS = Builder.CreateSExt(LHS, ResTy);
594
3
    RHS = Builder.CreateSExt(RHS, ResTy);
595
6
  } else {
596
3
    LHS = Builder.CreateZExt(LHS, ResTy);
597
3
    RHS = Builder.CreateZExt(RHS, ResTy);
598
3
  }
599
42
600
42
  return Builder.CreateMul(LHS, RHS);
601
42
}
602
603
51
static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) {
604
51
  Value *Arg0 = II.getArgOperand(0);
605
51
  Value *Arg1 = II.getArgOperand(1);
606
51
  Type *ResTy = II.getType();
607
51
608
51
  // Fast all undef handling.
609
51
  if (
isa<UndefValue>(Arg0) && 51
isa<UndefValue>(Arg1)17
)
610
12
    return UndefValue::get(ResTy);
611
39
612
39
  Type *ArgTy = Arg0->getType();
613
39
  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
614
39
  unsigned NumDstElts = ResTy->getVectorNumElements();
615
39
  unsigned NumSrcElts = ArgTy->getVectorNumElements();
616
39
  assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types");
617
39
618
39
  unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
619
39
  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
620
39
  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
621
39
  assert(ArgTy->getScalarSizeInBits() == (2 * DstScalarSizeInBits) &&
622
39
         "Unexpected packing types");
623
39
624
39
  // Constant folding.
625
39
  auto *Cst0 = dyn_cast<Constant>(Arg0);
626
39
  auto *Cst1 = dyn_cast<Constant>(Arg1);
627
39
  if (
!Cst0 || 39
!Cst120
)
628
21
    return nullptr;
629
18
630
18
  SmallVector<Constant *, 32> Vals;
631
60
  for (unsigned Lane = 0; 
Lane != NumLanes60
;
++Lane42
) {
632
602
    for (unsigned Elt = 0; 
Elt != NumDstEltsPerLane602
;
++Elt560
) {
633
560
      unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
634
560
      auto *Cst = (Elt >= NumSrcEltsPerLane) ? 
Cst1280
:
Cst0280
;
635
560
      auto *COp = Cst->getAggregateElement(SrcIdx);
636
560
      if (
COp && 560
isa<UndefValue>(COp)560
) {
637
294
        Vals.push_back(UndefValue::get(ResTy->getScalarType()));
638
294
        continue;
639
294
      }
640
266
641
266
      auto *CInt = dyn_cast_or_null<ConstantInt>(COp);
642
266
      if (!CInt)
643
0
        return nullptr;
644
266
645
266
      APInt Val = CInt->getValue();
646
266
      assert(Val.getBitWidth() == ArgTy->getScalarSizeInBits() &&
647
266
             "Unexpected constant bitwidth");
648
266
649
266
      if (
IsSigned266
) {
650
96
        // PACKSS: Truncate signed value with signed saturation.
651
96
        // Source values less than dst minint are saturated to minint.
652
96
        // Source values greater than dst maxint are saturated to maxint.
653
96
        if (Val.isSignedIntN(DstScalarSizeInBits))
654
88
          Val = Val.trunc(DstScalarSizeInBits);
655
8
        else 
if (8
Val.isNegative()8
)
656
4
          Val = APInt::getSignedMinValue(DstScalarSizeInBits);
657
8
        else
658
4
          Val = APInt::getSignedMaxValue(DstScalarSizeInBits);
659
266
      } else {
660
170
        // PACKUS: Truncate signed value with unsigned saturation.
661
170
        // Source values less than zero are saturated to zero.
662
170
        // Source values greater than dst maxuint are saturated to maxuint.
663
170
        if (Val.isIntN(DstScalarSizeInBits))
664
118
          Val = Val.trunc(DstScalarSizeInBits);
665
52
        else 
if (52
Val.isNegative()52
)
666
42
          Val = APInt::getNullValue(DstScalarSizeInBits);
667
52
        else
668
10
          Val = APInt::getAllOnesValue(DstScalarSizeInBits);
669
170
      }
670
560
671
560
      Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val));
672
560
    }
673
42
  }
674
18
675
18
  return ConstantVector::get(Vals);
676
51
}
677
678
47
static Value *simplifyX86movmsk(const IntrinsicInst &II) {
679
47
  Value *Arg = II.getArgOperand(0);
680
47
  Type *ResTy = II.getType();
681
47
  Type *ArgTy = Arg->getType();
682
47
683
47
  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
684
47
  if (isa<UndefValue>(Arg))
685
7
    return Constant::getNullValue(ResTy);
686
40
687
40
  // We can't easily peek through x86_mmx types.
688
40
  
if (40
!ArgTy->isVectorTy()40
)
689
6
    return nullptr;
690
34
691
34
  auto *C = dyn_cast<Constant>(Arg);
692
34
  if (!C)
693
22
    return nullptr;
694
12
695
12
  // Extract signbits of the vector input and pack into integer result.
696
12
  APInt Result(ResTy->getPrimitiveSizeInBits(), 0);
697
144
  for (unsigned I = 0, E = ArgTy->getVectorNumElements(); 
I != E144
;
++I132
) {
698
132
    auto *COp = C->getAggregateElement(I);
699
132
    if (!COp)
700
0
      return nullptr;
701
132
    
if (132
isa<UndefValue>(COp)132
)
702
7
      continue;
703
125
704
125
    auto *CInt = dyn_cast<ConstantInt>(COp);
705
125
    auto *CFp = dyn_cast<ConstantFP>(COp);
706
125
    if (
!CInt && 125
!CFp36
)
707
0
      return nullptr;
708
125
709
125
    
if (125
(CInt && 125
CInt->isNegative()89
) ||
(CFp && 110
CFp->isNegative()36
))
710
24
      Result.setBit(I);
711
132
  }
712
12
713
12
  return Constant::getIntegerValue(ResTy, Result);
714
47
}
715
716
static Value *simplifyX86insertps(const IntrinsicInst &II,
717
15
                                  InstCombiner::BuilderTy &Builder) {
718
15
  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
719
15
  if (!CInt)
720
1
    return nullptr;
721
14
722
14
  VectorType *VecTy = cast<VectorType>(II.getType());
723
14
  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
724
14
725
14
  // The immediate permute control byte looks like this:
726
14
  //    [3:0] - zero mask for each 32-bit lane
727
14
  //    [5:4] - select one 32-bit destination lane
728
14
  //    [7:6] - select one 32-bit source lane
729
14
730
14
  uint8_t Imm = CInt->getZExtValue();
731
14
  uint8_t ZMask = Imm & 0xf;
732
14
  uint8_t DestLane = (Imm >> 4) & 0x3;
733
14
  uint8_t SourceLane = (Imm >> 6) & 0x3;
734
14
735
14
  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
736
14
737
14
  // If all zero mask bits are set, this was just a weird way to
738
14
  // generate a zero vector.
739
14
  if (ZMask == 0xf)
740
2
    return ZeroVector;
741
12
742
12
  // Initialize by passing all of the first source bits through.
743
12
  uint32_t ShuffleMask[4] = { 0, 1, 2, 3 };
744
12
745
12
  // We may replace the second operand with the zero vector.
746
12
  Value *V1 = II.getArgOperand(1);
747
12
748
12
  if (
ZMask12
) {
749
4
    // If the zero mask is being used with a single input or the zero mask
750
4
    // overrides the destination lane, this is a shuffle with the zero vector.
751
4
    if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
752
4
        
(ZMask & (1 << DestLane))2
) {
753
3
      V1 = ZeroVector;
754
3
      // We may still move 32-bits of the first source vector from one lane
755
3
      // to another.
756
3
      ShuffleMask[DestLane] = SourceLane;
757
3
      // The zero mask may override the previous insert operation.
758
15
      for (unsigned i = 0; 
i < 415
;
++i12
)
759
12
        
if (12
(ZMask >> i) & 0x112
)
760
5
          ShuffleMask[i] = i + 4;
761
4
    } else {
762
1
      // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
763
1
      return nullptr;
764
1
    }
765
8
  } else {
766
8
    // Replace the selected destination lane with the selected source lane.
767
8
    ShuffleMask[DestLane] = SourceLane + 4;
768
8
  }
769
12
770
11
  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
771
15
}
772
773
/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
774
/// or conversion to a shuffle vector.
775
static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
776
                               ConstantInt *CILength, ConstantInt *CIIndex,
777
35
                               InstCombiner::BuilderTy &Builder) {
778
6
  auto LowConstantHighUndef = [&](uint64_t Val) {
779
6
    Type *IntTy64 = Type::getInt64Ty(II.getContext());
780
6
    Constant *Args[] = {ConstantInt::get(IntTy64, Val),
781
6
                        UndefValue::get(IntTy64)};
782
6
    return ConstantVector::get(Args);
783
6
  };
784
35
785
35
  // See if we're dealing with constant values.
786
35
  Constant *C0 = dyn_cast<Constant>(Op0);
787
35
  ConstantInt *CI0 =
788
8
      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
789
27
         : nullptr;
790
35
791
35
  // Attempt to constant fold.
792
35
  if (
CILength && 35
CIIndex20
) {
793
20
    // From AMD documentation: "The bit index and field length are each six
794
20
    // bits in length other bits of the field are ignored."
795
20
    APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
796
20
    APInt APLength = CILength->getValue().zextOrTrunc(6);
797
20
798
20
    unsigned Index = APIndex.getZExtValue();
799
20
800
20
    // From AMD documentation: "a value of zero in the field length is
801
20
    // defined as length of 64".
802
20
    unsigned Length = APLength == 0 ? 
642
:
APLength.getZExtValue()18
;
803
20
804
20
    // From AMD documentation: "If the sum of the bit index + length field
805
20
    // is greater than 64, the results are undefined".
806
20
    unsigned End = Index + Length;
807
20
808
20
    // Note that both field index and field length are 8-bit quantities.
809
20
    // Since variables 'Index' and 'Length' are unsigned values
810
20
    // obtained from zero-extending field index and field length
811
20
    // respectively, their sum should never wrap around.
812
20
    if (End > 64)
813
1
      return UndefValue::get(II.getType());
814
19
815
19
    // If we are inserting whole bytes, we can convert this to a shuffle.
816
19
    // Lowering can recognize EXTRQI shuffle masks.
817
19
    
if (19
(Length % 8) == 0 && 19
(Index % 8) == 011
) {
818
5
      // Convert bit indices to byte indices.
819
5
      Length /= 8;
820
5
      Index /= 8;
821
5
822
5
      Type *IntTy8 = Type::getInt8Ty(II.getContext());
823
5
      Type *IntTy32 = Type::getInt32Ty(II.getContext());
824
5
      VectorType *ShufTy = VectorType::get(IntTy8, 16);
825
5
826
5
      SmallVector<Constant *, 16> ShuffleMask;
827
27
      for (int i = 0; 
i != (int)Length27
;
++i22
)
828
22
        ShuffleMask.push_back(
829
22
            Constant::getIntegerValue(IntTy32, APInt(32, i + Index)));
830
23
      for (int i = Length; 
i != 823
;
++i18
)
831
18
        ShuffleMask.push_back(
832
18
            Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
833
45
      for (int i = 8; 
i != 1645
;
++i40
)
834
40
        ShuffleMask.push_back(UndefValue::get(IntTy32));
835
5
836
5
      Value *SV = Builder.CreateShuffleVector(
837
5
          Builder.CreateBitCast(Op0, ShufTy),
838
5
          ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask));
839
5
      return Builder.CreateBitCast(SV, II.getType());
840
5
    }
841
14
842
14
    // Constant Fold - shift Index'th bit to lowest position and mask off
843
14
    // Length bits.
844
14
    
if (14
CI014
) {
845
5
      APInt Elt = CI0->getValue();
846
5
      Elt.lshrInPlace(Index);
847
5
      Elt = Elt.zextOrTrunc(Length);
848
5
      return LowConstantHighUndef(Elt.getZExtValue());
849
5
    }
850
9
851
9
    // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
852
9
    
if (9
II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq9
) {
853
1
      Value *Args[] = {Op0, CILength, CIIndex};
854
1
      Module *M = II.getModule();
855
1
      Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
856
1
      return Builder.CreateCall(F, Args);
857
1
    }
858
23
  }
859
23
860
23
  // Constant Fold - extraction from zero is always {zero, undef}.
861
23
  
if (23
CI0 && 23
CI0->isZero()1
)
862
1
    return LowConstantHighUndef(0);
863
22
864
22
  return nullptr;
865
22
}
866
867
/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
868
/// folding or conversion to a shuffle vector.
869
static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
870
                                 APInt APLength, APInt APIndex,
871
34
                                 InstCombiner::BuilderTy &Builder) {
872
34
  // From AMD documentation: "The bit index and field length are each six bits
873
34
  // in length other bits of the field are ignored."
874
34
  APIndex = APIndex.zextOrTrunc(6);
875
34
  APLength = APLength.zextOrTrunc(6);
876
34
877
34
  // Attempt to constant fold.
878
34
  unsigned Index = APIndex.getZExtValue();
879
34
880
34
  // From AMD documentation: "a value of zero in the field length is
881
34
  // defined as length of 64".
882
34
  unsigned Length = APLength == 0 ? 
644
:
APLength.getZExtValue()30
;
883
34
884
34
  // From AMD documentation: "If the sum of the bit index + length field
885
34
  // is greater than 64, the results are undefined".
886
34
  unsigned End = Index + Length;
887
34
888
34
  // Note that both field index and field length are 8-bit quantities.
889
34
  // Since variables 'Index' and 'Length' are unsigned values
890
34
  // obtained from zero-extending field index and field length
891
34
  // respectively, their sum should never wrap around.
892
34
  if (End > 64)
893
3
    return UndefValue::get(II.getType());
894
31
895
31
  // If we are inserting whole bytes, we can convert this to a shuffle.
896
31
  // Lowering can recognize INSERTQI shuffle masks.
897
31
  
if (31
(Length % 8) == 0 && 31
(Index % 8) == 08
) {
898
4
    // Convert bit indices to byte indices.
899
4
    Length /= 8;
900
4
    Index /= 8;
901
4
902
4
    Type *IntTy8 = Type::getInt8Ty(II.getContext());
903
4
    Type *IntTy32 = Type::getInt32Ty(II.getContext());
904
4
    VectorType *ShufTy = VectorType::get(IntTy8, 16);
905
4
906
4
    SmallVector<Constant *, 16> ShuffleMask;
907
8
    for (int i = 0; 
i != (int)Index8
;
++i4
)
908
4
      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
909
26
    for (int i = 0; 
i != (int)Length26
;
++i22
)
910
22
      ShuffleMask.push_back(
911
22
          Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
912
10
    for (int i = Index + Length; 
i != 810
;
++i6
)
913
6
      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
914
36
    for (int i = 8; 
i != 1636
;
++i32
)
915
32
      ShuffleMask.push_back(UndefValue::get(IntTy32));
916
4
917
4
    Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
918
4
                                            Builder.CreateBitCast(Op1, ShufTy),
919
4
                                            ConstantVector::get(ShuffleMask));
920
4
    return Builder.CreateBitCast(SV, II.getType());
921
4
  }
922
27
923
27
  // See if we're dealing with constant values.
924
27
  Constant *C0 = dyn_cast<Constant>(Op0);
925
27
  Constant *C1 = dyn_cast<Constant>(Op1);
926
27
  ConstantInt *CI00 =
927
3
      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
928
24
         : nullptr;
929
27
  ConstantInt *CI10 =
930
14
      C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
931
13
         : nullptr;
932
27
933
27
  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
934
27
  if (
CI00 && 27
CI103
) {
935
3
    APInt V00 = CI00->getValue();
936
3
    APInt V10 = CI10->getValue();
937
3
    APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
938
3
    V00 = V00 & ~Mask;
939
3
    V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
940
3
    APInt Val = V00 | V10;
941
3
    Type *IntTy64 = Type::getInt64Ty(II.getContext());
942
3
    Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
943
3
                        UndefValue::get(IntTy64)};
944
3
    return ConstantVector::get(Args);
945
3
  }
946
24
947
24
  // If we were an INSERTQ call, we'll save demanded elements if we convert to
948
24
  // INSERTQI.
949
24
  
if (24
II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq24
) {
950
2
    Type *IntTy8 = Type::getInt8Ty(II.getContext());
951
2
    Constant *CILength = ConstantInt::get(IntTy8, Length, false);
952
2
    Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
953
2
954
2
    Value *Args[] = {Op0, Op1, CILength, CIIndex};
955
2
    Module *M = II.getModule();
956
2
    Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
957
2
    return Builder.CreateCall(F, Args);
958
2
  }
959
22
960
22
  return nullptr;
961
22
}
962
963
/// Attempt to convert pshufb* to shufflevector if the mask is constant.
964
static Value *simplifyX86pshufb(const IntrinsicInst &II,
965
58
                                InstCombiner::BuilderTy &Builder) {
966
58
  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
967
58
  if (!V)
968
7
    return nullptr;
969
51
970
51
  auto *VecTy = cast<VectorType>(II.getType());
971
51
  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
972
51
  unsigned NumElts = VecTy->getNumElements();
973
51
  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
974
51
         "Unexpected number of elements in shuffle mask!");
975
51
976
51
  // Construct a shuffle mask from constant integers or UNDEFs.
977
51
  Constant *Indexes[64] = {nullptr};
978
51
979
51
  // Each byte in the shuffle control mask forms an index to permute the
980
51
  // corresponding byte in the destination operand.
981
1.95k
  for (unsigned I = 0; 
I < NumElts1.95k
;
++I1.90k
) {
982
1.90k
    Constant *COp = V->getAggregateElement(I);
983
1.90k
    if (
!COp || 1.90k
(!isa<UndefValue>(COp) && 1.90k
!isa<ConstantInt>(COp)1.76k
))
984
0
      return nullptr;
985
1.90k
986
1.90k
    
if (1.90k
isa<UndefValue>(COp)1.90k
) {
987
140
      Indexes[I] = UndefValue::get(MaskEltTy);
988
140
      continue;
989
140
    }
990
1.76k
991
1.76k
    int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
992
1.76k
993
1.76k
    // If the most significant bit (bit[7]) of each byte of the shuffle
994
1.76k
    // control mask is set, then zero is written in the result byte.
995
1.76k
    // The zero vector is in the right-hand side of the resulting
996
1.76k
    // shufflevector.
997
1.76k
998
1.76k
    // The value of each index for the high 128-bit lane is the least
999
1.76k
    // significant 4 bits of the respective shuffle control byte.
1000
1.76k
    Index = ((Index < 0) ? 
NumElts774
:
Index & 0x0F990
) + (I & 0xF0);
1001
1.90k
    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
1002
1.90k
  }
1003
51
1004
51
  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
1005
51
  auto V1 = II.getArgOperand(0);
1006
51
  auto V2 = Constant::getNullValue(VecTy);
1007
51
  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
1008
58
}
1009
1010
/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
1011
static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
1012
39
                                    InstCombiner::BuilderTy &Builder) {
1013
39
  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
1014
39
  if (!V)
1015
11
    return nullptr;
1016
28
1017
28
  auto *VecTy = cast<VectorType>(II.getType());
1018
28
  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
1019
28
  unsigned NumElts = VecTy->getVectorNumElements();
1020
28
  bool IsPD = VecTy->getScalarType()->isDoubleTy();
1021
28
  unsigned NumLaneElts = IsPD ? 
214
:
414
;
1022
28
  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
1023
28
1024
28
  // Construct a shuffle mask from constant integers or UNDEFs.
1025
28
  Constant *Indexes[16] = {nullptr};
1026
28
1027
28
  // The intrinsics only read one or two bits, clear the rest.
1028
214
  for (unsigned I = 0; 
I < NumElts214
;
++I186
) {
1029
186
    Constant *COp = V->getAggregateElement(I);
1030
186
    if (
!COp || 186
(!isa<UndefValue>(COp) && 186
!isa<ConstantInt>(COp)164
))
1031
0
      return nullptr;
1032
186
1033
186
    
if (186
isa<UndefValue>(COp)186
) {
1034
22
      Indexes[I] = UndefValue::get(MaskEltTy);
1035
22
      continue;
1036
22
    }
1037
164
1038
164
    APInt Index = cast<ConstantInt>(COp)->getValue();
1039
164
    Index = Index.zextOrTrunc(32).getLoBits(2);
1040
164
1041
164
    // The PD variants uses bit 1 to select per-lane element index, so
1042
164
    // shift down to convert to generic shuffle mask index.
1043
164
    if (IsPD)
1044
53
      Index.lshrInPlace(1);
1045
186
1046
186
    // The _256 variants are a bit trickier since the mask bits always index
1047
186
    // into the corresponding 128 half. In order to convert to a generic
1048
186
    // shuffle, we have to make that explicit.
1049
186
    Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
1050
186
1051
186
    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
1052
186
  }
1053
28
1054
28
  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
1055
28
  auto V1 = II.getArgOperand(0);
1056
28
  auto V2 = UndefValue::get(V1->getType());
1057
28
  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
1058
39
}
1059
1060
/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
1061
static Value *simplifyX86vpermv(const IntrinsicInst &II,
1062
124
                                InstCombiner::BuilderTy &Builder) {
1063
124
  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1064
124
  if (!V)
1065
3
    return nullptr;
1066
121
1067
121
  auto *VecTy = cast<VectorType>(II.getType());
1068
121
  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
1069
121
  unsigned Size = VecTy->getNumElements();
1070
121
  assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
1071
121
         "Unexpected shuffle mask size");
1072
121
1073
121
  // Construct a shuffle mask from constant integers or UNDEFs.
1074
121
  Constant *Indexes[64] = {nullptr};
1075
121
1076
2.11k
  for (unsigned I = 0; 
I < Size2.11k
;
++I1.99k
) {
1077
1.99k
    Constant *COp = V->getAggregateElement(I);
1078
1.99k
    if (
!COp || 1.99k
(!isa<UndefValue>(COp) && 1.99k
!isa<ConstantInt>(COp)1.96k
))
1079
0
      return nullptr;
1080
1.99k
1081
1.99k
    
if (1.99k
isa<UndefValue>(COp)1.99k
) {
1082
31
      Indexes[I] = UndefValue::get(MaskEltTy);
1083
31
      continue;
1084
31
    }
1085
1.96k
1086
1.96k
    uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
1087
1.96k
    Index &= Size - 1;
1088
1.96k
    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
1089
1.96k
  }
1090
121
1091
121
  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size));
1092
121
  auto V1 = II.getArgOperand(0);
1093
121
  auto V2 = UndefValue::get(VecTy);
1094
121
  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
1095
124
}
1096
1097
/// Decode XOP integer vector comparison intrinsics.
1098
static Value *simplifyX86vpcom(const IntrinsicInst &II,
1099
                               InstCombiner::BuilderTy &Builder,
1100
16
                               bool IsSigned) {
1101
16
  if (auto *
CInt16
= dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1102
16
    uint64_t Imm = CInt->getZExtValue() & 0x7;
1103
16
    VectorType *VecTy = cast<VectorType>(II.getType());
1104
16
    CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
1105
16
1106
16
    switch (Imm) {
1107
2
    case 0x0:
1108
2
      Pred = IsSigned ? 
ICmpInst::ICMP_SLT1
:
ICmpInst::ICMP_ULT1
;
1109
2
      break;
1110
2
    case 0x1:
1111
2
      Pred = IsSigned ? 
ICmpInst::ICMP_SLE1
:
ICmpInst::ICMP_ULE1
;
1112
2
      break;
1113
2
    case 0x2:
1114
2
      Pred = IsSigned ? 
ICmpInst::ICMP_SGT1
:
ICmpInst::ICMP_UGT1
;
1115
2
      break;
1116
2
    case 0x3:
1117
2
      Pred = IsSigned ? 
ICmpInst::ICMP_SGE1
:
ICmpInst::ICMP_UGE1
;
1118
2
      break;
1119
2
    case 0x4:
1120
2
      Pred = ICmpInst::ICMP_EQ; break;
1121
2
    case 0x5:
1122
2
      Pred = ICmpInst::ICMP_NE; break;
1123
2
    case 0x6:
1124
2
      return ConstantInt::getSigned(VecTy, 0); // FALSE
1125
2
    case 0x7:
1126
2
      return ConstantInt::getSigned(VecTy, -1); // TRUE
1127
12
    }
1128
12
1129
12
    
if (Value *12
Cmp12
= Builder.CreateICmp(Pred, II.getArgOperand(0),
1130
12
                                        II.getArgOperand(1)))
1131
12
      return Builder.CreateSExtOrTrunc(Cmp, VecTy);
1132
0
  }
1133
0
  return nullptr;
1134
0
}
1135
1136
// Emit a select instruction and appropriate bitcasts to help simplify
1137
// masked intrinsics.
1138
static Value *emitX86MaskSelect(Value *Mask, Value *Op0, Value *Op1,
1139
128
                                InstCombiner::BuilderTy &Builder) {
1140
128
  unsigned VWidth = Op0->getType()->getVectorNumElements();
1141
128
1142
128
  // If the mask is all ones we don't need the select. But we need to check
1143
128
  // only the bit thats will be used in case VWidth is less than 8.
1144
128
  if (auto *C = dyn_cast<ConstantInt>(Mask))
1145
64
    
if (64
C->getValue().zextOrTrunc(VWidth).isAllOnesValue()64
)
1146
64
      return Op0;
1147
64
1148
64
  auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
1149
64
                         cast<IntegerType>(Mask->getType())->getBitWidth());
1150
64
  Mask = Builder.CreateBitCast(Mask, MaskTy);
1151
64
1152
64
  // If we have less than 8 elements, then the starting mask was an i8 and
1153
64
  // we need to extract down to the right number of elements.
1154
64
  if (
VWidth < 864
) {
1155
8
    uint32_t Indices[4];
1156
40
    for (unsigned i = 0; 
i != VWidth40
;
++i32
)
1157
32
      Indices[i] = i;
1158
8
    Mask = Builder.CreateShuffleVector(Mask, Mask,
1159
8
                                       makeArrayRef(Indices, VWidth),
1160
8
                                       "extract");
1161
8
  }
1162
128
1163
128
  return Builder.CreateSelect(Mask, Op0, Op1);
1164
128
}
1165
1166
1.13k
static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) {
1167
1.13k
  Value *Arg0 = II.getArgOperand(0);
1168
1.13k
  Value *Arg1 = II.getArgOperand(1);
1169
1.13k
1170
1.13k
  // fmin(x, x) -> x
1171
1.13k
  if (Arg0 == Arg1)
1172
4
    return Arg0;
1173
1.13k
1174
1.13k
  const auto *C1 = dyn_cast<ConstantFP>(Arg1);
1175
1.13k
1176
1.13k
  // fmin(x, nan) -> x
1177
1.13k
  if (
C1 && 1.13k
C1->isNaN()17
)
1178
7
    return Arg0;
1179
1.12k
1180
1.12k
  // This is the value because if undef were NaN, we would return the other
1181
1.12k
  // value and cannot return a NaN unless both operands are.
1182
1.12k
  //
1183
1.12k
  // fmin(undef, x) -> x
1184
1.12k
  
if (1.12k
isa<UndefValue>(Arg0)1.12k
)
1185
2
    return Arg1;
1186
1.12k
1187
1.12k
  // fmin(x, undef) -> x
1188
1.12k
  
if (1.12k
isa<UndefValue>(Arg1)1.12k
)
1189
2
    return Arg0;
1190
1.12k
1191
1.12k
  Value *X = nullptr;
1192
1.12k
  Value *Y = nullptr;
1193
1.12k
  if (
II.getIntrinsicID() == Intrinsic::minnum1.12k
) {
1194
42
    // fmin(x, fmin(x, y)) -> fmin(x, y)
1195
42
    // fmin(y, fmin(x, y)) -> fmin(x, y)
1196
42
    if (
match(Arg1, m_FMin(m_Value(X), m_Value(Y)))42
) {
1197
4
      if (
Arg0 == X || 4
Arg0 == Y3
)
1198
2
        return Arg1;
1199
40
    }
1200
40
1201
40
    // fmin(fmin(x, y), x) -> fmin(x, y)
1202
40
    // fmin(fmin(x, y), y) -> fmin(x, y)
1203
40
    
if (40
match(Arg0, m_FMin(m_Value(X), m_Value(Y)))40
) {
1204
2
      if (
Arg1 == X || 2
Arg1 == Y2
)
1205
0
        return Arg0;
1206
40
    }
1207
40
1208
40
    // TODO: fmin(nnan x, inf) -> x
1209
40
    // TODO: fmin(nnan ninf x, flt_max) -> x
1210
40
    
if (40
C1 && 40
C1->isInfinity()5
) {
1211
3
      // fmin(x, -inf) -> -inf
1212
3
      if (C1->isNegative())
1213
1
        return Arg1;
1214
1.12k
    }
1215
0
  } else {
1216
1.08k
    assert(II.getIntrinsicID() == Intrinsic::maxnum);
1217
1.08k
    // fmax(x, fmax(x, y)) -> fmax(x, y)
1218
1.08k
    // fmax(y, fmax(x, y)) -> fmax(x, y)
1219
1.08k
    if (
match(Arg1, m_FMax(m_Value(X), m_Value(Y)))1.08k
) {
1220
4
      if (
Arg0 == X || 4
Arg0 == Y3
)
1221
2
        return Arg1;
1222
1.08k
    }
1223
1.08k
1224
1.08k
    // fmax(fmax(x, y), x) -> fmax(x, y)
1225
1.08k
    // fmax(fmax(x, y), y) -> fmax(x, y)
1226
1.08k
    
if (1.08k
match(Arg0, m_FMax(m_Value(X), m_Value(Y)))1.08k
) {
1227
2
      if (
Arg1 == X || 2
Arg1 == Y2
)
1228
0
        return Arg0;
1229
1.08k
    }
1230
1.08k
1231
1.08k
    // TODO: fmax(nnan x, -inf) -> x
1232
1.08k
    // TODO: fmax(nnan ninf x, -flt_max) -> x
1233
1.08k
    
if (1.08k
C1 && 1.08k
C1->isInfinity()5
) {
1234
3
      // fmax(x, inf) -> inf
1235
3
      if (!C1->isNegative())
1236
1
        return Arg1;
1237
1.11k
    }
1238
1.08k
  }
1239
1.11k
  return nullptr;
1240
1.11k
}
1241
1242
500
static bool maskIsAllOneOrUndef(Value *Mask) {
1243
500
  auto *ConstMask = dyn_cast<Constant>(Mask);
1244
500
  if (!ConstMask)
1245
489
    return false;
1246
11
  
if (11
ConstMask->isAllOnesValue() || 11
isa<UndefValue>(ConstMask)9
)
1247
2
    return true;
1248
15
  
for (unsigned I = 0, E = ConstMask->getType()->getVectorNumElements(); 9
I != E15
;
1249
9
       
++I6
) {
1250
14
    if (auto *MaskElt = ConstMask->getAggregateElement(I))
1251
14
      
if (14
MaskElt->isAllOnesValue() || 14
isa<UndefValue>(MaskElt)9
)
1252
6
        continue;
1253
8
    return false;
1254
8
  }
1255
1
  return true;
1256
500
}
1257
1258
static Value *simplifyMaskedLoad(const IntrinsicInst &II,
1259
500
                                 InstCombiner::BuilderTy &Builder) {
1260
500
  // If the mask is all ones or undefs, this is a plain vector load of the 1st
1261
500
  // argument.
1262
500
  if (
maskIsAllOneOrUndef(II.getArgOperand(2))500
) {
1263
3
    Value *LoadPtr = II.getArgOperand(0);
1264
3
    unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue();
1265
3
    return Builder.CreateAlignedLoad(LoadPtr, Alignment, "unmaskedload");
1266
3
  }
1267
497
1268
497
  return nullptr;
1269
497
}
1270
1271
489
static Instruction *simplifyMaskedStore(IntrinsicInst &II, InstCombiner &IC) {
1272
489
  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
1273
489
  if (!ConstMask)
1274
477
    return nullptr;
1275
12
1276
12
  // If the mask is all zeros, this instruction does nothing.
1277
12
  
if (12
ConstMask->isNullValue()12
)
1278
2
    return IC.eraseInstFromFunction(II);
1279
10
1280
10
  // If the mask is all ones, this is a plain vector store of the 1st argument.
1281
10
  
if (10
ConstMask->isAllOnesValue()10
) {
1282
2
    Value *StorePtr = II.getArgOperand(1);
1283
2
    unsigned Alignment = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
1284
2
    return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
1285
2
  }
1286
8
1287
8
  return nullptr;
1288
8
}
1289
1290
224
static Instruction *simplifyMaskedGather(IntrinsicInst &II, InstCombiner &IC) {
1291
224
  // If the mask is all zeros, return the "passthru" argument of the gather.
1292
224
  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
1293
224
  if (
ConstMask && 224
ConstMask->isNullValue()105
)
1294
1
    return IC.replaceInstUsesWith(II, II.getArgOperand(3));
1295
223
1296
223
  return nullptr;
1297
223
}
1298
1299
105
static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) {
1300
105
  // If the mask is all zeros, a scatter does nothing.
1301
105
  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
1302
105
  if (
ConstMask && 105
ConstMask->isNullValue()1
)
1303
1
    return IC.eraseInstFromFunction(II);
1304
104
1305
104
  return nullptr;
1306
104
}
1307
1308
59.7k
static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
1309
59.7k
  assert((II.getIntrinsicID() == Intrinsic::cttz ||
1310
59.7k
          II.getIntrinsicID() == Intrinsic::ctlz) &&
1311
59.7k
         "Expected cttz or ctlz intrinsic");
1312
59.7k
  Value *Op0 = II.getArgOperand(0);
1313
59.7k
1314
59.7k
  KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
1315
59.7k
1316
59.7k
  // Create a mask for bits above (ctlz) or below (cttz) the first known one.
1317
59.7k
  bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
1318
8.66k
  unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
1319
51.0k
                                : Known.countMaxLeadingZeros();
1320
8.66k
  unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
1321
51.0k
                                : Known.countMinLeadingZeros();
1322
59.7k
1323
59.7k
  // If all bits above (ctlz) or below (cttz) the first known one are known
1324
59.7k
  // zero, this value is constant.
1325
59.7k
  // FIXME: This should be in InstSimplify because we're replacing an
1326
59.7k
  // instruction with a constant.
1327
59.7k
  if (
PossibleZeros == DefiniteZeros59.7k
) {
1328
4
    auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros);
1329
4
    return IC.replaceInstUsesWith(II, C);
1330
4
  }
1331
59.7k
1332
59.7k
  // If the input to cttz/ctlz is known to be non-zero,
1333
59.7k
  // then change the 'ZeroIsUndef' parameter to 'true'
1334
59.7k
  // because we know the zero behavior can't affect the result.
1335
59.7k
  
if (59.7k
!Known.One.isNullValue() ||
1336
59.6k
      isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
1337
59.7k
                     &IC.getDominatorTree())) {
1338
104
    if (
!match(II.getArgOperand(1), m_One())104
) {
1339
4
      II.setOperand(1, IC.Builder.getTrue());
1340
4
      return &II;
1341
4
    }
1342
59.7k
  }
1343
59.7k
1344
59.7k
  // Add range metadata since known bits can't completely reflect what we know.
1345
59.7k
  // TODO: Handle splat vectors.
1346
59.7k
  auto *IT = dyn_cast<IntegerType>(Op0->getType());
1347
59.7k
  if (
IT && 59.7k
IT->getBitWidth() != 159.3k
&&
!II.getMetadata(LLVMContext::MD_range)59.3k
) {
1348
4.13k
    Metadata *LowAndHigh[] = {
1349
4.13k
        ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)),
1350
4.13k
        ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))};
1351
4.13k
    II.setMetadata(LLVMContext::MD_range,
1352
4.13k
                   MDNode::get(II.getContext(), LowAndHigh));
1353
4.13k
    return &II;
1354
4.13k
  }
1355
55.5k
1356
55.5k
  return nullptr;
1357
55.5k
}
1358
1359
268
static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) {
1360
268
  assert(II.getIntrinsicID() == Intrinsic::ctpop &&
1361
268
         "Expected ctpop intrinsic");
1362
268
  Value *Op0 = II.getArgOperand(0);
1363
268
  // FIXME: Try to simplify vectors of integers.
1364
268
  auto *IT = dyn_cast<IntegerType>(Op0->getType());
1365
268
  if (!IT)
1366
176
    return nullptr;
1367
92
1368
92
  unsigned BitWidth = IT->getBitWidth();
1369
92
  KnownBits Known(BitWidth);
1370
92
  IC.computeKnownBits(Op0, Known, 0, &II);
1371
92
1372
92
  unsigned MinCount = Known.countMinPopulation();
1373
92
  unsigned MaxCount = Known.countMaxPopulation();
1374
92
1375
92
  // Add range metadata since known bits can't completely reflect what we know.
1376
92
  if (
IT->getBitWidth() != 1 && 92
!II.getMetadata(LLVMContext::MD_range)91
) {
1377
14
    Metadata *LowAndHigh[] = {
1378
14
        ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)),
1379
14
        ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))};
1380
14
    II.setMetadata(LLVMContext::MD_range,
1381
14
                   MDNode::get(II.getContext(), LowAndHigh));
1382
14
    return &II;
1383
14
  }
1384
78
1385
78
  return nullptr;
1386
78
}
1387
1388
// TODO: If the x86 backend knew how to convert a bool vector mask back to an
1389
// XMM register mask efficiently, we could transform all x86 masked intrinsics
1390
// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
1391
12
static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
1392
12
  Value *Ptr = II.getOperand(0);
1393
12
  Value *Mask = II.getOperand(1);
1394
12
  Constant *ZeroVec = Constant::getNullValue(II.getType());
1395
12
1396
12
  // Special case a zero mask since that's not a ConstantDataVector.
1397
12
  // This masked load instruction creates a zero vector.
1398
12
  if (isa<ConstantAggregateZero>(Mask))
1399
1
    return IC.replaceInstUsesWith(II, ZeroVec);
1400
11
1401
11
  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
1402
11
  if (!ConstMask)
1403
1
    return nullptr;
1404
10
1405
10
  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
1406
10
  // to allow target-independent optimizations.
1407
10
1408
10
  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
1409
10
  // the LLVM intrinsic definition for the pointer argument.
1410
10
  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
1411
10
  PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
1412
10
  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
1413
10
1414
10
  // Second, convert the x86 XMM integer vector mask to a vector of bools based
1415
10
  // on each element's most significant bit (the sign bit).
1416
10
  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
1417
10
1418
10
  // The pass-through vector for an x86 masked load is a zero vector.
1419
10
  CallInst *NewMaskedLoad =
1420
10
      IC.Builder.CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec);
1421
10
  return IC.replaceInstUsesWith(II, NewMaskedLoad);
1422
10
}
1423
1424
// TODO: If the x86 backend knew how to convert a bool vector mask back to an
1425
// XMM register mask efficiently, we could transform all x86 masked intrinsics
1426
// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
1427
13
static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
1428
13
  Value *Ptr = II.getOperand(0);
1429
13
  Value *Mask = II.getOperand(1);
1430
13
  Value *Vec = II.getOperand(2);
1431
13
1432
13
  // Special case a zero mask since that's not a ConstantDataVector:
1433
13
  // this masked store instruction does nothing.
1434
13
  if (
isa<ConstantAggregateZero>(Mask)13
) {
1435
2
    IC.eraseInstFromFunction(II);
1436
2
    return true;
1437
2
  }
1438
11
1439
11
  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
1440
11
  // anything else at this level.
1441
11
  
if (11
II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu11
)
1442
0
    return false;
1443
11
1444
11
  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
1445
11
  if (!ConstMask)
1446
1
    return false;
1447
10
1448
10
  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
1449
10
  // to allow target-independent optimizations.
1450
10
1451
10
  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
1452
10
  // the LLVM intrinsic definition for the pointer argument.
1453
10
  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
1454
10
  PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
1455
10
  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
1456
10
1457
10
  // Second, convert the x86 XMM integer vector mask to a vector of bools based
1458
10
  // on each element's most significant bit (the sign bit).
1459
10
  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
1460
10
1461
10
  IC.Builder.CreateMaskedStore(Vec, PtrCast, 1, BoolMask);
1462
10
1463
10
  // 'Replace uses' doesn't work for stores. Erase the original masked store.
1464
10
  IC.eraseInstFromFunction(II);
1465
10
  return true;
1466
10
}
1467
1468
// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
1469
//
1470
// A single NaN input is folded to minnum, so we rely on that folding for
1471
// handling NaNs.
1472
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
1473
6
                           const APFloat &Src2) {
1474
6
  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
1475
6
1476
6
  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
1477
6
  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
1478
6
  if (Cmp0 == APFloat::cmpEqual)
1479
2
    return maxnum(Src1, Src2);
1480
4
1481
4
  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
1482
4
  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
1483
4
  if (Cmp1 == APFloat::cmpEqual)
1484
2
    return maxnum(Src0, Src2);
1485
2
1486
2
  return maxnum(Src0, Src1);
1487
2
}
1488
1489
// Returns true iff the 2 intrinsics have the same operands, limiting the
1490
// comparison to the first NumOperands.
1491
static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
1492
12
                             unsigned NumOperands) {
1493
12
  assert(I.getNumArgOperands() >= NumOperands && "Not enough operands");
1494
12
  assert(E.getNumArgOperands() >= NumOperands && "Not enough operands");
1495
30
  for (unsigned i = 0; 
i < NumOperands30
;
i++18
)
1496
20
    
if (20
I.getArgOperand(i) != E.getArgOperand(i)20
)
1497
2
      return false;
1498
10
  return true;
1499
12
}
1500
1501
// Remove trivially empty start/end intrinsic ranges, i.e. a start
1502
// immediately followed by an end (ignoring debuginfo or other
1503
// start/end intrinsics in between). As this handles only the most trivial
1504
// cases, tracking the nesting level is not needed:
1505
//
1506
//   call @llvm.foo.start(i1 0) ; &I
1507
//   call @llvm.foo.start(i1 0)
1508
//   call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed
1509
//   call @llvm.foo.end(i1 0)
1510
static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID,
1511
858k
                                      unsigned EndID, InstCombiner &IC) {
1512
858k
  assert(I.getIntrinsicID() == StartID &&
1513
858k
         "Start intrinsic does not have expected ID");
1514
858k
  BasicBlock::iterator BI(I), BE(I.getParent()->end());
1515
893k
  for (++BI; 
BI != BE893k
;
++BI34.6k
) {
1516
893k
    if (auto *
E893k
= dyn_cast<IntrinsicInst>(BI)) {
1517
85.0k
      if (
isa<DbgInfoIntrinsic>(E) || 85.0k
E->getIntrinsicID() == StartID85.0k
)
1518
34.6k
        continue;
1519
50.3k
      
if (50.3k
E->getIntrinsicID() == EndID &&
1520
50.3k
          
haveSameOperands(I, *E, E->getNumArgOperands())12
) {
1521
10
        IC.eraseInstFromFunction(*E);
1522
10
        IC.eraseInstFromFunction(I);
1523
10
        return true;
1524
10
      }
1525
858k
    }
1526
858k
    break;
1527
858k
  }
1528
858k
1529
858k
  return false;
1530
858k
}
1531
1532
// Convert NVVM intrinsics to target-generic LLVM code where possible.
1533
2.95M
static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
1534
2.95M
  // Each NVVM intrinsic we can simplify can be replaced with one of:
1535
2.95M
  //
1536
2.95M
  //  * an LLVM intrinsic,
1537
2.95M
  //  * an LLVM cast operation,
1538
2.95M
  //  * an LLVM binary operation, or
1539
2.95M
  //  * ad-hoc LLVM IR for the particular operation.
1540
2.95M
1541
2.95M
  // Some transformations are only valid when the module's
1542
2.95M
  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
1543
2.95M
  // transformations are valid regardless of the module's ftz setting.
1544
2.95M
  enum FtzRequirementTy {
1545
2.95M
    FTZ_Any,       // Any ftz setting is ok.
1546
2.95M
    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
1547
2.95M
    FTZ_MustBeOff, // Transformation is valid only if ftz is off.
1548
2.95M
  };
1549
2.95M
  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
1550
2.95M
  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
1551
2.95M
  // simplify.
1552
2.95M
  enum SpecialCase {
1553
2.95M
    SPC_Reciprocal,
1554
2.95M
  };
1555
2.95M
1556
2.95M
  // SimplifyAction is a poor-man's variant (plus an additional flag) that
1557
2.95M
  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
1558
2.95M
  struct SimplifyAction {
1559
2.95M
    // Invariant: At most one of these Optionals has a value.
1560
2.95M
    Optional<Intrinsic::ID> IID;
1561
2.95M
    Optional<Instruction::CastOps> CastOp;
1562
2.95M
    Optional<Instruction::BinaryOps> BinaryOp;
1563
2.95M
    Optional<SpecialCase> Special;
1564
2.95M
1565
2.95M
    FtzRequirementTy FtzRequirement = FTZ_Any;
1566
2.95M
1567
2.95M
    SimplifyAction() = default;
1568
2.95M
1569
2.95M
    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
1570
56
        : IID(IID), FtzRequirement(FtzReq) {}
1571
2.95M
1572
2.95M
    // Cast operations don't have anything to do with FTZ, so we skip that
1573
2.95M
    // argument.
1574
32
    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
1575
2.95M
1576
2.95M
    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
1577
18
        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
1578
2.95M
1579
2.95M
    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
1580
4
        : Special(Special), FtzRequirement(FtzReq) {}
1581
2.95M
  };
1582
2.95M
1583
2.95M
  // Try to generate a SimplifyAction describing how to replace our
1584
2.95M
  // IntrinsicInstr with target-generic LLVM IR.
1585
2.95M
  const SimplifyAction Action = [II]() -> SimplifyAction {
1586
2.95M
    switch (II->getIntrinsicID()) {
1587
2.95M
1588
2.95M
    // NVVM intrinsics that map directly to LLVM intrinsics.
1589
2
    case Intrinsic::nvvm_ceil_d:
1590
2
      return {Intrinsic::ceil, FTZ_Any};
1591
2
    case Intrinsic::nvvm_ceil_f:
1592
2
      return {Intrinsic::ceil, FTZ_MustBeOff};
1593
2
    case Intrinsic::nvvm_ceil_ftz_f:
1594
2
      return {Intrinsic::ceil, FTZ_MustBeOn};
1595
2
    case Intrinsic::nvvm_fabs_d:
1596
2
      return {Intrinsic::fabs, FTZ_Any};
1597
2
    case Intrinsic::nvvm_fabs_f:
1598
2
      return {Intrinsic::fabs, FTZ_MustBeOff};
1599
2
    case Intrinsic::nvvm_fabs_ftz_f:
1600
2
      return {Intrinsic::fabs, FTZ_MustBeOn};
1601
2
    case Intrinsic::nvvm_floor_d:
1602
2
      return {Intrinsic::floor, FTZ_Any};
1603
2
    case Intrinsic::nvvm_floor_f:
1604
2
      return {Intrinsic::floor, FTZ_MustBeOff};
1605
2
    case Intrinsic::nvvm_floor_ftz_f:
1606
2
      return {Intrinsic::floor, FTZ_MustBeOn};
1607
2
    case Intrinsic::nvvm_fma_rn_d:
1608
2
      return {Intrinsic::fma, FTZ_Any};
1609
2
    case Intrinsic::nvvm_fma_rn_f:
1610
2
      return {Intrinsic::fma, FTZ_MustBeOff};
1611
2
    case Intrinsic::nvvm_fma_rn_ftz_f:
1612
2
      return {Intrinsic::fma, FTZ_MustBeOn};
1613
2
    case Intrinsic::nvvm_fmax_d:
1614
2
      return {Intrinsic::maxnum, FTZ_Any};
1615
2
    case Intrinsic::nvvm_fmax_f:
1616
2
      return {Intrinsic::maxnum, FTZ_MustBeOff};
1617
2
    case Intrinsic::nvvm_fmax_ftz_f:
1618
2
      return {Intrinsic::maxnum, FTZ_MustBeOn};
1619
2
    case Intrinsic::nvvm_fmin_d:
1620
2
      return {Intrinsic::minnum, FTZ_Any};
1621
2
    case Intrinsic::nvvm_fmin_f:
1622
2
      return {Intrinsic::minnum, FTZ_MustBeOff};
1623
2
    case Intrinsic::nvvm_fmin_ftz_f:
1624
2
      return {Intrinsic::minnum, FTZ_MustBeOn};
1625
2
    case Intrinsic::nvvm_round_d:
1626
2
      return {Intrinsic::round, FTZ_Any};
1627
2
    case Intrinsic::nvvm_round_f:
1628
2
      return {Intrinsic::round, FTZ_MustBeOff};
1629
2
    case Intrinsic::nvvm_round_ftz_f:
1630
2
      return {Intrinsic::round, FTZ_MustBeOn};
1631
2
    case Intrinsic::nvvm_sqrt_rn_d:
1632
2
      return {Intrinsic::sqrt, FTZ_Any};
1633
2
    case Intrinsic::nvvm_sqrt_f:
1634
2
      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
1635
2
      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
1636
2
      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
1637
2
      // the versions with explicit ftz-ness.
1638
2
      return {Intrinsic::sqrt, FTZ_Any};
1639
2
    case Intrinsic::nvvm_sqrt_rn_f:
1640
2
      return {Intrinsic::sqrt, FTZ_MustBeOff};
1641
2
    case Intrinsic::nvvm_sqrt_rn_ftz_f:
1642
2
      return {Intrinsic::sqrt, FTZ_MustBeOn};
1643
2
    case Intrinsic::nvvm_trunc_d:
1644
2
      return {Intrinsic::trunc, FTZ_Any};
1645
2
    case Intrinsic::nvvm_trunc_f:
1646
2
      return {Intrinsic::trunc, FTZ_MustBeOff};
1647
2
    case Intrinsic::nvvm_trunc_ftz_f:
1648
2
      return {Intrinsic::trunc, FTZ_MustBeOn};
1649
2.95M
1650
2.95M
    // NVVM intrinsics that map to LLVM cast operations.
1651
2.95M
    //
1652
2.95M
    // Note that llvm's target-generic conversion operators correspond to the rz
1653
2.95M
    // (round to zero) versions of the nvvm conversion intrinsics, even though
1654
2.95M
    // most everything else here uses the rn (round to nearest even) nvvm ops.
1655
8
    case Intrinsic::nvvm_d2i_rz:
1656
8
    case Intrinsic::nvvm_f2i_rz:
1657
8
    case Intrinsic::nvvm_d2ll_rz:
1658
8
    case Intrinsic::nvvm_f2ll_rz:
1659
8
      return {Instruction::FPToSI};
1660
8
    case Intrinsic::nvvm_d2ui_rz:
1661
8
    case Intrinsic::nvvm_f2ui_rz:
1662
8
    case Intrinsic::nvvm_d2ull_rz:
1663
8
    case Intrinsic::nvvm_f2ull_rz:
1664
8
      return {Instruction::FPToUI};
1665
8
    case Intrinsic::nvvm_i2d_rz:
1666
8
    case Intrinsic::nvvm_i2f_rz:
1667
8
    case Intrinsic::nvvm_ll2d_rz:
1668
8
    case Intrinsic::nvvm_ll2f_rz:
1669
8
      return {Instruction::SIToFP};
1670
8
    case Intrinsic::nvvm_ui2d_rz:
1671
8
    case Intrinsic::nvvm_ui2f_rz:
1672
8
    case Intrinsic::nvvm_ull2d_rz:
1673
8
    case Intrinsic::nvvm_ull2f_rz:
1674
8
      return {Instruction::UIToFP};
1675
8
1676
8
    // NVVM intrinsics that map to LLVM binary ops.
1677
2
    case Intrinsic::nvvm_add_rn_d:
1678
2
      return {Instruction::FAdd, FTZ_Any};
1679
2
    case Intrinsic::nvvm_add_rn_f:
1680
2
      return {Instruction::FAdd, FTZ_MustBeOff};
1681
2
    case Intrinsic::nvvm_add_rn_ftz_f:
1682
2
      return {Instruction::FAdd, FTZ_MustBeOn};
1683
2
    case Intrinsic::nvvm_mul_rn_d:
1684
2
      return {Instruction::FMul, FTZ_Any};
1685
2
    case Intrinsic::nvvm_mul_rn_f:
1686
2
      return {Instruction::FMul, FTZ_MustBeOff};
1687
2
    case Intrinsic::nvvm_mul_rn_ftz_f:
1688
2
      return {Instruction::FMul, FTZ_MustBeOn};
1689
2
    case Intrinsic::nvvm_div_rn_d:
1690
2
      return {Instruction::FDiv, FTZ_Any};
1691
2
    case Intrinsic::nvvm_div_rn_f:
1692
2
      return {Instruction::FDiv, FTZ_MustBeOff};
1693
2
    case Intrinsic::nvvm_div_rn_ftz_f:
1694
2
      return {Instruction::FDiv, FTZ_MustBeOn};
1695
8
1696
8
    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
1697
8
    // need special handling.
1698
8
    //
1699
8
    // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
1700
8
    // as well.
1701
0
    case Intrinsic::nvvm_rcp_rn_d:
1702
0
      return {SPC_Reciprocal, FTZ_Any};
1703
2
    case Intrinsic::nvvm_rcp_rn_f:
1704
2
      return {SPC_Reciprocal, FTZ_MustBeOff};
1705
2
    case Intrinsic::nvvm_rcp_rn_ftz_f:
1706
2
      return {SPC_Reciprocal, FTZ_MustBeOn};
1707
8
1708
8
    // We do not currently simplify intrinsics that give an approximate answer.
1709
8
    // These include:
1710
8
    //
1711
8
    //   - nvvm_cos_approx_{f,ftz_f}
1712
8
    //   - nvvm_ex2_approx_{d,f,ftz_f}
1713
8
    //   - nvvm_lg2_approx_{d,f,ftz_f}
1714
8
    //   - nvvm_sin_approx_{f,ftz_f}
1715
8
    //   - nvvm_sqrt_approx_{f,ftz_f}
1716
8
    //   - nvvm_rsqrt_approx_{d,f,ftz_f}
1717
8
    //   - nvvm_div_approx_{ftz_d,ftz_f,f}
1718
8
    //   - nvvm_rcp_approx_ftz_d
1719
8
    //
1720
8
    // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
1721
8
    // means that fastmath is enabled in the intrinsic.  Unfortunately only
1722
8
    // binary operators (currently) have a fastmath bit in SelectionDAG, so this
1723
8
    // information gets lost and we can't select on it.
1724
8
    //
1725
8
    // TODO: div and rcp are lowered to a binary op, so these we could in theory
1726
8
    // lower them to "fast fdiv".
1727
8
1728
2.95M
    default:
1729
2.95M
      return {};
1730
0
    }
1731
0
  }();
1732
2.95M
1733
2.95M
  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
1734
2.95M
  // can bail out now.  (Notice that in the case that IID is not an NVVM
1735
2.95M
  // intrinsic, we don't have to look up any module metadata, as
1736
2.95M
  // FtzRequirementTy will be FTZ_Any.)
1737
2.95M
  if (
Action.FtzRequirement != FTZ_Any2.95M
) {
1738
52
    bool FtzEnabled =
1739
52
        II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() ==
1740
52
        "true";
1741
52
1742
52
    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
1743
26
      return nullptr;
1744
2.95M
  }
1745
2.95M
1746
2.95M
  // Simplify to target-generic intrinsic.
1747
2.95M
  
if (2.95M
Action.IID2.95M
) {
1748
38
    SmallVector<Value *, 4> Args(II->arg_operands());
1749
38
    // All the target-generic intrinsics currently of interest to us have one
1750
38
    // type argument, equal to that of the nvvm intrinsic's argument.
1751
38
    Type *Tys[] = {II->getArgOperand(0)->getType()};
1752
38
    return CallInst::Create(
1753
38
        Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
1754
38
  }
1755
2.95M
1756
2.95M
  // Simplify to target-generic binary op.
1757
2.95M
  
if (2.95M
Action.BinaryOp2.95M
)
1758
12
    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
1759
12
                                  II->getArgOperand(1), II->getName());
1760
2.95M
1761
2.95M
  // Simplify to target-generic cast op.
1762
2.95M
  
if (2.95M
Action.CastOp2.95M
)
1763
32
    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
1764
32
                            II->getName());
1765
2.95M
1766
2.95M
  // All that's left are the special cases.
1767
2.95M
  
if (2.95M
!Action.Special2.95M
)
1768
2.95M
    return nullptr;
1769
2
1770
2
  switch (*Action.Special) {
1771
2
  case SPC_Reciprocal:
1772
2
    // Simplify reciprocal.
1773
2
    return BinaryOperator::Create(
1774
2
        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
1775
2
        II->getArgOperand(0), II->getName());
1776
0
  }
1777
0
  
llvm_unreachable0
("All SpecialCase enumerators should be handled in switch.");
1778
0
}
1779
1780
1.73k
Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) {
1781
1.73k
  removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this);
1782
1.73k
  return nullptr;
1783
1.73k
}
1784
1785
773
Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) {
1786
773
  removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this);
1787
773
  return nullptr;
1788
773
}
1789
1790
/// CallInst simplification. This mostly only handles folding of intrinsic
1791
/// instructions. For normal calls, it allows visitCallSite to do the heavy
1792
/// lifting.
1793
28.0M
Instruction *InstCombiner::visitCallInst(CallInst &CI) {
1794
28.0M
  auto Args = CI.arg_operands();
1795
28.0M
  if (Value *V = SimplifyCall(&CI, CI.getCalledValue(), Args.begin(),
1796
28.0M
                              Args.end(), SQ.getWithInstruction(&CI)))
1797
14
    return replaceInstUsesWith(CI, V);
1798
28.0M
1799
28.0M
  
if (28.0M
isFreeCall(&CI, &TLI)28.0M
)
1800
398k
    return visitFree(CI);
1801
27.6M
1802
27.6M
  // If the caller function is nounwind, mark the call as nounwind, even if the
1803
27.6M
  // callee isn't.
1804
27.6M
  
if (27.6M
CI.getFunction()->doesNotThrow() && 27.6M
!CI.doesNotThrow()26.5M
) {
1805
109k
    CI.setDoesNotThrow();
1806
109k
    return &CI;
1807
109k
  }
1808
27.5M
1809
27.5M
  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
1810
27.5M
  if (
!II27.5M
)
return visitCallSite(&CI)24.5M
;
1811
2.95M
1812
2.95M
  // Intrinsics cannot occur in an invoke, so handle them here instead of in
1813
2.95M
  // visitCallSite.
1814
2.95M
  
if (MemIntrinsic *2.95M
MI2.95M
= dyn_cast<MemIntrinsic>(II)) {
1815
461k
    bool Changed = false;
1816
461k
1817
461k
    // memmove/cpy/set of zero bytes is a noop.
1818
461k
    if (Constant *
NumBytes461k
= dyn_cast<Constant>(MI->getLength())) {
1819
393k
      if (NumBytes->isNullValue())
1820
1.42k
        return eraseInstFromFunction(CI);
1821
392k
1822
392k
      
if (ConstantInt *392k
CI392k
= dyn_cast<ConstantInt>(NumBytes))
1823
392k
        
if (392k
CI->getZExtValue() == 1392k
) {
1824
101
          // Replace the instruction with just byte operations.  We would
1825
101
          // transform other cases to loads/stores, but we don't know if
1826
101
          // alignment is sufficient.
1827
101
        }
1828
393k
    }
1829
461k
1830
461k
    // No other transformations apply to volatile transfers.
1831
459k
    
if (459k
MI->isVolatile()459k
)
1832
20
      return nullptr;
1833
459k
1834
459k
    // If we have a memmove and the source operation is a constant global,
1835
459k
    // then the source and dest pointers can't alias, so we can change this
1836
459k
    // into a call to memcpy.
1837
459k
    
if (MemMoveInst *459k
MMI459k
= dyn_cast<MemMoveInst>(MI)) {
1838
3.10k
      if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
1839
12
        
if (12
GVSrc->isConstant()12
) {
1840
4
          Module *M = CI.getModule();
1841
4
          Intrinsic::ID MemCpyID = Intrinsic::memcpy;
1842
4
          Type *Tys[3] = { CI.getArgOperand(0)->getType(),
1843
4
                           CI.getArgOperand(1)->getType(),
1844
4
                           CI.getArgOperand(2)->getType() };
1845
4
          CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));
1846
4
          Changed = true;
1847
4
        }
1848
3.10k
    }
1849
459k
1850
459k
    if (MemTransferInst *
MTI459k
= dyn_cast<MemTransferInst>(MI)) {
1851
154k
      // memmove(x,x,size) -> noop.
1852
154k
      if (MTI->getSource() == MTI->getDest())
1853
4
        return eraseInstFromFunction(CI);
1854
459k
    }
1855
459k
1856
459k
    // If we can determine a pointer alignment that is bigger than currently
1857
459k
    // set, update the alignment.
1858
459k
    
if (459k
isa<MemTransferInst>(MI)459k
) {
1859
154k
      if (Instruction *I = SimplifyMemTransfer(MI))
1860
1.13k
        return I;
1861
305k
    } else 
if (MemSetInst *305k
MSI305k
= dyn_cast<MemSetInst>(MI)) {
1862
305k
      if (Instruction *I = SimplifyMemSet(MSI))
1863
3.75k
        return I;
1864
454k
    }
1865
454k
1866
454k
    
if (454k
Changed454k
)
return II1
;
1867
2.95M
  }
1868
2.95M
1869
2.95M
  
if (auto *2.95M
AMI2.95M
= dyn_cast<ElementUnorderedAtomicMemCpyInst>(II)) {
1870
7
    if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
1871
7
      
if (7
C->isNullValue()7
)
1872
3
        return eraseInstFromFunction(*AMI);
1873
4
1874
4
    
if (Instruction *4
I4
= SimplifyElementUnorderedAtomicMemCpy(AMI))
1875
2
      return I;
1876
2.95M
  }
1877
2.95M
1878
2.95M
  
if (Instruction *2.95M
I2.95M
= SimplifyNVVMIntrinsic(II, *this))
1879
84
    return I;
1880
2.95M
1881
2.95M
  auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,
1882
468
                                              unsigned DemandedWidth) {
1883
468
    APInt UndefElts(Width, 0);
1884
468
    APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
1885
468
    return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
1886
468
  };
1887
2.95M
1888
2.95M
  switch (II->getIntrinsicID()) {
1889
1.87M
  default: break;
1890
91.1k
  case Intrinsic::objectsize:
1891
91.1k
    if (ConstantInt *N =
1892
91.1k
            lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false))
1893
627
      return replaceInstUsesWith(CI, N);
1894
90.5k
    return nullptr;
1895
90.5k
1896
3.50k
  case Intrinsic::bswap: {
1897
3.50k
    Value *IIOperand = II->getArgOperand(0);
1898
3.50k
    Value *X = nullptr;
1899
3.50k
1900
3.50k
    // TODO should this be in InstSimplify?
1901
3.50k
    // bswap(bswap(x)) -> x
1902
3.50k
    if (match(IIOperand, m_BSwap(m_Value(X))))
1903
1
      return replaceInstUsesWith(CI, X);
1904
3.50k
1905
3.50k
    // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
1906
3.50k
    
if (3.50k
match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))3.50k
) {
1907
2
      unsigned C = X->getType()->getPrimitiveSizeInBits() -
1908
2
        IIOperand->getType()->getPrimitiveSizeInBits();
1909
2
      Value *CV = ConstantInt::get(X->getType(), C);
1910
2
      Value *V = Builder.CreateLShr(X, CV);
1911
2
      return new TruncInst(V, IIOperand->getType());
1912
2
    }
1913
3.50k
    break;
1914
3.50k
  }
1915
3.50k
1916
159
  case Intrinsic::bitreverse: {
1917
159
    Value *IIOperand = II->getArgOperand(0);
1918
159
    Value *X = nullptr;
1919
159
1920
159
    // TODO should this be in InstSimplify?
1921
159
    // bitreverse(bitreverse(x)) -> x
1922
159
    if (match(IIOperand, m_BitReverse(m_Value(X))))
1923
2
      return replaceInstUsesWith(CI, X);
1924
157
    break;
1925
157
  }
1926
157
1927
500
  case Intrinsic::masked_load:
1928
500
    if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II, Builder))
1929
3
      return replaceInstUsesWith(CI, SimplifiedMaskedOp);
1930
497
    break;
1931
489
  case Intrinsic::masked_store:
1932
489
    return simplifyMaskedStore(*II, *this);
1933
224
  case Intrinsic::masked_gather:
1934
224
    return simplifyMaskedGather(*II, *this);
1935
105
  case Intrinsic::masked_scatter:
1936
105
    return simplifyMaskedScatter(*II, *this);
1937
497
1938
8
  case Intrinsic::powi:
1939
8
    if (ConstantInt *
Power8
= dyn_cast<ConstantInt>(II->getArgOperand(1))) {
1940
3
      // powi(x, 0) -> 1.0
1941
3
      if (Power->isZero())
1942
1
        return replaceInstUsesWith(CI, ConstantFP::get(CI.getType(), 1.0));
1943
2
      // powi(x, 1) -> x
1944
2
      
if (2
Power->isOne()2
)
1945
1
        return replaceInstUsesWith(CI, II->getArgOperand(0));
1946
1
      // powi(x, -1) -> 1/x
1947
1
      
if (1
Power->isMinusOne()1
)
1948
1
        return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0),
1949
1
                                          II->getArgOperand(0));
1950
5
    }
1951
5
    break;
1952
5
1953
59.7k
  case Intrinsic::cttz:
1954
59.7k
  case Intrinsic::ctlz:
1955
59.7k
    if (auto *I = foldCttzCtlz(*II, *this))
1956
4.14k
      return I;
1957
55.5k
    break;
1958
55.5k
1959
268
  case Intrinsic::ctpop:
1960
268
    if (auto *I = foldCtpop(*II, *this))
1961
14
      return I;
1962
254
    break;
1963
254
1964
19.6k
  case Intrinsic::uadd_with_overflow:
1965
19.6k
  case Intrinsic::sadd_with_overflow:
1966
19.6k
  case Intrinsic::umul_with_overflow:
1967
19.6k
  case Intrinsic::smul_with_overflow:
1968
19.6k
    if (isa<Constant>(II->getArgOperand(0)) &&
1969
19.6k
        
!isa<Constant>(II->getArgOperand(1))295
) {
1970
295
      // Canonicalize constants into the RHS.
1971
295
      Value *LHS = II->getArgOperand(0);
1972
295
      II->setArgOperand(0, II->getArgOperand(1));
1973
295
      II->setArgOperand(1, LHS);
1974
295
      return II;
1975
295
    }
1976
19.3k
    
LLVM_FALLTHROUGH19.3k
;
1977
19.3k
1978
19.6k
  case Intrinsic::usub_with_overflow:
1979
19.6k
  case Intrinsic::ssub_with_overflow: {
1980
19.6k
    OverflowCheckFlavor OCF =
1981
19.6k
        IntrinsicIDToOverflowCheckFlavor(II->getIntrinsicID());
1982
19.6k
    assert(OCF != OCF_INVALID && "unexpected!");
1983
19.6k
1984
19.6k
    Value *OperationResult = nullptr;
1985
19.6k
    Constant *OverflowResult = nullptr;
1986
19.6k
    if (OptimizeOverflowCheck(OCF, II->getArgOperand(0), II->getArgOperand(1),
1987
19.6k
                              *II, OperationResult, OverflowResult))
1988
295
      return CreateOverflowTuple(II, OperationResult, OverflowResult);
1989
19.3k
1990
19.3k
    break;
1991
19.3k
  }
1992
19.3k
1993
1.14k
  case Intrinsic::minnum:
1994
1.14k
  case Intrinsic::maxnum: {
1995
1.14k
    Value *Arg0 = II->getArgOperand(0);
1996
1.14k
    Value *Arg1 = II->getArgOperand(1);
1997
1.14k
    // Canonicalize constants to the RHS.
1998
1.14k
    if (
isa<ConstantFP>(Arg0) && 1.14k
!isa<ConstantFP>(Arg1)8
) {
1999
8
      II->setArgOperand(0, Arg1);
2000
8
      II->setArgOperand(1, Arg0);
2001
8
      return II;
2002
8
    }
2003
1.13k
    
if (Value *1.13k
V1.13k
= simplifyMinnumMaxnum(*II))
2004
21
      return replaceInstUsesWith(*II, V);
2005
1.11k
    break;
2006
1.11k
  }
2007
117
  case Intrinsic::fmuladd: {
2008
117
    // Canonicalize fast fmuladd to the separate fmul + fadd.
2009
117
    if (
II->hasUnsafeAlgebra()117
) {
2010
4
      BuilderTy::FastMathFlagGuard Guard(Builder);
2011
4
      Builder.setFastMathFlags(II->getFastMathFlags());
2012
4
      Value *Mul = Builder.CreateFMul(II->getArgOperand(0),
2013
4
                                      II->getArgOperand(1));
2014
4
      Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2));
2015
4
      Add->takeName(II);
2016
4
      return replaceInstUsesWith(*II, Add);
2017
4
    }
2018
113
2019
113
    
LLVM_FALLTHROUGH113
;
2020
113
  }
2021
490
  case Intrinsic::fma: {
2022
490
    Value *Src0 = II->getArgOperand(0);
2023
490
    Value *Src1 = II->getArgOperand(1);
2024
490
2025
490
    // Canonicalize constants into the RHS.
2026
490
    if (
isa<Constant>(Src0) && 490
!isa<Constant>(Src1)22
) {
2027
21
      II->setArgOperand(0, Src1);
2028
21
      II->setArgOperand(1, Src0);
2029
21
      std::swap(Src0, Src1);
2030
21
    }
2031
490
2032
490
    Value *LHS = nullptr;
2033
490
    Value *RHS = nullptr;
2034
490
2035
490
    // fma fneg(x), fneg(y), z -> fma x, y, z
2036
490
    if (match(Src0, m_FNeg(m_Value(LHS))) &&
2037
490
        
match(Src1, m_FNeg(m_Value(RHS)))15
) {
2038
7
      II->setArgOperand(0, LHS);
2039
7
      II->setArgOperand(1, RHS);
2040
7
      return II;
2041
7
    }
2042
483
2043
483
    // fma fabs(x), fabs(x), z -> fma x, x, z
2044
483
    
if (483
match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) &&
2045
483
        
match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS)))5
&&
LHS == RHS5
) {
2046
3
      II->setArgOperand(0, LHS);
2047
3
      II->setArgOperand(1, RHS);
2048
3
      return II;
2049
3
    }
2050
480
2051
480
    // fma x, 1, z -> fadd x, z
2052
480
    
if (480
match(Src1, m_FPOne())480
) {
2053
5
      Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
2054
5
      RI->copyFastMathFlags(II);
2055
5
      return RI;
2056
5
    }
2057
475
2058
475
    break;
2059
475
  }
2060
31.5k
  case Intrinsic::fabs: {
2061
31.5k
    Value *Cond;
2062
31.5k
    Constant *LHS, *RHS;
2063
31.5k
    if (match(II->getArgOperand(0),
2064
31.5k
              m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) {
2065
4
      CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS});
2066
4
      CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS});
2067
4
      return SelectInst::Create(Cond, Call0, Call1);
2068
4
    }
2069
31.5k
2070
31.5k
    
LLVM_FALLTHROUGH31.5k
;
2071
31.5k
  }
2072
36.1k
  case Intrinsic::ceil:
2073
36.1k
  case Intrinsic::floor:
2074
36.1k
  case Intrinsic::round:
2075
36.1k
  case Intrinsic::nearbyint:
2076
36.1k
  case Intrinsic::rint:
2077
36.1k
  case Intrinsic::trunc: {
2078
36.1k
    Value *ExtSrc;
2079
36.1k
    if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) &&
2080
36.1k
        
II->getArgOperand(0)->hasOneUse()252
) {
2081
179
      // fabs (fpext x) -> fpext (fabs x)
2082
179
      Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(),
2083
179
                                           { ExtSrc->getType() });
2084
179
      CallInst *NewFabs = Builder.CreateCall(F, ExtSrc);
2085
179
      NewFabs->copyFastMathFlags(II);
2086
179
      NewFabs->takeName(II);
2087
179
      return new FPExtInst(NewFabs, II->getType());
2088
179
    }
2089
36.0k
2090
36.0k
    break;
2091
36.0k
  }
2092
126
  case Intrinsic::cos:
2093
126
  case Intrinsic::amdgcn_cos: {
2094
126
    Value *SrcSrc;
2095
126
    Value *Src = II->getArgOperand(0);
2096
126
    if (match(Src, m_FNeg(m_Value(SrcSrc))) ||
2097
126
        
match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))122
) {
2098
8
      // cos(-x) -> cos(x)
2099
8
      // cos(fabs(x)) -> cos(x)
2100
8
      II->setArgOperand(0, SrcSrc);
2101
8
      return II;
2102
8
    }
2103
118
2104
118
    break;
2105
118
  }
2106
4
  case Intrinsic::ppc_altivec_lvx:
2107
4
  case Intrinsic::ppc_altivec_lvxl:
2108
4
    // Turn PPC lvx -> load if the pointer is known aligned.
2109
4
    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
2110
4
                                   &DT) >= 16) {
2111
2
      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
2112
2
                                         PointerType::getUnqual(II->getType()));
2113
2
      return new LoadInst(Ptr);
2114
2
    }
2115
2
    break;
2116
2
  case Intrinsic::ppc_vsx_lxvw4x:
2117
2
  case Intrinsic::ppc_vsx_lxvd2x: {
2118
2
    // Turn PPC VSX loads into normal loads.
2119
2
    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
2120
2
                                       PointerType::getUnqual(II->getType()));
2121
2
    return new LoadInst(Ptr, Twine(""), false, 1);
2122
2
  }
2123
4
  case Intrinsic::ppc_altivec_stvx:
2124
4
  case Intrinsic::ppc_altivec_stvxl:
2125
4
    // Turn stvx -> store if the pointer is known aligned.
2126
4
    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
2127
4
                                   &DT) >= 16) {
2128
2
      Type *OpPtrTy =
2129
2
        PointerType::getUnqual(II->getArgOperand(0)->getType());
2130
2
      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
2131
2
      return new StoreInst(II->getArgOperand(0), Ptr);
2132
2
    }
2133
2
    break;
2134
2
  case Intrinsic::ppc_vsx_stxvw4x:
2135
2
  case Intrinsic::ppc_vsx_stxvd2x: {
2136
2
    // Turn PPC VSX stores into normal stores.
2137
2
    Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
2138
2
    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
2139
2
    return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
2140
2
  }
2141
3
  case Intrinsic::ppc_qpx_qvlfs:
2142
3
    // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
2143
3
    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
2144
3
                                   &DT) >= 16) {
2145
1
      Type *VTy = VectorType::get(Builder.getFloatTy(),
2146
1
                                  II->getType()->getVectorNumElements());
2147
1
      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
2148
1
                                         PointerType::getUnqual(VTy));
2149
1
      Value *Load = Builder.CreateLoad(Ptr);
2150
1
      return new FPExtInst(Load, II->getType());
2151
1
    }
2152
2
    break;
2153
3
  case Intrinsic::ppc_qpx_qvlfd:
2154
3
    // Turn PPC QPX qvlfd -> load if the pointer is known aligned.
2155
3
    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC,
2156
3
                                   &DT) >= 32) {
2157
1
      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
2158
1
                                         PointerType::getUnqual(II->getType()));
2159
1
      return new LoadInst(Ptr);
2160
1
    }
2161
2
    break;
2162
2
  case Intrinsic::ppc_qpx_qvstfs:
2163
2
    // Turn PPC QPX qvstfs -> store if the pointer is known aligned.
2164
2
    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
2165
2
                                   &DT) >= 16) {
2166
1
      Type *VTy = VectorType::get(Builder.getFloatTy(),
2167
1
          II->getArgOperand(0)->getType()->getVectorNumElements());
2168
1
      Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy);
2169
1
      Type *OpPtrTy = PointerType::getUnqual(VTy);
2170
1
      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
2171
1
      return new StoreInst(TOp, Ptr);
2172
1
    }
2173
1
    break;
2174
3
  case Intrinsic::ppc_qpx_qvstfd:
2175
3
    // Turn PPC QPX qvstfd -> store if the pointer is known aligned.
2176
3
    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC,
2177
3
                                   &DT) >= 32) {
2178
1
      Type *OpPtrTy =
2179
1
        PointerType::getUnqual(II->getArgOperand(0)->getType());
2180
1
      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
2181
1
      return new StoreInst(II->getArgOperand(0), Ptr);
2182
1
    }
2183
2
    break;
2184
2
2185
24
  case Intrinsic::x86_bmi_bextr_32:
2186
24
  case Intrinsic::x86_bmi_bextr_64:
2187
24
  case Intrinsic::x86_tbm_bextri_u32:
2188
24
  case Intrinsic::x86_tbm_bextri_u64:
2189
24
    // If the RHS is a constant we can try some simplifications.
2190
24
    if (auto *
C24
= dyn_cast<ConstantInt>(II->getArgOperand(1))) {
2191
24
      uint64_t Shift = C->getZExtValue();
2192
24
      uint64_t Length = (Shift >> 8) & 0xff;
2193
24
      Shift &= 0xff;
2194
24
      unsigned BitWidth = II->getType()->getIntegerBitWidth();
2195
24
      // If the length is 0 or the shift is out of range, replace with zero.
2196
24
      if (
Length == 0 || 24
Shift >= BitWidth20
)
2197
8
        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
2198
16
      // If the LHS is also a constant, we can completely constant fold this.
2199
16
      
if (auto *16
InC16
= dyn_cast<ConstantInt>(II->getArgOperand(0))) {
2200
12
        uint64_t Result = InC->getZExtValue() >> Shift;
2201
12
        if (Length > BitWidth)
2202
4
          Length = BitWidth;
2203
12
        Result &= maskTrailingOnes<uint64_t>(Length);
2204
12
        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
2205
12
      }
2206
4
      // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2207
4
      // are only masking bits that a shift already cleared?
2208
4
    }
2209
4
    break;
2210
4
2211
8
  case Intrinsic::x86_bmi_bzhi_32:
2212
8
  case Intrinsic::x86_bmi_bzhi_64:
2213
8
    // If the RHS is a constant we can try some simplifications.
2214
8
    if (auto *
C8
= dyn_cast<ConstantInt>(II->getArgOperand(1))) {
2215
8
      uint64_t Index = C->getZExtValue() & 0xff;
2216
8
      unsigned BitWidth = II->getType()->getIntegerBitWidth();
2217
8
      if (Index >= BitWidth)
2218
2
        return replaceInstUsesWith(CI, II->getArgOperand(0));
2219
6
      
if (6
Index == 06
)
2220
2
        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
2221
4
      // If the LHS is also a constant, we can completely constant fold this.
2222
4
      
if (auto *4
InC4
= dyn_cast<ConstantInt>(II->getArgOperand(0))) {
2223
2
        uint64_t Result = InC->getZExtValue();
2224
2
        Result &= maskTrailingOnes<uint64_t>(Index);
2225
2
        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
2226
2
      }
2227
2
      // TODO should we convert this to an AND if the RHS is constant?
2228
2
    }
2229
2
    break;
2230
2
2231
9
  case Intrinsic::x86_vcvtph2ps_128:
2232
9
  case Intrinsic::x86_vcvtph2ps_256: {
2233
9
    auto Arg = II->getArgOperand(0);
2234
9
    auto ArgType = cast<VectorType>(Arg->getType());
2235
9
    auto RetType = cast<VectorType>(II->getType());
2236
9
    unsigned ArgWidth = ArgType->getNumElements();
2237
9
    unsigned RetWidth = RetType->getNumElements();
2238
9
    assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths");
2239
9
    assert(ArgType->isIntOrIntVectorTy() &&
2240
9
           ArgType->getScalarSizeInBits() == 16 &&
2241
9
           "CVTPH2PS input type should be 16-bit integer vector");
2242
9
    assert(RetType->getScalarType()->isFloatTy() &&
2243
9
           "CVTPH2PS output type should be 32-bit float vector");
2244
9
2245
9
    // Constant folding: Convert to generic half to single conversion.
2246
9
    if (isa<ConstantAggregateZero>(Arg))
2247
2
      return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));
2248
7
2249
7
    
if (7
isa<ConstantDataVector>(Arg)7
) {
2250
2
      auto VectorHalfAsShorts = Arg;
2251
2
      if (
RetWidth < ArgWidth2
) {
2252
1
        SmallVector<uint32_t, 8> SubVecMask;
2253
5
        for (unsigned i = 0; 
i != RetWidth5
;
++i4
)
2254
4
          SubVecMask.push_back((int)i);
2255
1
        VectorHalfAsShorts = Builder.CreateShuffleVector(
2256
1
            Arg, UndefValue::get(ArgType), SubVecMask);
2257
1
      }
2258
2
2259
2
      auto VectorHalfType =
2260
2
          VectorType::get(Type::getHalfTy(II->getContext()), RetWidth);
2261
2
      auto VectorHalfs =
2262
2
          Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType);
2263
2
      auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType);
2264
2
      return replaceInstUsesWith(*II, VectorFloats);
2265
2
    }
2266
5
2267
5
    // We only use the lowest lanes of the argument.
2268
5
    
if (Value *5
V5
= SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) {
2269
1
      II->setArgOperand(0, V);
2270
1
      return II;
2271
1
    }
2272
4
    break;
2273
4
  }
2274
4
2275
75
  case Intrinsic::x86_sse_cvtss2si:
2276
75
  case Intrinsic::x86_sse_cvtss2si64:
2277
75
  case Intrinsic::x86_sse_cvttss2si:
2278
75
  case Intrinsic::x86_sse_cvttss2si64:
2279
75
  case Intrinsic::x86_sse2_cvtsd2si:
2280
75
  case Intrinsic::x86_sse2_cvtsd2si64:
2281
75
  case Intrinsic::x86_sse2_cvttsd2si:
2282
75
  case Intrinsic::x86_sse2_cvttsd2si64:
2283
75
  case Intrinsic::x86_avx512_vcvtss2si32:
2284
75
  case Intrinsic::x86_avx512_vcvtss2si64:
2285
75
  case Intrinsic::x86_avx512_vcvtss2usi32:
2286
75
  case Intrinsic::x86_avx512_vcvtss2usi64:
2287
75
  case Intrinsic::x86_avx512_vcvtsd2si32:
2288
75
  case Intrinsic::x86_avx512_vcvtsd2si64:
2289
75
  case Intrinsic::x86_avx512_vcvtsd2usi32:
2290
75
  case Intrinsic::x86_avx512_vcvtsd2usi64:
2291
75
  case Intrinsic::x86_avx512_cvttss2si:
2292
75
  case Intrinsic::x86_avx512_cvttss2si64:
2293
75
  case Intrinsic::x86_avx512_cvttss2usi:
2294
75
  case Intrinsic::x86_avx512_cvttss2usi64:
2295
75
  case Intrinsic::x86_avx512_cvttsd2si:
2296
75
  case Intrinsic::x86_avx512_cvttsd2si64:
2297
75
  case Intrinsic::x86_avx512_cvttsd2usi:
2298
75
  case Intrinsic::x86_avx512_cvttsd2usi64: {
2299
75
    // These intrinsics only demand the 0th element of their input vectors. If
2300
75
    // we can simplify the input based on that, do so now.
2301
75
    Value *Arg = II->getArgOperand(0);
2302
75
    unsigned VWidth = Arg->getType()->getVectorNumElements();
2303
75
    if (Value *
V75
= SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2304
25
      II->setArgOperand(0, V);
2305
25
      return II;
2306
25
    }
2307
50
    break;
2308
50
  }
2309
50
2310
47
  case Intrinsic::x86_mmx_pmovmskb:
2311
47
  case Intrinsic::x86_sse_movmsk_ps:
2312
47
  case Intrinsic::x86_sse2_movmsk_pd:
2313
47
  case Intrinsic::x86_sse2_pmovmskb_128:
2314
47
  case Intrinsic::x86_avx_movmsk_pd_256:
2315
47
  case Intrinsic::x86_avx_movmsk_ps_256:
2316
47
  case Intrinsic::x86_avx2_pmovmskb: {
2317
47
    if (Value *V = simplifyX86movmsk(*II))
2318
19
      return replaceInstUsesWith(*II, V);
2319
28
    break;
2320
28
  }
2321
28
2322
85
  case Intrinsic::x86_sse_comieq_ss:
2323
85
  case Intrinsic::x86_sse_comige_ss:
2324
85
  case Intrinsic::x86_sse_comigt_ss:
2325
85
  case Intrinsic::x86_sse_comile_ss:
2326
85
  case Intrinsic::x86_sse_comilt_ss:
2327
85
  case Intrinsic::x86_sse_comineq_ss:
2328
85
  case Intrinsic::x86_sse_ucomieq_ss:
2329
85
  case Intrinsic::x86_sse_ucomige_ss:
2330
85
  case Intrinsic::x86_sse_ucomigt_ss:
2331
85
  case Intrinsic::x86_sse_ucomile_ss:
2332
85
  case Intrinsic::x86_sse_ucomilt_ss:
2333
85
  case Intrinsic::x86_sse_ucomineq_ss:
2334
85
  case Intrinsic::x86_sse2_comieq_sd:
2335
85
  case Intrinsic::x86_sse2_comige_sd:
2336
85
  case Intrinsic::x86_sse2_comigt_sd:
2337
85
  case Intrinsic::x86_sse2_comile_sd:
2338
85
  case Intrinsic::x86_sse2_comilt_sd:
2339
85
  case Intrinsic::x86_sse2_comineq_sd:
2340
85
  case Intrinsic::x86_sse2_ucomieq_sd:
2341
85
  case Intrinsic::x86_sse2_ucomige_sd:
2342
85
  case Intrinsic::x86_sse2_ucomigt_sd:
2343
85
  case Intrinsic::x86_sse2_ucomile_sd:
2344
85
  case Intrinsic::x86_sse2_ucomilt_sd:
2345
85
  case Intrinsic::x86_sse2_ucomineq_sd:
2346
85
  case Intrinsic::x86_avx512_vcomi_ss:
2347
85
  case Intrinsic::x86_avx512_vcomi_sd:
2348
85
  case Intrinsic::x86_avx512_mask_cmp_ss:
2349
85
  case Intrinsic::x86_avx512_mask_cmp_sd: {
2350
85
    // These intrinsics only demand the 0th element of their input vectors. If
2351
85
    // we can simplify the input based on that, do so now.
2352
85
    bool MadeChange = false;
2353
85
    Value *Arg0 = II->getArgOperand(0);
2354
85
    Value *Arg1 = II->getArgOperand(1);
2355
85
    unsigned VWidth = Arg0->getType()->getVectorNumElements();
2356
85
    if (Value *
V85
= SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2357
28
      II->setArgOperand(0, V);
2358
28
      MadeChange = true;
2359
28
    }
2360
85
    if (Value *
V85
= SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2361
28
      II->setArgOperand(1, V);
2362
28
      MadeChange = true;
2363
28
    }
2364
85
    if (MadeChange)
2365
28
      return II;
2366
57
    break;
2367
57
  }
2368
37
  case Intrinsic::x86_avx512_mask_cmp_pd_128:
2369
37
  case Intrinsic::x86_avx512_mask_cmp_pd_256:
2370
37
  case Intrinsic::x86_avx512_mask_cmp_pd_512:
2371
37
  case Intrinsic::x86_avx512_mask_cmp_ps_128:
2372
37
  case Intrinsic::x86_avx512_mask_cmp_ps_256:
2373
37
  case Intrinsic::x86_avx512_mask_cmp_ps_512: {
2374
37
    // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
2375
37
    Value *Arg0 = II->getArgOperand(0);
2376
37
    Value *Arg1 = II->getArgOperand(1);
2377
37
    bool Arg0IsZero = match(Arg0, m_Zero());
2378
37
    if (Arg0IsZero)
2379
6
      std::swap(Arg0, Arg1);
2380
37
    Value *A, *B;
2381
37
    // This fold requires only the NINF(not +/- inf) since inf minus
2382
37
    // inf is nan.
2383
37
    // NSZ(No Signed Zeros) is not needed because zeros of any sign are
2384
37
    // equal for both compares.
2385
37
    // NNAN is not needed because nans compare the same for both compares.
2386
37
    // The compare intrinsic uses the above assumptions and therefore
2387
37
    // doesn't require additional flags.
2388
37
    if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
2389
13
         match(Arg1, m_Zero()) &&
2390
37
         
cast<Instruction>(Arg0)->getFastMathFlags().noInfs()13
)) {
2391
12
      if (Arg0IsZero)
2392
6
        std::swap(A, B);
2393
12
      II->setArgOperand(0, A);
2394
12
      II->setArgOperand(1, B);
2395
12
      return II;
2396
12
    }
2397
25
    break;
2398
25
  }
2399
25
2400
32
  case Intrinsic::x86_avx512_mask_add_ps_512:
2401
32
  case Intrinsic::x86_avx512_mask_div_ps_512:
2402
32
  case Intrinsic::x86_avx512_mask_mul_ps_512:
2403
32
  case Intrinsic::x86_avx512_mask_sub_ps_512:
2404
32
  case Intrinsic::x86_avx512_mask_add_pd_512:
2405
32
  case Intrinsic::x86_avx512_mask_div_pd_512:
2406
32
  case Intrinsic::x86_avx512_mask_mul_pd_512:
2407
32
  case Intrinsic::x86_avx512_mask_sub_pd_512:
2408
32
    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2409
32
    // IR operations.
2410
32
    if (auto *
R32
= dyn_cast<ConstantInt>(II->getArgOperand(4))) {
2411
32
      if (
R->getValue() == 432
) {
2412
16
        Value *Arg0 = II->getArgOperand(0);
2413
16
        Value *Arg1 = II->getArgOperand(1);
2414
16
2415
16
        Value *V;
2416
16
        switch (II->getIntrinsicID()) {
2417
0
        
default: 0
llvm_unreachable0
("Case stmts out of sync!");
2418
4
        case Intrinsic::x86_avx512_mask_add_ps_512:
2419
4
        case Intrinsic::x86_avx512_mask_add_pd_512:
2420
4
          V = Builder.CreateFAdd(Arg0, Arg1);
2421
4
          break;
2422
4
        case Intrinsic::x86_avx512_mask_sub_ps_512:
2423
4
        case Intrinsic::x86_avx512_mask_sub_pd_512:
2424
4
          V = Builder.CreateFSub(Arg0, Arg1);
2425
4
          break;
2426
4
        case Intrinsic::x86_avx512_mask_mul_ps_512:
2427
4
        case Intrinsic::x86_avx512_mask_mul_pd_512:
2428
4
          V = Builder.CreateFMul(Arg0, Arg1);
2429
4
          break;
2430
4
        case Intrinsic::x86_avx512_mask_div_ps_512:
2431
4
        case Intrinsic::x86_avx512_mask_div_pd_512:
2432
4
          V = Builder.CreateFDiv(Arg0, Arg1);
2433
4
          break;
2434
16
        }
2435
16
2436
16
        // Create a select for the masking.
2437
16
        V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
2438
16
                              Builder);
2439
16
        return replaceInstUsesWith(*II, V);
2440
16
      }
2441
32
    }
2442
16
    break;
2443
16
2444
88
  case Intrinsic::x86_avx512_mask_add_ss_round:
2445
88
  case Intrinsic::x86_avx512_mask_div_ss_round:
2446
88
  case Intrinsic::x86_avx512_mask_mul_ss_round:
2447
88
  case Intrinsic::x86_avx512_mask_sub_ss_round:
2448
88
  case Intrinsic::x86_avx512_mask_add_sd_round:
2449
88
  case Intrinsic::x86_avx512_mask_div_sd_round:
2450
88
  case Intrinsic::x86_avx512_mask_mul_sd_round:
2451
88
  case Intrinsic::x86_avx512_mask_sub_sd_round:
2452
88
    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2453
88
    // IR operations.
2454
88
    if (auto *
R88
= dyn_cast<ConstantInt>(II->getArgOperand(4))) {
2455
88
      if (
R->getValue() == 488
) {
2456
16
        // Extract the element as scalars.
2457
16
        Value *Arg0 = II->getArgOperand(0);
2458
16
        Value *Arg1 = II->getArgOperand(1);
2459
16
        Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0);
2460
16
        Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0);
2461
16
2462
16
        Value *V;
2463
16
        switch (II->getIntrinsicID()) {
2464
0
        
default: 0
llvm_unreachable0
("Case stmts out of sync!");
2465
4
        case Intrinsic::x86_avx512_mask_add_ss_round:
2466
4
        case Intrinsic::x86_avx512_mask_add_sd_round:
2467
4
          V = Builder.CreateFAdd(LHS, RHS);
2468
4
          break;
2469
4
        case Intrinsic::x86_avx512_mask_sub_ss_round:
2470
4
        case Intrinsic::x86_avx512_mask_sub_sd_round:
2471
4
          V = Builder.CreateFSub(LHS, RHS);
2472
4
          break;
2473
4
        case Intrinsic::x86_avx512_mask_mul_ss_round:
2474
4
        case Intrinsic::x86_avx512_mask_mul_sd_round:
2475
4
          V = Builder.CreateFMul(LHS, RHS);
2476
4
          break;
2477
4
        case Intrinsic::x86_avx512_mask_div_ss_round:
2478
4
        case Intrinsic::x86_avx512_mask_div_sd_round:
2479
4
          V = Builder.CreateFDiv(LHS, RHS);
2480
4
          break;
2481
16
        }
2482
16
2483
16
        // Handle the masking aspect of the intrinsic.
2484
16
        Value *Mask = II->getArgOperand(3);
2485
16
        auto *C = dyn_cast<ConstantInt>(Mask);
2486
16
        // We don't need a select if we know the mask bit is a 1.
2487
16
        if (
!C || 16
!C->getValue()[0]8
) {
2488
8
          // Cast the mask to an i1 vector and then extract the lowest element.
2489
8
          auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
2490
8
                             cast<IntegerType>(Mask->getType())->getBitWidth());
2491
8
          Mask = Builder.CreateBitCast(Mask, MaskTy);
2492
8
          Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
2493
8
          // Extract the lowest element from the passthru operand.
2494
8
          Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2),
2495
8
                                                          (uint64_t)0);
2496
8
          V = Builder.CreateSelect(Mask, V, Passthru);
2497
8
        }
2498
16
2499
16
        // Insert the result back into the original argument 0.
2500
16
        V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2501
16
2502
16
        return replaceInstUsesWith(*II, V);
2503
16
      }
2504
88
    }
2505
72
    
LLVM_FALLTHROUGH72
;
2506
72
2507
72
  // X86 scalar intrinsics simplified with SimplifyDemandedVectorElts.
2508
442
  case Intrinsic::x86_avx512_mask_max_ss_round:
2509
442
  case Intrinsic::x86_avx512_mask_min_ss_round:
2510
442
  case Intrinsic::x86_avx512_mask_max_sd_round:
2511
442
  case Intrinsic::x86_avx512_mask_min_sd_round:
2512
442
  case Intrinsic::x86_avx512_mask_vfmadd_ss:
2513
442
  case Intrinsic::x86_avx512_mask_vfmadd_sd:
2514
442
  case Intrinsic::x86_avx512_maskz_vfmadd_ss:
2515
442
  case Intrinsic::x86_avx512_maskz_vfmadd_sd:
2516
442
  case Intrinsic::x86_avx512_mask3_vfmadd_ss:
2517
442
  case Intrinsic::x86_avx512_mask3_vfmadd_sd:
2518
442
  case Intrinsic::x86_avx512_mask3_vfmsub_ss:
2519
442
  case Intrinsic::x86_avx512_mask3_vfmsub_sd:
2520
442
  case Intrinsic::x86_avx512_mask3_vfnmsub_ss:
2521
442
  case Intrinsic::x86_avx512_mask3_vfnmsub_sd:
2522
442
  case Intrinsic::x86_fma_vfmadd_ss:
2523
442
  case Intrinsic::x86_fma_vfmsub_ss:
2524
442
  case Intrinsic::x86_fma_vfnmadd_ss:
2525
442
  case Intrinsic::x86_fma_vfnmsub_ss:
2526
442
  case Intrinsic::x86_fma_vfmadd_sd:
2527
442
  case Intrinsic::x86_fma_vfmsub_sd:
2528
442
  case Intrinsic::x86_fma_vfnmadd_sd:
2529
442
  case Intrinsic::x86_fma_vfnmsub_sd:
2530
442
  case Intrinsic::x86_sse_cmp_ss:
2531
442
  case Intrinsic::x86_sse_min_ss:
2532
442
  case Intrinsic::x86_sse_max_ss:
2533
442
  case Intrinsic::x86_sse2_cmp_sd:
2534
442
  case Intrinsic::x86_sse2_min_sd:
2535
442
  case Intrinsic::x86_sse2_max_sd:
2536
442
  case Intrinsic::x86_sse41_round_ss:
2537
442
  case Intrinsic::x86_sse41_round_sd:
2538
442
  case Intrinsic::x86_xop_vfrcz_ss:
2539
442
  case Intrinsic::x86_xop_vfrcz_sd: {
2540
442
   unsigned VWidth = II->getType()->getVectorNumElements();
2541
442
   APInt UndefElts(VWidth, 0);
2542
442
   APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
2543
442
   if (Value *
V442
= SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
2544
83
     if (V != II)
2545
0
       return replaceInstUsesWith(*II, V);
2546
83
     return II;
2547
83
   }
2548
359
   break;
2549
359
  }
2550
359
2551
359
  // Constant fold ashr( <A x Bi>, Ci ).
2552
359
  // Constant fold lshr( <A x Bi>, Ci ).
2553
359
  // Constant fold shl( <A x Bi>, Ci ).
2554
129
  case Intrinsic::x86_sse2_psrai_d:
2555
129
  case Intrinsic::x86_sse2_psrai_w:
2556
129
  case Intrinsic::x86_avx2_psrai_d:
2557
129
  case Intrinsic::x86_avx2_psrai_w:
2558
129
  case Intrinsic::x86_avx512_psrai_q_128:
2559
129
  case Intrinsic::x86_avx512_psrai_q_256:
2560
129
  case Intrinsic::x86_avx512_psrai_d_512:
2561
129
  case Intrinsic::x86_avx512_psrai_q_512:
2562
129
  case Intrinsic::x86_avx512_psrai_w_512:
2563
129
  case Intrinsic::x86_sse2_psrli_d:
2564
129
  case Intrinsic::x86_sse2_psrli_q:
2565
129
  case Intrinsic::x86_sse2_psrli_w:
2566
129
  case Intrinsic::x86_avx2_psrli_d:
2567
129
  case Intrinsic::x86_avx2_psrli_q:
2568
129
  case Intrinsic::x86_avx2_psrli_w:
2569
129
  case Intrinsic::x86_avx512_psrli_d_512:
2570
129
  case Intrinsic::x86_avx512_psrli_q_512:
2571
129
  case Intrinsic::x86_avx512_psrli_w_512:
2572
129
  case Intrinsic::x86_sse2_pslli_d:
2573
129
  case Intrinsic::x86_sse2_pslli_q:
2574
129
  case Intrinsic::x86_sse2_pslli_w:
2575
129
  case Intrinsic::x86_avx2_pslli_d:
2576
129
  case Intrinsic::x86_avx2_pslli_q:
2577
129
  case Intrinsic::x86_avx2_pslli_w:
2578
129
  case Intrinsic::x86_avx512_pslli_d_512:
2579
129
  case Intrinsic::x86_avx512_pslli_q_512:
2580
129
  case Intrinsic::x86_avx512_pslli_w_512:
2581
129
    if (Value *V = simplifyX86immShift(*II, Builder))
2582
129
      return replaceInstUsesWith(*II, V);
2583
0
    break;
2584
0
2585
266
  case Intrinsic::x86_sse2_psra_d:
2586
266
  case Intrinsic::x86_sse2_psra_w:
2587
266
  case Intrinsic::x86_avx2_psra_d:
2588
266
  case Intrinsic::x86_avx2_psra_w:
2589
266
  case Intrinsic::x86_avx512_psra_q_128:
2590
266
  case Intrinsic::x86_avx512_psra_q_256:
2591
266
  case Intrinsic::x86_avx512_psra_d_512:
2592
266
  case Intrinsic::x86_avx512_psra_q_512:
2593
266
  case Intrinsic::x86_avx512_psra_w_512:
2594
266
  case Intrinsic::x86_sse2_psrl_d:
2595
266
  case Intrinsic::x86_sse2_psrl_q:
2596
266
  case Intrinsic::x86_sse2_psrl_w:
2597
266
  case Intrinsic::x86_avx2_psrl_d:
2598
266
  case Intrinsic::x86_avx2_psrl_q:
2599
266
  case Intrinsic::x86_avx2_psrl_w:
2600
266
  case Intrinsic::x86_avx512_psrl_d_512:
2601
266
  case Intrinsic::x86_avx512_psrl_q_512:
2602
266
  case Intrinsic::x86_avx512_psrl_w_512:
2603
266
  case Intrinsic::x86_sse2_psll_d:
2604
266
  case Intrinsic::x86_sse2_psll_q:
2605
266
  case Intrinsic::x86_sse2_psll_w:
2606
266
  case Intrinsic::x86_avx2_psll_d:
2607
266
  case Intrinsic::x86_avx2_psll_q:
2608
266
  case Intrinsic::x86_avx2_psll_w:
2609
266
  case Intrinsic::x86_avx512_psll_d_512:
2610
266
  case Intrinsic::x86_avx512_psll_q_512:
2611
266
  case Intrinsic::x86_avx512_psll_w_512: {
2612
266
    if (Value *V = simplifyX86immShift(*II, Builder))
2613
134
      return replaceInstUsesWith(*II, V);
2614
132
2615
132
    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2616
132
    // operand to compute the shift amount.
2617
132
    Value *Arg1 = II->getArgOperand(1);
2618
132
    assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2619
132
           "Unexpected packed shift size");
2620
132
    unsigned VWidth = Arg1->getType()->getVectorNumElements();
2621
132
2622
132
    if (Value *
V132
= SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2623
33
      II->setArgOperand(1, V);
2624
33
      return II;
2625
33
    }
2626
99
    break;
2627
99
  }
2628
99
2629
126
  case Intrinsic::x86_avx2_psllv_d:
2630
126
  case Intrinsic::x86_avx2_psllv_d_256:
2631
126
  case Intrinsic::x86_avx2_psllv_q:
2632
126
  case Intrinsic::x86_avx2_psllv_q_256:
2633
126
  case Intrinsic::x86_avx512_psllv_d_512:
2634
126
  case Intrinsic::x86_avx512_psllv_q_512:
2635
126
  case Intrinsic::x86_avx512_psllv_w_128:
2636
126
  case Intrinsic::x86_avx512_psllv_w_256:
2637
126
  case Intrinsic::x86_avx512_psllv_w_512:
2638
126
  case Intrinsic::x86_avx2_psrav_d:
2639
126
  case Intrinsic::x86_avx2_psrav_d_256:
2640
126
  case Intrinsic::x86_avx512_psrav_q_128:
2641
126
  case Intrinsic::x86_avx512_psrav_q_256:
2642
126
  case Intrinsic::x86_avx512_psrav_d_512:
2643
126
  case Intrinsic::x86_avx512_psrav_q_512:
2644
126
  case Intrinsic::x86_avx512_psrav_w_128:
2645
126
  case Intrinsic::x86_avx512_psrav_w_256:
2646
126
  case Intrinsic::x86_avx512_psrav_w_512:
2647
126
  case Intrinsic::x86_avx2_psrlv_d:
2648
126
  case Intrinsic::x86_avx2_psrlv_d_256:
2649
126
  case Intrinsic::x86_avx2_psrlv_q:
2650
126
  case Intrinsic::x86_avx2_psrlv_q_256:
2651
126
  case Intrinsic::x86_avx512_psrlv_d_512:
2652
126
  case Intrinsic::x86_avx512_psrlv_q_512:
2653
126
  case Intrinsic::x86_avx512_psrlv_w_128:
2654
126
  case Intrinsic::x86_avx512_psrlv_w_256:
2655
126
  case Intrinsic::x86_avx512_psrlv_w_512:
2656
126
    if (Value *V = simplifyX86varShift(*II, Builder))
2657
108
      return replaceInstUsesWith(*II, V);
2658
18
    break;
2659
18
2660
42
  case Intrinsic::x86_sse2_pmulu_dq:
2661
42
  case Intrinsic::x86_sse41_pmuldq:
2662
42
  case Intrinsic::x86_avx2_pmul_dq:
2663
42
  case Intrinsic::x86_avx2_pmulu_dq:
2664
42
  case Intrinsic::x86_avx512_pmul_dq_512:
2665
42
  case Intrinsic::x86_avx512_pmulu_dq_512: {
2666
42
    if (Value *V = simplifyX86muldq(*II, Builder))
2667
18
      return replaceInstUsesWith(*II, V);
2668
24
2669
24
    unsigned VWidth = II->getType()->getVectorNumElements();
2670
24
    APInt UndefElts(VWidth, 0);
2671
24
    APInt DemandedElts = APInt::getAllOnesValue(VWidth);
2672
24
    if (Value *
V24
= SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) {
2673
6
      if (V != II)
2674
0
        return replaceInstUsesWith(*II, V);
2675
6
      return II;
2676
6
    }
2677
18
    break;
2678
18
  }
2679
18
2680
29
  case Intrinsic::x86_sse2_packssdw_128:
2681
29
  case Intrinsic::x86_sse2_packsswb_128:
2682
29
  case Intrinsic::x86_avx2_packssdw:
2683
29
  case Intrinsic::x86_avx2_packsswb:
2684
29
  case Intrinsic::x86_avx512_packssdw_512:
2685
29
  case Intrinsic::x86_avx512_packsswb_512:
2686
29
    if (Value *V = simplifyX86pack(*II, true))
2687
15
      return replaceInstUsesWith(*II, V);
2688
14
    break;
2689
14
2690
22
  case Intrinsic::x86_sse2_packuswb_128:
2691
22
  case Intrinsic::x86_sse41_packusdw:
2692
22
  case Intrinsic::x86_avx2_packusdw:
2693
22
  case Intrinsic::x86_avx2_packuswb:
2694
22
  case Intrinsic::x86_avx512_packusdw_512:
2695
22
  case Intrinsic::x86_avx512_packuswb_512:
2696
22
    if (Value *V = simplifyX86pack(*II, false))
2697
15
      return replaceInstUsesWith(*II, V);
2698
7
    break;
2699
7
2700
0
  case Intrinsic::x86_pclmulqdq: {
2701
0
    if (auto *
C0
= dyn_cast<ConstantInt>(II->getArgOperand(2))) {
2702
0
      unsigned Imm = C->getZExtValue();
2703
0
2704
0
      bool MadeChange = false;
2705
0
      Value *Arg0 = II->getArgOperand(0);
2706
0
      Value *Arg1 = II->getArgOperand(1);
2707
0
      unsigned VWidth = Arg0->getType()->getVectorNumElements();
2708
0
      APInt DemandedElts(VWidth, 0);
2709
0
2710
0
      APInt UndefElts1(VWidth, 0);
2711
0
      DemandedElts = (Imm & 0x01) ? 
20
:
10
;
2712
0
      if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts,
2713
0
                                                UndefElts1)) {
2714
0
        II->setArgOperand(0, V);
2715
0
        MadeChange = true;
2716
0
      }
2717
0
2718
0
      APInt UndefElts2(VWidth, 0);
2719
0
      DemandedElts = (Imm & 0x10) ? 
20
:
10
;
2720
0
      if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts,
2721
0
                                                UndefElts2)) {
2722
0
        II->setArgOperand(1, V);
2723
0
        MadeChange = true;
2724
0
      }
2725
0
2726
0
      // If both input elements are undef, the result is undef.
2727
0
      if (
UndefElts1[(Imm & 0x01) ? 0
10
:
00
] ||
2728
0
          
UndefElts2[(Imm & 0x10) ? 0
10
:
00
])
2729
0
        return replaceInstUsesWith(*II,
2730
0
                                   ConstantAggregateZero::get(II->getType()));
2731
0
2732
0
      
if (0
MadeChange0
)
2733
0
        return II;
2734
0
    }
2735
0
    break;
2736
0
  }
2737
0
2738
15
  case Intrinsic::x86_sse41_insertps:
2739
15
    if (Value *V = simplifyX86insertps(*II, Builder))
2740
13
      return replaceInstUsesWith(*II, V);
2741
2
    break;
2742
2
2743
20
  case Intrinsic::x86_sse4a_extrq: {
2744
20
    Value *Op0 = II->getArgOperand(0);
2745
20
    Value *Op1 = II->getArgOperand(1);
2746
20
    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
2747
20
    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
2748
20
    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2749
20
           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2750
20
           VWidth1 == 16 && "Unexpected operand sizes");
2751
20
2752
20
    // See if we're dealing with constant values.
2753
20
    Constant *C1 = dyn_cast<Constant>(Op1);
2754
20
    ConstantInt *CILength =
2755
5
        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2756
15
           : nullptr;
2757
20
    ConstantInt *CIIndex =
2758
5
        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2759
15
           : nullptr;
2760
20
2761
20
    // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2762
20
    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
2763
6
      return replaceInstUsesWith(*II, V);
2764
14
2765
14
    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2766
14
    // operands and the lowest 16-bits of the second.
2767
14
    bool MadeChange = false;
2768
14
    if (Value *
V14
= SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2769
2
      II->setArgOperand(0, V);
2770
2
      MadeChange = true;
2771
2
    }
2772
14
    if (Value *
V14
= SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2773
2
      II->setArgOperand(1, V);
2774
2
      MadeChange = true;
2775
2
    }
2776
14
    if (MadeChange)
2777
3
      return II;
2778
11
    break;
2779
11
  }
2780
11
2781
15
  case Intrinsic::x86_sse4a_extrqi: {
2782
15
    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2783
15
    // bits of the lower 64-bits. The upper 64-bits are undefined.
2784
15
    Value *Op0 = II->getArgOperand(0);
2785
15
    unsigned VWidth = Op0->getType()->getVectorNumElements();
2786
15
    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2787
15
           "Unexpected operand size");
2788
15
2789
15
    // See if we're dealing with constant values.
2790
15
    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1));
2791
15
    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2));
2792
15
2793
15
    // Attempt to simplify to a constant or shuffle vector.
2794
15
    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
2795
7
      return replaceInstUsesWith(*II, V);
2796
8
2797
8
    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2798
8
    // operand.
2799
8
    
if (Value *8
V8
= SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2800
1
      II->setArgOperand(0, V);
2801
1
      return II;
2802
1
    }
2803
7
    break;
2804
7
  }
2805
7
2806
10
  case Intrinsic::x86_sse4a_insertq: {
2807
10
    Value *Op0 = II->getArgOperand(0);
2808
10
    Value *Op1 = II->getArgOperand(1);
2809
10
    unsigned VWidth = Op0->getType()->getVectorNumElements();
2810
10
    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2811
10
           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2812
10
           Op1->getType()->getVectorNumElements() == 2 &&
2813
10
           "Unexpected operand size");
2814
10
2815
10
    // See if we're dealing with constant values.
2816
10
    Constant *C1 = dyn_cast<Constant>(Op1);
2817
10
    ConstantInt *CI11 =
2818
4
        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2819
6
           : nullptr;
2820
10
2821
10
    // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2822
10
    if (
CI1110
) {
2823
4
      const APInt &V11 = CI11->getValue();
2824
4
      APInt Len = V11.zextOrTrunc(6);
2825
4
      APInt Idx = V11.lshr(8).zextOrTrunc(6);
2826
4
      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
2827
4
        return replaceInstUsesWith(*II, V);
2828
6
    }
2829
6
2830
6
    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2831
6
    // operand.
2832
6
    
if (Value *6
V6
= SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2833
1
      II->setArgOperand(0, V);
2834
1
      return II;
2835
1
    }
2836
5
    break;
2837
5
  }
2838
5
2839
30
  case Intrinsic::x86_sse4a_insertqi: {
2840
30
    // INSERTQI: Extract lowest Length bits from lower half of second source and
2841
30
    // insert over first source starting at Index bit. The upper 64-bits are
2842
30
    // undefined.
2843
30
    Value *Op0 = II->getArgOperand(0);
2844
30
    Value *Op1 = II->getArgOperand(1);
2845
30
    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
2846
30
    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
2847
30
    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2848
30
           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2849
30
           VWidth1 == 2 && "Unexpected operand sizes");
2850
30
2851
30
    // See if we're dealing with constant values.
2852
30
    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2));
2853
30
    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3));
2854
30
2855
30
    // Attempt to simplify to a constant or shuffle vector.
2856
30
    if (
CILength && 30
CIIndex30
) {
2857
30
      APInt Len = CILength->getValue().zextOrTrunc(6);
2858
30
      APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2859
30
      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
2860
8
        return replaceInstUsesWith(*II, V);
2861
22
    }
2862
22
2863
22
    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2864
22
    // operands.
2865
22
    bool MadeChange = false;
2866
22
    if (Value *
V22
= SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2867
2
      II->setArgOperand(0, V);
2868
2
      MadeChange = true;
2869
2
    }
2870
22
    if (Value *
V22
= SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2871
5
      II->setArgOperand(1, V);
2872
5
      MadeChange = true;
2873
5
    }
2874
22
    if (MadeChange)
2875
6
      return II;
2876
16
    break;
2877
16
  }
2878
16
2879
18
  case Intrinsic::x86_sse41_pblendvb:
2880
18
  case Intrinsic::x86_sse41_blendvps:
2881
18
  case Intrinsic::x86_sse41_blendvpd:
2882
18
  case Intrinsic::x86_avx_blendv_ps_256:
2883
18
  case Intrinsic::x86_avx_blendv_pd_256:
2884
18
  case Intrinsic::x86_avx2_pblendvb: {
2885
18
    // Convert blendv* to vector selects if the mask is constant.
2886
18
    // This optimization is convoluted because the intrinsic is defined as
2887
18
    // getting a vector of floats or doubles for the ps and pd versions.
2888
18
    // FIXME: That should be changed.
2889
18
2890
18
    Value *Op0 = II->getArgOperand(0);
2891
18
    Value *Op1 = II->getArgOperand(1);
2892
18
    Value *Mask = II->getArgOperand(2);
2893
18
2894
18
    // fold (blend A, A, Mask) -> A
2895
18
    if (Op0 == Op1)
2896
6
      return replaceInstUsesWith(CI, Op0);
2897
12
2898
12
    // Zero Mask - select 1st argument.
2899
12
    
if (12
isa<ConstantAggregateZero>(Mask)12
)
2900
6
      return replaceInstUsesWith(CI, Op0);
2901
6
2902
6
    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2903
6
    
if (auto *6
ConstantMask6
= dyn_cast<ConstantDataVector>(Mask)) {
2904
6
      Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
2905
6
      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2906
6
    }
2907
0
    break;
2908
0
  }
2909
0
2910
58
  case Intrinsic::x86_ssse3_pshuf_b_128:
2911
58
  case Intrinsic::x86_avx2_pshuf_b:
2912
58
  case Intrinsic::x86_avx512_pshuf_b_512:
2913
58
    if (Value *V = simplifyX86pshufb(*II, Builder))
2914
51
      return replaceInstUsesWith(*II, V);
2915
7
    break;
2916
7
2917
39
  case Intrinsic::x86_avx_vpermilvar_ps:
2918
39
  case Intrinsic::x86_avx_vpermilvar_ps_256:
2919
39
  case Intrinsic::x86_avx512_vpermilvar_ps_512:
2920
39
  case Intrinsic::x86_avx_vpermilvar_pd:
2921
39
  case Intrinsic::x86_avx_vpermilvar_pd_256:
2922
39
  case Intrinsic::x86_avx512_vpermilvar_pd_512:
2923
39
    if (Value *V = simplifyX86vpermilvar(*II, Builder))
2924
28
      return replaceInstUsesWith(*II, V);
2925
11
    break;
2926
11
2927
12
  case Intrinsic::x86_avx2_permd:
2928
12
  case Intrinsic::x86_avx2_permps:
2929
12
    if (Value *V = simplifyX86vpermv(*II, Builder))
2930
9
      return replaceInstUsesWith(*II, V);
2931
3
    break;
2932
3
2933
112
  case Intrinsic::x86_avx512_mask_permvar_df_256:
2934
112
  case Intrinsic::x86_avx512_mask_permvar_df_512:
2935
112
  case Intrinsic::x86_avx512_mask_permvar_di_256:
2936
112
  case Intrinsic::x86_avx512_mask_permvar_di_512:
2937
112
  case Intrinsic::x86_avx512_mask_permvar_hi_128:
2938
112
  case Intrinsic::x86_avx512_mask_permvar_hi_256:
2939
112
  case Intrinsic::x86_avx512_mask_permvar_hi_512:
2940
112
  case Intrinsic::x86_avx512_mask_permvar_qi_128:
2941
112
  case Intrinsic::x86_avx512_mask_permvar_qi_256:
2942
112
  case Intrinsic::x86_avx512_mask_permvar_qi_512:
2943
112
  case Intrinsic::x86_avx512_mask_permvar_sf_256:
2944
112
  case Intrinsic::x86_avx512_mask_permvar_sf_512:
2945
112
  case Intrinsic::x86_avx512_mask_permvar_si_256:
2946
112
  case Intrinsic::x86_avx512_mask_permvar_si_512:
2947
112
    if (Value *
V112
= simplifyX86vpermv(*II, Builder)) {
2948
112
      // We simplified the permuting, now create a select for the masking.
2949
112
      V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
2950
112
                            Builder);
2951
112
      return replaceInstUsesWith(*II, V);
2952
112
    }
2953
0
    break;
2954
0
2955
12
  case Intrinsic::x86_avx_maskload_ps:
2956
12
  case Intrinsic::x86_avx_maskload_pd:
2957
12
  case Intrinsic::x86_avx_maskload_ps_256:
2958
12
  case Intrinsic::x86_avx_maskload_pd_256:
2959
12
  case Intrinsic::x86_avx2_maskload_d:
2960
12
  case Intrinsic::x86_avx2_maskload_q:
2961
12
  case Intrinsic::x86_avx2_maskload_d_256:
2962
12
  case Intrinsic::x86_avx2_maskload_q_256:
2963
12
    if (Instruction *I = simplifyX86MaskedLoad(*II, *this))
2964
11
      return I;
2965
1
    break;
2966
1
2967
13
  case Intrinsic::x86_sse2_maskmov_dqu:
2968
13
  case Intrinsic::x86_avx_maskstore_ps:
2969
13
  case Intrinsic::x86_avx_maskstore_pd:
2970
13
  case Intrinsic::x86_avx_maskstore_ps_256:
2971
13
  case Intrinsic::x86_avx_maskstore_pd_256:
2972
13
  case Intrinsic::x86_avx2_maskstore_d:
2973
13
  case Intrinsic::x86_avx2_maskstore_q:
2974
13
  case Intrinsic::x86_avx2_maskstore_d_256:
2975
13
  case Intrinsic::x86_avx2_maskstore_q_256:
2976
13
    if (simplifyX86MaskedStore(*II, *this))
2977
12
      return nullptr;
2978
1
    break;
2979
1
2980
8
  case Intrinsic::x86_xop_vpcomb:
2981
8
  case Intrinsic::x86_xop_vpcomd:
2982
8
  case Intrinsic::x86_xop_vpcomq:
2983
8
  case Intrinsic::x86_xop_vpcomw:
2984
8
    if (Value *V = simplifyX86vpcom(*II, Builder, true))
2985
8
      return replaceInstUsesWith(*II, V);
2986
0
    break;
2987
0
2988
8
  case Intrinsic::x86_xop_vpcomub:
2989
8
  case Intrinsic::x86_xop_vpcomud:
2990
8
  case Intrinsic::x86_xop_vpcomuq:
2991
8
  case Intrinsic::x86_xop_vpcomuw:
2992
8
    if (Value *V = simplifyX86vpcom(*II, Builder, false))
2993
8
      return replaceInstUsesWith(*II, V);
2994
0
    break;
2995
0
2996
2
  case Intrinsic::ppc_altivec_vperm:
2997
2
    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
2998
2
    // Note that ppc_altivec_vperm has a big-endian bias, so when creating
2999
2
    // a vectorshuffle for little endian, we must undo the transformation
3000
2
    // performed on vec_perm in altivec.h.  That is, we must complement
3001
2
    // the permutation mask with respect to 31 and reverse the order of
3002
2
    // V1 and V2.
3003
2
    if (Constant *
Mask2
= dyn_cast<Constant>(II->getArgOperand(2))) {
3004
2
      assert(Mask->getType()->getVectorNumElements() == 16 &&
3005
2
             "Bad type for intrinsic!");
3006
2
3007
2
      // Check that all of the elements are integer constants or undefs.
3008
2
      bool AllEltsOk = true;
3009
34
      for (unsigned i = 0; 
i != 1634
;
++i32
) {
3010
32
        Constant *Elt = Mask->getAggregateElement(i);
3011
32
        if (
!Elt || 32
!(isa<ConstantInt>(Elt) || 32
isa<UndefValue>(Elt)0
)) {
3012
0
          AllEltsOk = false;
3013
0
          break;
3014
0
        }
3015
32
      }
3016
2
3017
2
      if (
AllEltsOk2
) {
3018
2
        // Cast the input vectors to byte vectors.
3019
2
        Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0),
3020
2
                                           Mask->getType());
3021
2
        Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1),
3022
2
                                           Mask->getType());
3023
2
        Value *Result = UndefValue::get(Op0->getType());
3024
2
3025
2
        // Only extract each element once.
3026
2
        Value *ExtractedElts[32];
3027
2
        memset(ExtractedElts, 0, sizeof(ExtractedElts));
3028
2
3029
34
        for (unsigned i = 0; 
i != 1634
;
++i32
) {
3030
32
          if (isa<UndefValue>(Mask->getAggregateElement(i)))
3031
0
            continue;
3032
32
          unsigned Idx =
3033
32
            cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
3034
32
          Idx &= 31;  // Match the hardware behavior.
3035
32
          if (DL.isLittleEndian())
3036
32
            Idx = 31 - Idx;
3037
32
3038
32
          if (
!ExtractedElts[Idx]32
) {
3039
17
            Value *Op0ToUse = (DL.isLittleEndian()) ? 
Op117
:
Op00
;
3040
17
            Value *Op1ToUse = (DL.isLittleEndian()) ? 
Op017
:
Op10
;
3041
17
            ExtractedElts[Idx] =
3042
17
              Builder.CreateExtractElement(Idx < 16 ? 
Op0ToUse8
:
Op1ToUse9
,
3043
17
                                           Builder.getInt32(Idx&15));
3044
17
          }
3045
32
3046
32
          // Insert this value into the result vector.
3047
32
          Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx],
3048
32
                                               Builder.getInt32(i));
3049
32
        }
3050
2
        return CastInst::Create(Instruction::BitCast, Result, CI.getType());
3051
2
      }
3052
0
    }
3053
0
    break;
3054
0
3055
6
  case Intrinsic::arm_neon_vld1:
3056
6
  case Intrinsic::arm_neon_vld2:
3057
6
  case Intrinsic::arm_neon_vld3:
3058
6
  case Intrinsic::arm_neon_vld4:
3059
6
  case Intrinsic::arm_neon_vld2lane:
3060
6
  case Intrinsic::arm_neon_vld3lane:
3061
6
  case Intrinsic::arm_neon_vld4lane:
3062
6
  case Intrinsic::arm_neon_vst1:
3063
6
  case Intrinsic::arm_neon_vst2:
3064
6
  case Intrinsic::arm_neon_vst3:
3065
6
  case Intrinsic::arm_neon_vst4:
3066
6
  case Intrinsic::arm_neon_vst2lane:
3067
6
  case Intrinsic::arm_neon_vst3lane:
3068
6
  case Intrinsic::arm_neon_vst4lane: {
3069
6
    unsigned MemAlign =
3070
6
        getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
3071
6
    unsigned AlignArg = II->getNumArgOperands() - 1;
3072
6
    ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
3073
6
    if (
IntrAlign && 6
IntrAlign->getZExtValue() < MemAlign6
) {
3074
2
      II->setArgOperand(AlignArg,
3075
2
                        ConstantInt::get(Type::getInt32Ty(II->getContext()),
3076
2
                                         MemAlign, false));
3077
2
      return II;
3078
2
    }
3079
4
    break;
3080
4
  }
3081
4
3082
733
  case Intrinsic::arm_neon_vmulls:
3083
733
  case Intrinsic::arm_neon_vmullu:
3084
733
  case Intrinsic::aarch64_neon_smull:
3085
733
  case Intrinsic::aarch64_neon_umull: {
3086
733
    Value *Arg0 = II->getArgOperand(0);
3087
733
    Value *Arg1 = II->getArgOperand(1);
3088
733
3089
733
    // Handle mul by zero first:
3090
733
    if (
isa<ConstantAggregateZero>(Arg0) || 733
isa<ConstantAggregateZero>(Arg1)733
) {
3091
17
      return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
3092
17
    }
3093
716
3094
716
    // Check for constant LHS & RHS - in this case we just simplify.
3095
716
    bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu ||
3096
715
                 II->getIntrinsicID() == Intrinsic::aarch64_neon_umull);
3097
716
    VectorType *NewVT = cast<VectorType>(II->getType());
3098
716
    if (Constant *
CV0716
= dyn_cast<Constant>(Arg0)) {
3099
207
      if (Constant *
CV1207
= dyn_cast<Constant>(Arg1)) {
3100
204
        CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext);
3101
204
        CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext);
3102
204
3103
204
        return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1));
3104
204
      }
3105
3
3106
3
      // Couldn't simplify - canonicalize constant to the RHS.
3107
3
      std::swap(Arg0, Arg1);
3108
3
    }
3109
716
3110
716
    // Handle mul by one:
3111
512
    
if (Constant *512
CV1512
= dyn_cast<Constant>(Arg1))
3112
108
      
if (ConstantInt *108
Splat108
=
3113
108
              dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
3114
108
        
if (108
Splat->isOne()108
)
3115
1
          return CastInst::CreateIntegerCast(Arg0, II->getType(),
3116
1
                                             /*isSigned=*/!Zext);
3117
511
3118
511
    break;
3119
511
  }
3120
47
  case Intrinsic::amdgcn_rcp: {
3121
47
    Value *Src = II->getArgOperand(0);
3122
47
3123
47
    // TODO: Move to ConstantFolding/InstSimplify?
3124
47
    if (isa<UndefValue>(Src))
3125
1
      return replaceInstUsesWith(CI, Src);
3126
46
3127
46
    
if (const ConstantFP *46
C46
= dyn_cast<ConstantFP>(Src)) {
3128
6
      const APFloat &ArgVal = C->getValueAPF();
3129
6
      APFloat Val(ArgVal.getSemantics(), 1.0);
3130
6
      APFloat::opStatus Status = Val.divide(ArgVal,
3131
6
                                            APFloat::rmNearestTiesToEven);
3132
6
      // Only do this if it was exact and therefore not dependent on the
3133
6
      // rounding mode.
3134
6
      if (Status == APFloat::opOK)
3135
4
        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
3136
42
    }
3137
42
3138
42
    break;
3139
42
  }
3140
41
  case Intrinsic::amdgcn_rsq: {
3141
41
    Value *Src = II->getArgOperand(0);
3142
41
3143
41
    // TODO: Move to ConstantFolding/InstSimplify?
3144
41
    if (isa<UndefValue>(Src))
3145
1
      return replaceInstUsesWith(CI, Src);
3146
40
    break;
3147
40
  }
3148
122
  case Intrinsic::amdgcn_frexp_mant:
3149
122
  case Intrinsic::amdgcn_frexp_exp: {
3150
122
    Value *Src = II->getArgOperand(0);
3151
122
    if (const ConstantFP *
C122
= dyn_cast<ConstantFP>(Src)) {
3152
38
      int Exp;
3153
38
      APFloat Significand = frexp(C->getValueAPF(), Exp,
3154
38
                                  APFloat::rmNearestTiesToEven);
3155
38
3156
38
      if (
II->getIntrinsicID() == Intrinsic::amdgcn_frexp_mant38
) {
3157
18
        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(),
3158
18
                                                       Significand));
3159
18
      }
3160
20
3161
20
      // Match instruction special case behavior.
3162
20
      
if (20
Exp == APFloat::IEK_NaN || 20
Exp == APFloat::IEK_Inf18
)
3163
6
        Exp = 0;
3164
38
3165
38
      return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp));
3166
38
    }
3167
84
3168
84
    
if (84
isa<UndefValue>(Src)84
)
3169
4
      return replaceInstUsesWith(CI, UndefValue::get(II->getType()));
3170
80
3171
80
    break;
3172
80
  }
3173
75
  case Intrinsic::amdgcn_class: {
3174
75
    enum  {
3175
75
      S_NAN = 1 << 0,        // Signaling NaN
3176
75
      Q_NAN = 1 << 1,        // Quiet NaN
3177
75
      N_INFINITY = 1 << 2,   // Negative infinity
3178
75
      N_NORMAL = 1 << 3,     // Negative normal
3179
75
      N_SUBNORMAL = 1 << 4,  // Negative subnormal
3180
75
      N_ZERO = 1 << 5,       // Negative zero
3181
75
      P_ZERO = 1 << 6,       // Positive zero
3182
75
      P_SUBNORMAL = 1 << 7,  // Positive subnormal
3183
75
      P_NORMAL = 1 << 8,     // Positive normal
3184
75
      P_INFINITY = 1 << 9    // Positive infinity
3185
75
    };
3186
75
3187
75
    const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
3188
75
      N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY;
3189
75
3190
75
    Value *Src0 = II->getArgOperand(0);
3191
75
    Value *Src1 = II->getArgOperand(1);
3192
75
    const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
3193
75
    if (
!CMask75
) {
3194
43
      if (isa<UndefValue>(Src0))
3195
1
        return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
3196
42
3197
42
      
if (42
isa<UndefValue>(Src1)42
)
3198
1
        return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
3199
41
      break;
3200
41
    }
3201
32
3202
32
    uint32_t Mask = CMask->getZExtValue();
3203
32
3204
32
    // If all tests are made, it doesn't matter what the value is.
3205
32
    if ((Mask & FullMask) == FullMask)
3206
2
      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true));
3207
30
3208
30
    
if (30
(Mask & FullMask) == 030
)
3209
2
      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
3210
28
3211
28
    
if (28
Mask == (S_NAN | Q_NAN)28
) {
3212
1
      // Equivalent of isnan. Replace with standard fcmp.
3213
1
      Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0);
3214
1
      FCmp->takeName(II);
3215
1
      return replaceInstUsesWith(*II, FCmp);
3216
1
    }
3217
27
3218
27
    const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
3219
27
    if (
!CVal27
) {
3220
4
      if (isa<UndefValue>(Src0))
3221
1
        return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
3222
3
3223
3
      // Clamp mask to used bits
3224
3
      
if (3
(Mask & FullMask) != Mask3
) {
3225
1
        CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(),
3226
1
          { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) }
3227
1
        );
3228
1
3229
1
        NewCall->takeName(II);
3230
1
        return replaceInstUsesWith(*II, NewCall);
3231
1
      }
3232
2
3233
2
      break;
3234
2
    }
3235
23
3236
23
    const APFloat &Val = CVal->getValueAPF();
3237
23
3238
23
    bool Result =
3239
23
      ((Mask & S_NAN) && 
Val.isNaN()2
&&
Val.isSignaling()2
) ||
3240
22
      
((Mask & Q_NAN) && 22
Val.isNaN()1
&&
!Val.isSignaling()1
) ||
3241
21
      
((Mask & N_INFINITY) && 21
Val.isInfinity()4
&&
Val.isNegative()2
) ||
3242
20
      
((Mask & N_NORMAL) && 20
Val.isNormal()2
&&
Val.isNegative()2
) ||
3243
19
      
((Mask & N_SUBNORMAL) && 19
Val.isDenormal()2
&&
Val.isNegative()2
) ||
3244
18
      
((Mask & N_ZERO) && 18
Val.isZero()2
&&
Val.isNegative()2
) ||
3245
17
      
((Mask & P_ZERO) && 17
Val.isZero()2
&&
!Val.isNegative()2
) ||
3246
16
      
((Mask & P_SUBNORMAL) && 16
Val.isDenormal()2
&&
!Val.isNegative()2
) ||
3247
15
      
((Mask & P_NORMAL) && 15
Val.isNormal()2
&&
!Val.isNegative()2
) ||
3248
14
      
((Mask & P_INFINITY) && 14
Val.isInfinity()4
&&
!Val.isNegative()2
);
3249
23
3250
23
    return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
3251
23
  }
3252
9
  case Intrinsic::amdgcn_cvt_pkrtz: {
3253
9
    Value *Src0 = II->getArgOperand(0);
3254
9
    Value *Src1 = II->getArgOperand(1);
3255
9
    if (const ConstantFP *
C09
= dyn_cast<ConstantFP>(Src0)) {
3256
4
      if (const ConstantFP *
C14
= dyn_cast<ConstantFP>(Src1)) {
3257
3
        const fltSemantics &HalfSem
3258
3
          = II->getType()->getScalarType()->getFltSemantics();
3259
3
        bool LosesInfo;
3260
3
        APFloat Val0 = C0->getValueAPF();
3261
3
        APFloat Val1 = C1->getValueAPF();
3262
3
        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
3263
3
        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
3264
3
3265
3
        Constant *Folded = ConstantVector::get({
3266
3
            ConstantFP::get(II->getContext(), Val0),
3267
3
            ConstantFP::get(II->getContext(), Val1) });
3268
3
        return replaceInstUsesWith(*II, Folded);
3269
3
      }
3270
6
    }
3271
6
3272
6
    
if (6
isa<UndefValue>(Src0) && 6
isa<UndefValue>(Src1)2
)
3273
1
      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
3274
5
3275
5
    break;
3276
5
  }
3277
37
  case Intrinsic::amdgcn_ubfe:
3278
37
  case Intrinsic::amdgcn_sbfe: {
3279
37
    // Decompose simple cases into standard shifts.
3280
37
    Value *Src = II->getArgOperand(0);
3281
37
    if (isa<UndefValue>(Src))
3282
1
      return replaceInstUsesWith(*II, Src);
3283
36
3284
36
    unsigned Width;
3285
36
    Type *Ty = II->getType();
3286
36
    unsigned IntSize = Ty->getIntegerBitWidth();
3287
36
3288
36
    ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2));
3289
36
    if (
CWidth36
) {
3290
19
      Width = CWidth->getZExtValue();
3291
19
      if ((Width & (IntSize - 1)) == 0)
3292
4
        return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty));
3293
15
3294
15
      
if (15
Width >= IntSize15
) {
3295
2
        // Hardware ignores high bits, so remove those.
3296
2
        II->setArgOperand(2, ConstantInt::get(CWidth->getType(),
3297
2
                                              Width & (IntSize - 1)));
3298
2
        return II;
3299
2
      }
3300
30
    }
3301
30
3302
30
    unsigned Offset;
3303
30
    ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1));
3304
30
    if (
COffset30
) {
3305
22
      Offset = COffset->getZExtValue();
3306
22
      if (
Offset >= IntSize22
) {
3307
5
        II->setArgOperand(1, ConstantInt::get(COffset->getType(),
3308
5
                                              Offset & (IntSize - 1)));
3309
5
        return II;
3310
5
      }
3311
25
    }
3312
25
3313
25
    bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe;
3314
25
3315
25
    // TODO: Also emit sub if only width is constant.
3316
25
    if (
!CWidth && 25
COffset12
&&
Offset == 09
) {
3317
5
      Constant *KSize = ConstantInt::get(COffset->getType(), IntSize);
3318
5
      Value *ShiftVal = Builder.CreateSub(KSize, II->getArgOperand(2));
3319
5
      ShiftVal = Builder.CreateZExt(ShiftVal, II->getType());
3320
5
3321
5
      Value *Shl = Builder.CreateShl(Src, ShiftVal);
3322
1
      Value *RightShift = Signed ? Builder.CreateAShr(Shl, ShiftVal)
3323
4
                                 : Builder.CreateLShr(Shl, ShiftVal);
3324
5
      RightShift->takeName(II);
3325
5
      return replaceInstUsesWith(*II, RightShift);
3326
5
    }
3327
20
3328
20
    
if (20
!CWidth || 20
!COffset13
)
3329
12
      break;
3330
8
3331
8
    // TODO: This allows folding to undef when the hardware has specific
3332
8
    // behavior?
3333
8
    
if (8
Offset + Width < IntSize8
) {
3334
6
      Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width);
3335
1
      Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width)
3336
5
                                 : Builder.CreateLShr(Shl, IntSize - Width);
3337
6
      RightShift->takeName(II);
3338
6
      return replaceInstUsesWith(*II, RightShift);
3339
6
    }
3340
2
3341
2
    
Value *RightShift = Signed ? 2
Builder.CreateAShr(Src, Offset)1
3342
1
                               : Builder.CreateLShr(Src, Offset);
3343
2
3344
2
    RightShift->takeName(II);
3345
2
    return replaceInstUsesWith(*II, RightShift);
3346
2
  }
3347
71
  case Intrinsic::amdgcn_exp:
3348
71
  case Intrinsic::amdgcn_exp_compr: {
3349
71
    ConstantInt *En = dyn_cast<ConstantInt>(II->getArgOperand(1));
3350
71
    if (!En) // Illegal.
3351
2
      break;
3352
69
3353
69
    unsigned EnBits = En->getZExtValue();
3354
69
    if (EnBits == 0xf)
3355
5
      break; // All inputs enabled.
3356
64
3357
64
    bool IsCompr = II->getIntrinsicID() == Intrinsic::amdgcn_exp_compr;
3358
64
    bool Changed = false;
3359
264
    for (int I = 0; 
I < (IsCompr ? 264
284
:
4180
);
++I200
) {
3360
200
      if (
(!IsCompr && 200
(EnBits & (1 << I)) == 0144
) ||
3361
200
          
(IsCompr && 98
((EnBits & (0x3 << (2 * I))) == 0)56
)) {
3362
135
        Value *Src = II->getArgOperand(I + 2);
3363
135
        if (
!isa<UndefValue>(Src)135
) {
3364
45
          II->setArgOperand(I + 2, UndefValue::get(Src->getType()));
3365
45
          Changed = true;
3366
45
        }
3367
135
      }
3368
200
    }
3369
64
3370
64
    if (Changed)
3371
21
      return II;
3372
43
3373
43
    break;
3374
43
3375
43
  }
3376
65
  case Intrinsic::amdgcn_fmed3: {
3377
65
    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
3378
65
    // for the shader.
3379
65
3380
65
    Value *Src0 = II->getArgOperand(0);
3381
65
    Value *Src1 = II->getArgOperand(1);
3382
65
    Value *Src2 = II->getArgOperand(2);
3383
65
3384
65
    bool Swap = false;
3385
65
    // Canonicalize constants to RHS operands.
3386
65
    //
3387
65
    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
3388
65
    if (
isa<Constant>(Src0) && 65
!isa<Constant>(Src1)16
) {
3389
7
      std::swap(Src0, Src1);
3390
7
      Swap = true;
3391
7
    }
3392
65
3393
65
    if (
isa<Constant>(Src1) && 65
!isa<Constant>(Src2)27
) {
3394
10
      std::swap(Src1, Src2);
3395
10
      Swap = true;
3396
10
    }
3397
65
3398
65
    if (
isa<Constant>(Src0) && 65
!isa<Constant>(Src1)9
) {
3399
2
      std::swap(Src0, Src1);
3400
2
      Swap = true;
3401
2
    }
3402
65
3403
65
    if (
Swap65
) {
3404
12
      II->setArgOperand(0, Src0);
3405
12
      II->setArgOperand(1, Src1);
3406
12
      II->setArgOperand(2, Src2);
3407
12
      return II;
3408
12
    }
3409
53
3410
53
    
if (53
match(Src2, m_NaN()) || 53
isa<UndefValue>(Src2)45
) {
3411
12
      CallInst *NewCall = Builder.CreateMinNum(Src0, Src1);
3412
12
      NewCall->copyFastMathFlags(II);
3413
12
      NewCall->takeName(II);
3414
12
      return replaceInstUsesWith(*II, NewCall);
3415
12
    }
3416
41
3417
41
    
if (const ConstantFP *41
C041
= dyn_cast<ConstantFP>(Src0)) {
3418
6
      if (const ConstantFP *
C16
= dyn_cast<ConstantFP>(Src1)) {
3419
6
        if (const ConstantFP *
C26
= dyn_cast<ConstantFP>(Src2)) {
3420
6
          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
3421
6
                                       C2->getValueAPF());
3422
6
          return replaceInstUsesWith(*II,
3423
6
            ConstantFP::get(Builder.getContext(), Result));
3424
6
        }
3425
35
      }
3426
6
    }
3427
35
3428
35
    break;
3429
35
  }
3430
182
  case Intrinsic::amdgcn_icmp:
3431
182
  case Intrinsic::amdgcn_fcmp: {
3432
182
    const ConstantInt *CC = dyn_cast<ConstantInt>(II->getArgOperand(2));
3433
182
    if (!CC)
3434
2
      break;
3435
180
3436
180
    // Guard against invalid arguments.
3437
180
    int64_t CCVal = CC->getZExtValue();
3438
180
    bool IsInteger = II->getIntrinsicID() == Intrinsic::amdgcn_icmp;
3439
180
    if (
(IsInteger && 180
(CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
3440
129
                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
3441
178
        
(!IsInteger && 178
(CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
3442
51
                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
3443
4
      break;
3444
176
3445
176
    Value *Src0 = II->getArgOperand(0);
3446
176
    Value *Src1 = II->getArgOperand(1);
3447
176
3448
176
    if (auto *
CSrc0176
= dyn_cast<Constant>(Src0)) {
3449
7
      if (auto *
CSrc17
= dyn_cast<Constant>(Src1)) {
3450
4
        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
3451
4
        if (
CCmp->isNullValue()4
) {
3452
2
          return replaceInstUsesWith(
3453
2
              *II, ConstantExpr::getSExt(CCmp, II->getType()));
3454
2
        }
3455
2
3456
2
        // The result of V_ICMP/V_FCMP assembly instructions (which this
3457
2
        // intrinsic exposes) is one bit per thread, masked with the EXEC
3458
2
        // register (which contains the bitmask of live threads). So a
3459
2
        // comparison that always returns true is the same as a read of the
3460
2
        // EXEC register.
3461
2
        Value *NewF = Intrinsic::getDeclaration(
3462
2
            II->getModule(), Intrinsic::read_register, II->getType());
3463
2
        Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")};
3464
2
        MDNode *MD = MDNode::get(II->getContext(), MDArgs);
3465
2
        Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)};
3466
2
        CallInst *NewCall = Builder.CreateCall(NewF, Args);
3467
2
        NewCall->addAttribute(AttributeList::FunctionIndex,
3468
2
                              Attribute::Convergent);
3469
2
        NewCall->takeName(II);
3470
2
        return replaceInstUsesWith(*II, NewCall);
3471
2
      }
3472
3
3473
3
      // Canonicalize constants to RHS.
3474
3
      CmpInst::Predicate SwapPred
3475
3
        = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
3476
3
      II->setArgOperand(0, Src1);
3477
3
      II->setArgOperand(1, Src0);
3478
3
      II->setArgOperand(2, ConstantInt::get(CC->getType(),
3479
3
                                            static_cast<int>(SwapPred)));
3480
3
      return II;
3481
3
    }
3482
169
3483
169
    
if (169
CCVal != CmpInst::ICMP_EQ && 169
CCVal != CmpInst::ICMP_NE115
)
3484
92
      break;
3485
77
3486
77
    // Canonicalize compare eq with true value to compare != 0
3487
77
    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
3488
77
    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
3489
77
    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
3490
77
    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
3491
77
    Value *ExtSrc;
3492
77
    if (CCVal == CmpInst::ICMP_EQ &&
3493
54
        
((match(Src1, m_One()) && 54
match(Src0, m_ZExt(m_Value(ExtSrc)))4
) ||
3494
54
         
(match(Src1, m_AllOnes()) && 52
match(Src0, m_SExt(m_Value(ExtSrc)))5
)) &&
3495
77
        
ExtSrc->getType()->isIntegerTy(1)6
) {
3496
6
      II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType()));
3497
6
      II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
3498
6
      return II;
3499
6
    }
3500
71
3501
71
    CmpInst::Predicate SrcPred;
3502
71
    Value *SrcLHS;
3503
71
    Value *SrcRHS;
3504
71
3505
71
    // Fold compare eq/ne with 0 from a compare result as the predicate to the
3506
71
    // intrinsic. The typical use is a wave vote function in the library, which
3507
71
    // will be fed from a user code condition compared with 0. Fold in the
3508
71
    // redundant compare.
3509
71
3510
71
    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
3511
71
    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
3512
71
    //
3513
71
    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
3514
71
    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
3515
71
    if (match(Src1, m_Zero()) &&
3516
24
        match(Src0,
3517
71
              m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) {
3518
18
      if (CCVal == CmpInst::ICMP_EQ)
3519
5
        SrcPred = CmpInst::getInversePredicate(SrcPred);
3520
18
3521
18
      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ?
3522
18
        
Intrinsic::amdgcn_fcmp6
:
Intrinsic::amdgcn_icmp12
;
3523
18
3524
18
      Value *NewF = Intrinsic::getDeclaration(II->getModule(), NewIID,
3525
18
                                              SrcLHS->getType());
3526
18
      Value *Args[] = { SrcLHS, SrcRHS,
3527
18
                        ConstantInt::get(CC->getType(), SrcPred) };
3528
18
      CallInst *NewCall = Builder.CreateCall(NewF, Args);
3529
18
      NewCall->takeName(II);
3530
18
      return replaceInstUsesWith(*II, NewCall);
3531
18
    }
3532
53
3533
53
    break;
3534
53
  }
3535
129
  case Intrinsic::stackrestore: {
3536
129
    // If the save is right next to the restore, remove the restore.  This can
3537
129
    // happen when variable allocas are DCE'd.
3538
129
    if (IntrinsicInst *
SS129
= dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
3539
126
      if (
SS->getIntrinsicID() == Intrinsic::stacksave126
) {
3540
126
        if (&*++SS->getIterator() == II)
3541
3
          return eraseInstFromFunction(CI);
3542
126
      }
3543
126
    }
3544
126
3545
126
    // Scan down this block to see if there is another stack restore in the
3546
126
    // same block without an intervening call/alloca.
3547
126
    BasicBlock::iterator BI(II);
3548
126
    TerminatorInst *TI = II->getParent()->getTerminator();
3549
126
    bool CannotRemove = false;
3550
134
    for (++BI; 
&*BI != TI134
;
++BI8
) {
3551
87
      if (
isa<AllocaInst>(BI)87
) {
3552
0
        CannotRemove = true;
3553
0
        break;
3554
0
      }
3555
87
      
if (CallInst *87
BCI87
= dyn_cast<CallInst>(BI)) {
3556
79
        if (IntrinsicInst *
II79
= dyn_cast<IntrinsicInst>(BCI)) {
3557
79
          // If there is a stackrestore below this one, remove this one.
3558
79
          if (II->getIntrinsicID() == Intrinsic::stackrestore)
3559
3
            return eraseInstFromFunction(CI);
3560
76
3561
76
          // Bail if we cross over an intrinsic with side effects, such as
3562
76
          // llvm.stacksave, llvm.read_register, or llvm.setjmp.
3563
76
          
if (76
II->mayHaveSideEffects()76
) {
3564
76
            CannotRemove = true;
3565
76
            break;
3566
76
          }
3567
0
        } else {
3568
0
          // If we found a non-intrinsic call, we can't remove the stack
3569
0
          // restore.
3570
0
          CannotRemove = true;
3571
0
          break;
3572
0
        }
3573
79
      }
3574
87
    }
3575
126
3576
126
    // If the stack restore is in a return, resume, or unwind block and if there
3577
126
    // are no allocas or calls between the restore and the return, nuke the
3578
126
    // restore.
3579
123
    
if (123
!CannotRemove && 123
(isa<ReturnInst>(TI) || 47
isa<ResumeInst>(TI)30
))
3580
17
      return eraseInstFromFunction(CI);
3581
106
    break;
3582
106
  }
3583
856k
  case Intrinsic::lifetime_start:
3584
856k
    // Asan needs to poison memory to detect invalid access which is possible
3585
856k
    // even for empty lifetime range.
3586
856k
    if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress))
3587
19
      break;
3588
856k
3589
856k
    
if (856k
removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start,
3590
856k
                                  Intrinsic::lifetime_end, *this))
3591
6
      return nullptr;
3592
856k
    break;
3593
390
  case Intrinsic::assume: {
3594
390
    Value *IIOperand = II->getArgOperand(0);
3595
390
    // Remove an assume if it is immediately followed by an identical assume.
3596
390
    if (match(II->getNextNode(),
3597
390
              m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
3598
0
      return eraseInstFromFunction(CI);
3599
390
3600
390
    // Canonicalize assume(a && b) -> assume(a); assume(b);
3601
390
    // Note: New assumption intrinsics created here are registered by
3602
390
    // the InstCombineIRInserter object.
3603
390
    Value *AssumeIntrinsic = II->getCalledValue(), *A, *B;
3604
390
    if (
match(IIOperand, m_And(m_Value(A), m_Value(B)))390
) {
3605
2
      Builder.CreateCall(AssumeIntrinsic, A, II->getName());
3606
2
      Builder.CreateCall(AssumeIntrinsic, B, II->getName());
3607
2
      return eraseInstFromFunction(*II);
3608
2
    }
3609
388
    // assume(!(a || b)) -> assume(!a); assume(!b);
3610
388
    
if (388
match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))388
) {
3611
1
      Builder.CreateCall(AssumeIntrinsic, Builder.CreateNot(A), II->getName());
3612
1
      Builder.CreateCall(AssumeIntrinsic, Builder.CreateNot(B), II->getName());
3613
1
      return eraseInstFromFunction(*II);
3614
1
    }
3615
387
3616
387
    // assume( (load addr) != null ) -> add 'nonnull' metadata to load
3617
387
    // (if assume is valid at the load)
3618
387
    CmpInst::Predicate Pred;
3619
387
    Instruction *LHS;
3620
387
    if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
3621
387
        
Pred == ICmpInst::ICMP_NE18
&&
LHS->getOpcode() == Instruction::Load5
&&
3622
5
        LHS->getType()->isPointerTy() &&
3623
387
        
isValidAssumeForContext(II, LHS, &DT)4
) {
3624
1
      MDNode *MD = MDNode::get(II->getContext(), None);
3625
1
      LHS->setMetadata(LLVMContext::MD_nonnull, MD);
3626
1
      return eraseInstFromFunction(*II);
3627
1
3628
1
      // TODO: apply nonnull return attributes to calls and invokes
3629
1
      // TODO: apply range metadata for range check patterns?
3630
1
    }
3631
386
3632
386
    // If there is a dominating assume with the same condition as this one,
3633
386
    // then this one is redundant, and should be removed.
3634
386
    KnownBits Known(1);
3635
386
    computeKnownBits(IIOperand, Known, 0, II);
3636
386
    if (Known.isAllOnes())
3637
3
      return eraseInstFromFunction(*II);
3638
383
3639
383
    // Update the cache of affected values for this assumption (we might be
3640
383
    // here because we just simplified the condition).
3641
383
    AC.updateAffectedValues(II);
3642
383
    break;
3643
383
  }
3644
27
  case Intrinsic::experimental_gc_relocate: {
3645
27
    // Translate facts known about a pointer before relocating into
3646
27
    // facts about the relocate value, while being careful to
3647
27
    // preserve relocation semantics.
3648
27
    Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr();
3649
27
3650
27
    // Remove the relocation if unused, note that this check is required
3651
27
    // to prevent the cases below from looping forever.
3652
27
    if (II->use_empty())
3653
0
      return eraseInstFromFunction(*II);
3654
27
3655
27
    // Undef is undef, even after relocation.
3656
27
    // TODO: provide a hook for this in GCStrategy.  This is clearly legal for
3657
27
    // most practical collectors, but there was discussion in the review thread
3658
27
    // about whether it was legal for all possible collectors.
3659
27
    
if (27
isa<UndefValue>(DerivedPtr)27
)
3660
27
      // Use undef of gc_relocate's type to replace it.
3661
1
      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
3662
26
3663
26
    
if (auto *26
PT26
= dyn_cast<PointerType>(II->getType())) {
3664
24
      // The relocation of null will be null for most any collector.
3665
24
      // TODO: provide a hook for this in GCStrategy.  There might be some
3666
24
      // weird collector this property does not hold for.
3667
24
      if (isa<ConstantPointerNull>(DerivedPtr))
3668
24
        // Use null-pointer of gc_relocate's type to replace it.
3669
1
        return replaceInstUsesWith(*II, ConstantPointerNull::get(PT));
3670
23
3671
23
      // isKnownNonNull -> nonnull attribute
3672
23
      
if (23
isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)23
)
3673
3
        II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
3674
24
    }
3675
26
3676
26
    // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
3677
26
    // Canonicalize on the type from the uses to the defs
3678
26
3679
26
    // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
3680
25
    break;
3681
26
  }
3682
26
3683
16
  case Intrinsic::experimental_guard: {
3684
16
    // Is this guard followed by another guard?
3685
16
    Instruction *NextInst = II->getNextNode();
3686
16
    Value *NextCond = nullptr;
3687
16
    if (match(NextInst,
3688
16
              m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
3689
11
      Value *CurrCond = II->getArgOperand(0);
3690
11
3691
11
      // Remove a guard that it is immediately preceded by an identical guard.
3692
11
      if (CurrCond == NextCond)
3693
9
        return eraseInstFromFunction(*NextInst);
3694
2
3695
2
      // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
3696
2
      II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
3697
2
      return eraseInstFromFunction(*NextInst);
3698
2
    }
3699
5
    break;
3700
5
  }
3701
2.85M
  }
3702
2.85M
  return visitCallSite(II);
3703
2.85M
}
3704
3705
// Fence instruction simplification
3706
34.6k
Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
3707
34.6k
  // Remove identical consecutive fences.
3708
34.6k
  if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode()))
3709
10
    
if (10
FI.isIdenticalTo(NFI)10
)
3710
4
      return eraseInstFromFunction(FI);
3711
34.6k
  return nullptr;
3712
34.6k
}
3713
3714
// InvokeInst simplification
3715
//
3716
306k
Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
3717
306k
  return visitCallSite(&II);
3718
306k
}
3719
3720
/// If this cast does not affect the value passed through the varargs area, we
3721
/// can eliminate the use of the cast.
3722
static bool isSafeToEliminateVarargsCast(const CallSite CS,
3723
                                         const DataLayout &DL,
3724
                                         const CastInst *const CI,
3725
108k
                                         const int ix) {
3726
108k
  if (!CI->isLosslessCast())
3727
108k
    return false;
3728
212
3729
212
  // If this is a GC intrinsic, avoid munging types.  We need types for
3730
212
  // statepoint reconstruction in SelectionDAG.
3731
212
  // TODO: This is probably something which should be expanded to all
3732
212
  // intrinsics since the entire point of intrinsics is that
3733
212
  // they are understandable by the optimizer.
3734
212
  
if (212
isStatepoint(CS) || 212
isGCRelocate(CS)212
||
isGCResult(CS)212
)
3735
0
    return false;
3736
212
3737
212
  // The size of ByVal or InAlloca arguments is derived from the type, so we
3738
212
  // can't change to a type with a different size.  If the size were
3739
212
  // passed explicitly we could avoid this check.
3740
212
  
if (212
!CS.isByValOrInAllocaArgument(ix)212
)
3741
211
    return true;
3742
1
3743
1
  Type* SrcTy =
3744
1
            cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
3745
1
  Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
3746
1
  if (
!SrcTy->isSized() || 1
!DstTy->isSized()1
)
3747
0
    return false;
3748
1
  
if (1
DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy)1
)
3749
1
    return false;
3750
0
  return true;
3751
0
}
3752
3753
27.3M
Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
3754
27.3M
  if (
!CI->getCalledFunction()27.3M
)
return nullptr1.02M
;
3755
26.2M
3756
26.2M
  
auto InstCombineRAUW = [this](Instruction *From, Value *With) 26.2M
{
3757
25
    replaceInstUsesWith(*From, With);
3758
25
  };
3759
26.2M
  LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW);
3760
26.2M
  if (Value *
With26.2M
= Simplifier.optimizeCall(CI)) {
3761
39.5k
    ++NumSimplified;
3762
39.5k
    return CI->use_empty() ? 
CI37.0k
:
replaceInstUsesWith(*CI, With)2.52k
;
3763
39.5k
  }
3764
26.2M
3765
26.2M
  return nullptr;
3766
26.2M
}
3767
3768
10
static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) {
3769
10
  // Strip off at most one level of pointer casts, looking for an alloca.  This
3770
10
  // is good enough in practice and simpler than handling any number of casts.
3771
10
  Value *Underlying = TrampMem->stripPointerCasts();
3772
10
  if (Underlying != TrampMem &&
3773
5
      
(!Underlying->hasOneUse() || 5
Underlying->user_back() != TrampMem5
))
3774
0
    return nullptr;
3775
10
  
if (10
!isa<AllocaInst>(Underlying)10
)
3776
5
    return nullptr;
3777
5
3778
5
  IntrinsicInst *InitTrampoline = nullptr;
3779
12
  for (User *U : TrampMem->users()) {
3780
12
    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
3781
12
    if (!II)
3782
0
      return nullptr;
3783
12
    
if (12
II->getIntrinsicID() == Intrinsic::init_trampoline12
) {
3784
5
      if (InitTrampoline)
3785
5
        // More than one init_trampoline writes to this value.  Give up.
3786
0
        return nullptr;
3787
5
      InitTrampoline = II;
3788
5
      continue;
3789
5
    }
3790
7
    
if (7
II->getIntrinsicID() == Intrinsic::adjust_trampoline7
)
3791
7
      // Allow any number of calls to adjust.trampoline.
3792
7
      continue;
3793
0
    return nullptr;
3794
0
  }
3795
5
3796
5
  // No call to init.trampoline found.
3797
5
  
if (5
!InitTrampoline5
)
3798
0
    return nullptr;
3799
5
3800
5
  // Check that the alloca is being used in the expected way.
3801
5
  
if (5
InitTrampoline->getOperand(0) != TrampMem5
)
3802
0
    return nullptr;
3803
5
3804
5
  return InitTrampoline;
3805
5
}
3806
3807
static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
3808
5
                                               Value *TrampMem) {
3809
5
  // Visit all the previous instructions in the basic block, and try to find a
3810
5
  // init.trampoline which has a direct path to the adjust.trampoline.
3811
5
  for (BasicBlock::iterator I = AdjustTramp->getIterator(),
3812
5
                            E = AdjustTramp->getParent()->begin();
3813
5
       
I != E5
;) {
3814
4
    Instruction *Inst = &*--I;
3815
4
    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
3816
2
      
if (2
II->getIntrinsicID() == Intrinsic::init_trampoline &&
3817
2
          II->getOperand(0) == TrampMem)
3818
2
        return II;
3819
2
    
if (2
Inst->mayWriteToMemory()2
)
3820
2
      return nullptr;
3821
4
  }
3822
1
  return nullptr;
3823
5
}
3824
3825
// Given a call to llvm.adjust.trampoline, find and return the corresponding
3826
// call to llvm.init.trampoline if the call to the trampoline can be optimized
3827
// to a direct call to a function.  Otherwise return NULL.
3828
//
3829
27.5M
static IntrinsicInst *findInitTrampoline(Value *Callee) {
3830
27.5M
  Callee = Callee->stripPointerCasts();
3831
27.5M
  IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
3832
27.5M
  if (!AdjustTramp ||
3833
171
      AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
3834
27.5M
    return nullptr;
3835
10
3836
10
  Value *TrampMem = AdjustTramp->getOperand(0);
3837
10
3838
10
  if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem))
3839
5
    return IT;
3840
5
  
if (IntrinsicInst *5
IT5
= findInitTrampolineFromBB(AdjustTramp, TrampMem))
3841
2
    return IT;
3842
3
  return nullptr;
3843
3
}
3844
3845
/// Improvements for call and invoke instructions.
3846
27.7M
Instruction *InstCombiner::visitCallSite(CallSite CS) {
3847
27.7M
  if (isAllocLikeFn(CS.getInstruction(), &TLI))
3848
155k
    return visitAllocSite(*CS.getInstruction());
3849
27.5M
3850
27.5M
  bool Changed = false;
3851
27.5M
3852
27.5M
  // Mark any parameters that are known to be non-null with the nonnull
3853
27.5M
  // attribute.  This is helpful for inlining calls to functions with null
3854
27.5M
  // checks on their arguments.
3855
27.5M
  SmallVector<unsigned, 4> ArgNos;
3856
27.5M
  unsigned ArgNo = 0;
3857
27.5M
3858
65.9M
  for (Value *V : CS.args()) {
3859
65.9M
    if (V->getType()->isPointerTy() &&
3860
47.9M
        !CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
3861
34.1M
        isKnownNonZero(V, DL, 0, &AC, CS.getInstruction(), &DT))
3862
884k
      ArgNos.push_back(ArgNo);
3863
65.9M
    ArgNo++;
3864
65.9M
  }
3865
27.5M
3866
27.5M
  assert(ArgNo == CS.arg_size() && "sanity check");
3867
27.5M
3868
27.5M
  if (
!ArgNos.empty()27.5M
) {
3869
801k
    AttributeList AS = CS.getAttributes();
3870
801k
    LLVMContext &Ctx = CS.getInstruction()->getContext();
3871
801k
    AS = AS.addParamAttribute(Ctx, ArgNos,
3872
801k
                              Attribute::get(Ctx, Attribute::NonNull));
3873
801k
    CS.setAttributes(AS);
3874
801k
    Changed = true;
3875
801k
  }
3876
27.5M
3877
27.5M
  // If the callee is a pointer to a function, attempt to move any casts to the
3878
27.5M
  // arguments of the call/invoke.
3879
27.5M
  Value *Callee = CS.getCalledValue();
3880
27.5M
  if (
!isa<Function>(Callee) && 27.5M
transformConstExprCastCall(CS)1.07M
)
3881
223
    return nullptr;
3882
27.5M
3883
27.5M
  
if (Function *27.5M
CalleeF27.5M
= dyn_cast<Function>(Callee)) {
3884
26.4M
    // Remove the convergent attr on calls when the callee is not convergent.
3885
26.4M
    if (
CS.isConvergent() && 26.4M
!CalleeF->isConvergent()370
&&
3886
26.4M
        
!CalleeF->isIntrinsic()22
) {
3887
1
      DEBUG(dbgs() << "Removing convergent attr from instr "
3888
1
                   << CS.getInstruction() << "\n");
3889
1
      CS.setNotConvergent();
3890
1
      return CS.getInstruction();
3891
1
    }
3892
26.4M
3893
26.4M
    // If the call and callee calling conventions don't match, this call must
3894
26.4M
    // be unreachable, as the call is undefined.
3895
26.4M
    
if (26.4M
CalleeF->getCallingConv() != CS.getCallingConv() &&
3896
26.4M
        // Only do this for calls to a function with a body.  A prototype may
3897
26.4M
        // not actually end up matching the implementation's calling conv for a
3898
26.4M
        // variety of reasons (e.g. it may be written in assembly).
3899
26.4M
        
!CalleeF->isDeclaration()55
) {
3900
0
      Instruction *OldCall = CS.getInstruction();
3901
0
      new StoreInst(ConstantInt::getTrue(Callee->getContext()),
3902
0
                UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
3903
0
                                  OldCall);
3904
0
      // If OldCall does not return void then replaceAllUsesWith undef.
3905
0
      // This allows ValueHandlers and custom metadata to adjust itself.
3906
0
      if (!OldCall->getType()->isVoidTy())
3907
0
        replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
3908
0
      if (isa<CallInst>(OldCall))
3909
0
        return eraseInstFromFunction(*OldCall);
3910
0
3911
0
      // We cannot remove an invoke, because it would change the CFG, just
3912
0
      // change the callee to a null pointer.
3913
0
      cast<InvokeInst>(OldCall)->setCalledFunction(
3914
0
                                    Constant::getNullValue(CalleeF->getType()));
3915
0
      return nullptr;
3916
0
    }
3917
26.4M
  }
3918
27.5M
3919
27.5M
  
if (27.5M
isa<ConstantPointerNull>(Callee) || 27.5M
isa<UndefValue>(Callee)27.5M
) {
3920
4
    // If CS does not return void then replaceAllUsesWith undef.
3921
4
    // This allows ValueHandlers and custom metadata to adjust itself.
3922
4
    if (!CS.getInstruction()->getType()->isVoidTy())
3923
4
      replaceInstUsesWith(*CS.getInstruction(),
3924
4
                          UndefValue::get(CS.getInstruction()->getType()));
3925
4
3926
4
    if (
isa<InvokeInst>(CS.getInstruction())4
) {
3927
4
      // Can't remove an invoke because we cannot change the CFG.
3928
4
      return nullptr;
3929
4
    }
3930
0
3931
0
    // This instruction is not reachable, just remove it.  We insert a store to
3932
0
    // undef so that we know that this code is not reachable, despite the fact
3933
0
    // that we can't modify the CFG here.
3934
0
    new StoreInst(ConstantInt::getTrue(Callee->getContext()),
3935
0
                  UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
3936
0
                  CS.getInstruction());
3937
0
3938
0
    return eraseInstFromFunction(*CS.getInstruction());
3939
0
  }
3940
27.5M
3941
27.5M
  
if (IntrinsicInst *27.5M
II27.5M
= findInitTrampoline(Callee))
3942
7
    return transformCallThroughTrampoline(CS, II);
3943
27.5M
3944
27.5M
  PointerType *PTy = cast<PointerType>(Callee->getType());
3945
27.5M
  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
3946
27.5M
  if (
FTy->isVarArg()27.5M
) {
3947
664k
    int ix = FTy->getNumParams();
3948
664k
    // See if we can optimize any arguments passed through the varargs area of
3949
664k
    // the call.
3950
664k
    for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(),
3951
1.70M
           E = CS.arg_end(); 
I != E1.70M
;
++I, ++ix1.04M
) {
3952
1.04M
      CastInst *CI = dyn_cast<CastInst>(*I);
3953
1.04M
      if (
CI && 1.04M
isSafeToEliminateVarargsCast(CS, DL, CI, ix)108k
) {
3954
211
        *I = CI->getOperand(0);
3955
211
        Changed = true;
3956
211
      }
3957
1.04M
    }
3958
664k
  }
3959
27.5M
3960
27.5M
  if (
isa<InlineAsm>(Callee) && 27.5M
!CS.doesNotThrow()40.7k
) {
3961
22
    // Inline asm calls cannot throw - mark them 'nounwind'.
3962
22
    CS.setDoesNotThrow();
3963
22
    Changed = true;
3964
22
  }
3965
27.5M
3966
27.5M
  // Try to optimize the call if possible, we require DataLayout for most of
3967
27.5M
  // this.  None of these calls are seen as possibly dead so go ahead and
3968
27.5M
  // delete the instruction now.
3969
27.5M
  if (CallInst *
CI27.5M
= dyn_cast<CallInst>(CS.getInstruction())) {
3970
27.3M
    Instruction *I = tryOptimizeCall(CI);
3971
27.3M
    // If we changed something return the result, etc. Otherwise let
3972
27.3M
    // the fallthrough check.
3973
27.3M
    if (
I27.3M
)
return eraseInstFromFunction(*I)39.5k
;
3974
27.5M
  }
3975
27.5M
3976
27.5M
  
return Changed ? 27.5M
CS.getInstruction()800k
:
nullptr26.7M
;
3977
27.7M
}
3978
3979
/// If the callee is a constexpr cast of a function, attempt to move the cast to
3980
/// the arguments of the call/invoke.
3981
1.07M
bool InstCombiner::transformConstExprCastCall(CallSite CS) {
3982
1.07M
  auto *Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
3983
1.07M
  if (!Callee)
3984
981k
    return false;
3985
95.1k
3986
95.1k
  // The prototype of a thunk is a lie. Don't directly call such a function.
3987
95.1k
  
if (95.1k
Callee->hasFnAttribute("thunk")95.1k
)
3988
1
    return false;
3989
95.1k
3990
95.1k
  Instruction *Caller = CS.getInstruction();
3991
95.1k
  const AttributeList &CallerPAL = CS.getAttributes();
3992
95.1k
3993
95.1k
  // Okay, this is a cast from a function to a different type.  Unless doing so
3994
95.1k
  // would cause a type conversion of one of our arguments, change this call to
3995
95.1k
  // be a direct call with arguments casted to the appropriate types.
3996
95.1k
  //
3997
95.1k
  FunctionType *FT = Callee->getFunctionType();
3998
95.1k
  Type *OldRetTy = Caller->getType();
3999
95.1k
  Type *NewRetTy = FT->getReturnType();
4000
95.1k
4001
95.1k
  // Check to see if we are changing the return type...
4002
95.1k
  if (
OldRetTy != NewRetTy95.1k
) {
4003
3.80k
4004
3.80k
    if (NewRetTy->isStructTy())
4005
0
      return false; // TODO: Handle multiple return values.
4006
3.80k
4007
3.80k
    
if (3.80k
!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)3.80k
) {
4008
3.17k
      if (Callee->isDeclaration())
4009
3.15k
        return false;   // Cannot transform this return value.
4010
15
4011
15
      
if (15
!Caller->use_empty() &&
4012
15
          // void -> non-void is handled specially
4013
12
          !NewRetTy->isVoidTy())
4014
11
        return false;   // Cannot transform this return value.
4015
639
    }
4016
639
4017
639
    
if (639
!CallerPAL.isEmpty() && 639
!Caller->use_empty()365
) {
4018
364
      AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
4019
364
      if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
4020
0
        return false;   // Attribute not compatible with transformed value.
4021
639
    }
4022
639
4023
639
    // If the callsite is an invoke instruction, and the return value is used by
4024
639
    // a PHI node in a successor, we cannot change the return type of the call
4025
639
    // because there is no place to put the cast instruction (without breaking
4026
639
    // the critical edge).  Bail out in this case.
4027
639
    
if (639
!Caller->use_empty()639
)
4028
636
      
if (InvokeInst *636
II636
= dyn_cast<InvokeInst>(Caller))
4029
49
        for (User *U : II->users())
4030
89
          
if (PHINode *89
PN89
= dyn_cast<PHINode>(U))
4031
40
            
if (40
PN->getParent() == II->getNormalDest() ||
4032
40
                PN->getParent() == II->getUnwindDest())
4033
0
              return false;
4034
91.9k
  }
4035
91.9k
4036
91.9k
  unsigned NumActualArgs = CS.arg_size();
4037
91.9k
  unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
4038
91.9k
4039
91.9k
  // Prevent us turning:
4040
91.9k
  // declare void @takes_i32_inalloca(i32* inalloca)
4041
91.9k
  //  call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0)
4042
91.9k
  //
4043
91.9k
  // into:
4044
91.9k
  //  call void @takes_i32_inalloca(i32* null)
4045
91.9k
  //
4046
91.9k
  //  Similarly, avoid folding away bitcasts of byval calls.
4047
91.9k
  if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
4048
91.9k
      Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
4049
2
    return false;
4050
91.9k
4051
91.9k
  CallSite::arg_iterator AI = CS.arg_begin();
4052
98.8k
  for (unsigned i = 0, e = NumCommonArgs; 
i != e98.8k
;
++i, ++AI6.87k
) {
4053
7.20k
    Type *ParamTy = FT->getParamType(i);
4054
7.20k
    Type *ActTy = (*AI)->getType();
4055
7.20k
4056
7.20k
    if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
4057
325
      return false;   // Cannot transform this parameter value.
4058
6.88k
4059
6.88k
    
if (6.88k
AttrBuilder(CallerPAL.getParamAttributes(i))
4060
6.88k
            .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
4061
3
      return false;   // Attribute not compatible with transformed value.
4062
6.88k
4063
6.88k
    
if (6.88k
CS.isInAllocaArgument(i)6.88k
)
4064
0
      return false;   // Cannot transform to and from inalloca.
4065
6.88k
4066
6.88k
    // If the parameter is passed as a byval argument, then we have to have a
4067
6.88k
    // sized type and the sized type has to have the same size as the old type.
4068
6.88k
    
if (6.88k
ParamTy != ActTy && 6.88k
CallerPAL.hasParamAttribute(i, Attribute::ByVal)225
) {
4069
2
      PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
4070
2
      if (
!ParamPTy || 2
!ParamPTy->getElementType()->isSized()2
)
4071
2
        return false;
4072
0
4073
0
      Type *CurElTy = ActTy->getPointerElementType();
4074
0
      if (DL.getTypeAllocSize(CurElTy) !=
4075
0
          DL.getTypeAllocSize(ParamPTy->getElementType()))
4076
0
        return false;
4077
2
    }
4078
7.20k
  }
4079
91.9k
4080
91.6k
  
if (91.6k
Callee->isDeclaration()91.6k
) {
4081
91.4k
    // Do not delete arguments unless we have a function body.
4082
91.4k
    if (
FT->getNumParams() < NumActualArgs && 91.4k
!FT->isVarArg()76.9k
)
4083
8
      return false;
4084
91.4k
4085
91.4k
    // If the callee is just a declaration, don't change the varargsness of the
4086
91.4k
    // call.  We don't want to introduce a varargs call where one doesn't
4087
91.4k
    // already exist.
4088
91.4k
    PointerType *APTy = cast<PointerType>(CS.getCalledValue()->getType());
4089
91.4k
    if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
4090
91.2k
      return false;
4091
107
4092
107
    // If both the callee and the cast type are varargs, we still have to make
4093
107
    // sure the number of fixed parameters are the same or we have the same
4094
107
    // ABI issues as if we introduce a varargs call.
4095
107
    
if (107
FT->isVarArg() &&
4096
89
        cast<FunctionType>(APTy->getElementType())->isVarArg() &&
4097
89
        FT->getNumParams() !=
4098
89
        cast<FunctionType>(APTy->getElementType())->getNumParams())
4099
89
      return false;
4100
226
  }
4101
226
4102
226
  
if (226
FT->getNumParams() < NumActualArgs && 226
FT->isVarArg()12
&&
4103
226
      
!CallerPAL.isEmpty()8
) {
4104
5
    // In this case we have more arguments than the new function type, but we
4105
5
    // won't be dropping them.  Check that these extra arguments have attributes
4106
5
    // that are compatible with being a vararg call argument.
4107
5
    unsigned SRetIdx;
4108
5
    if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) &&
4109
4
        SRetIdx > FT->getNumParams())
4110
4
      return false;
4111
222
  }
4112
222
4113
222
  // Okay, we decided that this is a safe thing to do: go ahead and start
4114
222
  // inserting cast instructions as necessary.
4115
222
  SmallVector<Value *, 8> Args;
4116
222
  SmallVector<AttributeSet, 8> ArgAttrs;
4117
222
  Args.reserve(NumActualArgs);
4118
222
  ArgAttrs.reserve(NumActualArgs);
4119
222
4120
222
  // Get any return attributes.
4121
222
  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
4122
222
4123
222
  // If the return value is not being used, the type may not be compatible
4124
222
  // with the existing attributes.  Wipe out any problematic attributes.
4125
222
  RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy));
4126
222
4127
222
  AI = CS.arg_begin();
4128
483
  for (unsigned i = 0; 
i != NumCommonArgs483
;
++i, ++AI261
) {
4129
261
    Type *ParamTy = FT->getParamType(i);
4130
261
4131
261
    Value *NewArg = *AI;
4132
261
    if ((*AI)->getType() != ParamTy)
4133
223
      NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy);
4134
261
    Args.push_back(NewArg);
4135
261
4136
261
    // Add any parameter attributes.
4137
261
    ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
4138
261
  }
4139
222
4140
222
  // If the function takes more arguments than the call was taking, add them
4141
222
  // now.
4142
231
  for (unsigned i = NumCommonArgs; 
i != FT->getNumParams()231
;
++i9
) {
4143
9
    Args.push_back(Constant::getNullValue(FT->getParamType(i)));
4144
9
    ArgAttrs.push_back(AttributeSet());
4145
9
  }
4146
222
4147
222
  // If we are removing arguments to the function, emit an obnoxious warning.
4148
222
  if (
FT->getNumParams() < NumActualArgs222
) {
4149
8
    // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722
4150
8
    if (
FT->isVarArg()8
) {
4151
4
      // Add all of the arguments in their promoted form to the arg list.
4152
8
      for (unsigned i = FT->getNumParams(); 
i != NumActualArgs8
;
++i, ++AI4
) {
4153
4
        Type *PTy = getPromotedType((*AI)->getType());
4154
4
        Value *NewArg = *AI;
4155
4
        if (
PTy != (*AI)->getType()4
) {
4156
1
          // Must promote to pass through va_arg area!
4157
1
          Instruction::CastOps opcode =
4158
1
            CastInst::getCastOpcode(*AI, false, PTy, false);
4159
1
          NewArg = Builder.CreateCast(opcode, *AI, PTy);
4160
1
        }
4161
4
        Args.push_back(NewArg);
4162
4
4163
4
        // Add any parameter attributes.
4164
4
        ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
4165
4
      }
4166
4
    }
4167
8
  }
4168
222
4169
222
  AttributeSet FnAttrs = CallerPAL.getFnAttributes();
4170
222
4171
222
  if (NewRetTy->isVoidTy())
4172
86
    Caller->setName("");   // Void type should not have a name.
4173
222
4174
222
  assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) &&
4175
222
         "missing argument attributes");
4176
222
  LLVMContext &Ctx = Callee->getContext();
4177
222
  AttributeList NewCallerPAL = AttributeList::get(
4178
222
      Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs);
4179
222
4180
222
  SmallVector<OperandBundleDef, 1> OpBundles;
4181
222
  CS.getOperandBundlesAsDefs(OpBundles);
4182
222
4183
222
  CallSite NewCS;
4184
222
  if (InvokeInst *
II222
= dyn_cast<InvokeInst>(Caller)) {
4185
43
    NewCS = Builder.CreateInvoke(Callee, II->getNormalDest(),
4186
43
                                 II->getUnwindDest(), Args, OpBundles);
4187
222
  } else {
4188
179
    NewCS = Builder.CreateCall(Callee, Args, OpBundles);
4189
179
    cast<CallInst>(NewCS.getInstruction())
4190
179
        ->setTailCallKind(cast<CallInst>(Caller)->getTailCallKind());
4191
179
  }
4192
222
  NewCS->takeName(Caller);
4193
222
  NewCS.setCallingConv(CS.getCallingConv());
4194
222
  NewCS.setAttributes(NewCallerPAL);
4195
222
4196
222
  // Preserve the weight metadata for the new call instruction. The metadata
4197
222
  // is used by SamplePGO to check callsite's hotness.
4198
222
  uint64_t W;
4199
222
  if (Caller->extractProfTotalWeight(W))
4200
2
    NewCS->setProfWeight(W);
4201
222
4202
222
  // Insert a cast of the return type as necessary.
4203
222
  Instruction *NC = NewCS.getInstruction();
4204
222
  Value *NV = NC;
4205
222
  if (
OldRetTy != NV->getType() && 222
!Caller->use_empty()27
) {
4206
26
    if (
!NV->getType()->isVoidTy()26
) {
4207
26
      NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy);
4208
26
      NC->setDebugLoc(Caller->getDebugLoc());
4209
26
4210
26
      // If this is an invoke instruction, we should insert it after the first
4211
26
      // non-phi, instruction in the normal successor block.
4212
26
      if (InvokeInst *
II26
= dyn_cast<InvokeInst>(Caller)) {
4213
1
        BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt();
4214
1
        InsertNewInstBefore(NC, *I);
4215
26
      } else {
4216
25
        // Otherwise, it's a call, just insert cast right after the call.
4217
25
        InsertNewInstBefore(NC, *Caller);
4218
25
      }
4219
26
      Worklist.AddUsersToWorkList(*Caller);
4220
0
    } else {
4221
0
      NV = UndefValue::get(Caller->getType());
4222
0
    }
4223
26
  }
4224
222
4225
222
  if (!Caller->use_empty())
4226
78
    replaceInstUsesWith(*Caller, NV);
4227
144
  else 
if (144
Caller->hasValueHandle()144
) {
4228
92
    if (OldRetTy == NV->getType())
4229
92
      ValueHandleBase::ValueIsRAUWd(Caller, NV);
4230
92
    else
4231
92
      // We cannot call ValueIsRAUWd with a different type, and the
4232
92
      // actual tracked value will disappear.
4233
0
      ValueHandleBase::ValueIsDeleted(Caller);
4234
144
  }
4235
1.07M
4236
1.07M
  eraseInstFromFunction(*Caller);
4237
1.07M
  return true;
4238
1.07M
}
4239
4240
/// Turn a call to a function created by init_trampoline / adjust_trampoline
4241
/// intrinsic pair into a direct call to the underlying function.
4242
Instruction *
4243
InstCombiner::transformCallThroughTrampoline(CallSite CS,
4244
7
                                             IntrinsicInst *Tramp) {
4245
7
  Value *Callee = CS.getCalledValue();
4246
7
  PointerType *PTy = cast<PointerType>(Callee->getType());
4247
7
  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
4248
7
  AttributeList Attrs = CS.getAttributes();
4249
7
4250
7
  // If the call already has the 'nest' attribute somewhere then give up -
4251
7
  // otherwise 'nest' would occur twice after splicing in the chain.
4252
7
  if (Attrs.hasAttrSomewhere(Attribute::Nest))
4253
0
    return nullptr;
4254
7
4255
7
  assert(Tramp &&
4256
7
         "transformCallThroughTrampoline called with incorrect CallSite.");
4257
7
4258
7
  Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
4259
7
  FunctionType *NestFTy = cast<FunctionType>(NestF->getValueType());
4260
7
4261
7
  AttributeList NestAttrs = NestF->getAttributes();
4262
7
  if (
!NestAttrs.isEmpty()7
) {
4263
7
    unsigned NestArgNo = 0;
4264
7
    Type *NestTy = nullptr;
4265
7
    AttributeSet NestAttr;
4266
7
4267
7
    // Look for a parameter marked with the 'nest' attribute.
4268
7
    for (FunctionType::param_iterator I = NestFTy->param_begin(),
4269
7
                                      E = NestFTy->param_end();
4270
7
         
I != E7
;
++NestArgNo, ++I0
) {
4271
7
      AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
4272
7
      if (
AS.hasAttribute(Attribute::Nest)7
) {
4273
7
        // Record the parameter type and any other attributes.
4274
7
        NestTy = *I;
4275
7
        NestAttr = AS;
4276
7
        break;
4277
7
      }
4278
7
    }
4279
7
4280
7
    if (
NestTy7
) {
4281
7
      Instruction *Caller = CS.getInstruction();
4282
7
      std::vector<Value*> NewArgs;
4283
7
      std::vector<AttributeSet> NewArgAttrs;
4284
7
      NewArgs.reserve(CS.arg_size() + 1);
4285
7
      NewArgAttrs.reserve(CS.arg_size());
4286
7
4287
7
      // Insert the nest argument into the call argument list, which may
4288
7
      // mean appending it.  Likewise for attributes.
4289
7
4290
7
      {
4291
7
        unsigned ArgNo = 0;
4292
7
        CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
4293
14
        do {
4294
14
          if (
ArgNo == NestArgNo14
) {
4295
7
            // Add the chain argument and attributes.
4296
7
            Value *NestVal = Tramp->getArgOperand(2);
4297
7
            if (NestVal->getType() != NestTy)
4298
1
              NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest");
4299
7
            NewArgs.push_back(NestVal);
4300
7
            NewArgAttrs.push_back(NestAttr);
4301
7
          }
4302
14
4303
14
          if (I == E)
4304
7
            break;
4305
7
4306
7
          // Add the original argument and attributes.
4307
7
          NewArgs.push_back(*I);
4308
7
          NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
4309
7
4310
7
          ++ArgNo;
4311
7
          ++I;
4312
7
        } while (true);
4313
7
      }
4314
7
4315
7
      // The trampoline may have been bitcast to a bogus type (FTy).
4316
7
      // Handle this by synthesizing a new function type, equal to FTy
4317
7
      // with the chain parameter inserted.
4318
7
4319
7
      std::vector<Type*> NewTypes;
4320
7
      NewTypes.reserve(FTy->getNumParams()+1);
4321
7
4322
7
      // Insert the chain's type into the list of parameter types, which may
4323
7
      // mean appending it.
4324
7
      {
4325
7
        unsigned ArgNo = 0;
4326
7
        FunctionType::param_iterator I = FTy->param_begin(),
4327
7
          E = FTy->param_end();
4328
7
4329
13
        do {
4330
13
          if (ArgNo == NestArgNo)
4331
13
            // Add the chain's type.
4332
7
            NewTypes.push_back(NestTy);
4333
13
4334
13
          if (I == E)
4335
7
            break;
4336
6
4337
6
          // Add the original type.
4338
6
          NewTypes.push_back(*I);
4339
6
4340
6
          ++ArgNo;
4341
6
          ++I;
4342
7
        } while (true);
4343
7
      }
4344
7
4345
7
      // Replace the trampoline call with a direct call.  Let the generic
4346
7
      // code sort out any function type mismatches.
4347
7
      FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes,
4348
7
                                                FTy->isVarArg());
4349
7
      Constant *NewCallee =
4350
7
        NestF->getType() == PointerType::getUnqual(NewFTy) ?
4351
7
        
NestF7
: ConstantExpr::getBitCast(NestF,
4352
0
                                         PointerType::getUnqual(NewFTy));
4353
7
      AttributeList NewPAL =
4354
7
          AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
4355
7
                             Attrs.getRetAttributes(), NewArgAttrs);
4356
7
4357
7
      SmallVector<OperandBundleDef, 1> OpBundles;
4358
7
      CS.getOperandBundlesAsDefs(OpBundles);
4359
7
4360
7
      Instruction *NewCaller;
4361
7
      if (InvokeInst *
II7
= dyn_cast<InvokeInst>(Caller)) {
4362
0
        NewCaller = InvokeInst::Create(NewCallee,
4363
0
                                       II->getNormalDest(), II->getUnwindDest(),
4364
0
                                       NewArgs, OpBundles);
4365
0
        cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
4366
0
        cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
4367
7
      } else {
4368
7
        NewCaller = CallInst::Create(NewCallee, NewArgs, OpBundles);
4369
7
        cast<CallInst>(NewCaller)->setTailCallKind(
4370
7
            cast<CallInst>(Caller)->getTailCallKind());
4371
7
        cast<CallInst>(NewCaller)->setCallingConv(
4372
7
            cast<CallInst>(Caller)->getCallingConv());
4373
7
        cast<CallInst>(NewCaller)->setAttributes(NewPAL);
4374
7
      }
4375
7
4376
7
      return NewCaller;
4377
7
    }
4378
0
  }
4379
0
4380
0
  // Replace the trampoline call with a direct call.  Since there is no 'nest'
4381
0
  // parameter, there is no need to adjust the argument list.  Let the generic
4382
0
  // code sort out any function type mismatches.
4383
0
  Constant *NewCallee =
4384
0
    NestF->getType() == PTy ? NestF :
4385
0
                              ConstantExpr::getBitCast(NestF, PTy);
4386
7
  CS.setCalledFunction(NewCallee);
4387
7
  return CS.getInstruction();
4388
7
}