Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10
// selection DAG.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "NVPTXISelLowering.h"
15
#include "MCTargetDesc/NVPTXBaseInfo.h"
16
#include "NVPTX.h"
17
#include "NVPTXSubtarget.h"
18
#include "NVPTXTargetMachine.h"
19
#include "NVPTXTargetObjectFile.h"
20
#include "NVPTXUtilities.h"
21
#include "llvm/ADT/APInt.h"
22
#include "llvm/ADT/SmallVector.h"
23
#include "llvm/ADT/StringRef.h"
24
#include "llvm/CodeGen/Analysis.h"
25
#include "llvm/CodeGen/MachineFunction.h"
26
#include "llvm/CodeGen/MachineMemOperand.h"
27
#include "llvm/CodeGen/SelectionDAG.h"
28
#include "llvm/CodeGen/SelectionDAGNodes.h"
29
#include "llvm/CodeGen/TargetCallingConv.h"
30
#include "llvm/CodeGen/TargetLowering.h"
31
#include "llvm/CodeGen/ValueTypes.h"
32
#include "llvm/IR/Argument.h"
33
#include "llvm/IR/Attributes.h"
34
#include "llvm/IR/CallSite.h"
35
#include "llvm/IR/Constants.h"
36
#include "llvm/IR/DataLayout.h"
37
#include "llvm/IR/DerivedTypes.h"
38
#include "llvm/IR/Function.h"
39
#include "llvm/IR/GlobalValue.h"
40
#include "llvm/IR/Instruction.h"
41
#include "llvm/IR/Instructions.h"
42
#include "llvm/IR/Module.h"
43
#include "llvm/IR/Type.h"
44
#include "llvm/IR/Value.h"
45
#include "llvm/Support/Casting.h"
46
#include "llvm/Support/CodeGen.h"
47
#include "llvm/Support/CommandLine.h"
48
#include "llvm/Support/ErrorHandling.h"
49
#include "llvm/Support/MachineValueType.h"
50
#include "llvm/Support/MathExtras.h"
51
#include "llvm/Support/raw_ostream.h"
52
#include "llvm/Target/TargetMachine.h"
53
#include "llvm/Target/TargetOptions.h"
54
#include <algorithm>
55
#include <cassert>
56
#include <cstdint>
57
#include <iterator>
58
#include <sstream>
59
#include <string>
60
#include <utility>
61
#include <vector>
62
63
#define DEBUG_TYPE "nvptx-lower"
64
65
using namespace llvm;
66
67
static unsigned int uniqueCallSite = 0;
68
69
static cl::opt<bool> sched4reg(
70
    "nvptx-sched4reg",
71
    cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
72
73
static cl::opt<unsigned>
74
FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
75
                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
76
                             " 1: do it  2: do it aggressively"),
77
                    cl::init(2));
78
79
static cl::opt<int> UsePrecDivF32(
80
    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
81
    cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
82
             " IEEE Compliant F32 div.rnd if available."),
83
    cl::init(2));
84
85
static cl::opt<bool> UsePrecSqrtF32(
86
    "nvptx-prec-sqrtf32", cl::Hidden,
87
    cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
88
    cl::init(true));
89
90
static cl::opt<bool> FtzEnabled(
91
    "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
92
    cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
93
    cl::init(false));
94
95
45
int NVPTXTargetLowering::getDivF32Level() const {
96
45
  if (UsePrecDivF32.getNumOccurrences() > 0) {
97
1
    // If nvptx-prec-div32=N is used on the command-line, always honor it
98
1
    return UsePrecDivF32;
99
44
  } else {
100
44
    // Otherwise, use div.approx if fast math is enabled
101
44
    if (getTargetMachine().Options.UnsafeFPMath)
102
4
      return 0;
103
40
    else
104
40
      return 2;
105
44
  }
106
45
}
107
108
15
bool NVPTXTargetLowering::usePrecSqrtF32() const {
109
15
  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
110
8
    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
111
8
    return UsePrecSqrtF32;
112
8
  } else {
113
7
    // Otherwise, use sqrt.approx if fast math is enabled
114
7
    return !getTargetMachine().Options.UnsafeFPMath;
115
7
  }
116
15
}
117
118
1.20k
bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
119
1.20k
  // TODO: Get rid of this flag; there can be only one way to do this.
120
1.20k
  if (FtzEnabled.getNumOccurrences() > 0) {
121
58
    // If nvptx-f32ftz is used on the command-line, always honor it
122
58
    return FtzEnabled;
123
1.14k
  } else {
124
1.14k
    const Function &F = MF.getFunction();
125
1.14k
    // Otherwise, check for an nvptx-f32ftz attribute on the function
126
1.14k
    if (F.hasFnAttribute("nvptx-f32ftz"))
127
49
      return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
128
1.09k
    else
129
1.09k
      return false;
130
1.14k
  }
131
1.20k
}
132
133
50.5k
static bool IsPTXVectorType(MVT VT) {
134
50.5k
  switch (VT.SimpleTy) {
135
50.5k
  default:
136
43.6k
    return false;
137
50.5k
  case MVT::v2i1:
138
6.82k
  case MVT::v4i1:
139
6.82k
  case MVT::v2i8:
140
6.82k
  case MVT::v4i8:
141
6.82k
  case MVT::v2i16:
142
6.82k
  case MVT::v4i16:
143
6.82k
  case MVT::v2i32:
144
6.82k
  case MVT::v4i32:
145
6.82k
  case MVT::v2i64:
146
6.82k
  case MVT::v2f16:
147
6.82k
  case MVT::v4f16:
148
6.82k
  case MVT::v8f16: // <4 x f16x2>
149
6.82k
  case MVT::v2f32:
150
6.82k
  case MVT::v4f32:
151
6.82k
  case MVT::v2f64:
152
6.82k
    return true;
153
50.5k
  }
154
50.5k
}
155
156
/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
157
/// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
158
/// into their primitive components.
159
/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
160
/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
161
/// LowerCall, and LowerReturn.
162
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
163
                               Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
164
                               SmallVectorImpl<uint64_t> *Offsets = nullptr,
165
4.78k
                               uint64_t StartingOffset = 0) {
166
4.78k
  SmallVector<EVT, 16> TempVTs;
167
4.78k
  SmallVector<uint64_t, 16> TempOffsets;
168
4.78k
169
4.78k
  // Special case for i128 - decompose to (i64, i64)
170
4.78k
  if (Ty->isIntegerTy(128)) {
171
35
    ValueVTs.push_back(EVT(MVT::i64));
172
35
    ValueVTs.push_back(EVT(MVT::i64));
173
35
174
35
    if (Offsets) {
175
33
      Offsets->push_back(StartingOffset + 0);
176
33
      Offsets->push_back(StartingOffset + 8);
177
33
    }
178
35
179
35
    return;
180
35
  }
181
4.75k
182
4.75k
  // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
183
4.75k
  if (StructType *STy = dyn_cast<StructType>(Ty)) {
184
59
    auto const *SL = DL.getStructLayout(STy);
185
59
    auto ElementNum = 0;
186
157
    for(auto *EI : STy->elements()) {
187
157
      ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
188
157
                         StartingOffset + SL->getElementOffset(ElementNum));
189
157
      ++ElementNum;
190
157
    }
191
59
    return;
192
59
  }
193
4.69k
194
4.69k
  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
195
9.06k
  for (unsigned i = 0, e = TempVTs.size(); i != e; 
++i4.36k
) {
196
4.36k
    EVT VT = TempVTs[i];
197
4.36k
    uint64_t Off = TempOffsets[i];
198
4.36k
    // Split vectors into individual elements, except for v2f16, which
199
4.36k
    // we will pass as a single scalar.
200
4.36k
    if (VT.isVector()) {
201
719
      unsigned NumElts = VT.getVectorNumElements();
202
719
      EVT EltVT = VT.getVectorElementType();
203
719
      // Vectors with an even number of f16 elements will be passed to
204
719
      // us as an array of v2f16 elements. We must match this so we
205
719
      // stay in sync with Ins/Outs.
206
719
      if (EltVT == MVT::f16 && 
NumElts % 2 == 0487
) {
207
463
        EltVT = MVT::v2f16;
208
463
        NumElts /= 2;
209
463
      }
210
2.00k
      for (unsigned j = 0; j != NumElts; 
++j1.28k
) {
211
1.28k
        ValueVTs.push_back(EltVT);
212
1.28k
        if (Offsets)
213
1.28k
          Offsets->push_back(Off + j * EltVT.getStoreSize());
214
1.28k
      }
215
3.64k
    } else {
216
3.64k
      ValueVTs.push_back(VT);
217
3.64k
      if (Offsets)
218
3.63k
        Offsets->push_back(Off);
219
3.64k
    }
220
4.36k
  }
221
4.69k
}
222
223
// Check whether we can merge loads/stores of some of the pieces of a
224
// flattened function parameter or return value into a single vector
225
// load/store.
226
//
227
// The flattened parameter is represented as a list of EVTs and
228
// offsets, and the whole structure is aligned to ParamAlignment. This
229
// function determines whether we can load/store pieces of the
230
// parameter starting at index Idx using a single vectorized op of
231
// size AccessSize. If so, it returns the number of param pieces
232
// covered by the vector op. Otherwise, it returns 1.
233
static unsigned CanMergeParamLoadStoresStartingAt(
234
    unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
235
17.0k
    const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
236
17.0k
  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
237
17.0k
238
17.0k
  // Can't vectorize if param alignment is not sufficient.
239
17.0k
  if (AccessSize > ParamAlignment)
240
8.82k
    return 1;
241
8.21k
  // Can't vectorize if offset is not aligned.
242
8.21k
  if (Offsets[Idx] & (AccessSize - 1))
243
68
    return 1;
244
8.14k
245
8.14k
  EVT EltVT = ValueVTs[Idx];
246
8.14k
  unsigned EltSize = EltVT.getStoreSize();
247
8.14k
248
8.14k
  // Element is too large to vectorize.
249
8.14k
  if (EltSize >= AccessSize)
250
7.62k
    return 1;
251
518
252
518
  unsigned NumElts = AccessSize / EltSize;
253
518
  // Can't vectorize if AccessBytes if not a multiple of EltSize.
254
518
  if (AccessSize != EltSize * NumElts)
255
0
    return 1;
256
518
257
518
  // We don't have enough elements to vectorize.
258
518
  if (Idx + NumElts > ValueVTs.size())
259
143
    return 1;
260
375
261
375
  // PTX ISA can only deal with 2- and 4-element vector ops.
262
375
  if (NumElts != 4 && 
NumElts != 2257
)
263
9
    return 1;
264
366
265
932
  
for (unsigned j = Idx + 1; 366
j < Idx + NumElts;
++j566
) {
266
594
    // Types do not match.
267
594
    if (ValueVTs[j] != EltVT)
268
20
      return 1;
269
574
270
574
    // Elements are not contiguous.
271
574
    if (Offsets[j] - Offsets[j - 1] != EltSize)
272
8
      return 1;
273
574
  }
274
366
  // OK. We can vectorize ValueVTs[i..i+NumElts)
275
366
  
return NumElts338
;
276
366
}
277
278
// Flags for tracking per-element vectorization state of loads/stores
279
// of a flattened function parameter or return value.
280
enum ParamVectorizationFlags {
281
  PVF_INNER = 0x0, // Middle elements of a vector.
282
  PVF_FIRST = 0x1, // First element of the vector.
283
  PVF_LAST = 0x2,  // Last element of the vector.
284
  // Scalar is effectively a 1-element vector.
285
  PVF_SCALAR = PVF_FIRST | PVF_LAST
286
};
287
288
// Computes whether and how we can vectorize the loads/stores of a
289
// flattened function parameter or return value.
290
//
291
// The flattened parameter is represented as the list of ValueVTs and
292
// Offsets, and is aligned to ParamAlignment bytes. We return a vector
293
// of the same size as ValueVTs indicating how each piece should be
294
// loaded/stored (i.e. as a scalar, or as part of a vector
295
// load/store).
296
static SmallVector<ParamVectorizationFlags, 16>
297
VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
298
                     const SmallVectorImpl<uint64_t> &Offsets,
299
4.62k
                     unsigned ParamAlignment) {
300
4.62k
  // Set vector size to match ValueVTs and mark all elements as
301
4.62k
  // scalars by default.
302
4.62k
  SmallVector<ParamVectorizationFlags, 16> VectorInfo;
303
4.62k
  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
304
4.62k
305
4.62k
  // Check what we can vectorize using 128/64/32-bit accesses.
306
9.05k
  for (int I = 0, E = ValueVTs.size(); I != E; 
++I4.43k
) {
307
4.43k
    // Skip elements we've already processed.
308
4.43k
    assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
309
17.0k
    for (unsigned AccessSize : {16, 8, 4, 2}) {
310
17.0k
      unsigned NumElts = CanMergeParamLoadStoresStartingAt(
311
17.0k
          I, AccessSize, ValueVTs, Offsets, ParamAlignment);
312
17.0k
      // Mark vectorized elements.
313
17.0k
      switch (NumElts) {
314
17.0k
      default:
315
0
        llvm_unreachable("Unexpected return value");
316
17.0k
      case 1:
317
16.6k
        // Can't vectorize using this size, try next smaller size.
318
16.6k
        continue;
319
17.0k
      case 2:
320
228
        assert(I + 1 < E && "Not enough elements.");
321
228
        VectorInfo[I] = PVF_FIRST;
322
228
        VectorInfo[I + 1] = PVF_LAST;
323
228
        I += 1;
324
228
        break;
325
17.0k
      case 4:
326
110
        assert(I + 3 < E && "Not enough elements.");
327
110
        VectorInfo[I] = PVF_FIRST;
328
110
        VectorInfo[I + 1] = PVF_INNER;
329
110
        VectorInfo[I + 2] = PVF_INNER;
330
110
        VectorInfo[I + 3] = PVF_LAST;
331
110
        I += 3;
332
110
        break;
333
338
      }
334
338
      // Break out of the inner loop because we've already succeeded
335
338
      // using largest possible AccessSize.
336
338
      break;
337
338
    }
338
4.43k
  }
339
4.62k
  return VectorInfo;
340
4.62k
}
341
342
// NVPTXTargetLowering Constructor.
343
NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
344
                                         const NVPTXSubtarget &STI)
345
455
    : TargetLowering(TM), nvTM(&TM), STI(STI) {
346
455
  // always lower memset, memcpy, and memmove intrinsics to load/store
347
455
  // instructions, rather
348
455
  // then generating calls to memset, mempcy or memmove.
349
455
  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
350
455
  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
351
455
  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
352
455
353
455
  setBooleanContents(ZeroOrNegativeOneBooleanContent);
354
455
  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
355
455
356
455
  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
357
455
  // condition branches.
358
455
  setJumpIsExpensive(true);
359
455
360
455
  // Wide divides are _very_ slow. Try to reduce the width of the divide if
361
455
  // possible.
362
455
  addBypassSlowDiv(64, 32);
363
455
364
455
  // By default, use the Source scheduling
365
455
  if (sched4reg)
366
0
    setSchedulingPreference(Sched::RegPressure);
367
455
  else
368
455
    setSchedulingPreference(Sched::Source);
369
455
370
455
  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
371
4.55k
                                    LegalizeAction NoF16Action) {
372
4.55k
    setOperationAction(Op, VT, STI.allowFP16Math() ? 
Action320
:
NoF16Action4.23k
);
373
4.55k
  };
374
455
375
455
  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
376
455
  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
377
455
  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
378
455
  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
379
455
  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
380
455
  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
381
455
  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
382
455
  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
383
455
384
455
  // Conversion to/from FP16/FP16x2 is always legal.
385
455
  setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
386
455
  setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
387
455
  setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
388
455
  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
389
455
  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
390
455
  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
391
455
392
455
  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
393
455
  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
394
455
395
455
  // Operations not directly supported by NVPTX.
396
455
  for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8,
397
4.09k
                 MVT::i16, MVT::i32, MVT::i64}) {
398
4.09k
    setOperationAction(ISD::SELECT_CC, VT, Expand);
399
4.09k
    setOperationAction(ISD::BR_CC, VT, Expand);
400
4.09k
  }
401
455
402
455
  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
403
455
  // For others we will expand to a SHL/SRA pair.
404
455
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
405
455
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
406
455
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
407
455
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
408
455
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
409
455
410
455
  setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
411
455
  setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
412
455
  setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
413
455
  setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
414
455
  setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
415
455
  setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
416
455
417
455
  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
418
455
  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
419
455
420
455
  // TODO: we may consider expanding ROTL/ROTR on older GPUs.  Currently on GPUs
421
455
  // that don't have h/w rotation we lower them to multi-instruction assembly.
422
455
  // See ROT*_sw in NVPTXIntrInfo.td
423
455
  setOperationAction(ISD::ROTL, MVT::i64, Legal);
424
455
  setOperationAction(ISD::ROTR, MVT::i64, Legal);
425
455
  setOperationAction(ISD::ROTL, MVT::i32, Legal);
426
455
  setOperationAction(ISD::ROTR, MVT::i32, Legal);
427
455
428
455
  setOperationAction(ISD::ROTL, MVT::i16, Expand);
429
455
  setOperationAction(ISD::ROTR, MVT::i16, Expand);
430
455
  setOperationAction(ISD::ROTL, MVT::i8, Expand);
431
455
  setOperationAction(ISD::ROTR, MVT::i8, Expand);
432
455
  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
433
455
  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
434
455
  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
435
455
436
455
  // Indirect branch is not supported.
437
455
  // This also disables Jump Table creation.
438
455
  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
439
455
  setOperationAction(ISD::BRIND, MVT::Other, Expand);
440
455
441
455
  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
442
455
  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
443
455
444
455
  // We want to legalize constant related memmove and memcopy
445
455
  // intrinsics.
446
455
  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
447
455
448
455
  // Turn FP extload into load/fpextend
449
455
  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
450
455
  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
451
455
  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
452
455
  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
453
455
  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
454
455
  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
455
455
  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
456
455
  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
457
455
  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
458
455
  // Turn FP truncstore into trunc + store.
459
455
  // FIXME: vector types should also be expanded
460
455
  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
461
455
  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
462
455
  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
463
455
464
455
  // PTX does not support load / store predicate registers
465
455
  setOperationAction(ISD::LOAD, MVT::i1, Custom);
466
455
  setOperationAction(ISD::STORE, MVT::i1, Custom);
467
455
468
2.73k
  for (MVT VT : MVT::integer_valuetypes()) {
469
2.73k
    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
470
2.73k
    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
471
2.73k
    setTruncStoreAction(VT, MVT::i1, Expand);
472
2.73k
  }
473
455
474
455
  // This is legal in NVPTX
475
455
  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
476
455
  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
477
455
  setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
478
455
479
455
  // TRAP can be lowered to PTX trap
480
455
  setOperationAction(ISD::TRAP, MVT::Other, Legal);
481
455
482
455
  // Register custom handling for vector loads/stores
483
50.5k
  for (MVT VT : MVT::vector_valuetypes()) {
484
50.5k
    if (IsPTXVectorType(VT)) {
485
6.82k
      setOperationAction(ISD::LOAD, VT, Custom);
486
6.82k
      setOperationAction(ISD::STORE, VT, Custom);
487
6.82k
      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
488
6.82k
    }
489
50.5k
  }
490
455
491
455
  // Custom handling for i8 intrinsics
492
455
  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
493
455
494
1.36k
  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
495
1.36k
    setOperationAction(ISD::ABS,  Ty, Legal);
496
1.36k
    setOperationAction(ISD::SMIN, Ty, Legal);
497
1.36k
    setOperationAction(ISD::SMAX, Ty, Legal);
498
1.36k
    setOperationAction(ISD::UMIN, Ty, Legal);
499
1.36k
    setOperationAction(ISD::UMAX, Ty, Legal);
500
1.36k
501
1.36k
    setOperationAction(ISD::CTPOP, Ty, Legal);
502
1.36k
    setOperationAction(ISD::CTLZ, Ty, Legal);
503
1.36k
  }
504
455
505
455
  setOperationAction(ISD::CTTZ, MVT::i16, Expand);
506
455
  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
507
455
  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
508
455
509
455
  // PTX does not directly support SELP of i1, so promote to i32 first
510
455
  setOperationAction(ISD::SELECT, MVT::i1, Custom);
511
455
512
455
  // PTX cannot multiply two i64s in a single instruction.
513
455
  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
514
455
  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
515
455
516
455
  // We have some custom DAG combine patterns for these nodes
517
455
  setTargetDAGCombine(ISD::ADD);
518
455
  setTargetDAGCombine(ISD::AND);
519
455
  setTargetDAGCombine(ISD::FADD);
520
455
  setTargetDAGCombine(ISD::MUL);
521
455
  setTargetDAGCombine(ISD::SHL);
522
455
  setTargetDAGCombine(ISD::SREM);
523
455
  setTargetDAGCombine(ISD::UREM);
524
455
525
455
  // setcc for f16x2 needs special handling to prevent legalizer's
526
455
  // attempt to scalarize it due to v2i1 not being legal.
527
455
  if (STI.allowFP16Math())
528
32
    setTargetDAGCombine(ISD::SETCC);
529
455
530
455
  // Promote fp16 arithmetic if fp16 hardware isn't available or the
531
455
  // user passed --nvptx-no-fp16-math. The flag is useful because,
532
455
  // although sm_53+ GPUs have some sort of FP16 support in
533
455
  // hardware, only sm_53 and sm_60 have full implementation. Others
534
455
  // only have token amount of hardware and are likely to run faster
535
455
  // by using fp32 units instead.
536
1.82k
  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
537
1.82k
    setFP16OperationAction(Op, MVT::f16, Legal, Promote);
538
1.82k
    setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
539
1.82k
  }
540
455
541
455
  // There's no neg.f16 instruction. Expand to (0-x).
542
455
  setOperationAction(ISD::FNEG, MVT::f16, Expand);
543
455
  setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
544
455
545
455
  // (would be) Library functions.
546
455
547
455
  // These map to conversion instructions for scalar FP types.
548
455
  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
549
2.27k
                         ISD::FTRUNC}) {
550
2.27k
    setOperationAction(Op, MVT::f16, Legal);
551
2.27k
    setOperationAction(Op, MVT::f32, Legal);
552
2.27k
    setOperationAction(Op, MVT::f64, Legal);
553
2.27k
    setOperationAction(Op, MVT::v2f16, Expand);
554
2.27k
  }
555
455
556
455
  setOperationAction(ISD::FROUND, MVT::f16, Promote);
557
455
  setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
558
455
  setOperationAction(ISD::FROUND, MVT::f32, Custom);
559
455
  setOperationAction(ISD::FROUND, MVT::f64, Custom);
560
455
561
455
562
455
  // 'Expand' implements FCOPYSIGN without calling an external library.
563
455
  setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
564
455
  setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
565
455
  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
566
455
  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
567
455
568
455
  // These map to corresponding instructions for f32/f64. f16 must be
569
455
  // promoted to f32. v2f16 is expanded to f16, which is then promoted
570
455
  // to f32.
571
455
  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
572
3.64k
                         ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) {
573
3.64k
    setOperationAction(Op, MVT::f16, Promote);
574
3.64k
    setOperationAction(Op, MVT::f32, Legal);
575
3.64k
    setOperationAction(Op, MVT::f64, Legal);
576
3.64k
    setOperationAction(Op, MVT::v2f16, Expand);
577
3.64k
  }
578
455
  setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
579
455
  setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
580
455
  setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
581
455
  setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
582
455
583
455
  // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
584
455
  // No FPOW or FREM in PTX.
585
455
586
455
  // Now deduce the information based on the above mentioned
587
455
  // actions
588
455
  computeRegisterProperties(STI.getRegisterInfo());
589
455
}
590
591
0
const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
592
0
  switch ((NVPTXISD::NodeType)Opcode) {
593
0
  case NVPTXISD::FIRST_NUMBER:
594
0
    break;
595
0
  case NVPTXISD::CALL:
596
0
    return "NVPTXISD::CALL";
597
0
  case NVPTXISD::RET_FLAG:
598
0
    return "NVPTXISD::RET_FLAG";
599
0
  case NVPTXISD::LOAD_PARAM:
600
0
    return "NVPTXISD::LOAD_PARAM";
601
0
  case NVPTXISD::Wrapper:
602
0
    return "NVPTXISD::Wrapper";
603
0
  case NVPTXISD::DeclareParam:
604
0
    return "NVPTXISD::DeclareParam";
605
0
  case NVPTXISD::DeclareScalarParam:
606
0
    return "NVPTXISD::DeclareScalarParam";
607
0
  case NVPTXISD::DeclareRet:
608
0
    return "NVPTXISD::DeclareRet";
609
0
  case NVPTXISD::DeclareScalarRet:
610
0
    return "NVPTXISD::DeclareScalarRet";
611
0
  case NVPTXISD::DeclareRetParam:
612
0
    return "NVPTXISD::DeclareRetParam";
613
0
  case NVPTXISD::PrintCall:
614
0
    return "NVPTXISD::PrintCall";
615
0
  case NVPTXISD::PrintConvergentCall:
616
0
    return "NVPTXISD::PrintConvergentCall";
617
0
  case NVPTXISD::PrintCallUni:
618
0
    return "NVPTXISD::PrintCallUni";
619
0
  case NVPTXISD::PrintConvergentCallUni:
620
0
    return "NVPTXISD::PrintConvergentCallUni";
621
0
  case NVPTXISD::LoadParam:
622
0
    return "NVPTXISD::LoadParam";
623
0
  case NVPTXISD::LoadParamV2:
624
0
    return "NVPTXISD::LoadParamV2";
625
0
  case NVPTXISD::LoadParamV4:
626
0
    return "NVPTXISD::LoadParamV4";
627
0
  case NVPTXISD::StoreParam:
628
0
    return "NVPTXISD::StoreParam";
629
0
  case NVPTXISD::StoreParamV2:
630
0
    return "NVPTXISD::StoreParamV2";
631
0
  case NVPTXISD::StoreParamV4:
632
0
    return "NVPTXISD::StoreParamV4";
633
0
  case NVPTXISD::StoreParamS32:
634
0
    return "NVPTXISD::StoreParamS32";
635
0
  case NVPTXISD::StoreParamU32:
636
0
    return "NVPTXISD::StoreParamU32";
637
0
  case NVPTXISD::CallArgBegin:
638
0
    return "NVPTXISD::CallArgBegin";
639
0
  case NVPTXISD::CallArg:
640
0
    return "NVPTXISD::CallArg";
641
0
  case NVPTXISD::LastCallArg:
642
0
    return "NVPTXISD::LastCallArg";
643
0
  case NVPTXISD::CallArgEnd:
644
0
    return "NVPTXISD::CallArgEnd";
645
0
  case NVPTXISD::CallVoid:
646
0
    return "NVPTXISD::CallVoid";
647
0
  case NVPTXISD::CallVal:
648
0
    return "NVPTXISD::CallVal";
649
0
  case NVPTXISD::CallSymbol:
650
0
    return "NVPTXISD::CallSymbol";
651
0
  case NVPTXISD::Prototype:
652
0
    return "NVPTXISD::Prototype";
653
0
  case NVPTXISD::MoveParam:
654
0
    return "NVPTXISD::MoveParam";
655
0
  case NVPTXISD::StoreRetval:
656
0
    return "NVPTXISD::StoreRetval";
657
0
  case NVPTXISD::StoreRetvalV2:
658
0
    return "NVPTXISD::StoreRetvalV2";
659
0
  case NVPTXISD::StoreRetvalV4:
660
0
    return "NVPTXISD::StoreRetvalV4";
661
0
  case NVPTXISD::PseudoUseParam:
662
0
    return "NVPTXISD::PseudoUseParam";
663
0
  case NVPTXISD::RETURN:
664
0
    return "NVPTXISD::RETURN";
665
0
  case NVPTXISD::CallSeqBegin:
666
0
    return "NVPTXISD::CallSeqBegin";
667
0
  case NVPTXISD::CallSeqEnd:
668
0
    return "NVPTXISD::CallSeqEnd";
669
0
  case NVPTXISD::CallPrototype:
670
0
    return "NVPTXISD::CallPrototype";
671
0
  case NVPTXISD::ProxyReg:
672
0
    return "NVPTXISD::ProxyReg";
673
0
  case NVPTXISD::LoadV2:
674
0
    return "NVPTXISD::LoadV2";
675
0
  case NVPTXISD::LoadV4:
676
0
    return "NVPTXISD::LoadV4";
677
0
  case NVPTXISD::LDGV2:
678
0
    return "NVPTXISD::LDGV2";
679
0
  case NVPTXISD::LDGV4:
680
0
    return "NVPTXISD::LDGV4";
681
0
  case NVPTXISD::LDUV2:
682
0
    return "NVPTXISD::LDUV2";
683
0
  case NVPTXISD::LDUV4:
684
0
    return "NVPTXISD::LDUV4";
685
0
  case NVPTXISD::StoreV2:
686
0
    return "NVPTXISD::StoreV2";
687
0
  case NVPTXISD::StoreV4:
688
0
    return "NVPTXISD::StoreV4";
689
0
  case NVPTXISD::FUN_SHFL_CLAMP:
690
0
    return "NVPTXISD::FUN_SHFL_CLAMP";
691
0
  case NVPTXISD::FUN_SHFR_CLAMP:
692
0
    return "NVPTXISD::FUN_SHFR_CLAMP";
693
0
  case NVPTXISD::IMAD:
694
0
    return "NVPTXISD::IMAD";
695
0
  case NVPTXISD::SETP_F16X2:
696
0
    return "NVPTXISD::SETP_F16X2";
697
0
  case NVPTXISD::Dummy:
698
0
    return "NVPTXISD::Dummy";
699
0
  case NVPTXISD::MUL_WIDE_SIGNED:
700
0
    return "NVPTXISD::MUL_WIDE_SIGNED";
701
0
  case NVPTXISD::MUL_WIDE_UNSIGNED:
702
0
    return "NVPTXISD::MUL_WIDE_UNSIGNED";
703
0
  case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
704
0
  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
705
0
  case NVPTXISD::Tex1DFloatFloatLevel:
706
0
    return "NVPTXISD::Tex1DFloatFloatLevel";
707
0
  case NVPTXISD::Tex1DFloatFloatGrad:
708
0
    return "NVPTXISD::Tex1DFloatFloatGrad";
709
0
  case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
710
0
  case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
711
0
  case NVPTXISD::Tex1DS32FloatLevel:
712
0
    return "NVPTXISD::Tex1DS32FloatLevel";
713
0
  case NVPTXISD::Tex1DS32FloatGrad:
714
0
    return "NVPTXISD::Tex1DS32FloatGrad";
715
0
  case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
716
0
  case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
717
0
  case NVPTXISD::Tex1DU32FloatLevel:
718
0
    return "NVPTXISD::Tex1DU32FloatLevel";
719
0
  case NVPTXISD::Tex1DU32FloatGrad:
720
0
    return "NVPTXISD::Tex1DU32FloatGrad";
721
0
  case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
722
0
  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
723
0
  case NVPTXISD::Tex1DArrayFloatFloatLevel:
724
0
    return "NVPTXISD::Tex1DArrayFloatFloatLevel";
725
0
  case NVPTXISD::Tex1DArrayFloatFloatGrad:
726
0
    return "NVPTXISD::Tex1DArrayFloatFloatGrad";
727
0
  case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
728
0
  case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
729
0
  case NVPTXISD::Tex1DArrayS32FloatLevel:
730
0
    return "NVPTXISD::Tex1DArrayS32FloatLevel";
731
0
  case NVPTXISD::Tex1DArrayS32FloatGrad:
732
0
    return "NVPTXISD::Tex1DArrayS32FloatGrad";
733
0
  case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
734
0
  case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
735
0
  case NVPTXISD::Tex1DArrayU32FloatLevel:
736
0
    return "NVPTXISD::Tex1DArrayU32FloatLevel";
737
0
  case NVPTXISD::Tex1DArrayU32FloatGrad:
738
0
    return "NVPTXISD::Tex1DArrayU32FloatGrad";
739
0
  case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
740
0
  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
741
0
  case NVPTXISD::Tex2DFloatFloatLevel:
742
0
    return "NVPTXISD::Tex2DFloatFloatLevel";
743
0
  case NVPTXISD::Tex2DFloatFloatGrad:
744
0
    return "NVPTXISD::Tex2DFloatFloatGrad";
745
0
  case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
746
0
  case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
747
0
  case NVPTXISD::Tex2DS32FloatLevel:
748
0
    return "NVPTXISD::Tex2DS32FloatLevel";
749
0
  case NVPTXISD::Tex2DS32FloatGrad:
750
0
    return "NVPTXISD::Tex2DS32FloatGrad";
751
0
  case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
752
0
  case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
753
0
  case NVPTXISD::Tex2DU32FloatLevel:
754
0
    return "NVPTXISD::Tex2DU32FloatLevel";
755
0
  case NVPTXISD::Tex2DU32FloatGrad:
756
0
    return "NVPTXISD::Tex2DU32FloatGrad";
757
0
  case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
758
0
  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
759
0
  case NVPTXISD::Tex2DArrayFloatFloatLevel:
760
0
    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
761
0
  case NVPTXISD::Tex2DArrayFloatFloatGrad:
762
0
    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
763
0
  case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
764
0
  case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
765
0
  case NVPTXISD::Tex2DArrayS32FloatLevel:
766
0
    return "NVPTXISD::Tex2DArrayS32FloatLevel";
767
0
  case NVPTXISD::Tex2DArrayS32FloatGrad:
768
0
    return "NVPTXISD::Tex2DArrayS32FloatGrad";
769
0
  case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
770
0
  case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
771
0
  case NVPTXISD::Tex2DArrayU32FloatLevel:
772
0
    return "NVPTXISD::Tex2DArrayU32FloatLevel";
773
0
  case NVPTXISD::Tex2DArrayU32FloatGrad:
774
0
    return "NVPTXISD::Tex2DArrayU32FloatGrad";
775
0
  case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
776
0
  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
777
0
  case NVPTXISD::Tex3DFloatFloatLevel:
778
0
    return "NVPTXISD::Tex3DFloatFloatLevel";
779
0
  case NVPTXISD::Tex3DFloatFloatGrad:
780
0
    return "NVPTXISD::Tex3DFloatFloatGrad";
781
0
  case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
782
0
  case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
783
0
  case NVPTXISD::Tex3DS32FloatLevel:
784
0
    return "NVPTXISD::Tex3DS32FloatLevel";
785
0
  case NVPTXISD::Tex3DS32FloatGrad:
786
0
    return "NVPTXISD::Tex3DS32FloatGrad";
787
0
  case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
788
0
  case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
789
0
  case NVPTXISD::Tex3DU32FloatLevel:
790
0
    return "NVPTXISD::Tex3DU32FloatLevel";
791
0
  case NVPTXISD::Tex3DU32FloatGrad:
792
0
    return "NVPTXISD::Tex3DU32FloatGrad";
793
0
  case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
794
0
  case NVPTXISD::TexCubeFloatFloatLevel:
795
0
    return "NVPTXISD::TexCubeFloatFloatLevel";
796
0
  case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
797
0
  case NVPTXISD::TexCubeS32FloatLevel:
798
0
    return "NVPTXISD::TexCubeS32FloatLevel";
799
0
  case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
800
0
  case NVPTXISD::TexCubeU32FloatLevel:
801
0
    return "NVPTXISD::TexCubeU32FloatLevel";
802
0
  case NVPTXISD::TexCubeArrayFloatFloat:
803
0
    return "NVPTXISD::TexCubeArrayFloatFloat";
804
0
  case NVPTXISD::TexCubeArrayFloatFloatLevel:
805
0
    return "NVPTXISD::TexCubeArrayFloatFloatLevel";
806
0
  case NVPTXISD::TexCubeArrayS32Float:
807
0
    return "NVPTXISD::TexCubeArrayS32Float";
808
0
  case NVPTXISD::TexCubeArrayS32FloatLevel:
809
0
    return "NVPTXISD::TexCubeArrayS32FloatLevel";
810
0
  case NVPTXISD::TexCubeArrayU32Float:
811
0
    return "NVPTXISD::TexCubeArrayU32Float";
812
0
  case NVPTXISD::TexCubeArrayU32FloatLevel:
813
0
    return "NVPTXISD::TexCubeArrayU32FloatLevel";
814
0
  case NVPTXISD::Tld4R2DFloatFloat:
815
0
    return "NVPTXISD::Tld4R2DFloatFloat";
816
0
  case NVPTXISD::Tld4G2DFloatFloat:
817
0
    return "NVPTXISD::Tld4G2DFloatFloat";
818
0
  case NVPTXISD::Tld4B2DFloatFloat:
819
0
    return "NVPTXISD::Tld4B2DFloatFloat";
820
0
  case NVPTXISD::Tld4A2DFloatFloat:
821
0
    return "NVPTXISD::Tld4A2DFloatFloat";
822
0
  case NVPTXISD::Tld4R2DS64Float:
823
0
    return "NVPTXISD::Tld4R2DS64Float";
824
0
  case NVPTXISD::Tld4G2DS64Float:
825
0
    return "NVPTXISD::Tld4G2DS64Float";
826
0
  case NVPTXISD::Tld4B2DS64Float:
827
0
    return "NVPTXISD::Tld4B2DS64Float";
828
0
  case NVPTXISD::Tld4A2DS64Float:
829
0
    return "NVPTXISD::Tld4A2DS64Float";
830
0
  case NVPTXISD::Tld4R2DU64Float:
831
0
    return "NVPTXISD::Tld4R2DU64Float";
832
0
  case NVPTXISD::Tld4G2DU64Float:
833
0
    return "NVPTXISD::Tld4G2DU64Float";
834
0
  case NVPTXISD::Tld4B2DU64Float:
835
0
    return "NVPTXISD::Tld4B2DU64Float";
836
0
  case NVPTXISD::Tld4A2DU64Float:
837
0
    return "NVPTXISD::Tld4A2DU64Float";
838
0
839
0
  case NVPTXISD::TexUnified1DFloatS32:
840
0
    return "NVPTXISD::TexUnified1DFloatS32";
841
0
  case NVPTXISD::TexUnified1DFloatFloat:
842
0
    return "NVPTXISD::TexUnified1DFloatFloat";
843
0
  case NVPTXISD::TexUnified1DFloatFloatLevel:
844
0
    return "NVPTXISD::TexUnified1DFloatFloatLevel";
845
0
  case NVPTXISD::TexUnified1DFloatFloatGrad:
846
0
    return "NVPTXISD::TexUnified1DFloatFloatGrad";
847
0
  case NVPTXISD::TexUnified1DS32S32:
848
0
    return "NVPTXISD::TexUnified1DS32S32";
849
0
  case NVPTXISD::TexUnified1DS32Float:
850
0
    return "NVPTXISD::TexUnified1DS32Float";
851
0
  case NVPTXISD::TexUnified1DS32FloatLevel:
852
0
    return "NVPTXISD::TexUnified1DS32FloatLevel";
853
0
  case NVPTXISD::TexUnified1DS32FloatGrad:
854
0
    return "NVPTXISD::TexUnified1DS32FloatGrad";
855
0
  case NVPTXISD::TexUnified1DU32S32:
856
0
    return "NVPTXISD::TexUnified1DU32S32";
857
0
  case NVPTXISD::TexUnified1DU32Float:
858
0
    return "NVPTXISD::TexUnified1DU32Float";
859
0
  case NVPTXISD::TexUnified1DU32FloatLevel:
860
0
    return "NVPTXISD::TexUnified1DU32FloatLevel";
861
0
  case NVPTXISD::TexUnified1DU32FloatGrad:
862
0
    return "NVPTXISD::TexUnified1DU32FloatGrad";
863
0
  case NVPTXISD::TexUnified1DArrayFloatS32:
864
0
    return "NVPTXISD::TexUnified1DArrayFloatS32";
865
0
  case NVPTXISD::TexUnified1DArrayFloatFloat:
866
0
    return "NVPTXISD::TexUnified1DArrayFloatFloat";
867
0
  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
868
0
    return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
869
0
  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
870
0
    return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
871
0
  case NVPTXISD::TexUnified1DArrayS32S32:
872
0
    return "NVPTXISD::TexUnified1DArrayS32S32";
873
0
  case NVPTXISD::TexUnified1DArrayS32Float:
874
0
    return "NVPTXISD::TexUnified1DArrayS32Float";
875
0
  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
876
0
    return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
877
0
  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
878
0
    return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
879
0
  case NVPTXISD::TexUnified1DArrayU32S32:
880
0
    return "NVPTXISD::TexUnified1DArrayU32S32";
881
0
  case NVPTXISD::TexUnified1DArrayU32Float:
882
0
    return "NVPTXISD::TexUnified1DArrayU32Float";
883
0
  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
884
0
    return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
885
0
  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
886
0
    return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
887
0
  case NVPTXISD::TexUnified2DFloatS32:
888
0
    return "NVPTXISD::TexUnified2DFloatS32";
889
0
  case NVPTXISD::TexUnified2DFloatFloat:
890
0
    return "NVPTXISD::TexUnified2DFloatFloat";
891
0
  case NVPTXISD::TexUnified2DFloatFloatLevel:
892
0
    return "NVPTXISD::TexUnified2DFloatFloatLevel";
893
0
  case NVPTXISD::TexUnified2DFloatFloatGrad:
894
0
    return "NVPTXISD::TexUnified2DFloatFloatGrad";
895
0
  case NVPTXISD::TexUnified2DS32S32:
896
0
    return "NVPTXISD::TexUnified2DS32S32";
897
0
  case NVPTXISD::TexUnified2DS32Float:
898
0
    return "NVPTXISD::TexUnified2DS32Float";
899
0
  case NVPTXISD::TexUnified2DS32FloatLevel:
900
0
    return "NVPTXISD::TexUnified2DS32FloatLevel";
901
0
  case NVPTXISD::TexUnified2DS32FloatGrad:
902
0
    return "NVPTXISD::TexUnified2DS32FloatGrad";
903
0
  case NVPTXISD::TexUnified2DU32S32:
904
0
    return "NVPTXISD::TexUnified2DU32S32";
905
0
  case NVPTXISD::TexUnified2DU32Float:
906
0
    return "NVPTXISD::TexUnified2DU32Float";
907
0
  case NVPTXISD::TexUnified2DU32FloatLevel:
908
0
    return "NVPTXISD::TexUnified2DU32FloatLevel";
909
0
  case NVPTXISD::TexUnified2DU32FloatGrad:
910
0
    return "NVPTXISD::TexUnified2DU32FloatGrad";
911
0
  case NVPTXISD::TexUnified2DArrayFloatS32:
912
0
    return "NVPTXISD::TexUnified2DArrayFloatS32";
913
0
  case NVPTXISD::TexUnified2DArrayFloatFloat:
914
0
    return "NVPTXISD::TexUnified2DArrayFloatFloat";
915
0
  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
916
0
    return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
917
0
  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
918
0
    return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
919
0
  case NVPTXISD::TexUnified2DArrayS32S32:
920
0
    return "NVPTXISD::TexUnified2DArrayS32S32";
921
0
  case NVPTXISD::TexUnified2DArrayS32Float:
922
0
    return "NVPTXISD::TexUnified2DArrayS32Float";
923
0
  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
924
0
    return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
925
0
  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
926
0
    return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
927
0
  case NVPTXISD::TexUnified2DArrayU32S32:
928
0
    return "NVPTXISD::TexUnified2DArrayU32S32";
929
0
  case NVPTXISD::TexUnified2DArrayU32Float:
930
0
    return "NVPTXISD::TexUnified2DArrayU32Float";
931
0
  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
932
0
    return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
933
0
  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
934
0
    return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
935
0
  case NVPTXISD::TexUnified3DFloatS32:
936
0
    return "NVPTXISD::TexUnified3DFloatS32";
937
0
  case NVPTXISD::TexUnified3DFloatFloat:
938
0
    return "NVPTXISD::TexUnified3DFloatFloat";
939
0
  case NVPTXISD::TexUnified3DFloatFloatLevel:
940
0
    return "NVPTXISD::TexUnified3DFloatFloatLevel";
941
0
  case NVPTXISD::TexUnified3DFloatFloatGrad:
942
0
    return "NVPTXISD::TexUnified3DFloatFloatGrad";
943
0
  case NVPTXISD::TexUnified3DS32S32:
944
0
    return "NVPTXISD::TexUnified3DS32S32";
945
0
  case NVPTXISD::TexUnified3DS32Float:
946
0
    return "NVPTXISD::TexUnified3DS32Float";
947
0
  case NVPTXISD::TexUnified3DS32FloatLevel:
948
0
    return "NVPTXISD::TexUnified3DS32FloatLevel";
949
0
  case NVPTXISD::TexUnified3DS32FloatGrad:
950
0
    return "NVPTXISD::TexUnified3DS32FloatGrad";
951
0
  case NVPTXISD::TexUnified3DU32S32:
952
0
    return "NVPTXISD::TexUnified3DU32S32";
953
0
  case NVPTXISD::TexUnified3DU32Float:
954
0
    return "NVPTXISD::TexUnified3DU32Float";
955
0
  case NVPTXISD::TexUnified3DU32FloatLevel:
956
0
    return "NVPTXISD::TexUnified3DU32FloatLevel";
957
0
  case NVPTXISD::TexUnified3DU32FloatGrad:
958
0
    return "NVPTXISD::TexUnified3DU32FloatGrad";
959
0
  case NVPTXISD::TexUnifiedCubeFloatFloat:
960
0
    return "NVPTXISD::TexUnifiedCubeFloatFloat";
961
0
  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
962
0
    return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
963
0
  case NVPTXISD::TexUnifiedCubeS32Float:
964
0
    return "NVPTXISD::TexUnifiedCubeS32Float";
965
0
  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
966
0
    return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
967
0
  case NVPTXISD::TexUnifiedCubeU32Float:
968
0
    return "NVPTXISD::TexUnifiedCubeU32Float";
969
0
  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
970
0
    return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
971
0
  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
972
0
    return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
973
0
  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
974
0
    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
975
0
  case NVPTXISD::TexUnifiedCubeArrayS32Float:
976
0
    return "NVPTXISD::TexUnifiedCubeArrayS32Float";
977
0
  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
978
0
    return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
979
0
  case NVPTXISD::TexUnifiedCubeArrayU32Float:
980
0
    return "NVPTXISD::TexUnifiedCubeArrayU32Float";
981
0
  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
982
0
    return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
983
0
  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
984
0
    return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
985
0
  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
986
0
    return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
987
0
  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
988
0
    return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
989
0
  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
990
0
    return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
991
0
  case NVPTXISD::Tld4UnifiedR2DS64Float:
992
0
    return "NVPTXISD::Tld4UnifiedR2DS64Float";
993
0
  case NVPTXISD::Tld4UnifiedG2DS64Float:
994
0
    return "NVPTXISD::Tld4UnifiedG2DS64Float";
995
0
  case NVPTXISD::Tld4UnifiedB2DS64Float:
996
0
    return "NVPTXISD::Tld4UnifiedB2DS64Float";
997
0
  case NVPTXISD::Tld4UnifiedA2DS64Float:
998
0
    return "NVPTXISD::Tld4UnifiedA2DS64Float";
999
0
  case NVPTXISD::Tld4UnifiedR2DU64Float:
1000
0
    return "NVPTXISD::Tld4UnifiedR2DU64Float";
1001
0
  case NVPTXISD::Tld4UnifiedG2DU64Float:
1002
0
    return "NVPTXISD::Tld4UnifiedG2DU64Float";
1003
0
  case NVPTXISD::Tld4UnifiedB2DU64Float:
1004
0
    return "NVPTXISD::Tld4UnifiedB2DU64Float";
1005
0
  case NVPTXISD::Tld4UnifiedA2DU64Float:
1006
0
    return "NVPTXISD::Tld4UnifiedA2DU64Float";
1007
0
1008
0
  case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
1009
0
  case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
1010
0
  case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
1011
0
  case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
1012
0
  case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
1013
0
  case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
1014
0
  case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
1015
0
  case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
1016
0
  case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
1017
0
  case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
1018
0
  case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
1019
0
1020
0
  case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
1021
0
  case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
1022
0
  case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
1023
0
  case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
1024
0
  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1025
0
  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1026
0
  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1027
0
  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1028
0
  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1029
0
  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1030
0
  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1031
0
1032
0
  case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
1033
0
  case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
1034
0
  case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
1035
0
  case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
1036
0
  case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
1037
0
  case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
1038
0
  case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
1039
0
  case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
1040
0
  case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
1041
0
  case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
1042
0
  case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
1043
0
1044
0
  case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
1045
0
  case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
1046
0
  case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
1047
0
  case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
1048
0
  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1049
0
  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1050
0
  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1051
0
  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1052
0
  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1053
0
  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1054
0
  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1055
0
1056
0
  case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
1057
0
  case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
1058
0
  case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
1059
0
  case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
1060
0
  case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
1061
0
  case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
1062
0
  case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
1063
0
  case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
1064
0
  case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
1065
0
  case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
1066
0
  case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
1067
0
1068
0
  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
1069
0
  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
1070
0
  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
1071
0
  case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
1072
0
  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
1073
0
  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
1074
0
  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
1075
0
  case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
1076
0
  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
1077
0
  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
1078
0
  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
1079
0
1080
0
  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
1081
0
  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
1082
0
  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
1083
0
  case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
1084
0
  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
1085
0
  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
1086
0
  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
1087
0
  case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
1088
0
  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
1089
0
  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
1090
0
  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
1091
0
1092
0
  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
1093
0
  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
1094
0
  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
1095
0
  case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
1096
0
  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
1097
0
  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
1098
0
  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
1099
0
  case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
1100
0
  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
1101
0
  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
1102
0
  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
1103
0
1104
0
  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
1105
0
  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
1106
0
  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
1107
0
  case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
1108
0
  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
1109
0
  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
1110
0
  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
1111
0
  case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
1112
0
  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
1113
0
  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
1114
0
  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
1115
0
1116
0
  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
1117
0
  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
1118
0
  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
1119
0
  case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
1120
0
  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
1121
0
  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
1122
0
  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
1123
0
  case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
1124
0
  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
1125
0
  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
1126
0
  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
1127
0
1128
0
  case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
1129
0
  case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
1130
0
  case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
1131
0
  case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
1132
0
  case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
1133
0
  case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
1134
0
  case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
1135
0
  case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
1136
0
  case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
1137
0
  case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
1138
0
  case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
1139
0
1140
0
  case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
1141
0
  case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
1142
0
  case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
1143
0
  case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
1144
0
  case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
1145
0
  case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
1146
0
  case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
1147
0
  case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
1148
0
  case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
1149
0
  case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
1150
0
  case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
1151
0
1152
0
  case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
1153
0
  case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
1154
0
  case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
1155
0
  case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
1156
0
  case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
1157
0
  case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
1158
0
  case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
1159
0
  case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
1160
0
  case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
1161
0
  case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
1162
0
  case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
1163
0
1164
0
  case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
1165
0
  case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
1166
0
  case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
1167
0
  case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
1168
0
  case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
1169
0
  case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
1170
0
  case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
1171
0
  case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
1172
0
  case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
1173
0
  case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
1174
0
  case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
1175
0
1176
0
  case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
1177
0
  case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
1178
0
  case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
1179
0
  case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
1180
0
  case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
1181
0
  case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
1182
0
  case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
1183
0
  case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
1184
0
  case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
1185
0
  case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
1186
0
  case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
1187
0
  }
1188
0
  return nullptr;
1189
0
}
1190
1191
TargetLoweringBase::LegalizeTypeAction
1192
50.0k
NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
1193
50.0k
  if (VT.getVectorNumElements() != 1 && 
VT.getScalarType() == MVT::i143.2k
)
1194
6.37k
    return TypeSplitVector;
1195
43.6k
  if (VT == MVT::v2f16)
1196
0
    return TypeLegal;
1197
43.6k
  return TargetLoweringBase::getPreferredVectorAction(VT);
1198
43.6k
}
1199
1200
SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
1201
                                             int Enabled, int &ExtraSteps,
1202
                                             bool &UseOneConst,
1203
21
                                             bool Reciprocal) const {
1204
21
  if (!(Enabled == ReciprocalEstimate::Enabled ||
1205
21
        
(13
Enabled == ReciprocalEstimate::Unspecified13
&&
!usePrecSqrtF32()13
)))
1206
0
    return SDValue();
1207
21
1208
21
  if (ExtraSteps == ReciprocalEstimate::Unspecified)
1209
13
    ExtraSteps = 0;
1210
21
1211
21
  SDLoc DL(Operand);
1212
21
  EVT VT = Operand.getValueType();
1213
21
  bool Ftz = useF32FTZ(DAG.getMachineFunction());
1214
21
1215
21
  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1216
21
    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1217
21
                       DAG.getConstant(IID, DL, MVT::i32), Operand);
1218
21
  };
1219
21
1220
21
  // The sqrt and rsqrt refinement processes assume we always start out with an
1221
21
  // approximation of the rsqrt.  Therefore, if we're going to do any refinement
1222
21
  // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing
1223
21
  // any refinement, we must return a regular sqrt.
1224
21
  if (Reciprocal || 
ExtraSteps > 011
) {
1225
14
    if (VT == MVT::f32)
1226
8
      return MakeIntrinsicCall(Ftz ? 
Intrinsic::nvvm_rsqrt_approx_ftz_f4
1227
8
                                   : 
Intrinsic::nvvm_rsqrt_approx_f4
);
1228
6
    else if (VT == MVT::f64)
1229
6
      return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1230
0
    else
1231
0
      return SDValue();
1232
7
  } else {
1233
7
    if (VT == MVT::f32)
1234
4
      return MakeIntrinsicCall(Ftz ? 
Intrinsic::nvvm_sqrt_approx_ftz_f2
1235
4
                                   : 
Intrinsic::nvvm_sqrt_approx_f2
);
1236
3
    else {
1237
3
      // There's no sqrt.approx.f64 instruction, so we emit
1238
3
      // reciprocal(rsqrt(x)).  This is faster than
1239
3
      // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain
1240
3
      // x * rsqrt(x).)
1241
3
      return DAG.getNode(
1242
3
          ISD::INTRINSIC_WO_CHAIN, DL, VT,
1243
3
          DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1244
3
          MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1245
3
    }
1246
7
  }
1247
21
}
1248
1249
SDValue
1250
207
NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
1251
207
  SDLoc dl(Op);
1252
207
  const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1253
207
  auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1254
207
  Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1255
207
  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1256
207
}
1257
1258
std::string NVPTXTargetLowering::getPrototype(
1259
    const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1260
    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
1261
3
    ImmutableCallSite CS) const {
1262
3
  auto PtrVT = getPointerTy(DL);
1263
3
1264
3
  bool isABI = (STI.getSmVersion() >= 20);
1265
3
  assert(isABI && "Non-ABI compilation is not supported");
1266
3
  if (!isABI)
1267
0
    return "";
1268
3
1269
3
  std::stringstream O;
1270
3
  O << "prototype_" << uniqueCallSite << " : .callprototype ";
1271
3
1272
3
  if (retTy->getTypeID() == Type::VoidTyID) {
1273
3
    O << "()";
1274
3
  } else {
1275
0
    O << "(";
1276
0
    if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
1277
0
      unsigned size = 0;
1278
0
      if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1279
0
        size = ITy->getBitWidth();
1280
0
      } else {
1281
0
        assert(retTy->isFloatingPointTy() &&
1282
0
               "Floating point type expected here");
1283
0
        size = retTy->getPrimitiveSizeInBits();
1284
0
      }
1285
0
      // PTX ABI requires all scalar return values to be at least 32
1286
0
      // bits in size.  fp16 normally uses .b16 as its storage type in
1287
0
      // PTX, so its size must be adjusted here, too.
1288
0
      if (size < 32)
1289
0
        size = 32;
1290
0
1291
0
      O << ".param .b" << size << " _";
1292
0
    } else if (isa<PointerType>(retTy)) {
1293
0
      O << ".param .b" << PtrVT.getSizeInBits() << " _";
1294
0
    } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
1295
0
      auto &DL = CS.getCalledFunction()->getParent()->getDataLayout();
1296
0
      O << ".param .align " << retAlignment << " .b8 _["
1297
0
        << DL.getTypeAllocSize(retTy) << "]";
1298
0
    } else {
1299
0
      llvm_unreachable("Unknown return type");
1300
0
    }
1301
0
    O << ") ";
1302
0
  }
1303
3
  O << "_ (";
1304
3
1305
3
  bool first = true;
1306
3
1307
3
  unsigned OIdx = 0;
1308
3
  for (unsigned i = 0, e = Args.size(); i != e; 
++i, ++OIdx0
) {
1309
0
    Type *Ty = Args[i].Ty;
1310
0
    if (!first) {
1311
0
      O << ", ";
1312
0
    }
1313
0
    first = false;
1314
0
1315
0
    if (!Outs[OIdx].Flags.isByVal()) {
1316
0
      if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1317
0
        unsigned align = 0;
1318
0
        const CallInst *CallI = cast<CallInst>(CS.getInstruction());
1319
0
        // +1 because index 0 is reserved for return type alignment
1320
0
        if (!getAlign(*CallI, i + 1, align))
1321
0
          align = DL.getABITypeAlignment(Ty);
1322
0
        unsigned sz = DL.getTypeAllocSize(Ty);
1323
0
        O << ".param .align " << align << " .b8 ";
1324
0
        O << "_";
1325
0
        O << "[" << sz << "]";
1326
0
        // update the index for Outs
1327
0
        SmallVector<EVT, 16> vtparts;
1328
0
        ComputeValueVTs(*this, DL, Ty, vtparts);
1329
0
        if (unsigned len = vtparts.size())
1330
0
          OIdx += len - 1;
1331
0
        continue;
1332
0
      }
1333
0
      // i8 types in IR will be i16 types in SDAG
1334
0
      assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1335
0
              (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1336
0
             "type mismatch between callee prototype and arguments");
1337
0
      // scalar type
1338
0
      unsigned sz = 0;
1339
0
      if (isa<IntegerType>(Ty)) {
1340
0
        sz = cast<IntegerType>(Ty)->getBitWidth();
1341
0
        if (sz < 32)
1342
0
          sz = 32;
1343
0
      } else if (isa<PointerType>(Ty)) {
1344
0
        sz = PtrVT.getSizeInBits();
1345
0
      } else if (Ty->isHalfTy())
1346
0
        // PTX ABI requires all scalar parameters to be at least 32
1347
0
        // bits in size.  fp16 normally uses .b16 as its storage type
1348
0
        // in PTX, so its size must be adjusted here, too.
1349
0
        sz = 32;
1350
0
      else
1351
0
        sz = Ty->getPrimitiveSizeInBits();
1352
0
      O << ".param .b" << sz << " ";
1353
0
      O << "_";
1354
0
      continue;
1355
0
    }
1356
0
    auto *PTy = dyn_cast<PointerType>(Ty);
1357
0
    assert(PTy && "Param with byval attribute should be a pointer type");
1358
0
    Type *ETy = PTy->getElementType();
1359
0
1360
0
    unsigned align = Outs[OIdx].Flags.getByValAlign();
1361
0
    unsigned sz = DL.getTypeAllocSize(ETy);
1362
0
    O << ".param .align " << align << " .b8 ";
1363
0
    O << "_";
1364
0
    O << "[" << sz << "]";
1365
0
  }
1366
3
  O << ");";
1367
3
  return O.str();
1368
3
}
1369
1370
unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1371
                                                   ImmutableCallSite CS,
1372
                                                   Type *Ty, unsigned Idx,
1373
354
                                                   const DataLayout &DL) const {
1374
354
  if (!CS) {
1375
9
    // CallSite is zero, fallback to ABI type alignment
1376
9
    return DL.getABITypeAlignment(Ty);
1377
9
  }
1378
345
1379
345
  unsigned Align = 0;
1380
345
  const Value *DirectCallee = CS.getCalledFunction();
1381
345
1382
345
  if (!DirectCallee) {
1383
3
    // We don't have a direct function symbol, but that may be because of
1384
3
    // constant cast instructions in the call.
1385
3
    const Instruction *CalleeI = CS.getInstruction();
1386
3
    assert(CalleeI && "Call target is not a function or derived value?");
1387
3
1388
3
    // With bitcast'd call targets, the instruction will be the call
1389
3
    if (isa<CallInst>(CalleeI)) {
1390
3
      // Check if we have call alignment metadata
1391
3
      if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
1392
0
        return Align;
1393
3
1394
3
      const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
1395
3
      // Ignore any bitcast instructions
1396
6
      while (isa<ConstantExpr>(CalleeV)) {
1397
3
        const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1398
3
        if (!CE->isCast())
1399
0
          break;
1400
3
        // Look through the bitcast
1401
3
        CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1402
3
      }
1403
3
1404
3
      // We have now looked past all of the bitcasts.  Do we finally have a
1405
3
      // Function?
1406
3
      if (isa<Function>(CalleeV))
1407
3
        DirectCallee = CalleeV;
1408
3
    }
1409
3
  }
1410
345
1411
345
  // Check for function alignment information if we found that the
1412
345
  // ultimate target is a Function
1413
345
  if (DirectCallee)
1414
345
    if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
1415
0
      return Align;
1416
345
1417
345
  // Call is indirect or alignment information is not available, fall back to
1418
345
  // the ABI type alignment
1419
345
  return DL.getABITypeAlignment(Ty);
1420
345
}
1421
1422
SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1423
179
                                       SmallVectorImpl<SDValue> &InVals) const {
1424
179
  SelectionDAG &DAG = CLI.DAG;
1425
179
  SDLoc dl = CLI.DL;
1426
179
  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1427
179
  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1428
179
  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1429
179
  SDValue Chain = CLI.Chain;
1430
179
  SDValue Callee = CLI.Callee;
1431
179
  bool &isTailCall = CLI.IsTailCall;
1432
179
  ArgListTy &Args = CLI.getArgs();
1433
179
  Type *RetTy = CLI.RetTy;
1434
179
  ImmutableCallSite CS = CLI.CS;
1435
179
  const DataLayout &DL = DAG.getDataLayout();
1436
179
1437
179
  bool isABI = (STI.getSmVersion() >= 20);
1438
179
  assert(isABI && "Non-ABI compilation is not supported");
1439
179
  if (!isABI)
1440
0
    return Chain;
1441
179
1442
179
  SDValue tempChain = Chain;
1443
179
  Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
1444
179
  SDValue InFlag = Chain.getValue(1);
1445
179
1446
179
  unsigned paramCount = 0;
1447
179
  // Args.size() and Outs.size() need not match.
1448
179
  // Outs.size() will be larger
1449
179
  //   * if there is an aggregate argument with multiple fields (each field
1450
179
  //     showing up separately in Outs)
1451
179
  //   * if there is a vector argument with more than typical vector-length
1452
179
  //     elements (generally if more than 4) where each vector element is
1453
179
  //     individually present in Outs.
1454
179
  // So a different index should be used for indexing into Outs/OutVals.
1455
179
  // See similar issue in LowerFormalArguments.
1456
179
  unsigned OIdx = 0;
1457
179
  // Declare the .params or .reg need to pass values
1458
179
  // to the function
1459
354
  for (unsigned i = 0, e = Args.size(); i != e; 
++i, ++OIdx175
) {
1460
175
    EVT VT = Outs[OIdx].VT;
1461
175
    Type *Ty = Args[i].Ty;
1462
175
1463
175
    if (!Outs[OIdx].Flags.isByVal()) {
1464
174
      SmallVector<EVT, 16> VTs;
1465
174
      SmallVector<uint64_t, 16> Offsets;
1466
174
      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
1467
174
      unsigned ArgAlign =
1468
174
          getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
1469
174
      unsigned AllocSize = DL.getTypeAllocSize(Ty);
1470
174
      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1471
174
      bool NeedAlign; // Does argument declaration specify alignment?
1472
174
      if (Ty->isAggregateType() || 
Ty->isVectorTy()160
||
Ty->isIntegerTy(128)118
) {
1473
65
        // declare .param .align <align> .b8 .param<n>[<size>];
1474
65
        SDValue DeclareParamOps[] = {
1475
65
            Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1476
65
            DAG.getConstant(paramCount, dl, MVT::i32),
1477
65
            DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
1478
65
        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1479
65
                            DeclareParamOps);
1480
65
        NeedAlign = true;
1481
109
      } else {
1482
109
        // declare .param .b<size> .param<n>;
1483
109
        if ((VT.isInteger() || 
VT.isFloatingPoint()59
) && AllocSize < 4) {
1484
32
          // PTX ABI requires integral types to be at least 32 bits in
1485
32
          // size. FP16 is loaded/stored using i16, so it's handled
1486
32
          // here as well.
1487
32
          AllocSize = 4;
1488
32
        }
1489
109
        SDValue DeclareScalarParamOps[] = {
1490
109
            Chain, DAG.getConstant(paramCount, dl, MVT::i32),
1491
109
            DAG.getConstant(AllocSize * 8, dl, MVT::i32),
1492
109
            DAG.getConstant(0, dl, MVT::i32), InFlag};
1493
109
        Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1494
109
                            DeclareScalarParamOps);
1495
109
        NeedAlign = false;
1496
109
      }
1497
174
      InFlag = Chain.getValue(1);
1498
174
1499
174
      // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1500
174
      // than 32-bits are sign extended or zero extended, depending on
1501
174
      // whether they are signed or unsigned types. This case applies
1502
174
      // only to scalar parameters and not to aggregate values.
1503
174
      bool ExtendIntegerParam =
1504
174
          Ty->isIntegerTy() && 
DL.getTypeAllocSizeInBits(Ty) < 3235
;
1505
174
1506
174
      auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
1507
174
      SmallVector<SDValue, 6> StoreOperands;
1508
459
      for (unsigned j = 0, je = VTs.size(); j != je; 
++j285
) {
1509
285
        // New store.
1510
285
        if (VectorInfo[j] & PVF_FIRST) {
1511
212
          assert(StoreOperands.empty() && "Unfinished preceding store.");
1512
212
          StoreOperands.push_back(Chain);
1513
212
          StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
1514
212
          StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
1515
212
        }
1516
285
1517
285
        EVT EltVT = VTs[j];
1518
285
        SDValue StVal = OutVals[OIdx];
1519
285
        if (ExtendIntegerParam) {
1520
7
          assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1521
7
          // zext/sext to i32
1522
7
          StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? 
ISD::SIGN_EXTEND3
1523
7
                                                        : 
ISD::ZERO_EXTEND4
,
1524
7
                              dl, MVT::i32, StVal);
1525
278
        } else if (EltVT.getSizeInBits() < 16) {
1526
28
          // Use 16-bit registers for small stores as it's the
1527
28
          // smallest general purpose register size supported by NVPTX.
1528
28
          StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1529
28
        }
1530
285
1531
285
        // Record the value to store.
1532
285
        StoreOperands.push_back(StVal);
1533
285
1534
285
        if (VectorInfo[j] & PVF_LAST) {
1535
212
          unsigned NumElts = StoreOperands.size() - 3;
1536
212
          NVPTXISD::NodeType Op;
1537
212
          switch (NumElts) {
1538
212
          case 1:
1539
171
            Op = NVPTXISD::StoreParam;
1540
171
            break;
1541
212
          case 2:
1542
25
            Op = NVPTXISD::StoreParamV2;
1543
25
            break;
1544
212
          case 4:
1545
16
            Op = NVPTXISD::StoreParamV4;
1546
16
            break;
1547
212
          default:
1548
0
            llvm_unreachable("Invalid vector info.");
1549
212
          }
1550
212
1551
212
          StoreOperands.push_back(InFlag);
1552
212
1553
212
          // Adjust type of the store op if we've extended the scalar
1554
212
          // return value.
1555
212
          EVT TheStoreType = ExtendIntegerParam ? 
MVT::i327
:
VTs[j]205
;
1556
212
          unsigned EltAlign =
1557
212
              NeedAlign ? 
GreatestCommonDivisor64(ArgAlign, Offsets[j])103
:
0109
;
1558
212
1559
212
          Chain = DAG.getMemIntrinsicNode(
1560
212
              Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1561
212
              TheStoreType, MachinePointerInfo(), EltAlign,
1562
212
              MachineMemOperand::MOStore);
1563
212
          InFlag = Chain.getValue(1);
1564
212
1565
212
          // Cleanup.
1566
212
          StoreOperands.clear();
1567
212
        }
1568
285
        ++OIdx;
1569
285
      }
1570
174
      assert(StoreOperands.empty() && "Unfinished parameter store.");
1571
174
      if (VTs.size() > 0)
1572
174
        --OIdx;
1573
174
      ++paramCount;
1574
174
      continue;
1575
1
    }
1576
1
1577
1
    // ByVal arguments
1578
1
    SmallVector<EVT, 16> VTs;
1579
1
    SmallVector<uint64_t, 16> Offsets;
1580
1
    auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
1581
1
    assert(PTy && "Type of a byval parameter should be pointer");
1582
1
    ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
1583
1
1584
1
    // declare .param .align <align> .b8 .param<n>[<size>];
1585
1
    unsigned sz = Outs[OIdx].Flags.getByValSize();
1586
1
    SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1587
1
    unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
1588
1
    // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1589
1
    // so we don't need to worry about natural alignment or not.
1590
1
    // See TargetLowering::LowerCallTo().
1591
1
1592
1
    // Enforce minumum alignment of 4 to work around ptxas miscompile
1593
1
    // for sm_50+. See corresponding alignment adjustment in
1594
1
    // emitFunctionParamList() for details.
1595
1
    if (ArgAlign < 4)
1596
1
      ArgAlign = 4;
1597
1
    SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1598
1
                                 DAG.getConstant(paramCount, dl, MVT::i32),
1599
1
                                 DAG.getConstant(sz, dl, MVT::i32), InFlag};
1600
1
    Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1601
1
                        DeclareParamOps);
1602
1
    InFlag = Chain.getValue(1);
1603
2
    for (unsigned j = 0, je = VTs.size(); j != je; 
++j1
) {
1604
1
      EVT elemtype = VTs[j];
1605
1
      int curOffset = Offsets[j];
1606
1
      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
1607
1
      auto PtrVT = getPointerTy(DL);
1608
1
      SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
1609
1
                                    DAG.getConstant(curOffset, dl, PtrVT));
1610
1
      SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
1611
1
                                   MachinePointerInfo(), PartAlign);
1612
1
      if (elemtype.getSizeInBits() < 16) {
1613
1
        theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
1614
1
      }
1615
1
      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1616
1
      SDValue CopyParamOps[] = { Chain,
1617
1
                                 DAG.getConstant(paramCount, dl, MVT::i32),
1618
1
                                 DAG.getConstant(curOffset, dl, MVT::i32),
1619
1
                                 theVal, InFlag };
1620
1
      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
1621
1
                                      CopyParamOps, elemtype,
1622
1
                                      MachinePointerInfo(), /* Align */ 0,
1623
1
                                      MachineMemOperand::MOStore);
1624
1
1625
1
      InFlag = Chain.getValue(1);
1626
1
    }
1627
1
    ++paramCount;
1628
1
  }
1629
179
1630
179
  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1631
179
  unsigned retAlignment = 0;
1632
179
1633
179
  // Handle Result
1634
179
  if (Ins.size() > 0) {
1635
122
    SmallVector<EVT, 16> resvtparts;
1636
122
    ComputeValueVTs(*this, DL, RetTy, resvtparts);
1637
122
1638
122
    // Declare
1639
122
    //  .param .align 16 .b8 retval0[<size-in-bytes>], or
1640
122
    //  .param .b<size-in-bits> retval0
1641
122
    unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1642
122
    // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1643
122
    // these three types to match the logic in
1644
122
    // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1645
122
    // Plus, this behavior is consistent with nvcc's.
1646
122
    if (RetTy->isFloatingPointTy() || 
RetTy->isPointerTy()93
||
1647
122
        
(93
RetTy->isIntegerTy()93
&&
!RetTy->isIntegerTy(128)38
)) {
1648
62
      // Scalar needs to be at least 32bit wide
1649
62
      if (resultsz < 32)
1650
35
        resultsz = 32;
1651
62
      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1652
62
      SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1653
62
                                  DAG.getConstant(resultsz, dl, MVT::i32),
1654
62
                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
1655
62
      Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1656
62
                          DeclareRetOps);
1657
62
      InFlag = Chain.getValue(1);
1658
62
    } else {
1659
60
      retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1660
60
      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1661
60
      SDValue DeclareRetOps[] = { Chain,
1662
60
                                  DAG.getConstant(retAlignment, dl, MVT::i32),
1663
60
                                  DAG.getConstant(resultsz / 8, dl, MVT::i32),
1664
60
                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
1665
60
      Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1666
60
                          DeclareRetOps);
1667
60
      InFlag = Chain.getValue(1);
1668
60
    }
1669
122
  }
1670
179
1671
179
  // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1672
179
  // between them we must rely on the call site value which is valid for
1673
179
  // indirect calls but is always null for libcalls.
1674
179
  bool isIndirectCall = !Func && 
CS6
;
1675
179
1676
179
  if (isa<ExternalSymbolSDNode>(Callee)) {
1677
3
    Function* CalleeFunc = nullptr;
1678
3
1679
3
    // Try to find the callee in the current module.
1680
3
    Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1681
3
    assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1682
3
1683
3
    // Set the "libcall callee" attribute to indicate that the function
1684
3
    // must always have a declaration.
1685
3
    CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1686
3
  }
1687
179
1688
179
  if (isIndirectCall) {
1689
3
    // This is indirect function call case : PTX requires a prototype of the
1690
3
    // form
1691
3
    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1692
3
    // to be emitted, and the label has to used as the last arg of call
1693
3
    // instruction.
1694
3
    // The prototype is embedded in a string and put as the operand for a
1695
3
    // CallPrototype SDNode which will print out to the value of the string.
1696
3
    SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1697
3
    std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
1698
3
    const char *ProtoStr =
1699
3
      nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1700
3
    SDValue ProtoOps[] = {
1701
3
      Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1702
3
    };
1703
3
    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1704
3
    InFlag = Chain.getValue(1);
1705
3
  }
1706
179
  // Op to just print "call"
1707
179
  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1708
179
  SDValue PrintCallOps[] = {
1709
179
    Chain, DAG.getConstant((Ins.size() == 0) ? 
057
:
1122
, dl, MVT::i32), InFlag
1710
179
  };
1711
179
  // We model convergent calls as separate opcodes.
1712
179
  unsigned Opcode = isIndirectCall ? 
NVPTXISD::PrintCall3
:
NVPTXISD::PrintCallUni176
;
1713
179
  if (CLI.IsConvergent)
1714
7
    Opcode = Opcode == NVPTXISD::PrintCallUni ? 
NVPTXISD::PrintConvergentCallUni6
1715
7
                                              : 
NVPTXISD::PrintConvergentCall1
;
1716
179
  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1717
179
  InFlag = Chain.getValue(1);
1718
179
1719
179
  // Ops to print out the function name
1720
179
  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1721
179
  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1722
179
  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1723
179
  InFlag = Chain.getValue(1);
1724
179
1725
179
  // Ops to print out the param list
1726
179
  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1727
179
  SDValue CallArgBeginOps[] = { Chain, InFlag };
1728
179
  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1729
179
                      CallArgBeginOps);
1730
179
  InFlag = Chain.getValue(1);
1731
179
1732
350
  for (unsigned i = 0, e = paramCount; i != e; 
++i171
) {
1733
171
    unsigned opcode;
1734
171
    if (i == (e - 1))
1735
140
      opcode = NVPTXISD::LastCallArg;
1736
31
    else
1737
31
      opcode = NVPTXISD::CallArg;
1738
171
    SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1739
171
    SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1740
171
                             DAG.getConstant(i, dl, MVT::i32), InFlag };
1741
171
    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1742
171
    InFlag = Chain.getValue(1);
1743
171
  }
1744
179
  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1745
179
  SDValue CallArgEndOps[] = { Chain,
1746
179
                              DAG.getConstant(isIndirectCall ? 
03
:
1176
, dl, MVT::i32),
1747
179
                              InFlag };
1748
179
  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1749
179
  InFlag = Chain.getValue(1);
1750
179
1751
179
  if (isIndirectCall) {
1752
3
    SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1753
3
    SDValue PrototypeOps[] = { Chain,
1754
3
                               DAG.getConstant(uniqueCallSite, dl, MVT::i32),
1755
3
                               InFlag };
1756
3
    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1757
3
    InFlag = Chain.getValue(1);
1758
3
  }
1759
179
1760
179
  SmallVector<SDValue, 16> ProxyRegOps;
1761
179
  SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
1762
179
1763
179
  // Generate loads from param memory/moves from registers for result
1764
179
  if (Ins.size() > 0) {
1765
120
    SmallVector<EVT, 16> VTs;
1766
120
    SmallVector<uint64_t, 16> Offsets;
1767
120
    ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1768
120
    assert(VTs.size() == Ins.size() && "Bad value decomposition");
1769
120
1770
120
    unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1771
120
    auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1772
120
1773
120
    SmallVector<EVT, 6> LoadVTs;
1774
120
    int VecIdx = -1; // Index of the first element of the vector.
1775
120
1776
120
    // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1777
120
    // 32-bits are sign extended or zero extended, depending on whether
1778
120
    // they are signed or unsigned types.
1779
120
    bool ExtendIntegerRetVal =
1780
120
        RetTy->isIntegerTy() && 
DL.getTypeAllocSizeInBits(RetTy) < 3237
;
1781
120
1782
359
    for (unsigned i = 0, e = VTs.size(); i != e; 
++i239
) {
1783
239
      bool needTruncate = false;
1784
239
      EVT TheLoadType = VTs[i];
1785
239
      EVT EltType = Ins[i].VT;
1786
239
      unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
1787
239
      if (ExtendIntegerRetVal) {
1788
20
        TheLoadType = MVT::i32;
1789
20
        EltType = MVT::i32;
1790
20
        needTruncate = true;
1791
219
      } else if (TheLoadType.getSizeInBits() < 16) {
1792
28
        if (VTs[i].isInteger())
1793
28
          needTruncate = true;
1794
28
        EltType = MVT::i16;
1795
28
      }
1796
239
1797
239
      // Record index of the very first element of the vector.
1798
239
      if (VectorInfo[i] & PVF_FIRST) {
1799
158
        assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1800
158
        VecIdx = i;
1801
158
      }
1802
239
1803
239
      LoadVTs.push_back(EltType);
1804
239
1805
239
      if (VectorInfo[i] & PVF_LAST) {
1806
158
        unsigned NumElts = LoadVTs.size();
1807
158
        LoadVTs.push_back(MVT::Other);
1808
158
        LoadVTs.push_back(MVT::Glue);
1809
158
        NVPTXISD::NodeType Op;
1810
158
        switch (NumElts) {
1811
158
        case 1:
1812
117
          Op = NVPTXISD::LoadParam;
1813
117
          break;
1814
158
        case 2:
1815
21
          Op = NVPTXISD::LoadParamV2;
1816
21
          break;
1817
158
        case 4:
1818
20
          Op = NVPTXISD::LoadParamV4;
1819
20
          break;
1820
158
        default:
1821
0
          llvm_unreachable("Invalid vector info.");
1822
158
        }
1823
158
1824
158
        SDValue LoadOperands[] = {
1825
158
            Chain, DAG.getConstant(1, dl, MVT::i32),
1826
158
            DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
1827
158
        SDValue RetVal = DAG.getMemIntrinsicNode(
1828
158
            Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1829
158
            MachinePointerInfo(), EltAlign,
1830
158
            MachineMemOperand::MOLoad);
1831
158
1832
397
        for (unsigned j = 0; j < NumElts; 
++j239
) {
1833
239
          ProxyRegOps.push_back(RetVal.getValue(j));
1834
239
1835
239
          if (needTruncate)
1836
48
            ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
1837
191
          else
1838
191
            ProxyRegTruncates.push_back(Optional<MVT>());
1839
239
        }
1840
158
1841
158
        Chain = RetVal.getValue(NumElts);
1842
158
        InFlag = RetVal.getValue(NumElts + 1);
1843
158
1844
158
        // Cleanup
1845
158
        VecIdx = -1;
1846
158
        LoadVTs.clear();
1847
158
      }
1848
239
    }
1849
120
  }
1850
179
1851
179
  Chain = DAG.getCALLSEQ_END(Chain,
1852
179
                             DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1853
179
                             DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
1854
179
                                                   true),
1855
179
                             InFlag, dl);
1856
179
  InFlag = Chain.getValue(1);
1857
179
  uniqueCallSite++;
1858
179
1859
179
  // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1860
179
  // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1861
179
  // dangling.
1862
418
  for (unsigned i = 0; i < ProxyRegOps.size(); 
++i239
) {
1863
239
    SDValue Ret = DAG.getNode(
1864
239
      NVPTXISD::ProxyReg, dl,
1865
239
      DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1866
239
      { Chain, ProxyRegOps[i], InFlag }
1867
239
    );
1868
239
1869
239
    Chain = Ret.getValue(1);
1870
239
    InFlag = Ret.getValue(2);
1871
239
1872
239
    if (ProxyRegTruncates[i].hasValue()) {
1873
48
      Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
1874
48
    }
1875
239
1876
239
    InVals.push_back(Ret);
1877
239
  }
1878
179
1879
179
  // set isTailCall to false for now, until we figure out how to express
1880
179
  // tail call optimization in PTX
1881
179
  isTailCall = false;
1882
179
  return Chain;
1883
179
}
1884
1885
// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1886
// (see LegalizeDAG.cpp). This is slow and uses local memory.
1887
// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1888
SDValue
1889
0
NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1890
0
  SDNode *Node = Op.getNode();
1891
0
  SDLoc dl(Node);
1892
0
  SmallVector<SDValue, 8> Ops;
1893
0
  unsigned NumOperands = Node->getNumOperands();
1894
0
  for (unsigned i = 0; i < NumOperands; ++i) {
1895
0
    SDValue SubOp = Node->getOperand(i);
1896
0
    EVT VVT = SubOp.getNode()->getValueType(0);
1897
0
    EVT EltVT = VVT.getVectorElementType();
1898
0
    unsigned NumSubElem = VVT.getVectorNumElements();
1899
0
    for (unsigned j = 0; j < NumSubElem; ++j) {
1900
0
      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1901
0
                                DAG.getIntPtrConstant(j, dl)));
1902
0
    }
1903
0
  }
1904
0
  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1905
0
}
1906
1907
// We can init constant f16x2 with a single .b32 move.  Normally it
1908
// would get lowered as two constant loads and vector-packing move.
1909
//        mov.b16         %h1, 0x4000;
1910
//        mov.b16         %h2, 0x3C00;
1911
//        mov.b32         %hh2, {%h2, %h1};
1912
// Instead we want just a constant move:
1913
//        mov.b32         %hh2, 0x40003C00
1914
//
1915
// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
1916
// generates good SASS in both cases.
1917
SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
1918
241
                                               SelectionDAG &DAG) const {
1919
241
  //return Op;
1920
241
  if (!(Op->getValueType(0) == MVT::v2f16 &&
1921
241
        isa<ConstantFPSDNode>(Op->getOperand(0)) &&
1922
241
        
isa<ConstantFPSDNode>(Op->getOperand(1))6
))
1923
235
    return Op;
1924
6
1925
6
  APInt E0 =
1926
6
      cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
1927
6
  APInt E1 =
1928
6
      cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
1929
6
  SDValue Const =
1930
6
      DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
1931
6
  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
1932
6
}
1933
1934
SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1935
937
                                                     SelectionDAG &DAG) const {
1936
937
  SDValue Index = Op->getOperand(1);
1937
937
  // Constant index will be matched by tablegen.
1938
937
  if (isa<ConstantSDNode>(Index.getNode()))
1939
934
    return Op;
1940
3
1941
3
  // Extract individual elements and select one of them.
1942
3
  SDValue Vector = Op->getOperand(0);
1943
3
  EVT VectorVT = Vector.getValueType();
1944
3
  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
1945
3
  EVT EltVT = VectorVT.getVectorElementType();
1946
3
1947
3
  SDLoc dl(Op.getNode());
1948
3
  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1949
3
                           DAG.getIntPtrConstant(0, dl));
1950
3
  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1951
3
                           DAG.getIntPtrConstant(1, dl));
1952
3
  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
1953
3
                         ISD::CondCode::SETEQ);
1954
3
}
1955
1956
/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1957
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1958
///    amount, or
1959
/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1960
///    amount.
1961
SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1962
1
                                                  SelectionDAG &DAG) const {
1963
1
  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1964
1
  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
1965
1
1966
1
  EVT VT = Op.getValueType();
1967
1
  unsigned VTBits = VT.getSizeInBits();
1968
1
  SDLoc dl(Op);
1969
1
  SDValue ShOpLo = Op.getOperand(0);
1970
1
  SDValue ShOpHi = Op.getOperand(1);
1971
1
  SDValue ShAmt  = Op.getOperand(2);
1972
1
  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : 
ISD::SRL0
;
1973
1
1974
1
  if (VTBits == 32 && 
STI.getSmVersion() >= 350
) {
1975
0
    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1976
0
    // {dHi, dLo} = {aHi, aLo} >> Amt
1977
0
    //   dHi = aHi >> Amt
1978
0
    //   dLo = shf.r.clamp aLo, aHi, Amt
1979
0
1980
0
    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1981
0
    SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1982
0
                             ShAmt);
1983
0
1984
0
    SDValue Ops[2] = { Lo, Hi };
1985
0
    return DAG.getMergeValues(Ops, dl);
1986
0
  }
1987
1
  else {
1988
1
    // {dHi, dLo} = {aHi, aLo} >> Amt
1989
1
    // - if (Amt>=size) then
1990
1
    //      dLo = aHi >> (Amt-size)
1991
1
    //      dHi = aHi >> Amt (this is either all 0 or all 1)
1992
1
    //   else
1993
1
    //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1994
1
    //      dHi = aHi >> Amt
1995
1
1996
1
    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1997
1
                                   DAG.getConstant(VTBits, dl, MVT::i32),
1998
1
                                   ShAmt);
1999
1
    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2000
1
    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2001
1
                                     DAG.getConstant(VTBits, dl, MVT::i32));
2002
1
    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2003
1
    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2004
1
    SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2005
1
2006
1
    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2007
1
                               DAG.getConstant(VTBits, dl, MVT::i32),
2008
1
                               ISD::SETGE);
2009
1
    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2010
1
    SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2011
1
2012
1
    SDValue Ops[2] = { Lo, Hi };
2013
1
    return DAG.getMergeValues(Ops, dl);
2014
1
  }
2015
1
}
2016
2017
/// LowerShiftLeftParts - Lower SHL_PARTS, which
2018
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2019
///    amount, or
2020
/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2021
///    amount.
2022
SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2023
1
                                                 SelectionDAG &DAG) const {
2024
1
  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2025
1
  assert(Op.getOpcode() == ISD::SHL_PARTS);
2026
1
2027
1
  EVT VT = Op.getValueType();
2028
1
  unsigned VTBits = VT.getSizeInBits();
2029
1
  SDLoc dl(Op);
2030
1
  SDValue ShOpLo = Op.getOperand(0);
2031
1
  SDValue ShOpHi = Op.getOperand(1);
2032
1
  SDValue ShAmt  = Op.getOperand(2);
2033
1
2034
1
  if (VTBits == 32 && 
STI.getSmVersion() >= 350
) {
2035
0
    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2036
0
    // {dHi, dLo} = {aHi, aLo} << Amt
2037
0
    //   dHi = shf.l.clamp aLo, aHi, Amt
2038
0
    //   dLo = aLo << Amt
2039
0
2040
0
    SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2041
0
                             ShAmt);
2042
0
    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2043
0
2044
0
    SDValue Ops[2] = { Lo, Hi };
2045
0
    return DAG.getMergeValues(Ops, dl);
2046
0
  }
2047
1
  else {
2048
1
    // {dHi, dLo} = {aHi, aLo} << Amt
2049
1
    // - if (Amt>=size) then
2050
1
    //      dLo = aLo << Amt (all 0)
2051
1
    //      dLo = aLo << (Amt-size)
2052
1
    //   else
2053
1
    //      dLo = aLo << Amt
2054
1
    //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
2055
1
2056
1
    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2057
1
                                   DAG.getConstant(VTBits, dl, MVT::i32),
2058
1
                                   ShAmt);
2059
1
    SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2060
1
    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2061
1
                                     DAG.getConstant(VTBits, dl, MVT::i32));
2062
1
    SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2063
1
    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2064
1
    SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2065
1
2066
1
    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2067
1
                               DAG.getConstant(VTBits, dl, MVT::i32),
2068
1
                               ISD::SETGE);
2069
1
    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2070
1
    SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2071
1
2072
1
    SDValue Ops[2] = { Lo, Hi };
2073
1
    return DAG.getMergeValues(Ops, dl);
2074
1
  }
2075
1
}
2076
2077
13
SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2078
13
  EVT VT = Op.getValueType();
2079
13
2080
13
  if (VT == MVT::f32)
2081
12
    return LowerFROUND32(Op, DAG);
2082
1
2083
1
  if (VT == MVT::f64)
2084
1
    return LowerFROUND64(Op, DAG);
2085
0
2086
0
  llvm_unreachable("unhandled type");
2087
0
}
2088
2089
// This is the the rounding method used in CUDA libdevice in C like code:
2090
// float roundf(float A)
2091
// {
2092
//   float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2093
//   RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2094
//   return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2095
// }
2096
SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2097
12
                                           SelectionDAG &DAG) const {
2098
12
  SDLoc SL(Op);
2099
12
  SDValue A = Op.getOperand(0);
2100
12
  EVT VT = Op.getValueType();
2101
12
2102
12
  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2103
12
2104
12
  // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2105
12
  SDValue Bitcast  = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2106
12
  const int SignBitMask = 0x80000000;
2107
12
  SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2108
12
                             DAG.getConstant(SignBitMask, SL, MVT::i32));
2109
12
  const int PointFiveInBits = 0x3F000000;
2110
12
  SDValue PointFiveWithSignRaw =
2111
12
      DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2112
12
                  DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2113
12
  SDValue PointFiveWithSign =
2114
12
      DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2115
12
  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2116
12
  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2117
12
2118
12
  // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2119
12
  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2120
12
  SDValue IsLarge =
2121
12
      DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2122
12
                   ISD::SETOGT);
2123
12
  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2124
12
2125
12
  // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2126
12
  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2127
12
                                DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2128
12
  SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2129
12
  return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2130
12
}
2131
2132
// The implementation of round(double) is similar to that of round(float) in
2133
// that they both separate the value range into three regions and use a method
2134
// specific to the region to round the values. However, round(double) first
2135
// calculates the round of the absolute value and then adds the sign back while
2136
// round(float) directly rounds the value with sign.
2137
SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2138
1
                                           SelectionDAG &DAG) const {
2139
1
  SDLoc SL(Op);
2140
1
  SDValue A = Op.getOperand(0);
2141
1
  EVT VT = Op.getValueType();
2142
1
2143
1
  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2144
1
2145
1
  // double RoundedA = (double) (int) (abs(A) + 0.5f);
2146
1
  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2147
1
                                  DAG.getConstantFP(0.5, SL, VT));
2148
1
  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2149
1
2150
1
  // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2151
1
  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2152
1
  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2153
1
                                DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2154
1
  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2155
1
                         DAG.getConstantFP(0, SL, VT),
2156
1
                         RoundedA);
2157
1
2158
1
  // Add sign to rounded_A
2159
1
  RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2160
1
  DAG.getNode(ISD::FTRUNC, SL, VT, A);
2161
1
2162
1
  // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2163
1
  SDValue IsLarge =
2164
1
      DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2165
1
                   ISD::SETOGT);
2166
1
  return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2167
1
}
2168
2169
2170
2171
SDValue
2172
2.41k
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2173
2.41k
  switch (Op.getOpcode()) {
2174
2.41k
  case ISD::RETURNADDR:
2175
0
    return SDValue();
2176
2.41k
  case ISD::FRAMEADDR:
2177
0
    return SDValue();
2178
2.41k
  case ISD::GlobalAddress:
2179
207
    return LowerGlobalAddress(Op, DAG);
2180
2.41k
  case ISD::INTRINSIC_W_CHAIN:
2181
348
    return Op;
2182
2.41k
  case ISD::BUILD_VECTOR:
2183
241
    return LowerBUILD_VECTOR(Op, DAG);
2184
2.41k
  case ISD::EXTRACT_SUBVECTOR:
2185
0
    return Op;
2186
2.41k
  case ISD::EXTRACT_VECTOR_ELT:
2187
937
    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2188
2.41k
  case ISD::CONCAT_VECTORS:
2189
0
    return LowerCONCAT_VECTORS(Op, DAG);
2190
2.41k
  case ISD::STORE:
2191
72
    return LowerSTORE(Op, DAG);
2192
2.41k
  case ISD::LOAD:
2193
593
    return LowerLOAD(Op, DAG);
2194
2.41k
  case ISD::SHL_PARTS:
2195
1
    return LowerShiftLeftParts(Op, DAG);
2196
2.41k
  case ISD::SRA_PARTS:
2197
1
  case ISD::SRL_PARTS:
2198
1
    return LowerShiftRightParts(Op, DAG);
2199
1
  case ISD::SELECT:
2200
1
    return LowerSelect(Op, DAG);
2201
13
  case ISD::FROUND:
2202
13
    return LowerFROUND(Op, DAG);
2203
1
  default:
2204
0
    llvm_unreachable("Custom lowering not defined for operation");
2205
2.41k
  }
2206
2.41k
}
2207
2208
1
SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2209
1
  SDValue Op0 = Op->getOperand(0);
2210
1
  SDValue Op1 = Op->getOperand(1);
2211
1
  SDValue Op2 = Op->getOperand(2);
2212
1
  SDLoc DL(Op.getNode());
2213
1
2214
1
  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2215
1
2216
1
  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2217
1
  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2218
1
  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2219
1
  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2220
1
2221
1
  return Trunc;
2222
1
}
2223
2224
593
SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2225
593
  if (Op.getValueType() == MVT::i1)
2226
2
    return LowerLOADi1(Op, DAG);
2227
591
2228
591
  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2229
591
  // loads and have to handle it here.
2230
591
  if (Op.getValueType() == MVT::v2f16) {
2231
591
    LoadSDNode *Load = cast<LoadSDNode>(Op);
2232
591
    EVT MemVT = Load->getMemoryVT();
2233
591
    if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
2234
591
                            *Load->getMemOperand())) {
2235
3
      SDValue Ops[2];
2236
3
      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2237
3
      return DAG.getMergeValues(Ops, SDLoc(Op));
2238
3
    }
2239
588
  }
2240
588
2241
588
  return SDValue();
2242
588
}
2243
2244
// v = ld i1* addr
2245
//   =>
2246
// v1 = ld i8* addr (-> i16)
2247
// v = trunc i16 to i1
2248
2
SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2249
2
  SDNode *Node = Op.getNode();
2250
2
  LoadSDNode *LD = cast<LoadSDNode>(Node);
2251
2
  SDLoc dl(Node);
2252
2
  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2253
2
  assert(Node->getValueType(0) == MVT::i1 &&
2254
2
         "Custom lowering for i1 load only");
2255
2
  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2256
2
                              LD->getPointerInfo(), LD->getAlignment(),
2257
2
                              LD->getMemOperand()->getFlags());
2258
2
  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2259
2
  // The legalizer (the caller) is expecting two values from the legalized
2260
2
  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2261
2
  // in LegalizeDAG.cpp which also uses MergeValues.
2262
2
  SDValue Ops[] = { result, LD->getChain() };
2263
2
  return DAG.getMergeValues(Ops, dl);
2264
2
}
2265
2266
72
SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2267
72
  StoreSDNode *Store = cast<StoreSDNode>(Op);
2268
72
  EVT VT = Store->getMemoryVT();
2269
72
2270
72
  if (VT == MVT::i1)
2271
2
    return LowerSTOREi1(Op, DAG);
2272
70
2273
70
  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2274
70
  // stores and have to handle it here.
2275
70
  if (VT == MVT::v2f16 &&
2276
70
      !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
2277
6
                          *Store->getMemOperand()))
2278
3
    return expandUnalignedStore(Store, DAG);
2279
67
2280
67
  if (VT.isVector())
2281
67
    return LowerSTOREVector(Op, DAG);
2282
0
2283
0
  return SDValue();
2284
0
}
2285
2286
SDValue
2287
67
NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2288
67
  SDNode *N = Op.getNode();
2289
67
  SDValue Val = N->getOperand(1);
2290
67
  SDLoc DL(N);
2291
67
  EVT ValVT = Val.getValueType();
2292
67
2293
67
  if (ValVT.isVector()) {
2294
67
    // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2295
67
    // legal.  We can (and should) split that into 2 stores of <2 x double> here
2296
67
    // but I'm leaving that as a TODO for now.
2297
67
    if (!ValVT.isSimple())
2298
0
      return SDValue();
2299
67
    switch (ValVT.getSimpleVT().SimpleTy) {
2300
67
    default:
2301
0
      return SDValue();
2302
67
    case MVT::v2i8:
2303
67
    case MVT::v2i16:
2304
67
    case MVT::v2i32:
2305
67
    case MVT::v2i64:
2306
67
    case MVT::v2f16:
2307
67
    case MVT::v2f32:
2308
67
    case MVT::v2f64:
2309
67
    case MVT::v4i8:
2310
67
    case MVT::v4i16:
2311
67
    case MVT::v4i32:
2312
67
    case MVT::v4f16:
2313
67
    case MVT::v4f32:
2314
67
    case MVT::v8f16: // <4 x f16x2>
2315
67
      // This is a "native" vector type
2316
67
      break;
2317
67
    }
2318
67
2319
67
    MemSDNode *MemSD = cast<MemSDNode>(N);
2320
67
    const DataLayout &TD = DAG.getDataLayout();
2321
67
2322
67
    unsigned Align = MemSD->getAlignment();
2323
67
    unsigned PrefAlign =
2324
67
        TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
2325
67
    if (Align < PrefAlign) {
2326
8
      // This store is not sufficiently aligned, so bail out and let this vector
2327
8
      // store be scalarized.  Note that we may still be able to emit smaller
2328
8
      // vector stores.  For example, if we are storing a <4 x float> with an
2329
8
      // alignment of 8, this check will fail but the legalizer will try again
2330
8
      // with 2 x <2 x float>, which will succeed with an alignment of 8.
2331
8
      return SDValue();
2332
8
    }
2333
59
2334
59
    unsigned Opcode = 0;
2335
59
    EVT EltVT = ValVT.getVectorElementType();
2336
59
    unsigned NumElts = ValVT.getVectorNumElements();
2337
59
2338
59
    // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2339
59
    // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
2340
59
    // stored type to i16 and propagate the "real" type as the memory type.
2341
59
    bool NeedExt = false;
2342
59
    if (EltVT.getSizeInBits() < 16)
2343
4
      NeedExt = true;
2344
59
2345
59
    bool StoreF16x2 = false;
2346
59
    switch (NumElts) {
2347
59
    default:
2348
0
      return SDValue();
2349
59
    case 2:
2350
35
      Opcode = NVPTXISD::StoreV2;
2351
35
      break;
2352
59
    case 4:
2353
21
      Opcode = NVPTXISD::StoreV4;
2354
21
      break;
2355
59
    case 8:
2356
3
      // v8f16 is a special case. PTX doesn't have st.v8.f16
2357
3
      // instruction. Instead, we split the vector into v2f16 chunks and
2358
3
      // store them with st.v4.b32.
2359
3
      assert(EltVT == MVT::f16 && "Wrong type for the vector.");
2360
3
      Opcode = NVPTXISD::StoreV4;
2361
3
      StoreF16x2 = true;
2362
3
      break;
2363
59
    }
2364
59
2365
59
    SmallVector<SDValue, 8> Ops;
2366
59
2367
59
    // First is the chain
2368
59
    Ops.push_back(N->getOperand(0));
2369
59
2370
59
    if (StoreF16x2) {
2371
3
      // Combine f16,f16 -> v2f16
2372
3
      NumElts /= 2;
2373
15
      for (unsigned i = 0; i < NumElts; 
++i12
) {
2374
12
        SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2375
12
                                 DAG.getIntPtrConstant(i * 2, DL));
2376
12
        SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2377
12
                                 DAG.getIntPtrConstant(i * 2 + 1, DL));
2378
12
        SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
2379
12
        Ops.push_back(V2);
2380
12
      }
2381
56
    } else {
2382
56
      // Then the split values
2383
210
      for (unsigned i = 0; i < NumElts; 
++i154
) {
2384
154
        SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2385
154
                                     DAG.getIntPtrConstant(i, DL));
2386
154
        if (NeedExt)
2387
12
          ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2388
154
        Ops.push_back(ExtVal);
2389
154
      }
2390
56
    }
2391
59
2392
59
    // Then any remaining arguments
2393
59
    Ops.append(N->op_begin() + 2, N->op_end());
2394
59
2395
59
    SDValue NewSt =
2396
59
        DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2397
59
                                MemSD->getMemoryVT(), MemSD->getMemOperand());
2398
59
2399
59
    // return DCI.CombineTo(N, NewSt, true);
2400
59
    return NewSt;
2401
59
  }
2402
0
2403
0
  return SDValue();
2404
0
}
2405
2406
// st i1 v, addr
2407
//    =>
2408
// v1 = zxt v to i16
2409
// st.u8 i16, addr
2410
2
SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2411
2
  SDNode *Node = Op.getNode();
2412
2
  SDLoc dl(Node);
2413
2
  StoreSDNode *ST = cast<StoreSDNode>(Node);
2414
2
  SDValue Tmp1 = ST->getChain();
2415
2
  SDValue Tmp2 = ST->getBasePtr();
2416
2
  SDValue Tmp3 = ST->getValue();
2417
2
  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2418
2
  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2419
2
  SDValue Result =
2420
2
      DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2421
2
                        ST->getAlignment(), ST->getMemOperand()->getFlags());
2422
2
  return Result;
2423
2
}
2424
2425
SDValue
2426
2.64k
NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2427
2.64k
  std::string ParamSym;
2428
2.64k
  raw_string_ostream ParamStr(ParamSym);
2429
2.64k
2430
2.64k
  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2431
2.64k
  ParamStr.flush();
2432
2.64k
2433
2.64k
  std::string *SavedStr =
2434
2.64k
    nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2435
2.64k
  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2436
2.64k
}
2437
2438
// Check to see if the kernel argument is image*_t or sampler_t
2439
2440
2.69k
static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
2441
2.69k
  static const char *const specialTypes[] = { "struct._image2d_t",
2442
2.69k
                                              "struct._image3d_t",
2443
2.69k
                                              "struct._sampler_t" };
2444
2.69k
2445
2.69k
  Type *Ty = arg->getType();
2446
2.69k
  auto *PTy = dyn_cast<PointerType>(Ty);
2447
2.69k
2448
2.69k
  if (!PTy)
2449
2.05k
    return false;
2450
638
2451
638
  if (!context)
2452
0
    return false;
2453
638
2454
638
  auto *STy = dyn_cast<StructType>(PTy->getElementType());
2455
638
  if (!STy || 
STy->isLiteral()9
)
2456
631
    return false;
2457
7
2458
7
  return std::find(std::begin(specialTypes), std::end(specialTypes),
2459
7
                   STy->getName()) != std::end(specialTypes);
2460
7
}
2461
2462
SDValue NVPTXTargetLowering::LowerFormalArguments(
2463
    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2464
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2465
1.69k
    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2466
1.69k
  MachineFunction &MF = DAG.getMachineFunction();
2467
1.69k
  const DataLayout &DL = DAG.getDataLayout();
2468
1.69k
  auto PtrVT = getPointerTy(DAG.getDataLayout());
2469
1.69k
2470
1.69k
  const Function *F = &MF.getFunction();
2471
1.69k
  const AttributeList &PAL = F->getAttributes();
2472
1.69k
  const TargetLowering *TLI = STI.getTargetLowering();
2473
1.69k
2474
1.69k
  SDValue Root = DAG.getRoot();
2475
1.69k
  std::vector<SDValue> OutChains;
2476
1.69k
2477
1.69k
  bool isABI = (STI.getSmVersion() >= 20);
2478
1.69k
  assert(isABI && "Non-ABI compilation is not supported");
2479
1.69k
  if (!isABI)
2480
0
    return Chain;
2481
1.69k
2482
1.69k
  std::vector<Type *> argTypes;
2483
1.69k
  std::vector<const Argument *> theArgs;
2484
2.69k
  for (const Argument &I : F->args()) {
2485
2.69k
    theArgs.push_back(&I);
2486
2.69k
    argTypes.push_back(I.getType());
2487
2.69k
  }
2488
1.69k
  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2489
1.69k
  // Ins.size() will be larger
2490
1.69k
  //   * if there is an aggregate argument with multiple fields (each field
2491
1.69k
  //     showing up separately in Ins)
2492
1.69k
  //   * if there is a vector argument with more than typical vector-length
2493
1.69k
  //     elements (generally if more than 4) where each vector element is
2494
1.69k
  //     individually present in Ins.
2495
1.69k
  // So a different index should be used for indexing into Ins.
2496
1.69k
  // See similar issue in LowerCall.
2497
1.69k
  unsigned InsIdx = 0;
2498
1.69k
2499
1.69k
  int idx = 0;
2500
4.38k
  for (unsigned i = 0, e = theArgs.size(); i != e; 
++i, ++idx, ++InsIdx2.69k
) {
2501
2.69k
    Type *Ty = argTypes[i];
2502
2.69k
2503
2.69k
    // If the kernel argument is image*_t or sampler_t, convert it to
2504
2.69k
    // a i32 constant holding the parameter position. This can later
2505
2.69k
    // matched in the AsmPrinter to output the correct mangled name.
2506
2.69k
    if (isImageOrSamplerVal(
2507
2.69k
            theArgs[i],
2508
2.69k
            (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
2509
2.69k
                                     : 
nullptr0
))) {
2510
0
      assert(isKernelFunction(*F) &&
2511
0
             "Only kernels can have image/sampler params");
2512
0
      InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
2513
0
      continue;
2514
0
    }
2515
2.69k
2516
2.69k
    if (theArgs[i]->use_empty()) {
2517
51
      // argument is dead
2518
51
      if (Ty->isAggregateType() || 
Ty->isIntegerTy(128)47
) {
2519
6
        SmallVector<EVT, 16> vtparts;
2520
6
2521
6
        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2522
6
        assert(vtparts.size() > 0 && "empty aggregate type not expected");
2523
24
        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2524
18
             ++parti) {
2525
18
          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2526
18
          ++InsIdx;
2527
18
        }
2528
6
        if (vtparts.size() > 0)
2529
6
          --InsIdx;
2530
6
        continue;
2531
6
      }
2532
45
      if (Ty->isVectorTy()) {
2533
0
        EVT ObjectVT = getValueType(DL, Ty);
2534
0
        unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2535
0
        for (unsigned parti = 0; parti < NumRegs; ++parti) {
2536
0
          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2537
0
          ++InsIdx;
2538
0
        }
2539
0
        if (NumRegs > 0)
2540
0
          --InsIdx;
2541
0
        continue;
2542
0
      }
2543
45
      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2544
45
      continue;
2545
45
    }
2546
2.64k
2547
2.64k
    // In the following cases, assign a node order of "idx+1"
2548
2.64k
    // to newly created nodes. The SDNodes for params have to
2549
2.64k
    // appear in the same order as their order of appearance
2550
2.64k
    // in the original function. "idx+1" holds that order.
2551
2.64k
    if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
2552
2.63k
      bool aggregateIsPacked = false;
2553
2.63k
      if (StructType *STy = dyn_cast<StructType>(Ty))
2554
13
        aggregateIsPacked = STy->isPacked();
2555
2.63k
2556
2.63k
      SmallVector<EVT, 16> VTs;
2557
2.63k
      SmallVector<uint64_t, 16> Offsets;
2558
2.63k
      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2559
2.63k
      assert(VTs.size() > 0 && "Unexpected empty type.");
2560
2.63k
      auto VectorInfo =
2561
2.63k
          VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
2562
2.63k
2563
2.63k
      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2564
2.63k
      int VecIdx = -1; // Index of the first element of the current vector.
2565
5.51k
      for (unsigned parti = 0, parte = VTs.size(); parti != parte; 
++parti2.87k
) {
2566
2.87k
        if (VectorInfo[parti] & PVF_FIRST) {
2567
2.68k
          assert(VecIdx == -1 && "Orphaned vector.");
2568
2.68k
          VecIdx = parti;
2569
2.68k
        }
2570
2.87k
2571
2.87k
        // That's the last element of this store op.
2572
2.87k
        if (VectorInfo[parti] & PVF_LAST) {
2573
2.68k
          unsigned NumElts = parti - VecIdx + 1;
2574
2.68k
          EVT EltVT = VTs[parti];
2575
2.68k
          // i1 is loaded/stored as i8.
2576
2.68k
          EVT LoadVT = EltVT;
2577
2.68k
          if (EltVT == MVT::i1)
2578
41
            LoadVT = MVT::i8;
2579
2.64k
          else if (EltVT == MVT::v2f16)
2580
294
            // getLoad needs a vector type, but it can't handle
2581
294
            // vectors which contain v2f16 elements. So we must load
2582
294
            // using i32 here and then bitcast back.
2583
294
            LoadVT = MVT::i32;
2584
2.68k
2585
2.68k
          EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2586
2.68k
          SDValue VecAddr =
2587
2.68k
              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2588
2.68k
                          DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2589
2.68k
          Value *srcValue = Constant::getNullValue(PointerType::get(
2590
2.68k
              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2591
2.68k
          SDValue P =
2592
2.68k
              DAG.getLoad(VecVT, dl, Root, VecAddr,
2593
2.68k
                          MachinePointerInfo(srcValue), aggregateIsPacked,
2594
2.68k
                          MachineMemOperand::MODereferenceable |
2595
2.68k
                              MachineMemOperand::MOInvariant);
2596
2.68k
          if (P.getNode())
2597
2.68k
            P.getNode()->setIROrder(idx + 1);
2598
5.56k
          for (unsigned j = 0; j < NumElts; 
++j2.87k
) {
2599
2.87k
            SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2600
2.87k
                                      DAG.getIntPtrConstant(j, dl));
2601
2.87k
            // We've loaded i1 as an i8 and now must truncate it back to i1
2602
2.87k
            if (EltVT == MVT::i1)
2603
48
              Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2604
2.83k
            // v2f16 was loaded as an i32. Now we must bitcast it back.
2605
2.83k
            else if (EltVT == MVT::v2f16)
2606
298
              Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
2607
2.87k
            // Extend the element if necessary (e.g. an i8 is loaded
2608
2.87k
            // into an i16 register)
2609
2.87k
            if (Ins[InsIdx].VT.isInteger() &&
2610
2.87k
                
Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()1.70k
) {
2611
95
              unsigned Extend = Ins[InsIdx].Flags.isSExt() ? 
ISD::SIGN_EXTEND2
2612
95
                                                           : 
ISD::ZERO_EXTEND93
;
2613
95
              Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2614
95
            }
2615
2.87k
            InVals.push_back(Elt);
2616
2.87k
          }
2617
2.68k
2618
2.68k
          // Reset vector tracking state.
2619
2.68k
          VecIdx = -1;
2620
2.68k
        }
2621
2.87k
        ++InsIdx;
2622
2.87k
      }
2623
2.63k
      if (VTs.size() > 0)
2624
2.63k
        --InsIdx;
2625
2.63k
      continue;
2626
2.63k
    }
2627
4
2628
4
    // Param has ByVal attribute
2629
4
    // Return MoveParam(param symbol).
2630
4
    // Ideally, the param symbol can be returned directly,
2631
4
    // but when SDNode builder decides to use it in a CopyToReg(),
2632
4
    // machine instruction fails because TargetExternalSymbol
2633
4
    // (not lowered) is target dependent, and CopyToReg assumes
2634
4
    // the source is lowered.
2635
4
    EVT ObjectVT = getValueType(DL, Ty);
2636
4
    assert(ObjectVT == Ins[InsIdx].VT &&
2637
4
           "Ins type did not match function type");
2638
4
    SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2639
4
    SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2640
4
    if (p.getNode())
2641
4
      p.getNode()->setIROrder(idx + 1);
2642
4
    InVals.push_back(p);
2643
4
  }
2644
1.69k
2645
1.69k
  // Clang will check explicit VarArg and issue error if any. However, Clang
2646
1.69k
  // will let code with
2647
1.69k
  // implicit var arg like f() pass. See bug 617733.
2648
1.69k
  // We treat this case as if the arg list is empty.
2649
1.69k
  // if (F.isVarArg()) {
2650
1.69k
  // assert(0 && "VarArg not supported yet!");
2651
1.69k
  //}
2652
1.69k
2653
1.69k
  if (!OutChains.empty())
2654
0
    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2655
1.69k
2656
1.69k
  return Chain;
2657
1.69k
}
2658
2659
SDValue
2660
NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2661
                                 bool isVarArg,
2662
                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
2663
                                 const SmallVectorImpl<SDValue> &OutVals,
2664
1.69k
                                 const SDLoc &dl, SelectionDAG &DAG) const {
2665
1.69k
  MachineFunction &MF = DAG.getMachineFunction();
2666
1.69k
  Type *RetTy = MF.getFunction().getReturnType();
2667
1.69k
2668
1.69k
  bool isABI = (STI.getSmVersion() >= 20);
2669
1.69k
  assert(isABI && "Non-ABI compilation is not supported");
2670
1.69k
  if (!isABI)
2671
0
    return Chain;
2672
1.69k
2673
1.69k
  const DataLayout DL = DAG.getDataLayout();
2674
1.69k
  SmallVector<EVT, 16> VTs;
2675
1.69k
  SmallVector<uint64_t, 16> Offsets;
2676
1.69k
  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
2677
1.69k
  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
2678
1.69k
2679
1.69k
  auto VectorInfo = VectorizePTXValueVTs(
2680
1.69k
      VTs, Offsets, RetTy->isSized() ? 
DL.getABITypeAlignment(RetTy)1.32k
:
1370
);
2681
1.69k
2682
1.69k
  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2683
1.69k
  // 32-bits are sign extended or zero extended, depending on whether
2684
1.69k
  // they are signed or unsigned types.
2685
1.69k
  bool ExtendIntegerRetVal =
2686
1.69k
      RetTy->isIntegerTy() && 
DL.getTypeAllocSizeInBits(RetTy) < 32651
;
2687
1.69k
2688
1.69k
  SmallVector<SDValue, 6> StoreOperands;
2689
3.28k
  for (unsigned i = 0, e = VTs.size(); i != e; 
++i1.58k
) {
2690
1.58k
    // New load/store. Record chain and offset operands.
2691
1.58k
    if (VectorInfo[i] & PVF_FIRST) {
2692
1.37k
      assert(StoreOperands.empty() && "Orphaned operand list.");
2693
1.37k
      StoreOperands.push_back(Chain);
2694
1.37k
      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
2695
1.37k
    }
2696
1.58k
2697
1.58k
    SDValue RetVal = OutVals[i];
2698
1.58k
    if (ExtendIntegerRetVal) {
2699
211
      RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? 
ISD::SIGN_EXTEND3
2700
211
                                                  : 
ISD::ZERO_EXTEND208
,
2701
211
                           dl, MVT::i32, RetVal);
2702
1.37k
    } else if (RetVal.getValueSizeInBits() < 16) {
2703
99
      // Use 16-bit registers for small load-stores as it's the
2704
99
      // smallest general purpose register size supported by NVPTX.
2705
99
      RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
2706
99
    }
2707
1.58k
2708
1.58k
    // Record the value to return.
2709
1.58k
    StoreOperands.push_back(RetVal);
2710
1.58k
2711
1.58k
    // That's the last element of this store op.
2712
1.58k
    if (VectorInfo[i] & PVF_LAST) {
2713
1.37k
      NVPTXISD::NodeType Op;
2714
1.37k
      unsigned NumElts = StoreOperands.size() - 2;
2715
1.37k
      switch (NumElts) {
2716
1.37k
      case 1:
2717
1.23k
        Op = NVPTXISD::StoreRetval;
2718
1.23k
        break;
2719
1.37k
      case 2:
2720
100
        Op = NVPTXISD::StoreRetvalV2;
2721
100
        break;
2722
1.37k
      case 4:
2723
37
        Op = NVPTXISD::StoreRetvalV4;
2724
37
        break;
2725
1.37k
      default:
2726
0
        llvm_unreachable("Invalid vector info.");
2727
1.37k
      }
2728
1.37k
2729
1.37k
      // Adjust type of load/store op if we've extended the scalar
2730
1.37k
      // return value.
2731
1.37k
      EVT TheStoreType = ExtendIntegerRetVal ? 
MVT::i32211
:
VTs[i]1.16k
;
2732
1.37k
      Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
2733
1.37k
                                      StoreOperands, TheStoreType,
2734
1.37k
                                      MachinePointerInfo(), /* Align */ 1,
2735
1.37k
                                      MachineMemOperand::MOStore);
2736
1.37k
      // Cleanup vector state.
2737
1.37k
      StoreOperands.clear();
2738
1.37k
    }
2739
1.58k
  }
2740
1.69k
2741
1.69k
  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2742
1.69k
}
2743
2744
void NVPTXTargetLowering::LowerAsmOperandForConstraint(
2745
    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2746
2
    SelectionDAG &DAG) const {
2747
2
  if (Constraint.length() > 1)
2748
0
    return;
2749
2
  else
2750
2
    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2751
2
}
2752
2753
5
static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2754
5
  switch (Intrinsic) {
2755
5
  default:
2756
0
    return 0;
2757
5
2758
5
  case Intrinsic::nvvm_tex_1d_v4f32_s32:
2759
1
    return NVPTXISD::Tex1DFloatS32;
2760
5
  case Intrinsic::nvvm_tex_1d_v4f32_f32:
2761
0
    return NVPTXISD::Tex1DFloatFloat;
2762
5
  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2763
0
    return NVPTXISD::Tex1DFloatFloatLevel;
2764
5
  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2765
0
    return NVPTXISD::Tex1DFloatFloatGrad;
2766
5
  case Intrinsic::nvvm_tex_1d_v4s32_s32:
2767
0
    return NVPTXISD::Tex1DS32S32;
2768
5
  case Intrinsic::nvvm_tex_1d_v4s32_f32:
2769
0
    return NVPTXISD::Tex1DS32Float;
2770
5
  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2771
0
    return NVPTXISD::Tex1DS32FloatLevel;
2772
5
  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2773
0
    return NVPTXISD::Tex1DS32FloatGrad;
2774
5
  case Intrinsic::nvvm_tex_1d_v4u32_s32:
2775
0
    return NVPTXISD::Tex1DU32S32;
2776
5
  case Intrinsic::nvvm_tex_1d_v4u32_f32:
2777
0
    return NVPTXISD::Tex1DU32Float;
2778
5
  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2779
0
    return NVPTXISD::Tex1DU32FloatLevel;
2780
5
  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2781
0
    return NVPTXISD::Tex1DU32FloatGrad;
2782
5
2783
5
  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2784
0
    return NVPTXISD::Tex1DArrayFloatS32;
2785
5
  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2786
0
    return NVPTXISD::Tex1DArrayFloatFloat;
2787
5
  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2788
0
    return NVPTXISD::Tex1DArrayFloatFloatLevel;
2789
5
  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2790
0
    return NVPTXISD::Tex1DArrayFloatFloatGrad;
2791
5
  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2792
0
    return NVPTXISD::Tex1DArrayS32S32;
2793
5
  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2794
0
    return NVPTXISD::Tex1DArrayS32Float;
2795
5
  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2796
0
    return NVPTXISD::Tex1DArrayS32FloatLevel;
2797
5
  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2798
0
    return NVPTXISD::Tex1DArrayS32FloatGrad;
2799
5
  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2800
0
    return NVPTXISD::Tex1DArrayU32S32;
2801
5
  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2802
0
    return NVPTXISD::Tex1DArrayU32Float;
2803
5
  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2804
0
    return NVPTXISD::Tex1DArrayU32FloatLevel;
2805
5
  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2806
0
    return NVPTXISD::Tex1DArrayU32FloatGrad;
2807
5
2808
5
  case Intrinsic::nvvm_tex_2d_v4f32_s32:
2809
0
    return NVPTXISD::Tex2DFloatS32;
2810
5
  case Intrinsic::nvvm_tex_2d_v4f32_f32:
2811
0
    return NVPTXISD::Tex2DFloatFloat;
2812
5
  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2813
0
    return NVPTXISD::Tex2DFloatFloatLevel;
2814
5
  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2815
0
    return NVPTXISD::Tex2DFloatFloatGrad;
2816
5
  case Intrinsic::nvvm_tex_2d_v4s32_s32:
2817
0
    return NVPTXISD::Tex2DS32S32;
2818
5
  case Intrinsic::nvvm_tex_2d_v4s32_f32:
2819
0
    return NVPTXISD::Tex2DS32Float;
2820
5
  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2821
0
    return NVPTXISD::Tex2DS32FloatLevel;
2822
5
  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2823
0
    return NVPTXISD::Tex2DS32FloatGrad;
2824
5
  case Intrinsic::nvvm_tex_2d_v4u32_s32:
2825
0
    return NVPTXISD::Tex2DU32S32;
2826
5
  case Intrinsic::nvvm_tex_2d_v4u32_f32:
2827
0
    return NVPTXISD::Tex2DU32Float;
2828
5
  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2829
0
    return NVPTXISD::Tex2DU32FloatLevel;
2830
5
  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2831
0
    return NVPTXISD::Tex2DU32FloatGrad;
2832
5
2833
5
  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2834
0
    return NVPTXISD::Tex2DArrayFloatS32;
2835
5
  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2836
0
    return NVPTXISD::Tex2DArrayFloatFloat;
2837
5
  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2838
0
    return NVPTXISD::Tex2DArrayFloatFloatLevel;
2839
5
  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2840
0
    return NVPTXISD::Tex2DArrayFloatFloatGrad;
2841
5
  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2842
0
    return NVPTXISD::Tex2DArrayS32S32;
2843
5
  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2844
0
    return NVPTXISD::Tex2DArrayS32Float;
2845
5
  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2846
0
    return NVPTXISD::Tex2DArrayS32FloatLevel;
2847
5
  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2848
0
    return NVPTXISD::Tex2DArrayS32FloatGrad;
2849
5
  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2850
0
    return NVPTXISD::Tex2DArrayU32S32;
2851
5
  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2852
0
    return NVPTXISD::Tex2DArrayU32Float;
2853
5
  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2854
0
    return NVPTXISD::Tex2DArrayU32FloatLevel;
2855
5
  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2856
0
    return NVPTXISD::Tex2DArrayU32FloatGrad;
2857
5
2858
5
  case Intrinsic::nvvm_tex_3d_v4f32_s32:
2859
0
    return NVPTXISD::Tex3DFloatS32;
2860
5
  case Intrinsic::nvvm_tex_3d_v4f32_f32:
2861
0
    return NVPTXISD::Tex3DFloatFloat;
2862
5
  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2863
0
    return NVPTXISD::Tex3DFloatFloatLevel;
2864
5
  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2865
0
    return NVPTXISD::Tex3DFloatFloatGrad;
2866
5
  case Intrinsic::nvvm_tex_3d_v4s32_s32:
2867
0
    return NVPTXISD::Tex3DS32S32;
2868
5
  case Intrinsic::nvvm_tex_3d_v4s32_f32:
2869
0
    return NVPTXISD::Tex3DS32Float;
2870
5
  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2871
0
    return NVPTXISD::Tex3DS32FloatLevel;
2872
5
  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2873
0
    return NVPTXISD::Tex3DS32FloatGrad;
2874
5
  case Intrinsic::nvvm_tex_3d_v4u32_s32:
2875
0
    return NVPTXISD::Tex3DU32S32;
2876
5
  case Intrinsic::nvvm_tex_3d_v4u32_f32:
2877
0
    return NVPTXISD::Tex3DU32Float;
2878
5
  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2879
0
    return NVPTXISD::Tex3DU32FloatLevel;
2880
5
  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2881
0
    return NVPTXISD::Tex3DU32FloatGrad;
2882
5
2883
5
  case Intrinsic::nvvm_tex_cube_v4f32_f32:
2884
0
    return NVPTXISD::TexCubeFloatFloat;
2885
5
  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2886
0
    return NVPTXISD::TexCubeFloatFloatLevel;
2887
5
  case Intrinsic::nvvm_tex_cube_v4s32_f32:
2888
0
    return NVPTXISD::TexCubeS32Float;
2889
5
  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2890
0
    return NVPTXISD::TexCubeS32FloatLevel;
2891
5
  case Intrinsic::nvvm_tex_cube_v4u32_f32:
2892
0
    return NVPTXISD::TexCubeU32Float;
2893
5
  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2894
0
    return NVPTXISD::TexCubeU32FloatLevel;
2895
5
2896
5
  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2897
0
    return NVPTXISD::TexCubeArrayFloatFloat;
2898
5
  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2899
0
    return NVPTXISD::TexCubeArrayFloatFloatLevel;
2900
5
  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2901
0
    return NVPTXISD::TexCubeArrayS32Float;
2902
5
  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2903
0
    return NVPTXISD::TexCubeArrayS32FloatLevel;
2904
5
  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2905
0
    return NVPTXISD::TexCubeArrayU32Float;
2906
5
  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2907
0
    return NVPTXISD::TexCubeArrayU32FloatLevel;
2908
5
2909
5
  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2910
0
    return NVPTXISD::Tld4R2DFloatFloat;
2911
5
  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2912
0
    return NVPTXISD::Tld4G2DFloatFloat;
2913
5
  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2914
0
    return NVPTXISD::Tld4B2DFloatFloat;
2915
5
  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2916
0
    return NVPTXISD::Tld4A2DFloatFloat;
2917
5
  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2918
0
    return NVPTXISD::Tld4R2DS64Float;
2919
5
  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2920
0
    return NVPTXISD::Tld4G2DS64Float;
2921
5
  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2922
0
    return NVPTXISD::Tld4B2DS64Float;
2923
5
  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2924
0
    return NVPTXISD::Tld4A2DS64Float;
2925
5
  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2926
0
    return NVPTXISD::Tld4R2DU64Float;
2927
5
  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2928
0
    return NVPTXISD::Tld4G2DU64Float;
2929
5
  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2930
0
    return NVPTXISD::Tld4B2DU64Float;
2931
5
  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2932
0
    return NVPTXISD::Tld4A2DU64Float;
2933
5
2934
5
  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2935
4
    return NVPTXISD::TexUnified1DFloatS32;
2936
5
  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2937
0
    return NVPTXISD::TexUnified1DFloatFloat;
2938
5
  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2939
0
    return NVPTXISD::TexUnified1DFloatFloatLevel;
2940
5
  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2941
0
    return NVPTXISD::TexUnified1DFloatFloatGrad;
2942
5
  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2943
0
    return NVPTXISD::TexUnified1DS32S32;
2944
5
  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2945
0
    return NVPTXISD::TexUnified1DS32Float;
2946
5
  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2947
0
    return NVPTXISD::TexUnified1DS32FloatLevel;
2948
5
  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2949
0
    return NVPTXISD::TexUnified1DS32FloatGrad;
2950
5
  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2951
0
    return NVPTXISD::TexUnified1DU32S32;
2952
5
  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2953
0
    return NVPTXISD::TexUnified1DU32Float;
2954
5
  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2955
0
    return NVPTXISD::TexUnified1DU32FloatLevel;
2956
5
  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2957
0
    return NVPTXISD::TexUnified1DU32FloatGrad;
2958
5
2959
5
  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2960
0
    return NVPTXISD::TexUnified1DArrayFloatS32;
2961
5
  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2962
0
    return NVPTXISD::TexUnified1DArrayFloatFloat;
2963
5
  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2964
0
    return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
2965
5
  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2966
0
    return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
2967
5
  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2968
0
    return NVPTXISD::TexUnified1DArrayS32S32;
2969
5
  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2970
0
    return NVPTXISD::TexUnified1DArrayS32Float;
2971
5
  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2972
0
    return NVPTXISD::TexUnified1DArrayS32FloatLevel;
2973
5
  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2974
0
    return NVPTXISD::TexUnified1DArrayS32FloatGrad;
2975
5
  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2976
0
    return NVPTXISD::TexUnified1DArrayU32S32;
2977
5
  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2978
0
    return NVPTXISD::TexUnified1DArrayU32Float;
2979
5
  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2980
0
    return NVPTXISD::TexUnified1DArrayU32FloatLevel;
2981
5
  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2982
0
    return NVPTXISD::TexUnified1DArrayU32FloatGrad;
2983
5
2984
5
  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2985
0
    return NVPTXISD::TexUnified2DFloatS32;
2986
5
  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2987
0
    return NVPTXISD::TexUnified2DFloatFloat;
2988
5
  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2989
0
    return NVPTXISD::TexUnified2DFloatFloatLevel;
2990
5
  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2991
0
    return NVPTXISD::TexUnified2DFloatFloatGrad;
2992
5
  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2993
0
    return NVPTXISD::TexUnified2DS32S32;
2994
5
  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2995
0
    return NVPTXISD::TexUnified2DS32Float;
2996
5
  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2997
0
    return NVPTXISD::TexUnified2DS32FloatLevel;
2998
5
  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
2999
0
    return NVPTXISD::TexUnified2DS32FloatGrad;
3000
5
  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3001
0
    return NVPTXISD::TexUnified2DU32S32;
3002
5
  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3003
0
    return NVPTXISD::TexUnified2DU32Float;
3004
5
  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3005
0
    return NVPTXISD::TexUnified2DU32FloatLevel;
3006
5
  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3007
0
    return NVPTXISD::TexUnified2DU32FloatGrad;
3008
5
3009
5
  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3010
0
    return NVPTXISD::TexUnified2DArrayFloatS32;
3011
5
  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3012
0
    return NVPTXISD::TexUnified2DArrayFloatFloat;
3013
5
  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3014
0
    return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
3015
5
  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3016
0
    return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
3017
5
  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3018
0
    return NVPTXISD::TexUnified2DArrayS32S32;
3019
5
  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3020
0
    return NVPTXISD::TexUnified2DArrayS32Float;
3021
5
  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3022
0
    return NVPTXISD::TexUnified2DArrayS32FloatLevel;
3023
5
  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3024
0
    return NVPTXISD::TexUnified2DArrayS32FloatGrad;
3025
5
  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3026
0
    return NVPTXISD::TexUnified2DArrayU32S32;
3027
5
  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3028
0
    return NVPTXISD::TexUnified2DArrayU32Float;
3029
5
  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3030
0
    return NVPTXISD::TexUnified2DArrayU32FloatLevel;
3031
5
  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3032
0
    return NVPTXISD::TexUnified2DArrayU32FloatGrad;
3033
5
3034
5
  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3035
0
    return NVPTXISD::TexUnified3DFloatS32;
3036
5
  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3037
0
    return NVPTXISD::TexUnified3DFloatFloat;
3038
5
  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3039
0
    return NVPTXISD::TexUnified3DFloatFloatLevel;
3040
5
  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3041
0
    return NVPTXISD::TexUnified3DFloatFloatGrad;
3042
5
  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3043
0
    return NVPTXISD::TexUnified3DS32S32;
3044
5
  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3045
0
    return NVPTXISD::TexUnified3DS32Float;
3046
5
  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3047
0
    return NVPTXISD::TexUnified3DS32FloatLevel;
3048
5
  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3049
0
    return NVPTXISD::TexUnified3DS32FloatGrad;
3050
5
  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3051
0
    return NVPTXISD::TexUnified3DU32S32;
3052
5
  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3053
0
    return NVPTXISD::TexUnified3DU32Float;
3054
5
  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3055
0
    return NVPTXISD::TexUnified3DU32FloatLevel;
3056
5
  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3057
0
    return NVPTXISD::TexUnified3DU32FloatGrad;
3058
5
3059
5
  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3060
0
    return NVPTXISD::TexUnifiedCubeFloatFloat;
3061
5
  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3062
0
    return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
3063
5
  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3064
0
    return NVPTXISD::TexUnifiedCubeS32Float;
3065
5
  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3066
0
    return NVPTXISD::TexUnifiedCubeS32FloatLevel;
3067
5
  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3068
0
    return NVPTXISD::TexUnifiedCubeU32Float;
3069
5
  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3070
0
    return NVPTXISD::TexUnifiedCubeU32FloatLevel;
3071
5
3072
5
  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3073
0
    return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
3074
5
  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3075
0
    return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
3076
5
  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3077
0
    return NVPTXISD::TexUnifiedCubeArrayS32Float;
3078
5
  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3079
0
    return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
3080
5
  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3081
0
    return NVPTXISD::TexUnifiedCubeArrayU32Float;
3082
5
  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3083
0
    return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
3084
5
3085
5
  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3086
0
    return NVPTXISD::Tld4UnifiedR2DFloatFloat;
3087
5
  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3088
0
    return NVPTXISD::Tld4UnifiedG2DFloatFloat;
3089
5
  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3090
0
    return NVPTXISD::Tld4UnifiedB2DFloatFloat;
3091
5
  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3092
0
    return NVPTXISD::Tld4UnifiedA2DFloatFloat;
3093
5
  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3094
0
    return NVPTXISD::Tld4UnifiedR2DS64Float;
3095
5
  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3096
0
    return NVPTXISD::Tld4UnifiedG2DS64Float;
3097
5
  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3098
0
    return NVPTXISD::Tld4UnifiedB2DS64Float;
3099
5
  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3100
0
    return NVPTXISD::Tld4UnifiedA2DS64Float;
3101
5
  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3102
0
    return NVPTXISD::Tld4UnifiedR2DU64Float;
3103
5
  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3104
0
    return NVPTXISD::Tld4UnifiedG2DU64Float;
3105
5
  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3106
0
    return NVPTXISD::Tld4UnifiedB2DU64Float;
3107
5
  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3108
0
    return NVPTXISD::Tld4UnifiedA2DU64Float;
3109
5
  }
3110
5
}
3111
3112
5
static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3113
5
  switch (Intrinsic) {
3114
5
  default:
3115
0
    return 0;
3116
5
  case Intrinsic::nvvm_suld_1d_i8_clamp:
3117
0
    return NVPTXISD::Suld1DI8Clamp;
3118
5
  case Intrinsic::nvvm_suld_1d_i16_clamp:
3119
0
    return NVPTXISD::Suld1DI16Clamp;
3120
5
  case Intrinsic::nvvm_suld_1d_i32_clamp:
3121
0
    return NVPTXISD::Suld1DI32Clamp;
3122
5
  case Intrinsic::nvvm_suld_1d_i64_clamp:
3123
0
    return NVPTXISD::Suld1DI64Clamp;
3124
5
  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3125
0
    return NVPTXISD::Suld1DV2I8Clamp;
3126
5
  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3127
0
    return NVPTXISD::Suld1DV2I16Clamp;
3128
5
  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3129
0
    return NVPTXISD::Suld1DV2I32Clamp;
3130
5
  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3131
0
    return NVPTXISD::Suld1DV2I64Clamp;
3132
5
  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3133
0
    return NVPTXISD::Suld1DV4I8Clamp;
3134
5
  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3135
0
    return NVPTXISD::Suld1DV4I16Clamp;
3136
5
  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3137
0
    return NVPTXISD::Suld1DV4I32Clamp;
3138
5
  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3139
0
    return NVPTXISD::Suld1DArrayI8Clamp;
3140
5
  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3141
0
    return NVPTXISD::Suld1DArrayI16Clamp;
3142
5
  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3143
0
    return NVPTXISD::Suld1DArrayI32Clamp;
3144
5
  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3145
0
    return NVPTXISD::Suld1DArrayI64Clamp;
3146
5
  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3147
0
    return NVPTXISD::Suld1DArrayV2I8Clamp;
3148
5
  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3149
0
    return NVPTXISD::Suld1DArrayV2I16Clamp;
3150
5
  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3151
0
    return NVPTXISD::Suld1DArrayV2I32Clamp;
3152
5
  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3153
0
    return NVPTXISD::Suld1DArrayV2I64Clamp;
3154
5
  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3155
0
    return NVPTXISD::Suld1DArrayV4I8Clamp;
3156
5
  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3157
0
    return NVPTXISD::Suld1DArrayV4I16Clamp;
3158
5
  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3159
0
    return NVPTXISD::Suld1DArrayV4I32Clamp;
3160
5
  case Intrinsic::nvvm_suld_2d_i8_clamp:
3161
0
    return NVPTXISD::Suld2DI8Clamp;
3162
5
  case Intrinsic::nvvm_suld_2d_i16_clamp:
3163
0
    return NVPTXISD::Suld2DI16Clamp;
3164
5
  case Intrinsic::nvvm_suld_2d_i32_clamp:
3165
0
    return NVPTXISD::Suld2DI32Clamp;
3166
5
  case Intrinsic::nvvm_suld_2d_i64_clamp:
3167
0
    return NVPTXISD::Suld2DI64Clamp;
3168
5
  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3169
0
    return NVPTXISD::Suld2DV2I8Clamp;
3170
5
  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3171
0
    return NVPTXISD::Suld2DV2I16Clamp;
3172
5
  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3173
0
    return NVPTXISD::Suld2DV2I32Clamp;
3174
5
  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3175
0
    return NVPTXISD::Suld2DV2I64Clamp;
3176
5
  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3177
0
    return NVPTXISD::Suld2DV4I8Clamp;
3178
5
  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3179
0
    return NVPTXISD::Suld2DV4I16Clamp;
3180
5
  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3181
0
    return NVPTXISD::Suld2DV4I32Clamp;
3182
5
  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3183
0
    return NVPTXISD::Suld2DArrayI8Clamp;
3184
5
  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3185
0
    return NVPTXISD::Suld2DArrayI16Clamp;
3186
5
  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3187
0
    return NVPTXISD::Suld2DArrayI32Clamp;
3188
5
  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3189
0
    return NVPTXISD::Suld2DArrayI64Clamp;
3190
5
  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3191
0
    return NVPTXISD::Suld2DArrayV2I8Clamp;
3192
5
  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3193
0
    return NVPTXISD::Suld2DArrayV2I16Clamp;
3194
5
  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3195
0
    return NVPTXISD::Suld2DArrayV2I32Clamp;
3196
5
  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3197
0
    return NVPTXISD::Suld2DArrayV2I64Clamp;
3198
5
  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3199
0
    return NVPTXISD::Suld2DArrayV4I8Clamp;
3200
5
  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3201
0
    return NVPTXISD::Suld2DArrayV4I16Clamp;
3202
5
  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3203
0
    return NVPTXISD::Suld2DArrayV4I32Clamp;
3204
5
  case Intrinsic::nvvm_suld_3d_i8_clamp:
3205
0
    return NVPTXISD::Suld3DI8Clamp;
3206
5
  case Intrinsic::nvvm_suld_3d_i16_clamp:
3207
0
    return NVPTXISD::Suld3DI16Clamp;
3208
5
  case Intrinsic::nvvm_suld_3d_i32_clamp:
3209
0
    return NVPTXISD::Suld3DI32Clamp;
3210
5
  case Intrinsic::nvvm_suld_3d_i64_clamp:
3211
0
    return NVPTXISD::Suld3DI64Clamp;
3212
5
  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3213
0
    return NVPTXISD::Suld3DV2I8Clamp;
3214
5
  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3215
0
    return NVPTXISD::Suld3DV2I16Clamp;
3216
5
  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3217
0
    return NVPTXISD::Suld3DV2I32Clamp;
3218
5
  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3219
0
    return NVPTXISD::Suld3DV2I64Clamp;
3220
5
  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3221
0
    return NVPTXISD::Suld3DV4I8Clamp;
3222
5
  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3223
0
    return NVPTXISD::Suld3DV4I16Clamp;
3224
5
  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3225
0
    return NVPTXISD::Suld3DV4I32Clamp;
3226
5
  case Intrinsic::nvvm_suld_1d_i8_trap:
3227
0
    return NVPTXISD::Suld1DI8Trap;
3228
5
  case Intrinsic::nvvm_suld_1d_i16_trap:
3229
0
    return NVPTXISD::Suld1DI16Trap;
3230
5
  case Intrinsic::nvvm_suld_1d_i32_trap:
3231
5
    return NVPTXISD::Suld1DI32Trap;
3232
5
  case Intrinsic::nvvm_suld_1d_i64_trap:
3233
0
    return NVPTXISD::Suld1DI64Trap;
3234
5
  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3235
0
    return NVPTXISD::Suld1DV2I8Trap;
3236
5
  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3237
0
    return NVPTXISD::Suld1DV2I16Trap;
3238
5
  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3239
0
    return NVPTXISD::Suld1DV2I32Trap;
3240
5
  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3241
0
    return NVPTXISD::Suld1DV2I64Trap;
3242
5
  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3243
0
    return NVPTXISD::Suld1DV4I8Trap;
3244
5
  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3245
0
    return NVPTXISD::Suld1DV4I16Trap;
3246
5
  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3247
0
    return NVPTXISD::Suld1DV4I32Trap;
3248
5
  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3249
0
    return NVPTXISD::Suld1DArrayI8Trap;
3250
5
  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3251
0
    return NVPTXISD::Suld1DArrayI16Trap;
3252
5
  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3253
0
    return NVPTXISD::Suld1DArrayI32Trap;
3254
5
  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3255
0
    return NVPTXISD::Suld1DArrayI64Trap;
3256
5
  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3257
0
    return NVPTXISD::Suld1DArrayV2I8Trap;
3258
5
  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3259
0
    return NVPTXISD::Suld1DArrayV2I16Trap;
3260
5
  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3261
0
    return NVPTXISD::Suld1DArrayV2I32Trap;
3262
5
  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3263
0
    return NVPTXISD::Suld1DArrayV2I64Trap;
3264
5
  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3265
0
    return NVPTXISD::Suld1DArrayV4I8Trap;
3266
5
  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3267
0
    return NVPTXISD::Suld1DArrayV4I16Trap;
3268
5
  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3269
0
    return NVPTXISD::Suld1DArrayV4I32Trap;
3270
5
  case Intrinsic::nvvm_suld_2d_i8_trap:
3271
0
    return NVPTXISD::Suld2DI8Trap;
3272
5
  case Intrinsic::nvvm_suld_2d_i16_trap:
3273
0
    return NVPTXISD::Suld2DI16Trap;
3274
5
  case Intrinsic::nvvm_suld_2d_i32_trap:
3275
0
    return NVPTXISD::Suld2DI32Trap;
3276
5
  case Intrinsic::nvvm_suld_2d_i64_trap:
3277
0
    return NVPTXISD::Suld2DI64Trap;
3278
5
  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3279
0
    return NVPTXISD::Suld2DV2I8Trap;
3280
5
  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3281
0
    return NVPTXISD::Suld2DV2I16Trap;
3282
5
  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3283
0
    return NVPTXISD::Suld2DV2I32Trap;
3284
5
  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3285
0
    return NVPTXISD::Suld2DV2I64Trap;
3286
5
  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3287
0
    return NVPTXISD::Suld2DV4I8Trap;
3288
5
  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3289
0
    return NVPTXISD::Suld2DV4I16Trap;
3290
5
  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3291
0
    return NVPTXISD::Suld2DV4I32Trap;
3292
5
  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3293
0
    return NVPTXISD::Suld2DArrayI8Trap;
3294
5
  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3295
0
    return NVPTXISD::Suld2DArrayI16Trap;
3296
5
  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3297
0
    return NVPTXISD::Suld2DArrayI32Trap;
3298
5
  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3299
0
    return NVPTXISD::Suld2DArrayI64Trap;
3300
5
  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3301
0
    return NVPTXISD::Suld2DArrayV2I8Trap;
3302
5
  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3303
0
    return NVPTXISD::Suld2DArrayV2I16Trap;
3304
5
  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3305
0
    return NVPTXISD::Suld2DArrayV2I32Trap;
3306
5
  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3307
0
    return NVPTXISD::Suld2DArrayV2I64Trap;
3308
5
  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3309
0
    return NVPTXISD::Suld2DArrayV4I8Trap;
3310
5
  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3311
0
    return NVPTXISD::Suld2DArrayV4I16Trap;
3312
5
  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3313
0
    return NVPTXISD::Suld2DArrayV4I32Trap;
3314
5
  case Intrinsic::nvvm_suld_3d_i8_trap:
3315
0
    return NVPTXISD::Suld3DI8Trap;
3316
5
  case Intrinsic::nvvm_suld_3d_i16_trap:
3317
0
    return NVPTXISD::Suld3DI16Trap;
3318
5
  case Intrinsic::nvvm_suld_3d_i32_trap:
3319
0
    return NVPTXISD::Suld3DI32Trap;
3320
5
  case Intrinsic::nvvm_suld_3d_i64_trap:
3321
0
    return NVPTXISD::Suld3DI64Trap;
3322
5
  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3323
0
    return NVPTXISD::Suld3DV2I8Trap;
3324
5
  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3325
0
    return NVPTXISD::Suld3DV2I16Trap;
3326
5
  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3327
0
    return NVPTXISD::Suld3DV2I32Trap;
3328
5
  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3329
0
    return NVPTXISD::Suld3DV2I64Trap;
3330
5
  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3331
0
    return NVPTXISD::Suld3DV4I8Trap;
3332
5
  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3333
0
    return NVPTXISD::Suld3DV4I16Trap;
3334
5
  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3335
0
    return NVPTXISD::Suld3DV4I32Trap;
3336
5
  case Intrinsic::nvvm_suld_1d_i8_zero:
3337
0
    return NVPTXISD::Suld1DI8Zero;
3338
5
  case Intrinsic::nvvm_suld_1d_i16_zero:
3339
0
    return NVPTXISD::Suld1DI16Zero;
3340
5
  case Intrinsic::nvvm_suld_1d_i32_zero:
3341
0
    return NVPTXISD::Suld1DI32Zero;
3342
5
  case Intrinsic::nvvm_suld_1d_i64_zero:
3343
0
    return NVPTXISD::Suld1DI64Zero;
3344
5
  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3345
0
    return NVPTXISD::Suld1DV2I8Zero;
3346
5
  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3347
0
    return NVPTXISD::Suld1DV2I16Zero;
3348
5
  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3349
0
    return NVPTXISD::Suld1DV2I32Zero;
3350
5
  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3351
0
    return NVPTXISD::Suld1DV2I64Zero;
3352
5
  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3353
0
    return NVPTXISD::Suld1DV4I8Zero;
3354
5
  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3355
0
    return NVPTXISD::Suld1DV4I16Zero;
3356
5
  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3357
0
    return NVPTXISD::Suld1DV4I32Zero;
3358
5
  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3359
0
    return NVPTXISD::Suld1DArrayI8Zero;
3360
5
  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3361
0
    return NVPTXISD::Suld1DArrayI16Zero;
3362
5
  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3363
0
    return NVPTXISD::Suld1DArrayI32Zero;
3364
5
  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3365
0
    return NVPTXISD::Suld1DArrayI64Zero;
3366
5
  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3367
0
    return NVPTXISD::Suld1DArrayV2I8Zero;
3368
5
  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3369
0
    return NVPTXISD::Suld1DArrayV2I16Zero;
3370
5
  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3371
0
    return NVPTXISD::Suld1DArrayV2I32Zero;
3372
5
  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3373
0
    return NVPTXISD::Suld1DArrayV2I64Zero;
3374
5
  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3375
0
    return NVPTXISD::Suld1DArrayV4I8Zero;
3376
5
  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3377
0
    return NVPTXISD::Suld1DArrayV4I16Zero;
3378
5
  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3379
0
    return NVPTXISD::Suld1DArrayV4I32Zero;
3380
5
  case Intrinsic::nvvm_suld_2d_i8_zero:
3381
0
    return NVPTXISD::Suld2DI8Zero;
3382
5
  case Intrinsic::nvvm_suld_2d_i16_zero:
3383
0
    return NVPTXISD::Suld2DI16Zero;
3384
5
  case Intrinsic::nvvm_suld_2d_i32_zero:
3385
0
    return NVPTXISD::Suld2DI32Zero;
3386
5
  case Intrinsic::nvvm_suld_2d_i64_zero:
3387
0
    return NVPTXISD::Suld2DI64Zero;
3388
5
  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3389
0
    return NVPTXISD::Suld2DV2I8Zero;
3390
5
  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3391
0
    return NVPTXISD::Suld2DV2I16Zero;
3392
5
  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3393
0
    return NVPTXISD::Suld2DV2I32Zero;
3394
5
  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3395
0
    return NVPTXISD::Suld2DV2I64Zero;
3396
5
  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3397
0
    return NVPTXISD::Suld2DV4I8Zero;
3398
5
  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3399
0
    return NVPTXISD::Suld2DV4I16Zero;
3400
5
  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3401
0
    return NVPTXISD::Suld2DV4I32Zero;
3402
5
  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3403
0
    return NVPTXISD::Suld2DArrayI8Zero;
3404
5
  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3405
0
    return NVPTXISD::Suld2DArrayI16Zero;
3406
5
  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3407
0
    return NVPTXISD::Suld2DArrayI32Zero;
3408
5
  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3409
0
    return NVPTXISD::Suld2DArrayI64Zero;
3410
5
  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3411
0
    return NVPTXISD::Suld2DArrayV2I8Zero;
3412
5
  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3413
0
    return NVPTXISD::Suld2DArrayV2I16Zero;
3414
5
  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3415
0
    return NVPTXISD::Suld2DArrayV2I32Zero;
3416
5
  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3417
0
    return NVPTXISD::Suld2DArrayV2I64Zero;
3418
5
  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3419
0
    return NVPTXISD::Suld2DArrayV4I8Zero;
3420
5
  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3421
0
    return NVPTXISD::Suld2DArrayV4I16Zero;
3422
5
  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3423
0
    return NVPTXISD::Suld2DArrayV4I32Zero;
3424
5
  case Intrinsic::nvvm_suld_3d_i8_zero:
3425
0
    return NVPTXISD::Suld3DI8Zero;
3426
5
  case Intrinsic::nvvm_suld_3d_i16_zero:
3427
0
    return NVPTXISD::Suld3DI16Zero;
3428
5
  case Intrinsic::nvvm_suld_3d_i32_zero:
3429
0
    return NVPTXISD::Suld3DI32Zero;
3430
5
  case Intrinsic::nvvm_suld_3d_i64_zero:
3431
0
    return NVPTXISD::Suld3DI64Zero;
3432
5
  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3433
0
    return NVPTXISD::Suld3DV2I8Zero;
3434
5
  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3435
0
    return NVPTXISD::Suld3DV2I16Zero;
3436
5
  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3437
0
    return NVPTXISD::Suld3DV2I32Zero;
3438
5
  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3439
0
    return NVPTXISD::Suld3DV2I64Zero;
3440
5
  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3441
0
    return NVPTXISD::Suld3DV4I8Zero;
3442
5
  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3443
0
    return NVPTXISD::Suld3DV4I16Zero;
3444
5
  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3445
0
    return NVPTXISD::Suld3DV4I32Zero;
3446
5
  }
3447
5
}
3448
3449
// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3450
// TgtMemIntrinsic
3451
// because we need the information that is only available in the "Value" type
3452
// of destination
3453
// pointer. In particular, the address space information.
3454
bool NVPTXTargetLowering::getTgtMemIntrinsic(
3455
    IntrinsicInfo &Info, const CallInst &I,
3456
391
    MachineFunction &MF, unsigned Intrinsic) const {
3457
391
  switch (Intrinsic) {
3458
391
  default:
3459
262
    return false;
3460
391
  case Intrinsic::nvvm_match_all_sync_i32p:
3461
8
  case Intrinsic::nvvm_match_all_sync_i64p:
3462
8
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3463
8
    // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3464
8
    // in order to model data exchange with other threads, but perform no real
3465
8
    // memory accesses.
3466
8
    Info.memVT = MVT::i1;
3467
8
3468
8
    // Our result depends on both our and other thread's arguments.
3469
8
    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
3470
8
    return true;
3471
8
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3472
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3473
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3474
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3475
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3476
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3477
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3478
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3479
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3480
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3481
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3482
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3483
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3484
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3485
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3486
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3487
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3488
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3489
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3490
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3491
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3492
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3493
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3494
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3495
0
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3496
0
    Info.memVT = MVT::v8f16;
3497
0
    Info.ptrVal = I.getArgOperand(0);
3498
0
    Info.offset = 0;
3499
0
    Info.flags = MachineMemOperand::MOLoad;
3500
0
    Info.align = 16;
3501
0
    return true;
3502
0
  }
3503
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3504
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3505
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3506
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3507
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3508
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3509
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3510
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3511
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3512
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3513
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3514
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3515
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3516
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3517
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3518
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: {
3519
0
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3520
0
    Info.memVT = MVT::v2i32;
3521
0
    Info.ptrVal = I.getArgOperand(0);
3522
0
    Info.offset = 0;
3523
0
    Info.flags = MachineMemOperand::MOLoad;
3524
0
    Info.align = 8;
3525
0
    return true;
3526
0
  }
3527
0
3528
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3529
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3530
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3531
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3532
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3533
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3534
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3535
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3536
0
3537
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3538
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3539
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3540
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3541
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3542
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3543
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3544
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: {
3545
0
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3546
0
    Info.memVT = MVT::v4i32;
3547
0
    Info.ptrVal = I.getArgOperand(0);
3548
0
    Info.offset = 0;
3549
0
    Info.flags = MachineMemOperand::MOLoad;
3550
0
    Info.align = 16;
3551
0
    return true;
3552
0
  }
3553
0
3554
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3555
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3556
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3557
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3558
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3559
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3560
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3561
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3562
0
3563
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3564
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3565
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3566
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3567
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3568
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3569
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3570
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3571
0
  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3572
0
  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3573
0
  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3574
0
  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3575
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3576
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3577
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3578
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3579
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3580
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3581
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3582
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: {
3583
0
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3584
0
    Info.memVT = MVT::i32;
3585
0
    Info.ptrVal = I.getArgOperand(0);
3586
0
    Info.offset = 0;
3587
0
    Info.flags = MachineMemOperand::MOLoad;
3588
0
    Info.align = 4;
3589
0
    return true;
3590
0
  }
3591
0
3592
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3593
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3594
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3595
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3596
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3597
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3598
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3599
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3600
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3601
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3602
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3603
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3604
0
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3605
0
    Info.memVT = MVT::v4f16;
3606
0
    Info.ptrVal = I.getArgOperand(0);
3607
0
    Info.offset = 0;
3608
0
    Info.flags = MachineMemOperand::MOLoad;
3609
0
    Info.align = 16;
3610
0
    return true;
3611
0
  }
3612
0
3613
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3614
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3615
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3616
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3617
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3618
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3619
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3620
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3621
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3622
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3623
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3624
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
3625
0
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3626
0
    Info.memVT = MVT::v8f32;
3627
0
    Info.ptrVal = I.getArgOperand(0);
3628
0
    Info.offset = 0;
3629
0
    Info.flags = MachineMemOperand::MOLoad;
3630
0
    Info.align = 16;
3631
0
    return true;
3632
0
  }
3633
0
3634
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3635
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3636
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3637
0
  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3638
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3639
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3640
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3641
0
  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3642
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3643
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3644
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3645
0
  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3646
0
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3647
0
    Info.memVT = MVT::v8i32;
3648
0
    Info.ptrVal = I.getArgOperand(0);
3649
0
    Info.offset = 0;
3650
0
    Info.flags = MachineMemOperand::MOLoad;
3651
0
    Info.align = 16;
3652
0
    return true;
3653
0
  }
3654
0
3655
0
  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3656
0
  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3657
0
  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3658
0
  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3659
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3660
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3661
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3662
0
  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: {
3663
0
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3664
0
    Info.memVT = MVT::v2i32;
3665
0
    Info.ptrVal = I.getArgOperand(0);
3666
0
    Info.offset = 0;
3667
0
    Info.flags = MachineMemOperand::MOLoad;
3668
0
    Info.align = 8;
3669
0
    return true;
3670
0
  }
3671
0
3672
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3673
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3674
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3675
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3676
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3677
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3678
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3679
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3680
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3681
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3682
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3683
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3684
0
    Info.opc = ISD::INTRINSIC_VOID;
3685
0
    Info.memVT = MVT::v4f16;
3686
0
    Info.ptrVal = I.getArgOperand(0);
3687
0
    Info.offset = 0;
3688
0
    Info.flags = MachineMemOperand::MOStore;
3689
0
    Info.align = 16;
3690
0
    return true;
3691
0
  }
3692
0
3693
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3694
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3695
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3696
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3697
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3698
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3699
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3700
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3701
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3702
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3703
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3704
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
3705
0
    Info.opc = ISD::INTRINSIC_VOID;
3706
0
    Info.memVT = MVT::v8f32;
3707
0
    Info.ptrVal = I.getArgOperand(0);
3708
0
    Info.offset = 0;
3709
0
    Info.flags = MachineMemOperand::MOStore;
3710
0
    Info.align = 16;
3711
0
    return true;
3712
0
  }
3713
0
3714
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3715
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3716
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3717
0
  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3718
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3719
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3720
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3721
0
  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3722
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3723
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3724
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3725
0
  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3726
0
    Info.opc = ISD::INTRINSIC_VOID;
3727
0
    Info.memVT = MVT::v8i32;
3728
0
    Info.ptrVal = I.getArgOperand(0);
3729
0
    Info.offset = 0;
3730
0
    Info.flags = MachineMemOperand::MOStore;
3731
0
    Info.align = 16;
3732
0
    return true;
3733
0
  }
3734
0
3735
0
  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
3736
0
  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
3737
0
  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
3738
0
  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
3739
0
  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
3740
0
  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
3741
0
  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
3742
0
  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
3743
0
    Info.opc = ISD::INTRINSIC_VOID;
3744
0
    Info.memVT = MVT::v2i32;
3745
0
    Info.ptrVal = I.getArgOperand(0);
3746
0
    Info.offset = 0;
3747
0
    Info.flags = MachineMemOperand::MOStore;
3748
0
    Info.align = 8;
3749
0
    return true;
3750
0
  }
3751
0
3752
104
  case Intrinsic::nvvm_atomic_load_inc_32:
3753
104
  case Intrinsic::nvvm_atomic_load_dec_32:
3754
104
3755
104
  case Intrinsic::nvvm_atomic_add_gen_f_cta:
3756
104
  case Intrinsic::nvvm_atomic_add_gen_f_sys:
3757
104
  case Intrinsic::nvvm_atomic_add_gen_i_cta:
3758
104
  case Intrinsic::nvvm_atomic_add_gen_i_sys:
3759
104
  case Intrinsic::nvvm_atomic_and_gen_i_cta:
3760
104
  case Intrinsic::nvvm_atomic_and_gen_i_sys:
3761
104
  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3762
104
  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3763
104
  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3764
104
  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3765
104
  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3766
104
  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3767
104
  case Intrinsic::nvvm_atomic_max_gen_i_cta:
3768
104
  case Intrinsic::nvvm_atomic_max_gen_i_sys:
3769
104
  case Intrinsic::nvvm_atomic_min_gen_i_cta:
3770
104
  case Intrinsic::nvvm_atomic_min_gen_i_sys:
3771
104
  case Intrinsic::nvvm_atomic_or_gen_i_cta:
3772
104
  case Intrinsic::nvvm_atomic_or_gen_i_sys:
3773
104
  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3774
104
  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3775
104
  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3776
104
  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3777
104
    auto &DL = I.getModule()->getDataLayout();
3778
104
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3779
104
    Info.memVT = getValueType(DL, I.getType());
3780
104
    Info.ptrVal = I.getArgOperand(0);
3781
104
    Info.offset = 0;
3782
104
    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
3783
104
    Info.align = 0;
3784
104
    return true;
3785
104
  }
3786
104
3787
104
  case Intrinsic::nvvm_ldu_global_i:
3788
5
  case Intrinsic::nvvm_ldu_global_f:
3789
5
  case Intrinsic::nvvm_ldu_global_p: {
3790
5
    auto &DL = I.getModule()->getDataLayout();
3791
5
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3792
5
    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3793
5
      Info.memVT = getValueType(DL, I.getType());
3794
0
    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3795
0
      Info.memVT = getPointerTy(DL);
3796
0
    else
3797
0
      Info.memVT = getValueType(DL, I.getType());
3798
5
    Info.ptrVal = I.getArgOperand(0);
3799
5
    Info.offset = 0;
3800
5
    Info.flags = MachineMemOperand::MOLoad;
3801
5
    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3802
5
3803
5
    return true;
3804
5
  }
3805
5
  case Intrinsic::nvvm_ldg_global_i:
3806
2
  case Intrinsic::nvvm_ldg_global_f:
3807
2
  case Intrinsic::nvvm_ldg_global_p: {
3808
2
    auto &DL = I.getModule()->getDataLayout();
3809
2
3810
2
    Info.opc = ISD::INTRINSIC_W_CHAIN;
3811
2
    if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3812
2
      Info.memVT = getValueType(DL, I.getType());
3813
0
    else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3814
0
      Info.memVT = getPointerTy(DL);
3815
0
    else
3816
0
      Info.memVT = getValueType(DL, I.getType());
3817
2
    Info.ptrVal = I.getArgOperand(0);
3818
2
    Info.offset = 0;
3819
2
    Info.flags = MachineMemOperand::MOLoad;
3820
2
    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3821
2
3822
2
    return true;
3823
2
  }
3824
2
3825
5
  case Intrinsic::nvvm_tex_1d_v4f32_s32:
3826
5
  case Intrinsic::nvvm_tex_1d_v4f32_f32:
3827
5
  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3828
5
  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3829
5
  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3830
5
  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3831
5
  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3832
5
  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3833
5
  case Intrinsic::nvvm_tex_2d_v4f32_s32:
3834
5
  case Intrinsic::nvvm_tex_2d_v4f32_f32:
3835
5
  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3836
5
  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3837
5
  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3838
5
  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3839
5
  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3840
5
  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3841
5
  case Intrinsic::nvvm_tex_3d_v4f32_s32:
3842
5
  case Intrinsic::nvvm_tex_3d_v4f32_f32:
3843
5
  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3844
5
  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3845
5
  case Intrinsic::nvvm_tex_cube_v4f32_f32:
3846
5
  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3847
5
  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3848
5
  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3849
5
  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3850
5
  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3851
5
  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3852
5
  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3853
5
  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3854
5
  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3855
5
  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3856
5
  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3857
5
  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3858
5
  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3859
5
  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3860
5
  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3861
5
  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3862
5
  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3863
5
  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3864
5
  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3865
5
  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3866
5
  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3867
5
  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3868
5
  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3869
5
  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3870
5
  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3871
5
  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3872
5
  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3873
5
  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3874
5
  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3875
5
  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3876
5
  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3877
5
  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3878
5
  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3879
5
  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3880
5
  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3881
5
    Info.opc = getOpcForTextureInstr(Intrinsic);
3882
5
    Info.memVT = MVT::v4f32;
3883
5
    Info.ptrVal = nullptr;
3884
5
    Info.offset = 0;
3885
5
    Info.flags = MachineMemOperand::MOLoad;
3886
5
    Info.align = 16;
3887
5
    return true;
3888
5
3889
5
  case Intrinsic::nvvm_tex_1d_v4s32_s32:
3890
0
  case Intrinsic::nvvm_tex_1d_v4s32_f32:
3891
0
  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3892
0
  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3893
0
  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3894
0
  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3895
0
  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3896
0
  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3897
0
  case Intrinsic::nvvm_tex_2d_v4s32_s32:
3898
0
  case Intrinsic::nvvm_tex_2d_v4s32_f32:
3899
0
  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3900
0
  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3901
0
  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3902
0
  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3903
0
  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3904
0
  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3905
0
  case Intrinsic::nvvm_tex_3d_v4s32_s32:
3906
0
  case Intrinsic::nvvm_tex_3d_v4s32_f32:
3907
0
  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3908
0
  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3909
0
  case Intrinsic::nvvm_tex_cube_v4s32_f32:
3910
0
  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3911
0
  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3912
0
  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3913
0
  case Intrinsic::nvvm_tex_cube_v4u32_f32:
3914
0
  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3915
0
  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3916
0
  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3917
0
  case Intrinsic::nvvm_tex_1d_v4u32_s32:
3918
0
  case Intrinsic::nvvm_tex_1d_v4u32_f32:
3919
0
  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3920
0
  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3921
0
  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3922
0
  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3923
0
  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3924
0
  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3925
0
  case Intrinsic::nvvm_tex_2d_v4u32_s32:
3926
0
  case Intrinsic::nvvm_tex_2d_v4u32_f32:
3927
0
  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3928
0
  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3929
0
  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3930
0
  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3931
0
  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3932
0
  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3933
0
  case Intrinsic::nvvm_tex_3d_v4u32_s32:
3934
0
  case Intrinsic::nvvm_tex_3d_v4u32_f32:
3935
0
  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3936
0
  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3937
0
  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3938
0
  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3939
0
  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3940
0
  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3941
0
  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3942
0
  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3943
0
  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3944
0
  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3945
0
  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3946
0
  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3947
0
  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3948
0
  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3949
0
  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3950
0
  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3951
0
  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3952
0
  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3953
0
  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3954
0
  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3955
0
  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3956
0
  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3957
0
  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3958
0
  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3959
0
  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3960
0
  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3961
0
  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3962
0
  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3963
0
  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3964
0
  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3965
0
  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3966
0
  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3967
0
  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3968
0
  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3969
0
  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3970
0
  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3971
0
  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3972
0
  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3973
0
  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3974
0
  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3975
0
  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3976
0
  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3977
0
  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3978
0
  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3979
0
  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3980
0
  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3981
0
  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3982
0
  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3983
0
  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3984
0
  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3985
0
  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3986
0
  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3987
0
  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3988
0
  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3989
0
  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3990
0
  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3991
0
  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3992
0
  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3993
0
  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3994
0
  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3995
0
  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3996
0
  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3997
0
  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3998
0
  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3999
0
  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4000
0
  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4001
0
    Info.opc = getOpcForTextureInstr(Intrinsic);
4002
0
    Info.memVT = MVT::v4i32;
4003
0
    Info.ptrVal = nullptr;
4004
0
    Info.offset = 0;
4005
0
    Info.flags = MachineMemOperand::MOLoad;
4006
0
    Info.align = 16;
4007
0
    return true;
4008
0
4009
0
  case Intrinsic::nvvm_suld_1d_i8_clamp:
4010
0
  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4011
0
  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4012
0
  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4013
0
  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4014
0
  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4015
0
  case Intrinsic::nvvm_suld_2d_i8_clamp:
4016
0
  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4017
0
  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4018
0
  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4019
0
  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4020
0
  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4021
0
  case Intrinsic::nvvm_suld_3d_i8_clamp:
4022
0
  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4023
0
  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4024
0
  case Intrinsic::nvvm_suld_1d_i8_trap:
4025
0
  case Intrinsic::nvvm_suld_1d_v2i8_trap:
4026
0
  case Intrinsic::nvvm_suld_1d_v4i8_trap:
4027
0
  case Intrinsic::nvvm_suld_1d_array_i8_trap:
4028
0
  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4029
0
  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4030
0
  case Intrinsic::nvvm_suld_2d_i8_trap:
4031
0
  case Intrinsic::nvvm_suld_2d_v2i8_trap:
4032
0
  case Intrinsic::nvvm_suld_2d_v4i8_trap:
4033
0
  case Intrinsic::nvvm_suld_2d_array_i8_trap:
4034
0
  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4035
0
  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4036
0
  case Intrinsic::nvvm_suld_3d_i8_trap:
4037
0
  case Intrinsic::nvvm_suld_3d_v2i8_trap:
4038
0
  case Intrinsic::nvvm_suld_3d_v4i8_trap:
4039
0
  case Intrinsic::nvvm_suld_1d_i8_zero:
4040
0
  case Intrinsic::nvvm_suld_1d_v2i8_zero:
4041
0
  case Intrinsic::nvvm_suld_1d_v4i8_zero:
4042
0
  case Intrinsic::nvvm_suld_1d_array_i8_zero:
4043
0
  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4044
0
  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4045
0
  case Intrinsic::nvvm_suld_2d_i8_zero:
4046
0
  case Intrinsic::nvvm_suld_2d_v2i8_zero:
4047
0
  case Intrinsic::nvvm_suld_2d_v4i8_zero:
4048
0
  case Intrinsic::nvvm_suld_2d_array_i8_zero:
4049
0
  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4050
0
  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4051
0
  case Intrinsic::nvvm_suld_3d_i8_zero:
4052
0
  case Intrinsic::nvvm_suld_3d_v2i8_zero:
4053
0
  case Intrinsic::nvvm_suld_3d_v4i8_zero:
4054
0
    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4055
0
    Info.memVT = MVT::i8;
4056
0
    Info.ptrVal = nullptr;
4057
0
    Info.offset = 0;
4058
0
    Info.flags = MachineMemOperand::MOLoad;
4059
0
    Info.align = 16;
4060
0
    return true;
4061
0
4062
0
  case Intrinsic::nvvm_suld_1d_i16_clamp:
4063
0
  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4064
0
  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4065
0
  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4066
0
  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4067
0
  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4068
0
  case Intrinsic::nvvm_suld_2d_i16_clamp:
4069
0
  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4070
0
  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4071
0
  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4072
0
  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4073
0
  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4074
0
  case Intrinsic::nvvm_suld_3d_i16_clamp:
4075
0
  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4076
0
  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4077
0
  case Intrinsic::nvvm_suld_1d_i16_trap:
4078
0
  case Intrinsic::nvvm_suld_1d_v2i16_trap:
4079
0
  case Intrinsic::nvvm_suld_1d_v4i16_trap:
4080
0
  case Intrinsic::nvvm_suld_1d_array_i16_trap:
4081
0
  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4082
0
  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4083
0
  case Intrinsic::nvvm_suld_2d_i16_trap:
4084
0
  case Intrinsic::nvvm_suld_2d_v2i16_trap:
4085
0
  case Intrinsic::nvvm_suld_2d_v4i16_trap:
4086
0
  case Intrinsic::nvvm_suld_2d_array_i16_trap:
4087
0
  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4088
0
  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4089
0
  case Intrinsic::nvvm_suld_3d_i16_trap:
4090
0
  case Intrinsic::nvvm_suld_3d_v2i16_trap:
4091
0
  case Intrinsic::nvvm_suld_3d_v4i16_trap:
4092
0
  case Intrinsic::nvvm_suld_1d_i16_zero:
4093
0
  case Intrinsic::nvvm_suld_1d_v2i16_zero:
4094
0
  case Intrinsic::nvvm_suld_1d_v4i16_zero:
4095
0
  case Intrinsic::nvvm_suld_1d_array_i16_zero:
4096
0
  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4097
0
  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4098
0
  case Intrinsic::nvvm_suld_2d_i16_zero:
4099
0
  case Intrinsic::nvvm_suld_2d_v2i16_zero:
4100
0
  case Intrinsic::nvvm_suld_2d_v4i16_zero:
4101
0
  case Intrinsic::nvvm_suld_2d_array_i16_zero:
4102
0
  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4103
0
  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4104
0
  case Intrinsic::nvvm_suld_3d_i16_zero:
4105
0
  case Intrinsic::nvvm_suld_3d_v2i16_zero:
4106
0
  case Intrinsic::nvvm_suld_3d_v4i16_zero:
4107
0
    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4108
0
    Info.memVT = MVT::i16;
4109
0
    Info.ptrVal = nullptr;
4110
0
    Info.offset = 0;
4111
0
    Info.flags = MachineMemOperand::MOLoad;
4112
0
    Info.align = 16;
4113
0
    return true;
4114
0
4115
5
  case Intrinsic::nvvm_suld_1d_i32_clamp:
4116
5
  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4117
5
  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4118
5
  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4119
5
  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4120
5
  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4121
5
  case Intrinsic::nvvm_suld_2d_i32_clamp:
4122
5
  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4123
5
  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4124
5
  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4125
5
  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4126
5
  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4127
5
  case Intrinsic::nvvm_suld_3d_i32_clamp:
4128
5
  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4129
5
  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4130
5
  case Intrinsic::nvvm_suld_1d_i32_trap:
4131
5
  case Intrinsic::nvvm_suld_1d_v2i32_trap:
4132
5
  case Intrinsic::nvvm_suld_1d_v4i32_trap:
4133
5
  case Intrinsic::nvvm_suld_1d_array_i32_trap:
4134
5
  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4135
5
  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4136
5
  case Intrinsic::nvvm_suld_2d_i32_trap:
4137
5
  case Intrinsic::nvvm_suld_2d_v2i32_trap:
4138
5
  case Intrinsic::nvvm_suld_2d_v4i32_trap:
4139
5
  case Intrinsic::nvvm_suld_2d_array_i32_trap:
4140
5
  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4141
5
  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4142
5
  case Intrinsic::nvvm_suld_3d_i32_trap:
4143
5
  case Intrinsic::nvvm_suld_3d_v2i32_trap:
4144
5
  case Intrinsic::nvvm_suld_3d_v4i32_trap:
4145
5
  case Intrinsic::nvvm_suld_1d_i32_zero:
4146
5
  case Intrinsic::nvvm_suld_1d_v2i32_zero:
4147
5
  case Intrinsic::nvvm_suld_1d_v4i32_zero:
4148
5
  case Intrinsic::nvvm_suld_1d_array_i32_zero:
4149
5
  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4150
5
  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4151
5
  case Intrinsic::nvvm_suld_2d_i32_zero:
4152
5
  case Intrinsic::nvvm_suld_2d_v2i32_zero:
4153
5
  case Intrinsic::nvvm_suld_2d_v4i32_zero:
4154
5
  case Intrinsic::nvvm_suld_2d_array_i32_zero:
4155
5
  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4156
5
  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4157
5
  case Intrinsic::nvvm_suld_3d_i32_zero:
4158
5
  case Intrinsic::nvvm_suld_3d_v2i32_zero:
4159
5
  case Intrinsic::nvvm_suld_3d_v4i32_zero:
4160
5
    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4161
5
    Info.memVT = MVT::i32;
4162
5
    Info.ptrVal = nullptr;
4163
5
    Info.offset = 0;
4164
5
    Info.flags = MachineMemOperand::MOLoad;
4165
5
    Info.align = 16;
4166
5
    return true;
4167
5
4168
5
  case Intrinsic::nvvm_suld_1d_i64_clamp:
4169
0
  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4170
0
  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4171
0
  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4172
0
  case Intrinsic::nvvm_suld_2d_i64_clamp:
4173
0
  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4174
0
  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4175
0
  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4176
0
  case Intrinsic::nvvm_suld_3d_i64_clamp:
4177
0
  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4178
0
  case Intrinsic::nvvm_suld_1d_i64_trap:
4179
0
  case Intrinsic::nvvm_suld_1d_v2i64_trap:
4180
0
  case Intrinsic::nvvm_suld_1d_array_i64_trap:
4181
0
  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4182
0
  case Intrinsic::nvvm_suld_2d_i64_trap:
4183
0
  case Intrinsic::nvvm_suld_2d_v2i64_trap:
4184
0
  case Intrinsic::nvvm_suld_2d_array_i64_trap:
4185
0
  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4186
0
  case Intrinsic::nvvm_suld_3d_i64_trap:
4187
0
  case Intrinsic::nvvm_suld_3d_v2i64_trap:
4188
0
  case Intrinsic::nvvm_suld_1d_i64_zero:
4189
0
  case Intrinsic::nvvm_suld_1d_v2i64_zero:
4190
0
  case Intrinsic::nvvm_suld_1d_array_i64_zero:
4191
0
  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4192
0
  case Intrinsic::nvvm_suld_2d_i64_zero:
4193
0
  case Intrinsic::nvvm_suld_2d_v2i64_zero:
4194
0
  case Intrinsic::nvvm_suld_2d_array_i64_zero:
4195
0
  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4196
0
  case Intrinsic::nvvm_suld_3d_i64_zero:
4197
0
  case Intrinsic::nvvm_suld_3d_v2i64_zero:
4198
0
    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4199
0
    Info.memVT = MVT::i64;
4200
0
    Info.ptrVal = nullptr;
4201
0
    Info.offset = 0;
4202
0
    Info.flags = MachineMemOperand::MOLoad;
4203
0
    Info.align = 16;
4204
0
    return true;
4205
0
  }
4206
0
  return false;
4207
0
}
4208
4209
/// isLegalAddressingMode - Return true if the addressing mode represented
4210
/// by AM is legal for this target, for a load/store of the specified type.
4211
/// Used to guide target specific optimizations, like loop strength reduction
4212
/// (LoopStrengthReduce.cpp) and memory optimization for address mode
4213
/// (CodeGenPrepare.cpp)
4214
bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
4215
                                                const AddrMode &AM, Type *Ty,
4216
1.89k
                                                unsigned AS, Instruction *I) const {
4217
1.89k
  // AddrMode - This represents an addressing mode of:
4218
1.89k
  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4219
1.89k
  //
4220
1.89k
  // The legal address modes are
4221
1.89k
  // - [avar]
4222
1.89k
  // - [areg]
4223
1.89k
  // - [areg+immoff]
4224
1.89k
  // - [immAddr]
4225
1.89k
4226
1.89k
  if (AM.BaseGV) {
4227
70
    return !AM.BaseOffs && 
!AM.HasBaseReg53
&&
!AM.Scale47
;
4228
70
  }
4229
1.82k
4230
1.82k
  switch (AM.Scale) {
4231
1.82k
  case 0: // "r", "r+i" or "i" is allowed
4232
1.17k
    break;
4233
1.82k
  case 1:
4234
386
    if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4235
386
      return false;
4236
0
    // Otherwise we have r+i.
4237
0
    break;
4238
262
  default:
4239
262
    // No scale > 1 is allowed
4240
262
    return false;
4241
1.17k
  }
4242
1.17k
  return true;
4243
1.17k
}
4244
4245
//===----------------------------------------------------------------------===//
4246
//                         NVPTX Inline Assembly Support
4247
//===----------------------------------------------------------------------===//
4248
4249
/// getConstraintType - Given a constraint letter, return the type of
4250
/// constraint it is for this target.
4251
NVPTXTargetLowering::ConstraintType
4252
30
NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
4253
30
  if (Constraint.size() == 1) {
4254
30
    switch (Constraint[0]) {
4255
30
    default:
4256
6
      break;
4257
30
    case 'b':
4258
24
    case 'r':
4259
24
    case 'h':
4260
24
    case 'c':
4261
24
    case 'l':
4262
24
    case 'f':
4263
24
    case 'd':
4264
24
    case '0':
4265
24
    case 'N':
4266
24
      return C_RegisterClass;
4267
6
    }
4268
6
  }
4269
6
  return TargetLowering::getConstraintType(Constraint);
4270
6
}
4271
4272
std::pair<unsigned, const TargetRegisterClass *>
4273
NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
4274
                                                  StringRef Constraint,
4275
8
                                                  MVT VT) const {
4276
8
  if (Constraint.size() == 1) {
4277
8
    switch (Constraint[0]) {
4278
8
    case 'b':
4279
1
      return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4280
8
    case 'c':
4281
0
      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4282
8
    case 'h':
4283
0
      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4284
8
    case 'r':
4285
3
      return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4286
8
    case 'l':
4287
0
    case 'N':
4288
0
      return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4289
2
    case 'f':
4290
2
      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4291
0
    case 'd':
4292
0
      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4293
2
    }
4294
2
  }
4295
2
  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4296
2
}
4297
4298
//===----------------------------------------------------------------------===//
4299
//                         NVPTX DAG Combining
4300
//===----------------------------------------------------------------------===//
4301
4302
bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
4303
713
                                   CodeGenOpt::Level OptLevel) const {
4304
713
  // Always honor command-line argument
4305
713
  if (FMAContractLevelOpt.getNumOccurrences() > 0)
4306
36
    return FMAContractLevelOpt > 0;
4307
677
4308
677
  // Do not contract if we're not optimizing the code.
4309
677
  if (OptLevel == 0)
4310
275
    return false;
4311
402
4312
402
  // Honor TargetOptions flags that explicitly say fusion is okay.
4313
402
  if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
4314
31
    return true;
4315
371
4316
371
  return allowUnsafeFPMath(MF);
4317
371
}
4318
4319
395
bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
4320
395
  // Honor TargetOptions flags that explicitly say unsafe math is okay.
4321
395
  if (MF.getTarget().Options.UnsafeFPMath)
4322
58
    return true;
4323
337
4324
337
  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4325
337
  const Function &F = MF.getFunction();
4326
337
  if (F.hasFnAttribute("unsafe-fp-math")) {
4327
2
    Attribute Attr = F.getFnAttribute("unsafe-fp-math");
4328
2
    StringRef Val = Attr.getValueAsString();
4329
2
    if (Val == "true")
4330
0
      return true;
4331
337
  }
4332
337
4333
337
  return false;
4334
337
}
4335
4336
/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4337
/// operands N0 and N1.  This is a helper for PerformADDCombine that is
4338
/// called with the default operands, and if that fails, with commuted
4339
/// operands.
4340
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
4341
                                           TargetLowering::DAGCombinerInfo &DCI,
4342
                                             const NVPTXSubtarget &Subtarget,
4343
2.95k
                                             CodeGenOpt::Level OptLevel) {
4344
2.95k
  SelectionDAG  &DAG = DCI.DAG;
4345
2.95k
  // Skip non-integer, non-scalar case
4346
2.95k
  EVT VT=N0.getValueType();
4347
2.95k
  if (VT.isVector())
4348
120
    return SDValue();
4349
2.83k
4350
2.83k
  // fold (add (mul a, b), c) -> (mad a, b, c)
4351
2.83k
  //
4352
2.83k
  if (N0.getOpcode() == ISD::MUL) {
4353
9
    assert (VT.isInteger());
4354
9
    // For integer:
4355
9
    // Since integer multiply-add costs the same as integer multiply
4356
9
    // but is more costly than integer add, do the fusion only when
4357
9
    // the mul is only used in the add.
4358
9
    if (OptLevel==CodeGenOpt::None || 
VT != MVT::i325
||
4359
9
        
!N0.getNode()->hasOneUse()4
)
4360
5
      return SDValue();
4361
4
4362
4
    // Do the folding
4363
4
    return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
4364
4
                       N0.getOperand(0), N0.getOperand(1), N1);
4365
4
  }
4366
2.82k
  else if (N0.getOpcode() == ISD::FMUL) {
4367
40
    if (VT == MVT::f32 || 
VT == MVT::f648
) {
4368
40
      const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4369
40
          &DAG.getTargetLoweringInfo());
4370
40
      if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
4371
36
        return SDValue();
4372
4
4373
4
      // For floating point:
4374
4
      // Do the fusion only when the mul has less than 5 uses and all
4375
4
      // are add.
4376
4
      // The heuristic is that if a use is not an add, then that use
4377
4
      // cannot be fused into fma, therefore mul is still needed anyway.
4378
4
      // If there are more than 4 uses, even if they are all add, fusing
4379
4
      // them will increase register pressue.
4380
4
      //
4381
4
      int numUses = 0;
4382
4
      int nonAddCount = 0;
4383
4
      for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
4384
4
           UE = N0.getNode()->use_end();
4385
8
           UI != UE; 
++UI4
) {
4386
4
        numUses++;
4387
4
        SDNode *User = *UI;
4388
4
        if (User->getOpcode() != ISD::FADD)
4389
0
          ++nonAddCount;
4390
4
      }
4391
4
      if (numUses >= 5)
4392
0
        return SDValue();
4393
4
      if (nonAddCount) {
4394
0
        int orderNo = N->getIROrder();
4395
0
        int orderNo2 = N0.getNode()->getIROrder();
4396
0
        // simple heuristics here for considering potential register
4397
0
        // pressure, the logics here is that the differnce are used
4398
0
        // to measure the distance between def and use, the longer distance
4399
0
        // more likely cause register pressure.
4400
0
        if (orderNo - orderNo2 < 500)
4401
0
          return SDValue();
4402
0
4403
0
        // Now, check if at least one of the FMUL's operands is live beyond the node N,
4404
0
        // which guarantees that the FMA will not increase register pressure at node N.
4405
0
        bool opIsLive = false;
4406
0
        const SDNode *left = N0.getOperand(0).getNode();
4407
0
        const SDNode *right = N0.getOperand(1).getNode();
4408
0
4409
0
        if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4410
0
          opIsLive = true;
4411
0
4412
0
        if (!opIsLive)
4413
0
          for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
4414
0
            SDNode *User = *UI;
4415
0
            int orderNo3 = User->getIROrder();
4416
0
            if (orderNo3 > orderNo) {
4417
0
              opIsLive = true;
4418
0
              break;
4419
0
            }
4420
0
          }
4421
0
4422
0
        if (!opIsLive)
4423
0
          for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
4424
0
            SDNode *User = *UI;
4425
0
            int orderNo3 = User->getIROrder();
4426
0
            if (orderNo3 > orderNo) {
4427
0
              opIsLive = true;
4428
0
              break;
4429
0
            }
4430
0
          }
4431
0
4432
0
        if (!opIsLive)
4433
0
          return SDValue();
4434
4
      }
4435
4
4436
4
      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
4437
4
                         N0.getOperand(0), N0.getOperand(1), N1);
4438
4
    }
4439
40
  }
4440
2.78k
4441
2.78k
  return SDValue();
4442
2.78k
}
4443
4444
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4445
///
4446
static SDValue PerformADDCombine(SDNode *N,
4447
                                 TargetLowering::DAGCombinerInfo &DCI,
4448
                                 const NVPTXSubtarget &Subtarget,
4449
1.48k
                                 CodeGenOpt::Level OptLevel) {
4450
1.48k
  SDValue N0 = N->getOperand(0);
4451
1.48k
  SDValue N1 = N->getOperand(1);
4452
1.48k
4453
1.48k
  // First try with the default operand order.
4454
1.48k
  if (SDValue Result =
4455
7
          PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
4456
7
    return Result;
4457
1.47k
4458
1.47k
  // If that didn't work, try again with the operands commuted.
4459
1.47k
  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
4460
1.47k
}
4461
4462
static SDValue PerformANDCombine(SDNode *N,
4463
262
                                 TargetLowering::DAGCombinerInfo &DCI) {
4464
262
  // The type legalizer turns a vector load of i8 values into a zextload to i16
4465
262
  // registers, optionally ANY_EXTENDs it (if target type is integer),
4466
262
  // and ANDs off the high 8 bits. Since we turn this load into a
4467
262
  // target-specific DAG node, the DAG combiner fails to eliminate these AND
4468
262
  // nodes. Do that here.
4469
262
  SDValue Val = N->getOperand(0);
4470
262
  SDValue Mask = N->getOperand(1);
4471
262
4472
262
  if (isa<ConstantSDNode>(Val)) {
4473
0
    std::swap(Val, Mask);
4474
0
  }
4475
262
4476
262
  SDValue AExt;
4477
262
  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4478
262
  if (Val.getOpcode() == ISD::ANY_EXTEND) {
4479
31
    AExt = Val;
4480
31
    Val = Val->getOperand(0);
4481
31
  }
4482
262
4483
262
  if (Val->isMachineOpcode() && 
Val->getMachineOpcode() == NVPTX::IMOV16rr0
) {
4484
0
    Val = Val->getOperand(0);
4485
0
  }
4486
262
4487
262
  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
4488
262
      
Val->getOpcode() == NVPTXISD::LoadV4260
) {
4489
2
    ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
4490
2
    if (!MaskCnst) {
4491
0
      // Not an AND with a constant
4492
0
      return SDValue();
4493
0
    }
4494
2
4495
2
    uint64_t MaskVal = MaskCnst->getZExtValue();
4496
2
    if (MaskVal != 0xff) {
4497
0
      // Not an AND that chops off top 8 bits
4498
0
      return SDValue();
4499
0
    }
4500
2
4501
2
    MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4502
2
    if (!Mem) {
4503
0
      // Not a MemSDNode?!?
4504
0
      return SDValue();
4505
0
    }
4506
2
4507
2
    EVT MemVT = Mem->getMemoryVT();
4508
2
    if (MemVT != MVT::v2i8 && 
MemVT != MVT::v4i80
) {
4509
0
      // We only handle the i8 case
4510
0
      return SDValue();
4511
0
    }
4512
2
4513
2
    unsigned ExtType =
4514
2
      cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
4515
2
        getZExtValue();
4516
2
    if (ExtType == ISD::SEXTLOAD) {
4517
0
      // If for some reason the load is a sextload, the and is needed to zero
4518
0
      // out the high 8 bits
4519
0
      return SDValue();
4520
0
    }
4521
2
4522
2
    bool AddTo = false;
4523
2
    if (AExt.getNode() != nullptr) {
4524
2
      // Re-insert the ext as a zext.
4525
2
      Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4526
2
                            AExt.getValueType(), Val);
4527
2
      AddTo = true;
4528
2
    }
4529
2
4530
2
    // If we get here, the AND is unnecessary.  Just replace it with the load
4531
2
    DCI.CombineTo(N, Val, AddTo);
4532
2
  }
4533
262
4534
262
  return SDValue();
4535
262
}
4536
4537
static SDValue PerformREMCombine(SDNode *N,
4538
                                 TargetLowering::DAGCombinerInfo &DCI,
4539
94
                                 CodeGenOpt::Level OptLevel) {
4540
94
  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
4541
94
4542
94
  // Don't do anything at less than -O2.
4543
94
  if (OptLevel < CodeGenOpt::Default)
4544
21
    return SDValue();
4545
73
4546
73
  SelectionDAG &DAG = DCI.DAG;
4547
73
  SDLoc DL(N);
4548
73
  EVT VT = N->getValueType(0);
4549
73
  bool IsSigned = N->getOpcode() == ISD::SREM;
4550
73
  unsigned DivOpc = IsSigned ? 
ISD::SDIV34
:
ISD::UDIV39
;
4551
73
4552
73
  const SDValue &Num = N->getOperand(0);
4553
73
  const SDValue &Den = N->getOperand(1);
4554
73
4555
87
  for (const SDNode *U : Num->uses()) {
4556
87
    if (U->getOpcode() == DivOpc && 
U->getOperand(0) == Num8
&&
4557
87
        
U->getOperand(1) == Den5
) {
4558
2
      // Num % Den -> Num - (Num / Den) * Den
4559
2
      return DAG.getNode(ISD::SUB, DL, VT, Num,
4560
2
                         DAG.getNode(ISD::MUL, DL, VT,
4561
2
                                     DAG.getNode(DivOpc, DL, VT, Num, Den),
4562
2
                                     Den));
4563
2
    }
4564
87
  }
4565
73
  
return SDValue()71
;
4566
73
}
4567
4568
enum OperandSignedness {
4569
  Signed = 0,
4570
  Unsigned,
4571
  Unknown
4572
};
4573
4574
/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4575
/// that can be demoted to \p OptSize bits without loss of information. The
4576
/// signedness of the operand, if determinable, is placed in \p S.
4577
static bool IsMulWideOperandDemotable(SDValue Op,
4578
                                      unsigned OptSize,
4579
153
                                      OperandSignedness &S) {
4580
153
  S = Unknown;
4581
153
4582
153
  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4583
153
      
Op.getOpcode() == ISD::SIGN_EXTEND_INREG132
) {
4584
21
    EVT OrigVT = Op.getOperand(0).getValueType();
4585
21
    if (OrigVT.getSizeInBits() <= OptSize) {
4586
21
      S = Signed;
4587
21
      return true;
4588
21
    }
4589
132
  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4590
13
    EVT OrigVT = Op.getOperand(0).getValueType();
4591
13
    if (OrigVT.getSizeInBits() <= OptSize) {
4592
13
      S = Unsigned;
4593
13
      return true;
4594
13
    }
4595
119
  }
4596
119
4597
119
  return false;
4598
119
}
4599
4600
/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4601
/// be demoted to \p OptSize bits without loss of information. If the operands
4602
/// contain a constant, it should appear as the RHS operand. The signedness of
4603
/// the operands is placed in \p IsSigned.
4604
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
4605
                                        unsigned OptSize,
4606
144
                                        bool &IsSigned) {
4607
144
  OperandSignedness LHSSign;
4608
144
4609
144
  // The LHS operand must be a demotable op
4610
144
  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4611
119
    return false;
4612
25
4613
25
  // We should have been able to determine the signedness from the LHS
4614
25
  if (LHSSign == Unknown)
4615
0
    return false;
4616
25
4617
25
  IsSigned = (LHSSign == Signed);
4618
25
4619
25
  // The RHS can be a demotable op or a constant
4620
25
  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4621
16
    const APInt &Val = CI->getAPIntValue();
4622
16
    if (LHSSign == Unsigned) {
4623
5
      return Val.isIntN(OptSize);
4624
11
    } else {
4625
11
      return Val.isSignedIntN(OptSize);
4626
11
    }
4627
9
  } else {
4628
9
    OperandSignedness RHSSign;
4629
9
    if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4630
0
      return false;
4631
9
4632
9
    return LHSSign == RHSSign;
4633
9
  }
4634
25
}
4635
4636
/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4637
/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4638
/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4639
/// amount.
4640
static SDValue TryMULWIDECombine(SDNode *N,
4641
190
                                 TargetLowering::DAGCombinerInfo &DCI) {
4642
190
  EVT MulType = N->getValueType(0);
4643
190
  if (MulType != MVT::i32 && 
MulType != MVT::i6488
) {
4644
27
    return SDValue();
4645
27
  }
4646
163
4647
163
  SDLoc DL(N);
4648
163
  unsigned OptSize = MulType.getSizeInBits() >> 1;
4649
163
  SDValue LHS = N->getOperand(0);
4650
163
  SDValue RHS = N->getOperand(1);
4651
163
4652
163
  // Canonicalize the multiply so the constant (if any) is on the right
4653
163
  if (N->getOpcode() == ISD::MUL) {
4654
69
    if (isa<ConstantSDNode>(LHS)) {
4655
0
      std::swap(LHS, RHS);
4656
0
    }
4657
69
  }
4658
163
4659
163
  // If we have a SHL, determine the actual multiply amount
4660
163
  if (N->getOpcode() == ISD::SHL) {
4661
94
    ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4662
94
    if (!ShlRHS) {
4663
19
      return SDValue();
4664
19
    }
4665
75
4666
75
    APInt ShiftAmt = ShlRHS->getAPIntValue();
4667
75
    unsigned BitWidth = MulType.getSizeInBits();
4668
75
    if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4669
75
      APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
4670
75
      RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
4671
75
    } else {
4672
0
      return SDValue();
4673
0
    }
4674
144
  }
4675
144
4676
144
  bool Signed;
4677
144
  // Verify that our operands are demotable
4678
144
  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
4679
119
    return SDValue();
4680
119
  }
4681
25
4682
25
  EVT DemotedVT;
4683
25
  if (MulType == MVT::i32) {
4684
4
    DemotedVT = MVT::i16;
4685
21
  } else {
4686
21
    DemotedVT = MVT::i32;
4687
21
  }
4688
25