Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
/// \file
11
/// \brief Custom DAG lowering for SI
12
//
13
//===----------------------------------------------------------------------===//
14
15
#ifdef _MSC_VER
16
// Provide M_PI.
17
#define _USE_MATH_DEFINES
18
#endif
19
20
#include "SIISelLowering.h"
21
#include "AMDGPU.h"
22
#include "AMDGPUIntrinsicInfo.h"
23
#include "AMDGPUSubtarget.h"
24
#include "AMDGPUTargetMachine.h"
25
#include "SIDefines.h"
26
#include "SIInstrInfo.h"
27
#include "SIMachineFunctionInfo.h"
28
#include "SIRegisterInfo.h"
29
#include "Utils/AMDGPUBaseInfo.h"
30
#include "llvm/ADT/APFloat.h"
31
#include "llvm/ADT/APInt.h"
32
#include "llvm/ADT/ArrayRef.h"
33
#include "llvm/ADT/BitVector.h"
34
#include "llvm/ADT/SmallVector.h"
35
#include "llvm/ADT/Statistic.h"
36
#include "llvm/ADT/StringRef.h"
37
#include "llvm/ADT/StringSwitch.h"
38
#include "llvm/ADT/Twine.h"
39
#include "llvm/CodeGen/Analysis.h"
40
#include "llvm/CodeGen/CallingConvLower.h"
41
#include "llvm/CodeGen/DAGCombine.h"
42
#include "llvm/CodeGen/ISDOpcodes.h"
43
#include "llvm/CodeGen/MachineBasicBlock.h"
44
#include "llvm/CodeGen/MachineFrameInfo.h"
45
#include "llvm/CodeGen/MachineFunction.h"
46
#include "llvm/CodeGen/MachineInstr.h"
47
#include "llvm/CodeGen/MachineInstrBuilder.h"
48
#include "llvm/CodeGen/MachineMemOperand.h"
49
#include "llvm/CodeGen/MachineModuleInfo.h"
50
#include "llvm/CodeGen/MachineOperand.h"
51
#include "llvm/CodeGen/MachineRegisterInfo.h"
52
#include "llvm/CodeGen/MachineValueType.h"
53
#include "llvm/CodeGen/SelectionDAG.h"
54
#include "llvm/CodeGen/SelectionDAGNodes.h"
55
#include "llvm/CodeGen/ValueTypes.h"
56
#include "llvm/IR/Constants.h"
57
#include "llvm/IR/DataLayout.h"
58
#include "llvm/IR/DebugLoc.h"
59
#include "llvm/IR/DerivedTypes.h"
60
#include "llvm/IR/DiagnosticInfo.h"
61
#include "llvm/IR/Function.h"
62
#include "llvm/IR/GlobalValue.h"
63
#include "llvm/IR/InstrTypes.h"
64
#include "llvm/IR/Instruction.h"
65
#include "llvm/IR/Instructions.h"
66
#include "llvm/IR/IntrinsicInst.h"
67
#include "llvm/IR/Type.h"
68
#include "llvm/Support/Casting.h"
69
#include "llvm/Support/CodeGen.h"
70
#include "llvm/Support/CommandLine.h"
71
#include "llvm/Support/Compiler.h"
72
#include "llvm/Support/ErrorHandling.h"
73
#include "llvm/Support/KnownBits.h"
74
#include "llvm/Support/MathExtras.h"
75
#include "llvm/Target/TargetCallingConv.h"
76
#include "llvm/Target/TargetOptions.h"
77
#include "llvm/Target/TargetRegisterInfo.h"
78
#include <cassert>
79
#include <cmath>
80
#include <cstdint>
81
#include <iterator>
82
#include <tuple>
83
#include <utility>
84
#include <vector>
85
86
using namespace llvm;
87
88
#define DEBUG_TYPE "si-lower"
89
90
STATISTIC(NumTailCalls, "Number of tail calls");
91
92
static cl::opt<bool> EnableVGPRIndexMode(
93
  "amdgpu-vgpr-index-mode",
94
  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
95
  cl::init(false));
96
97
44
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
98
44
  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
99
201
  for (unsigned Reg = 0; 
Reg < NumSGPRs201
;
++Reg157
) {
100
201
    if (
!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)201
) {
101
44
      return AMDGPU::SGPR0 + Reg;
102
44
    }
103
201
  }
104
0
  
llvm_unreachable0
("Cannot allocate sgpr");
105
0
}
106
107
SITargetLowering::SITargetLowering(const TargetMachine &TM,
108
                                   const SISubtarget &STI)
109
1.81k
    : AMDGPUTargetLowering(TM, STI) {
110
1.81k
  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
111
1.81k
  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
112
1.81k
113
1.81k
  addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
114
1.81k
  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
115
1.81k
116
1.81k
  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
117
1.81k
  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
118
1.81k
  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
119
1.81k
120
1.81k
  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
121
1.81k
  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
122
1.81k
123
1.81k
  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
124
1.81k
  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
125
1.81k
126
1.81k
  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
127
1.81k
  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
128
1.81k
129
1.81k
  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
130
1.81k
  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
131
1.81k
132
1.81k
  if (
Subtarget->has16BitInsts()1.81k
) {
133
784
    addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
134
784
    addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
135
784
  }
136
1.81k
137
1.81k
  if (
Subtarget->hasVOP3PInsts()1.81k
) {
138
132
    addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
139
132
    addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
140
132
  }
141
1.81k
142
1.81k
  computeRegisterProperties(STI.getRegisterInfo());
143
1.81k
144
1.81k
  // We need to custom lower vector stores from local memory
145
1.81k
  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
146
1.81k
  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
147
1.81k
  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
148
1.81k
  setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
149
1.81k
  setOperationAction(ISD::LOAD, MVT::i1, Custom);
150
1.81k
151
1.81k
  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
152
1.81k
  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
153
1.81k
  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
154
1.81k
  setOperationAction(ISD::STORE, MVT::v16i32, Custom);
155
1.81k
  setOperationAction(ISD::STORE, MVT::i1, Custom);
156
1.81k
157
1.81k
  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
158
1.81k
  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
159
1.81k
  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
160
1.81k
  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
161
1.81k
  setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
162
1.81k
  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
163
1.81k
  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
164
1.81k
  setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
165
1.81k
  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
166
1.81k
  setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
167
1.81k
168
1.81k
  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
169
1.81k
  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
170
1.81k
  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
171
1.81k
172
1.81k
  setOperationAction(ISD::SELECT, MVT::i1, Promote);
173
1.81k
  setOperationAction(ISD::SELECT, MVT::i64, Custom);
174
1.81k
  setOperationAction(ISD::SELECT, MVT::f64, Promote);
175
1.81k
  AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
176
1.81k
177
1.81k
  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
178
1.81k
  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
179
1.81k
  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
180
1.81k
  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
181
1.81k
  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
182
1.81k
183
1.81k
  setOperationAction(ISD::SETCC, MVT::i1, Promote);
184
1.81k
  setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
185
1.81k
  setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
186
1.81k
  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
187
1.81k
188
1.81k
  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
189
1.81k
  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
190
1.81k
191
1.81k
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
192
1.81k
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
193
1.81k
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
194
1.81k
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
195
1.81k
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
196
1.81k
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
197
1.81k
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
198
1.81k
199
1.81k
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
200
1.81k
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
201
1.81k
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
202
1.81k
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
203
1.81k
204
1.81k
  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
205
1.81k
206
1.81k
  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
207
1.81k
  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
208
1.81k
  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
209
1.81k
210
1.81k
  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
211
1.81k
  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
212
1.81k
  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
213
1.81k
  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
214
1.81k
  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
215
1.81k
  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
216
1.81k
217
1.81k
  setOperationAction(ISD::UADDO, MVT::i32, Legal);
218
1.81k
  setOperationAction(ISD::USUBO, MVT::i32, Legal);
219
1.81k
220
1.81k
  setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
221
1.81k
  setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
222
1.81k
223
1.81k
  // We only support LOAD/STORE and vector manipulation ops for vectors
224
1.81k
  // with > 4 elements.
225
1.81k
  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
226
10.8k
        MVT::v2i64, MVT::v2f64}) {
227
2.81M
    for (unsigned Op = 0; 
Op < ISD::BUILTIN_OP_END2.81M
;
++Op2.80M
) {
228
2.80M
      switch (Op) {
229
97.9k
      case ISD::LOAD:
230
97.9k
      case ISD::STORE:
231
97.9k
      case ISD::BUILD_VECTOR:
232
97.9k
      case ISD::BITCAST:
233
97.9k
      case ISD::EXTRACT_VECTOR_ELT:
234
97.9k
      case ISD::INSERT_VECTOR_ELT:
235
97.9k
      case ISD::INSERT_SUBVECTOR:
236
97.9k
      case ISD::EXTRACT_SUBVECTOR:
237
97.9k
      case ISD::SCALAR_TO_VECTOR:
238
97.9k
        break;
239
10.8k
      case ISD::CONCAT_VECTORS:
240
10.8k
        setOperationAction(Op, VT, Custom);
241
10.8k
        break;
242
2.69M
      default:
243
2.69M
        setOperationAction(Op, VT, Expand);
244
2.69M
        break;
245
2.80M
      }
246
2.80M
    }
247
10.8k
  }
248
1.81k
249
1.81k
  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
250
1.81k
  // is expanded to avoid having two separate loops in case the index is a VGPR.
251
1.81k
252
1.81k
  // Most operations are naturally 32-bit vector operations. We only support
253
1.81k
  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
254
1.81k
  
for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) 1.81k
{
255
3.62k
    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
256
3.62k
    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
257
3.62k
258
3.62k
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
259
3.62k
    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
260
3.62k
261
3.62k
    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
262
3.62k
    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
263
3.62k
264
3.62k
    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
265
3.62k
    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
266
3.62k
  }
267
1.81k
268
1.81k
  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
269
1.81k
  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
270
1.81k
  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
271
1.81k
  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
272
1.81k
273
1.81k
  // Avoid stack access for these.
274
1.81k
  // TODO: Generalize to more vector types.
275
1.81k
  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
276
1.81k
  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
277
1.81k
  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
278
1.81k
  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
279
1.81k
280
1.81k
  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
281
1.81k
  // and output demarshalling
282
1.81k
  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
283
1.81k
  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
284
1.81k
285
1.81k
  // We can't return success/failure, only the old value,
286
1.81k
  // let LLVM add the comparison
287
1.81k
  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
288
1.81k
  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
289
1.81k
290
1.81k
  if (
getSubtarget()->hasFlatAddressSpace()1.81k
) {
291
1.09k
    setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
292
1.09k
    setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
293
1.09k
  }
294
1.81k
295
1.81k
  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
296
1.81k
  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
297
1.81k
298
1.81k
  // On SI this is s_memtime and s_memrealtime on VI.
299
1.81k
  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
300
1.81k
  setOperationAction(ISD::TRAP, MVT::Other, Custom);
301
1.81k
  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
302
1.81k
303
1.81k
  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
304
1.81k
  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
305
1.81k
306
1.81k
  if (
Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS1.81k
) {
307
978
    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
308
978
    setOperationAction(ISD::FCEIL, MVT::f64, Legal);
309
978
    setOperationAction(ISD::FRINT, MVT::f64, Legal);
310
978
  }
311
1.81k
312
1.81k
  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
313
1.81k
314
1.81k
  setOperationAction(ISD::FSIN, MVT::f32, Custom);
315
1.81k
  setOperationAction(ISD::FCOS, MVT::f32, Custom);
316
1.81k
  setOperationAction(ISD::FDIV, MVT::f32, Custom);
317
1.81k
  setOperationAction(ISD::FDIV, MVT::f64, Custom);
318
1.81k
319
1.81k
  if (
Subtarget->has16BitInsts()1.81k
) {
320
784
    setOperationAction(ISD::Constant, MVT::i16, Legal);
321
784
322
784
    setOperationAction(ISD::SMIN, MVT::i16, Legal);
323
784
    setOperationAction(ISD::SMAX, MVT::i16, Legal);
324
784
325
784
    setOperationAction(ISD::UMIN, MVT::i16, Legal);
326
784
    setOperationAction(ISD::UMAX, MVT::i16, Legal);
327
784
328
784
    setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
329
784
    AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
330
784
331
784
    setOperationAction(ISD::ROTR, MVT::i16, Promote);
332
784
    setOperationAction(ISD::ROTL, MVT::i16, Promote);
333
784
334
784
    setOperationAction(ISD::SDIV, MVT::i16, Promote);
335
784
    setOperationAction(ISD::UDIV, MVT::i16, Promote);
336
784
    setOperationAction(ISD::SREM, MVT::i16, Promote);
337
784
    setOperationAction(ISD::UREM, MVT::i16, Promote);
338
784
339
784
    setOperationAction(ISD::BSWAP, MVT::i16, Promote);
340
784
    setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
341
784
342
784
    setOperationAction(ISD::CTTZ, MVT::i16, Promote);
343
784
    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
344
784
    setOperationAction(ISD::CTLZ, MVT::i16, Promote);
345
784
    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
346
784
347
784
    setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
348
784
349
784
    setOperationAction(ISD::BR_CC, MVT::i16, Expand);
350
784
351
784
    setOperationAction(ISD::LOAD, MVT::i16, Custom);
352
784
353
784
    setTruncStoreAction(MVT::i64, MVT::i16, Expand);
354
784
355
784
    setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
356
784
    AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
357
784
    setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
358
784
    AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
359
784
360
784
    setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
361
784
    setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
362
784
    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
363
784
    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
364
784
365
784
    // F16 - Constant Actions.
366
784
    setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
367
784
368
784
    // F16 - Load/Store Actions.
369
784
    setOperationAction(ISD::LOAD, MVT::f16, Promote);
370
784
    AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
371
784
    setOperationAction(ISD::STORE, MVT::f16, Promote);
372
784
    AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
373
784
374
784
    // F16 - VOP1 Actions.
375
784
    setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
376
784
    setOperationAction(ISD::FCOS, MVT::f16, Promote);
377
784
    setOperationAction(ISD::FSIN, MVT::f16, Promote);
378
784
    setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
379
784
    setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
380
784
    setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
381
784
    setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
382
784
    setOperationAction(ISD::FROUND, MVT::f16, Custom);
383
784
384
784
    // F16 - VOP2 Actions.
385
784
    setOperationAction(ISD::BR_CC, MVT::f16, Expand);
386
784
    setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
387
784
    setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
388
784
    setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
389
784
    setOperationAction(ISD::FDIV, MVT::f16, Custom);
390
784
391
784
    // F16 - VOP3 Actions.
392
784
    setOperationAction(ISD::FMA, MVT::f16, Legal);
393
784
    if (!Subtarget->hasFP16Denormals())
394
26
      setOperationAction(ISD::FMAD, MVT::f16, Legal);
395
784
  }
396
1.81k
397
1.81k
  if (
Subtarget->hasVOP3PInsts()1.81k
) {
398
264
    for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
399
68.3k
      for (unsigned Op = 0; 
Op < ISD::BUILTIN_OP_END68.3k
;
++Op68.1k
) {
400
68.1k
        switch (Op) {
401
2.37k
        case ISD::LOAD:
402
2.37k
        case ISD::STORE:
403
2.37k
        case ISD::BUILD_VECTOR:
404
2.37k
        case ISD::BITCAST:
405
2.37k
        case ISD::EXTRACT_VECTOR_ELT:
406
2.37k
        case ISD::INSERT_VECTOR_ELT:
407
2.37k
        case ISD::INSERT_SUBVECTOR:
408
2.37k
        case ISD::EXTRACT_SUBVECTOR:
409
2.37k
        case ISD::SCALAR_TO_VECTOR:
410
2.37k
          break;
411
264
        case ISD::CONCAT_VECTORS:
412
264
          setOperationAction(Op, VT, Custom);
413
264
          break;
414
65.4k
        default:
415
65.4k
          setOperationAction(Op, VT, Expand);
416
65.4k
          break;
417
68.1k
        }
418
68.1k
      }
419
264
    }
420
132
421
132
    // XXX - Do these do anything? Vector constants turn into build_vector.
422
132
    setOperationAction(ISD::Constant, MVT::v2i16, Legal);
423
132
    setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
424
132
425
132
    setOperationAction(ISD::STORE, MVT::v2i16, Promote);
426
132
    AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
427
132
    setOperationAction(ISD::STORE, MVT::v2f16, Promote);
428
132
    AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
429
132
430
132
    setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
431
132
    AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
432
132
    setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
433
132
    AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
434
132
435
132
    setOperationAction(ISD::AND, MVT::v2i16, Promote);
436
132
    AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
437
132
    setOperationAction(ISD::OR, MVT::v2i16, Promote);
438
132
    AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
439
132
    setOperationAction(ISD::XOR, MVT::v2i16, Promote);
440
132
    AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
441
132
    setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
442
132
    AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
443
132
    setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
444
132
    AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
445
132
446
132
    setOperationAction(ISD::ADD, MVT::v2i16, Legal);
447
132
    setOperationAction(ISD::SUB, MVT::v2i16, Legal);
448
132
    setOperationAction(ISD::MUL, MVT::v2i16, Legal);
449
132
    setOperationAction(ISD::SHL, MVT::v2i16, Legal);
450
132
    setOperationAction(ISD::SRL, MVT::v2i16, Legal);
451
132
    setOperationAction(ISD::SRA, MVT::v2i16, Legal);
452
132
    setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
453
132
    setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
454
132
    setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
455
132
    setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
456
132
457
132
    setOperationAction(ISD::FADD, MVT::v2f16, Legal);
458
132
    setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
459
132
    setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
460
132
    setOperationAction(ISD::FMA, MVT::v2f16, Legal);
461
132
    setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
462
132
    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
463
132
464
132
    // This isn't really legal, but this avoids the legalizer unrolling it (and
465
132
    // allows matching fneg (fabs x) patterns)
466
132
    setOperationAction(ISD::FABS, MVT::v2f16, Legal);
467
132
468
132
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
469
132
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
470
132
471
132
    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
472
132
    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
473
132
    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
474
1.81k
  } else {
475
1.68k
    setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
476
1.68k
    setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
477
1.68k
  }
478
1.81k
479
1.81k
  
for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) 1.81k
{
480
9.07k
    setOperationAction(ISD::SELECT, VT, Custom);
481
9.07k
  }
482
1.81k
483
1.81k
  setTargetDAGCombine(ISD::ADD);
484
1.81k
  setTargetDAGCombine(ISD::ADDCARRY);
485
1.81k
  setTargetDAGCombine(ISD::SUB);
486
1.81k
  setTargetDAGCombine(ISD::SUBCARRY);
487
1.81k
  setTargetDAGCombine(ISD::FADD);
488
1.81k
  setTargetDAGCombine(ISD::FSUB);
489
1.81k
  setTargetDAGCombine(ISD::FMINNUM);
490
1.81k
  setTargetDAGCombine(ISD::FMAXNUM);
491
1.81k
  setTargetDAGCombine(ISD::SMIN);
492
1.81k
  setTargetDAGCombine(ISD::SMAX);
493
1.81k
  setTargetDAGCombine(ISD::UMIN);
494
1.81k
  setTargetDAGCombine(ISD::UMAX);
495
1.81k
  setTargetDAGCombine(ISD::SETCC);
496
1.81k
  setTargetDAGCombine(ISD::AND);
497
1.81k
  setTargetDAGCombine(ISD::OR);
498
1.81k
  setTargetDAGCombine(ISD::XOR);
499
1.81k
  setTargetDAGCombine(ISD::SINT_TO_FP);
500
1.81k
  setTargetDAGCombine(ISD::UINT_TO_FP);
501
1.81k
  setTargetDAGCombine(ISD::FCANONICALIZE);
502
1.81k
  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
503
1.81k
  setTargetDAGCombine(ISD::ZERO_EXTEND);
504
1.81k
  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
505
1.81k
  setTargetDAGCombine(ISD::BUILD_VECTOR);
506
1.81k
507
1.81k
  // All memory operations. Some folding on the pointer operand is done to help
508
1.81k
  // matching the constant offsets in the addressing modes.
509
1.81k
  setTargetDAGCombine(ISD::LOAD);
510
1.81k
  setTargetDAGCombine(ISD::STORE);
511
1.81k
  setTargetDAGCombine(ISD::ATOMIC_LOAD);
512
1.81k
  setTargetDAGCombine(ISD::ATOMIC_STORE);
513
1.81k
  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
514
1.81k
  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
515
1.81k
  setTargetDAGCombine(ISD::ATOMIC_SWAP);
516
1.81k
  setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
517
1.81k
  setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
518
1.81k
  setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
519
1.81k
  setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
520
1.81k
  setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
521
1.81k
  setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
522
1.81k
  setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
523
1.81k
  setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
524
1.81k
  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
525
1.81k
  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
526
1.81k
527
1.81k
  setSchedulingPreference(Sched::RegPressure);
528
1.81k
}
529
530
446k
const SISubtarget *SITargetLowering::getSubtarget() const {
531
446k
  return static_cast<const SISubtarget *>(Subtarget);
532
446k
}
533
534
//===----------------------------------------------------------------------===//
535
// TargetLowering queries
536
//===----------------------------------------------------------------------===//
537
538
18
bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
539
18
  // SI has some legal vector types, but no legal vector operations. Say no
540
18
  // shuffles are legal in order to prefer scalarizing some vector operations.
541
18
  return false;
542
18
}
543
544
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
545
                                          const CallInst &CI,
546
7.64k
                                          unsigned IntrID) const {
547
7.64k
  switch (IntrID) {
548
150
  case Intrinsic::amdgcn_atomic_inc:
549
150
  case Intrinsic::amdgcn_atomic_dec: {
550
150
    Info.opc = ISD::INTRINSIC_W_CHAIN;
551
150
    Info.memVT = MVT::getVT(CI.getType());
552
150
    Info.ptrVal = CI.getOperand(0);
553
150
    Info.align = 0;
554
150
555
150
    const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
556
148
    Info.vol = !Vol || !Vol->isZero();
557
150
    Info.readMem = true;
558
150
    Info.writeMem = true;
559
150
    return true;
560
150
  }
561
7.49k
  default:
562
7.49k
    return false;
563
0
  }
564
0
}
565
566
bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
567
                                            SmallVectorImpl<Value*> &Ops,
568
11.5k
                                            Type *&AccessTy) const {
569
11.5k
  switch (II->getIntrinsicID()) {
570
174
  case Intrinsic::amdgcn_atomic_inc:
571
174
  case Intrinsic::amdgcn_atomic_dec: {
572
174
    Value *Ptr = II->getArgOperand(0);
573
174
    AccessTy = II->getType();
574
174
    Ops.push_back(Ptr);
575
174
    return true;
576
174
  }
577
11.3k
  default:
578
11.3k
    return false;
579
0
  }
580
0
}
581
582
29.8k
bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
583
29.8k
  if (
!Subtarget->hasFlatInstOffsets()29.8k
) {
584
28.9k
    // Flat instructions do not have offsets, and only have the register
585
28.9k
    // address.
586
24.7k
    return AM.BaseOffs == 0 && AM.Scale == 0;
587
28.9k
  }
588
855
589
855
  // GFX9 added a 13-bit signed offset. When using regular flat instructions,
590
855
  // the sign bit is ignored and is treated as a 12-bit unsigned offset.
591
855
592
855
  // Just r + i
593
855
  
return isUInt<12>(AM.BaseOffs) && 855
AM.Scale == 0810
;
594
29.8k
}
595
596
63.6k
bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
597
63.6k
  if (Subtarget->hasFlatGlobalInsts())
598
9.10k
    
return isInt<13>(AM.BaseOffs) && 9.10k
AM.Scale == 08.97k
;
599
54.5k
600
54.5k
  
if (54.5k
!Subtarget->hasAddr64() || 54.5k
Subtarget->useFlatForGlobal()32.0k
) {
601
24.1k
      // Assume the we will use FLAT for all global memory accesses
602
24.1k
      // on VI.
603
24.1k
      // FIXME: This assumption is currently wrong.  On VI we still use
604
24.1k
      // MUBUF instructions for the r + i addressing mode.  As currently
605
24.1k
      // implemented, the MUBUF instructions only work on buffer < 4GB.
606
24.1k
      // It may be possible to support > 4GB buffers with MUBUF instructions,
607
24.1k
      // by setting the stride value in the resource descriptor which would
608
24.1k
      // increase the size limit to (stride * 4GB).  However, this is risky,
609
24.1k
      // because it has never been validated.
610
24.1k
    return isLegalFlatAddressingMode(AM);
611
24.1k
  }
612
30.3k
613
30.3k
  return isLegalMUBUFAddressingMode(AM);
614
30.3k
}
615
616
34.9k
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
617
34.9k
  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
618
34.9k
  // additionally can do r + r + i with addr64. 32-bit has more addressing
619
34.9k
  // mode options. Depending on the resource constant, it can also do
620
34.9k
  // (i64 r0) + (i32 r1) * (i14 i).
621
34.9k
  //
622
34.9k
  // Private arrays end up using a scratch buffer most of the time, so also
623
34.9k
  // assume those use MUBUF instructions. Scratch loads / stores are currently
624
34.9k
  // implemented as mubuf instructions with offen bit set, so slightly
625
34.9k
  // different than the normal addr64.
626
34.9k
  if (!isUInt<12>(AM.BaseOffs))
627
397
    return false;
628
34.5k
629
34.5k
  // FIXME: Since we can split immediate into soffset and immediate offset,
630
34.5k
  // would it make sense to allow any immediate?
631
34.5k
632
34.5k
  switch (AM.Scale) {
633
20.3k
  case 0: // r + i or just i, depending on HasBaseReg.
634
20.3k
    return true;
635
855
  case 1:
636
855
    return true; // We have r + r or r + i.
637
778
  case 2:
638
778
    if (
AM.HasBaseReg778
) {
639
778
      // Reject 2 * r + r.
640
778
      return false;
641
778
    }
642
0
643
0
    // Allow 2 * r as r + r
644
0
    // Or  2 * r + i is allowed as r + r + i.
645
0
    return true;
646
12.5k
  default: // Don't allow n * r
647
12.5k
    return false;
648
0
  }
649
0
}
650
651
bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
652
                                             const AddrMode &AM, Type *Ty,
653
90.2k
                                             unsigned AS, Instruction *I) const {
654
90.2k
  // No global is ever allowed as a base.
655
90.2k
  if (AM.BaseGV)
656
1.68k
    return false;
657
88.5k
658
88.5k
  
if (88.5k
AS == AMDGPUASI.GLOBAL_ADDRESS88.5k
)
659
63.4k
    return isLegalGlobalAddressingMode(AM);
660
25.0k
661
25.0k
  
if (25.0k
AS == AMDGPUASI.CONSTANT_ADDRESS25.0k
) {
662
2.74k
    // If the offset isn't a multiple of 4, it probably isn't going to be
663
2.74k
    // correctly aligned.
664
2.74k
    // FIXME: Can we get the real alignment here?
665
2.74k
    if (AM.BaseOffs % 4 != 0)
666
52
      return isLegalMUBUFAddressingMode(AM);
667
2.69k
668
2.69k
    // There are no SMRD extloads, so if we have to do a small type access we
669
2.69k
    // will use a MUBUF load.
670
2.69k
    // FIXME?: We also need to do this if unaligned, but we don't know the
671
2.69k
    // alignment here.
672
2.69k
    
if (2.69k
DL.getTypeStoreSize(Ty) < 42.69k
)
673
143
      return isLegalGlobalAddressingMode(AM);
674
2.55k
675
2.55k
    
if (2.55k
Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS2.55k
) {
676
824
      // SMRD instructions have an 8-bit, dword offset on SI.
677
824
      if (!isUInt<8>(AM.BaseOffs / 4))
678
60
        return false;
679
1.72k
    } else 
if (1.72k
Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS1.72k
) {
680
759
      // On CI+, this can also be a 32-bit literal constant offset. If it fits
681
759
      // in 8-bits, it can use a smaller encoding.
682
759
      if (!isUInt<32>(AM.BaseOffs / 4))
683
10
        return false;
684
970
    } else 
if (970
Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS970
) {
685
970
      // On VI, these use the SMEM format and the offset is 20-bit in bytes.
686
970
      if (!isUInt<20>(AM.BaseOffs))
687
36
        return false;
688
970
    } else
689
970
      llvm_unreachable("unhandled generation");
690
2.55k
691
2.44k
    
if (2.44k
AM.Scale == 02.44k
) // r + i or just i, depending on HasBaseReg.
692
2.19k
      return true;
693
257
694
257
    
if (257
AM.Scale == 1 && 257
AM.HasBaseReg0
)
695
0
      return true;
696
257
697
257
    return false;
698
257
699
22.2k
  } else 
if (22.2k
AS == AMDGPUASI.PRIVATE_ADDRESS22.2k
) {
700
4.47k
    return isLegalMUBUFAddressingMode(AM);
701
17.8k
  } else 
if (17.8k
AS == AMDGPUASI.LOCAL_ADDRESS ||
702
17.8k
             
AS == AMDGPUASI.REGION_ADDRESS5.68k
) {
703
12.1k
    // Basic, single offset DS instructions allow a 16-bit unsigned immediate
704
12.1k
    // field.
705
12.1k
    // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
706
12.1k
    // an 8-bit dword offset but we don't know the alignment here.
707
12.1k
    if (!isUInt<16>(AM.BaseOffs))
708
1.36k
      return false;
709
10.7k
710
10.7k
    
if (10.7k
AM.Scale == 010.7k
) // r + i or just i, depending on HasBaseReg.
711
8.33k
      return true;
712
2.43k
713
2.43k
    
if (2.43k
AM.Scale == 1 && 2.43k
AM.HasBaseReg838
)
714
837
      return true;
715
1.60k
716
1.60k
    return false;
717
5.68k
  } else 
if (5.68k
AS == AMDGPUASI.FLAT_ADDRESS ||
718
5.68k
             
AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE1.98k
) {
719
5.68k
    // For an unknown address space, this usually means that this is for some
720
5.68k
    // reason being used for pure arithmetic, and not based on some addressing
721
5.68k
    // computation. We don't have instructions that compute pointers with any
722
5.68k
    // addressing modes, so treat them as having no offset like flat
723
5.68k
    // instructions.
724
5.68k
    return isLegalFlatAddressingMode(AM);
725
0
  } else {
726
0
    llvm_unreachable("unhandled address space");
727
22.2k
  }
728
0
}
729
730
bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
731
12.2k
                                        const SelectionDAG &DAG) const {
732
12.2k
  if (
AS == AMDGPUASI.GLOBAL_ADDRESS || 12.2k
AS == AMDGPUASI.FLAT_ADDRESS5.17k
) {
733
7.04k
    return (MemVT.getSizeInBits() <= 4 * 32);
734
5.17k
  } else 
if (5.17k
AS == AMDGPUASI.PRIVATE_ADDRESS5.17k
) {
735
977
    unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
736
977
    return (MemVT.getSizeInBits() <= MaxPrivateBits);
737
4.19k
  } else 
if (4.19k
AS == AMDGPUASI.LOCAL_ADDRESS4.19k
) {
738
4.19k
    return (MemVT.getSizeInBits() <= 2 * 32);
739
4.19k
  }
740
0
  return true;
741
0
}
742
743
bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
744
                                                      unsigned AddrSpace,
745
                                                      unsigned Align,
746
23.5k
                                                      bool *IsFast) const {
747
23.5k
  if (IsFast)
748
18.0k
    *IsFast = false;
749
23.5k
750
23.5k
  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
751
23.5k
  // which isn't a simple VT.
752
23.5k
  // Until MVT is extended to handle this, simply check for the size and
753
23.5k
  // rely on the condition below: allow accesses if the size is a multiple of 4.
754
23.5k
  if (
VT == MVT::Other || 23.5k
(VT != MVT::Other && 23.5k
VT.getSizeInBits() > 102423.5k
&&
755
23.5k
                           
VT.getStoreSize() > 160
)) {
756
0
    return false;
757
0
  }
758
23.5k
759
23.5k
  
if (23.5k
AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
760
23.5k
      
AddrSpace == AMDGPUASI.REGION_ADDRESS16.6k
) {
761
6.94k
    // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
762
6.94k
    // aligned, 8 byte access in a single operation using ds_read2/write2_b32
763
6.94k
    // with adjacent offsets.
764
6.94k
    bool AlignedBy4 = (Align % 4 == 0);
765
6.94k
    if (IsFast)
766
5.04k
      *IsFast = AlignedBy4;
767
6.94k
768
6.94k
    return AlignedBy4;
769
6.94k
  }
770
16.6k
771
16.6k
  // FIXME: We have to be conservative here and assume that flat operations
772
16.6k
  // will access scratch.  If we had access to the IR function, then we
773
16.6k
  // could determine if any private memory was used in the function.
774
16.6k
  
if (16.6k
!Subtarget->hasUnalignedScratchAccess() &&
775
16.5k
      (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
776
16.6k
       
AddrSpace == AMDGPUASI.FLAT_ADDRESS16.3k
)) {
777
255
    return false;
778
255
  }
779
16.3k
780
16.3k
  
if (16.3k
Subtarget->hasUnalignedBufferAccess()16.3k
) {
781
5.41k
    // If we have an uniform constant load, it still requires using a slow
782
5.41k
    // buffer instruction if unaligned.
783
5.41k
    if (
IsFast5.41k
) {
784
3.91k
      *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
785
3.91k
        
(Align % 4 == 0)173
:
true3.74k
;
786
3.91k
    }
787
5.41k
788
5.41k
    return true;
789
5.41k
  }
790
10.9k
791
10.9k
  // Smaller than dword value must be aligned.
792
10.9k
  
if (10.9k
VT.bitsLT(MVT::i32)10.9k
)
793
1.74k
    return false;
794
9.21k
795
9.21k
  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
796
9.21k
  // byte-address are ignored, thus forcing Dword alignment.
797
9.21k
  // This applies to private, global, and constant memory.
798
9.21k
  
if (9.21k
IsFast9.21k
)
799
7.34k
    *IsFast = true;
800
9.21k
801
6.52k
  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
802
23.5k
}
803
804
EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
805
                                          unsigned SrcAlign, bool IsMemset,
806
                                          bool ZeroMemset,
807
                                          bool MemcpyStrSrc,
808
112
                                          MachineFunction &MF) const {
809
112
  // FIXME: Should account for address space here.
810
112
811
112
  // The default fallback uses the private pointer size as a guess for a type to
812
112
  // use. Make sure we switch these to 64-bit accesses.
813
112
814
112
  if (
Size >= 16 && 112
DstAlign >= 496
) // XXX: Should only do for global
815
86
    return MVT::v4i32;
816
26
817
26
  
if (26
Size >= 8 && 26
DstAlign >= 418
)
818
8
    return MVT::v2i32;
819
18
820
18
  // Use the default.
821
18
  return MVT::Other;
822
18
}
823
824
612
static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
825
612
  return AS == AMDGPUASI.GLOBAL_ADDRESS ||
826
472
         AS == AMDGPUASI.FLAT_ADDRESS ||
827
186
         AS == AMDGPUASI.CONSTANT_ADDRESS;
828
612
}
829
830
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
831
219
                                           unsigned DestAS) const {
832
219
  return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
833
142
         isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
834
219
}
835
836
3.04k
bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
837
3.04k
  const MemSDNode *MemNode = cast<MemSDNode>(N);
838
3.04k
  const Value *Ptr = MemNode->getMemOperand()->getValue();
839
3.04k
  const Instruction *I = dyn_cast<Instruction>(Ptr);
840
2.25k
  return I && I->getMetadata("amdgpu.noclobber");
841
3.04k
}
842
843
bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
844
77
                                            unsigned DestAS) const {
845
77
  // Flat -> private/local is a simple truncate.
846
77
  // Flat -> global is no-op
847
77
  if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
848
56
    return true;
849
21
850
21
  return isNoopAddrSpaceCast(SrcAS, DestAS);
851
21
}
852
853
115k
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
854
115k
  const MemSDNode *MemNode = cast<MemSDNode>(N);
855
115k
856
115k
  return AMDGPU::isUniformMMO(MemNode->getMemOperand());
857
115k
}
858
859
TargetLoweringBase::LegalizeTypeAction
860
152k
SITargetLowering::getPreferredVectorAction(EVT VT) const {
861
152k
  if (
VT.getVectorNumElements() != 1 && 152k
VT.getScalarType().bitsLE(MVT::i16)124k
)
862
79.5k
    return TypeSplitVector;
863
72.5k
864
72.5k
  return TargetLoweringBase::getPreferredVectorAction(VT);
865
72.5k
}
866
867
bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
868
32
                                                         Type *Ty) const {
869
32
  // FIXME: Could be smarter if called for vector constants.
870
32
  return true;
871
32
}
872
873
234k
bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
874
234k
  if (
Subtarget->has16BitInsts() && 234k
VT == MVT::i16113k
) {
875
11.9k
    switch (Op) {
876
4.04k
    case ISD::LOAD:
877
4.04k
    case ISD::STORE:
878
4.04k
879
4.04k
    // These operations are done with 32-bit instructions anyway.
880
4.04k
    case ISD::AND:
881
4.04k
    case ISD::OR:
882
4.04k
    case ISD::XOR:
883
4.04k
    case ISD::SELECT:
884
4.04k
      // TODO: Extensions?
885
4.04k
      return true;
886
7.87k
    default:
887
7.87k
      return false;
888
222k
    }
889
222k
  }
890
222k
891
222k
  // SimplifySetCC uses this function to determine whether or not it should
892
222k
  // create setcc with i1 operands.  We don't have instructions for i1 setcc.
893
222k
  
if (222k
VT == MVT::i1 && 222k
Op == ISD::SETCC421
)
894
23
    return false;
895
222k
896
222k
  return TargetLowering::isTypeDesirableForOp(Op, VT);
897
222k
}
898
899
SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
900
                                                   const SDLoc &SL,
901
                                                   SDValue Chain,
902
32.2k
                                                   uint64_t Offset) const {
903
32.2k
  const DataLayout &DL = DAG.getDataLayout();
904
32.2k
  MachineFunction &MF = DAG.getMachineFunction();
905
32.2k
  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
906
32.2k
907
32.2k
  const ArgDescriptor *InputPtrReg;
908
32.2k
  const TargetRegisterClass *RC;
909
32.2k
910
32.2k
  std::tie(InputPtrReg, RC)
911
32.2k
    = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
912
32.2k
913
32.2k
  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
914
32.2k
  MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
915
32.2k
  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
916
32.2k
    MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
917
32.2k
918
32.2k
  return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
919
32.2k
                     DAG.getConstant(Offset, SL, PtrVT));
920
32.2k
}
921
922
SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
923
28
                                            const SDLoc &SL) const {
924
28
  auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
925
28
  uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
926
28
  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
927
28
}
928
929
SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
930
                                         const SDLoc &SL, SDValue Val,
931
                                         bool Signed,
932
32.2k
                                         const ISD::InputArg *Arg) const {
933
32.2k
  if (
Arg && 32.2k
(Arg->Flags.isSExt() || 32.1k
Arg->Flags.isZExt()32.1k
) &&
934
32.2k
      
VT.bitsLT(MemVT)83
) {
935
46
    unsigned Opc = Arg->Flags.isZExt() ? 
ISD::AssertZext39
:
ISD::AssertSext7
;
936
46
    Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
937
46
  }
938
32.2k
939
32.2k
  if (MemVT.isFloatingPoint())
940
2.42k
    Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
941
29.8k
  else 
if (29.8k
Signed29.8k
)
942
16
    Val = DAG.getSExtOrTrunc(Val, SL, VT);
943
29.8k
  else
944
29.7k
    Val = DAG.getZExtOrTrunc(Val, SL, VT);
945
32.2k
946
32.2k
  return Val;
947
32.2k
}
948
949
SDValue SITargetLowering::lowerKernargMemParameter(
950
  SelectionDAG &DAG, EVT VT, EVT MemVT,
951
  const SDLoc &SL, SDValue Chain,
952
  uint64_t Offset, bool Signed,
953
32.2k
  const ISD::InputArg *Arg) const {
954
32.2k
  const DataLayout &DL = DAG.getDataLayout();
955
32.2k
  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
956
32.2k
  PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
957
32.2k
  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
958
32.2k
959
32.2k
  unsigned Align = DL.getABITypeAlignment(Ty);
960
32.2k
961
32.2k
  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
962
32.2k
  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
963
32.2k
                             MachineMemOperand::MONonTemporal |
964
32.2k
                             MachineMemOperand::MODereferenceable |
965
32.2k
                             MachineMemOperand::MOInvariant);
966
32.2k
967
32.2k
  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
968
32.2k
  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
969
32.2k
}
970
971
SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
972
                                              const SDLoc &SL, SDValue Chain,
973
184
                                              const ISD::InputArg &Arg) const {
974
184
  MachineFunction &MF = DAG.getMachineFunction();
975
184
  MachineFrameInfo &MFI = MF.getFrameInfo();
976
184
977
184
  if (
Arg.Flags.isByVal()184
) {
978
33
    unsigned Size = Arg.Flags.getByValSize();
979
33
    int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
980
33
    return DAG.getFrameIndex(FrameIdx, MVT::i32);
981
33
  }
982
151
983
151
  unsigned ArgOffset = VA.getLocMemOffset();
984
151
  unsigned ArgSize = VA.getValVT().getStoreSize();
985
151
986
151
  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
987
151
988
151
  // Create load nodes to retrieve arguments from the stack.
989
151
  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
990
151
  SDValue ArgValue;
991
151
992
151
  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
993
151
  ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
994
151
  MVT MemVT = VA.getValVT();
995
151
996
151
  switch (VA.getLocInfo()) {
997
148
  default:
998
148
    break;
999
0
  case CCValAssign::BCvt:
1000
0
    MemVT = VA.getLocVT();
1001
0
    break;
1002
0
  case CCValAssign::SExt:
1003
0
    ExtType = ISD::SEXTLOAD;
1004
0
    break;
1005
0
  case CCValAssign::ZExt:
1006
0
    ExtType = ISD::ZEXTLOAD;
1007
0
    break;
1008
3
  case CCValAssign::AExt:
1009
3
    ExtType = ISD::EXTLOAD;
1010
3
    break;
1011
151
  }
1012
151
1013
151
  ArgValue = DAG.getExtLoad(
1014
151
    ExtType, SL, VA.getLocVT(), Chain, FIN,
1015
151
    MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1016
151
    MemVT);
1017
151
  return ArgValue;
1018
151
}
1019
1020
SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1021
  const SIMachineFunctionInfo &MFI,
1022
  EVT VT,
1023
169
  AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1024
169
  const ArgDescriptor *Reg;
1025
169
  const TargetRegisterClass *RC;
1026
169
1027
169
  std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1028
169
  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1029
169
}
1030
1031
static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1032
                                   CallingConv::ID CallConv,
1033
                                   ArrayRef<ISD::InputArg> Ins,
1034
                                   BitVector &Skipped,
1035
                                   FunctionType *FType,
1036
547
                                   SIMachineFunctionInfo *Info) {
1037
2.98k
  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; 
I != E2.98k
;
++I2.43k
) {
1038
2.43k
    const ISD::InputArg &Arg = Ins[I];
1039
2.43k
1040
2.43k
    // First check if it's a PS input addr.
1041
2.43k
    if (
CallConv == CallingConv::AMDGPU_PS && 2.43k
!Arg.Flags.isInReg()2.05k
&&
1042
2.43k
        
!Arg.Flags.isByVal()1.55k
&&
PSInputNum <= 151.44k
) {
1043
1.44k
1044
1.44k
      if (
!Arg.Used && 1.44k
!Info->isPSInputAllocated(PSInputNum)895
) {
1045
852
        // We can safely skip PS inputs.
1046
852
        Skipped.set(I);
1047
852
        ++PSInputNum;
1048
852
        continue;
1049
852
      }
1050
594
1051
594
      Info->markPSInputAllocated(PSInputNum);
1052
594
      if (Arg.Used)
1053
551
        Info->markPSInputEnabled(PSInputNum);
1054
1.44k
1055
1.44k
      ++PSInputNum;
1056
1.44k
    }
1057
2.43k
1058
2.43k
    // Second split vertices into their elements.
1059
1.58k
    
if (1.58k
Arg.VT.isVector()1.58k
) {
1060
447
      ISD::InputArg NewArg = Arg;
1061
447
      NewArg.Flags.setSplit();
1062
447
      NewArg.VT = Arg.VT.getVectorElementType();
1063
447
1064
447
      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
1065
447
      // three or five element vertex only needs three or five registers,
1066
447
      // NOT four or eight.
1067
447
      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
1068
447
      unsigned NumElements = ParamType->getVectorNumElements();
1069
447
1070
2.31k
      for (unsigned J = 0; 
J != NumElements2.31k
;
++J1.87k
) {
1071
1.87k
        Splits.push_back(NewArg);
1072
1.87k
        NewArg.PartOffset += NewArg.VT.getStoreSize();
1073
1.87k
      }
1074
1.58k
    } else {
1075
1.13k
      Splits.push_back(Arg);
1076
1.13k
    }
1077
2.43k
  }
1078
547
}
1079
1080
// Allocate special inputs passed in VGPRs.
1081
static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1082
                                           MachineFunction &MF,
1083
                                           const SIRegisterInfo &TRI,
1084
14.1k
                                           SIMachineFunctionInfo &Info) {
1085
14.1k
  if (
Info.hasWorkItemIDX()14.1k
) {
1086
13.6k
    unsigned Reg = AMDGPU::VGPR0;
1087
13.6k
    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1088
13.6k
1089
13.6k
    CCInfo.AllocateReg(Reg);
1090
13.6k
    Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1091
13.6k
  }
1092
14.1k
1093
14.1k
  if (
Info.hasWorkItemIDY()14.1k
) {
1094
105
    unsigned Reg = AMDGPU::VGPR1;
1095
105
    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1096
105
1097
105
    CCInfo.AllocateReg(Reg);
1098
105
    Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1099
105
  }
1100
14.1k
1101
14.1k
  if (
Info.hasWorkItemIDZ()14.1k
) {
1102
58
    unsigned Reg = AMDGPU::VGPR2;
1103
58
    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1104
58
1105
58
    CCInfo.AllocateReg(Reg);
1106
58
    Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1107
58
  }
1108
14.1k
}
1109
1110
// Try to allocate a VGPR at the end of the argument list, or if no argument
1111
// VGPRs are left allocating a stack slot.
1112
29
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1113
29
  ArrayRef<MCPhysReg> ArgVGPRs
1114
29
    = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1115
29
  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1116
29
  if (
RegIdx == ArgVGPRs.size()29
) {
1117
8
    // Spill to stack required.
1118
8
    int64_t Offset = CCInfo.AllocateStack(4, 4);
1119
8
1120
8
    return ArgDescriptor::createStack(Offset);
1121
8
  }
1122
21
1123
21
  unsigned Reg = ArgVGPRs[RegIdx];
1124
21
  Reg = CCInfo.AllocateReg(Reg);
1125
21
  assert(Reg != AMDGPU::NoRegister);
1126
21
1127
21
  MachineFunction &MF = CCInfo.getMachineFunction();
1128
21
  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1129
21
  return ArgDescriptor::createRegister(Reg);
1130
21
}
1131
1132
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1133
                                             const TargetRegisterClass *RC,
1134
115
                                             unsigned NumArgRegs) {
1135
115
  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1136
115
  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1137
115
  if (RegIdx == ArgSGPRs.size())
1138
0
    report_fatal_error("ran out of SGPRs for arguments");
1139
115
1140
115
  unsigned Reg = ArgSGPRs[RegIdx];
1141
115
  Reg = CCInfo.AllocateReg(Reg);
1142
115
  assert(Reg != AMDGPU::NoRegister);
1143
115
1144
115
  MachineFunction &MF = CCInfo.getMachineFunction();
1145
115
  MF.addLiveIn(Reg, RC);
1146
115
  return ArgDescriptor::createRegister(Reg);
1147
115
}
1148
1149
62
static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1150
62
  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1151
62
}
1152
1153
53
static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1154
53
  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1155
53
}
1156
1157
static void allocateSpecialInputVGPRs(CCState &CCInfo,
1158
                                      MachineFunction &MF,
1159
                                      const SIRegisterInfo &TRI,
1160
864
                                      SIMachineFunctionInfo &Info) {
1161
864
  if (Info.hasWorkItemIDX())
1162
13
    Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1163
864
1164
864
  if (Info.hasWorkItemIDY())
1165
8
    Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1166
864
1167
864
  if (Info.hasWorkItemIDZ())
1168
8
    Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1169
864
}
1170
1171
static void allocateSpecialInputSGPRs(CCState &CCInfo,
1172
                                      MachineFunction &MF,
1173
                                      const SIRegisterInfo &TRI,
1174
864
                                      SIMachineFunctionInfo &Info) {
1175
864
  auto &ArgInfo = Info.getArgInfo();
1176
864
1177
864
  // TODO: Unify handling with private memory pointers.
1178
864
1179
864
  if (Info.hasDispatchPtr())
1180
10
    ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1181
864
1182
864
  if (Info.hasQueuePtr())
1183
11
    ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1184
864
1185
864
  if (Info.hasKernargSegmentPtr())
1186
13
    ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1187
864
1188
864
  if (Info.hasDispatchID())
1189
10
    ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1190
864
1191
864
  // flat_scratch_init is not applicable for non-kernel functions.
1192
864
1193
864
  if (Info.hasWorkGroupIDX())
1194
22
    ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1195
864
1196
864
  if (Info.hasWorkGroupIDY())
1197
20
    ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1198
864
1199
864
  if (Info.hasWorkGroupIDZ())
1200
20
    ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1201
864
1202
864
  if (Info.hasImplicitArgPtr())
1203
9
    ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1204
864
}
1205
1206
// Allocate special inputs passed in user SGPRs.
1207
static void allocateHSAUserSGPRs(CCState &CCInfo,
1208
                                 MachineFunction &MF,
1209
                                 const SIRegisterInfo &TRI,
1210
14.1k
                                 SIMachineFunctionInfo &Info) {
1211
14.1k
  if (
Info.hasImplicitBufferPtr()14.1k
) {
1212
2
    unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1213
2
    MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1214
2
    CCInfo.AllocateReg(ImplicitBufferPtrReg);
1215
2
  }
1216
14.1k
1217
14.1k
  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1218
14.1k
  if (
Info.hasPrivateSegmentBuffer()14.1k
) {
1219
1.74k
    unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1220
1.74k
    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1221
1.74k
    CCInfo.AllocateReg(PrivateSegmentBufferReg);
1222
1.74k
  }
1223
14.1k
1224
14.1k
  if (
Info.hasDispatchPtr()14.1k
) {
1225
25
    unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1226
25
    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1227
25
    CCInfo.AllocateReg(DispatchPtrReg);
1228
25
  }
1229
14.1k
1230
14.1k
  if (
Info.hasQueuePtr()14.1k
) {
1231
57
    unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1232
57
    MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1233
57
    CCInfo.AllocateReg(QueuePtrReg);
1234
57
  }
1235
14.1k
1236
14.1k
  if (
Info.hasKernargSegmentPtr()14.1k
) {
1237
12.7k
    unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1238
12.7k
    MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1239
12.7k
    CCInfo.AllocateReg(InputPtrReg);
1240
12.7k
  }
1241
14.1k
1242
14.1k
  if (
Info.hasDispatchID()14.1k
) {
1243
5
    unsigned DispatchIDReg = Info.addDispatchID(TRI);
1244
5
    MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1245
5
    CCInfo.AllocateReg(DispatchIDReg);
1246
5
  }
1247
14.1k
1248
14.1k
  if (
Info.hasFlatScratchInit()14.1k
) {
1249
332
    unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1250
332
    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1251
332
    CCInfo.AllocateReg(FlatScratchInitReg);
1252
332
  }
1253
14.1k
1254
14.1k
  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1255
14.1k
  // these from the dispatch pointer.
1256
14.1k
}
1257
1258
// Allocate special input registers that are initialized per-wave.
1259
static void allocateSystemSGPRs(CCState &CCInfo,
1260
                                MachineFunction &MF,
1261
                                SIMachineFunctionInfo &Info,
1262
                                CallingConv::ID CallConv,
1263
14.1k
                                bool IsShader) {
1264
14.1k
  if (
Info.hasWorkGroupIDX()14.1k
) {
1265
13.6k
    unsigned Reg = Info.addWorkGroupIDX();
1266
13.6k
    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1267
13.6k
    CCInfo.AllocateReg(Reg);
1268
13.6k
  }
1269
14.1k
1270
14.1k
  if (
Info.hasWorkGroupIDY()14.1k
) {
1271
24
    unsigned Reg = Info.addWorkGroupIDY();
1272
24
    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1273
24
    CCInfo.AllocateReg(Reg);
1274
24
  }
1275
14.1k
1276
14.1k
  if (
Info.hasWorkGroupIDZ()14.1k
) {
1277
24
    unsigned Reg = Info.addWorkGroupIDZ();
1278
24
    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1279
24
    CCInfo.AllocateReg(Reg);
1280
24
  }
1281
14.1k
1282
14.1k
  if (
Info.hasWorkGroupInfo()14.1k
) {
1283
0
    unsigned Reg = Info.addWorkGroupInfo();
1284
0
    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1285
0
    CCInfo.AllocateReg(Reg);
1286
0
  }
1287
14.1k
1288
14.1k
  if (
Info.hasPrivateSegmentWaveByteOffset()14.1k
) {
1289
13.6k
    // Scratch wave offset passed in system SGPR.
1290
13.6k
    unsigned PrivateSegmentWaveByteOffsetReg;
1291
13.6k
1292
13.6k
    if (
IsShader13.6k
) {
1293
48
      PrivateSegmentWaveByteOffsetReg =
1294
48
        Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1295
48
1296
48
      // This is true if the scratch wave byte offset doesn't have a fixed
1297
48
      // location.
1298
48
      if (
PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister48
) {
1299
44
        PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1300
44
        Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1301
44
      }
1302
48
    } else
1303
13.6k
      PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1304
13.6k
1305
13.6k
    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1306
13.6k
    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1307
13.6k
  }
1308
14.1k
}
1309
1310
static void reservePrivateMemoryRegs(const TargetMachine &TM,
1311
                                     MachineFunction &MF,
1312
                                     const SIRegisterInfo &TRI,
1313
14.1k
                                     SIMachineFunctionInfo &Info) {
1314
14.1k
  // Now that we've figured out where the scratch register inputs are, see if
1315
14.1k
  // should reserve the arguments and use them directly.
1316
14.1k
  MachineFrameInfo &MFI = MF.getFrameInfo();
1317
14.1k
  bool HasStackObjects = MFI.hasStackObjects();
1318
14.1k
1319
14.1k
  // Record that we know we have non-spill stack objects so we don't need to
1320
14.1k
  // check all stack objects later.
1321
14.1k
  if (HasStackObjects)
1322
351
    Info.setHasNonSpillStackObjects(true);
1323
14.1k
1324
14.1k
  // Everything live out of a block is spilled with fast regalloc, so it's
1325
14.1k
  // almost certain that spilling will be required.
1326
14.1k
  if (TM.getOptLevel() == CodeGenOpt::None)
1327
177
    HasStackObjects = true;
1328
14.1k
1329
14.1k
  // For now assume stack access is needed in any callee functions, so we need
1330
14.1k
  // the scratch registers to pass in.
1331
13.6k
  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1332
14.1k
1333
14.1k
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1334
14.1k
  if (
ST.isAmdCodeObjectV2(MF)14.1k
) {
1335
1.74k
    if (
RequiresStackAccess1.74k
) {
1336
427
      // If we have stack objects, we unquestionably need the private buffer
1337
427
      // resource. For the Code Object V2 ABI, this will be the first 4 user
1338
427
      // SGPR inputs. We can reserve those and use them directly.
1339
427
1340
427
      unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1341
427
        AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
1342
427
      Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1343
427
1344
427
      if (
MFI.hasCalls()427
) {
1345
230
        // If we have calls, we need to keep the frame register in a register
1346
230
        // that won't be clobbered by a call, so ensure it is copied somewhere.
1347
230
1348
230
        // This is not a problem for the scratch wave offset, because the same
1349
230
        // registers are reserved in all functions.
1350
230
1351
230
        // FIXME: Nothing is really ensuring this is a call preserved register,
1352
230
        // it's just selected from the end so it happens to be.
1353
230
        unsigned ReservedOffsetReg
1354
230
          = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1355
230
        Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1356
427
      } else {
1357
197
        unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1358
197
          AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1359
197
        Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1360
197
      }
1361
1.74k
    } else {
1362
1.31k
      unsigned ReservedBufferReg
1363
1.31k
        = TRI.reservedPrivateSegmentBufferReg(MF);
1364
1.31k
      unsigned ReservedOffsetReg
1365
1.31k
        = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1366
1.31k
1367
1.31k
      // We tentatively reserve the last registers (skipping the last two
1368
1.31k
      // which may contain VCC). After register allocation, we'll replace
1369
1.31k
      // these with the ones immediately after those which were really
1370
1.31k
      // allocated. In the prologue copies will be inserted from the argument
1371
1.31k
      // to these reserved registers.
1372
1.31k
      Info.setScratchRSrcReg(ReservedBufferReg);
1373
1.31k
      Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1374
1.31k
    }
1375
14.1k
  } else {
1376
12.4k
    unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1377
12.4k
1378
12.4k
    // Without HSA, relocations are used for the scratch pointer and the
1379
12.4k
    // buffer resource setup is always inserted in the prologue. Scratch wave
1380
12.4k
    // offset is still in an input SGPR.
1381
12.4k
    Info.setScratchRSrcReg(ReservedBufferReg);
1382
12.4k
1383
12.4k
    if (
HasStackObjects && 12.4k
!MFI.hasCalls()284
) {
1384
277
      unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1385
277
        AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1386
277
      Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1387
12.4k
    } else {
1388
12.1k
      unsigned ReservedOffsetReg
1389
12.1k
        = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1390
12.1k
      Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1391
12.1k
    }
1392
12.4k
  }
1393
14.1k
}
1394
1395
14.8k
bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1396
14.8k
  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1397
14.8k
  return !Info->isEntryFunction();
1398
14.8k
}
1399
1400
864
void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1401
864
1402
864
}
1403
1404
void SITargetLowering::insertCopiesSplitCSR(
1405
  MachineBasicBlock *Entry,
1406
864
  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1407
864
  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1408
864
1409
864
  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1410
864
  if (!IStart)
1411
864
    return;
1412
0
1413
0
  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1414
0
  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1415
0
  MachineBasicBlock::iterator MBBI = Entry->begin();
1416
0
  for (const MCPhysReg *I = IStart; 
*I0
;
++I0
) {
1417
0
    const TargetRegisterClass *RC = nullptr;
1418
0
    if (AMDGPU::SReg_64RegClass.contains(*I))
1419
0
      RC = &AMDGPU::SGPR_64RegClass;
1420
0
    else 
if (0
AMDGPU::SReg_32RegClass.contains(*I)0
)
1421
0
      RC = &AMDGPU::SGPR_32RegClass;
1422
0
    else
1423
0
      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1424
0
1425
0
    unsigned NewVR = MRI->createVirtualRegister(RC);
1426
0
    // Create copy from CSR to a virtual register.
1427
0
    Entry->addLiveIn(*I);
1428
0
    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1429
0
      .addReg(*I);
1430
0
1431
0
    // Insert the copy-back instructions right before the terminator.
1432
0
    for (auto *Exit : Exits)
1433
0
      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1434
0
              TII->get(TargetOpcode::COPY), *I)
1435
0
        .addReg(NewVR);
1436
0
  }
1437
864
}
1438
1439
SDValue SITargetLowering::LowerFormalArguments(
1440
    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1441
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1442
15.0k
    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1443
15.0k
  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1444
15.0k
1445
15.0k
  MachineFunction &MF = DAG.getMachineFunction();
1446
15.0k
  FunctionType *FType = MF.getFunction()->getFunctionType();
1447
15.0k
  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1448
15.0k
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1449
15.0k
1450
15.0k
  if (
Subtarget->isAmdHsaOS() && 15.0k
AMDGPU::isShader(CallConv)1.89k
) {
1451
3
    const Function *Fn = MF.getFunction();
1452
3
    DiagnosticInfoUnsupported NoGraphicsHSA(
1453
3
        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1454
3
    DAG.getContext()->diagnose(NoGraphicsHSA);
1455
3
    return DAG.getEntryNode();
1456
3
  }
1457
15.0k
1458
15.0k
  // Create stack objects that are used for emitting debugger prologue if
1459
15.0k
  // "amdgpu-debugger-emit-prologue" attribute was specified.
1460
15.0k
  
if (15.0k
ST.debuggerEmitPrologue()15.0k
)
1461
4
    createDebuggerPrologueStackObjects(MF);
1462
15.0k
1463
15.0k
  SmallVector<ISD::InputArg, 16> Splits;
1464
15.0k
  SmallVector<CCValAssign, 16> ArgLocs;
1465
15.0k
  BitVector Skipped(Ins.size());
1466
15.0k
  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1467
15.0k
                 *DAG.getContext());
1468
15.0k
1469
15.0k
  bool IsShader = AMDGPU::isShader(CallConv);
1470
15.0k
  bool IsKernel = AMDGPU::isKernel(CallConv);
1471
15.0k
  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1472
15.0k
1473
15.0k
  if (
!IsEntryFunc15.0k
) {
1474
864
    // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1475
864
    // this when allocating argument fixed offsets.
1476
864
    CCInfo.AllocateStack(4, 4);
1477
864
  }
1478
15.0k
1479
15.0k
  if (
IsShader15.0k
) {
1480
547
    processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1481
547
1482
547
    // At least one interpolation mode must be enabled or else the GPU will
1483
547
    // hang.
1484
547
    //
1485
547
    // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1486
547
    // set PSInputAddr, the user wants to enable some bits after the compilation
1487
547
    // based on run-time states. Since we can't know what the final PSInputEna
1488
547
    // will look like, so we shouldn't do anything here and the user should take
1489
547
    // responsibility for the correct programming.
1490
547
    //
1491
547
    // Otherwise, the following restrictions apply:
1492
547
    // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1493
547
    // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1494
547
    //   enabled too.
1495
547
    if (CallConv == CallingConv::AMDGPU_PS &&
1496
439
        ((Info->getPSInputAddr() & 0x7F) == 0 ||
1497
317
         ((Info->getPSInputAddr() & 0xF) == 0 &&
1498
547
          
Info->isPSInputAllocated(11)3
))) {
1499
124
      CCInfo.AllocateReg(AMDGPU::VGPR0);
1500
124
      CCInfo.AllocateReg(AMDGPU::VGPR1);
1501
124
      Info->markPSInputAllocated(0);
1502
124
      Info->markPSInputEnabled(0);
1503
124
    }
1504
547
1505
547
    assert(!Info->hasDispatchPtr() &&
1506
547
           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1507
547
           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1508
547
           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1509
547
           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1510
547
           !Info->hasWorkItemIDZ());
1511
15.0k
  } else 
if (14.4k
IsKernel14.4k
) {
1512
13.6k
    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
1513
14.4k
  } else {
1514
864
    Splits.append(Ins.begin(), Ins.end());
1515
864
  }
1516
15.0k
1517
15.0k
  if (
IsEntryFunc15.0k
) {
1518
14.1k
    allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1519
14.1k
    allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1520
14.1k
  }
1521
15.0k
1522
15.0k
  if (
IsKernel15.0k
) {
1523
13.6k
    analyzeFormalArgumentsCompute(CCInfo, Ins);
1524
15.0k
  } else {
1525
1.41k
    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1526
1.41k
    CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1527
1.41k
  }
1528
15.0k
1529
15.0k
  SmallVector<SDValue, 16> Chains;
1530
15.0k
1531
52.1k
  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; 
i != e52.1k
;
++i37.0k
) {
1532
37.0k
    const ISD::InputArg &Arg = Ins[i];
1533
37.0k
    if (
Skipped[i]37.0k
) {
1534
852
      InVals.push_back(DAG.getUNDEF(Arg.VT));
1535
852
      continue;
1536
852
    }
1537
36.2k
1538
36.2k
    CCValAssign &VA = ArgLocs[ArgIdx++];
1539
36.2k
    MVT VT = VA.getLocVT();
1540
36.2k
1541
36.2k
    if (
IsEntryFunc && 36.2k
VA.isMemLoc()33.7k
) {
1542
32.1k
      VT = Ins[i].VT;
1543
32.1k
      EVT MemVT = VA.getLocVT();
1544
32.1k
1545
32.1k
      const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
1546
32.1k
        VA.getLocMemOffset();
1547
32.1k
      Info->setABIArgOffset(Offset + MemVT.getStoreSize());
1548
32.1k
1549
32.1k
      // The first 36 bytes of the input buffer contains information about
1550
32.1k
      // thread group and global sizes.
1551
32.1k
      SDValue Arg = lowerKernargMemParameter(
1552
32.1k
        DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
1553
32.1k
      Chains.push_back(Arg.getValue(1));
1554
32.1k
1555
32.1k
      auto *ParamTy =
1556
32.1k
        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
1557
32.1k
      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
1558
32.1k
          
ParamTy13.0k
&&
ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS8.44k
) {
1559
572
        // On SI local pointers are just offsets into LDS, so they are always
1560
572
        // less than 16-bits.  On CI and newer they could potentially be
1561
572
        // real pointers, so we can't guarantee their size.
1562
572
        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1563
572
                          DAG.getValueType(MVT::i16));
1564
572
      }
1565
32.1k
1566
32.1k
      InVals.push_back(Arg);
1567
32.1k
      continue;
1568
4.08k
    } else 
if (4.08k
!IsEntryFunc && 4.08k
VA.isMemLoc()2.50k
) {
1569
184
      SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1570
184
      InVals.push_back(Val);
1571
184
      if (!Arg.Flags.isByVal())
1572
151
        Chains.push_back(Val.getValue(1));
1573
4.08k
      continue;
1574
4.08k
    }
1575
3.90k
1576
36.2k
    assert(VA.isRegLoc() && "Parameter must be in a register!");
1577
3.90k
1578
3.90k
    unsigned Reg = VA.getLocReg();
1579
3.90k
    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
1580
3.90k
    EVT ValVT = VA.getValVT();
1581
3.90k
1582
3.90k
    Reg = MF.addLiveIn(Reg, RC);
1583
3.90k
    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1584
3.90k
1585
3.90k
    // If this is an 8 or 16-bit value, it is really passed promoted
1586
3.90k
    // to 32 bits. Insert an assert[sz]ext to capture this, then
1587
3.90k
    // truncate to the right size.
1588
3.90k
    switch (VA.getLocInfo()) {
1589
3.88k
    case CCValAssign::Full:
1590
3.88k
      break;
1591
0
    case CCValAssign::BCvt:
1592
0
      Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
1593
0
      break;
1594
7
    case CCValAssign::SExt:
1595
7
      Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
1596
7
                        DAG.getValueType(ValVT));
1597
7
      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1598
7
      break;
1599
7
    case CCValAssign::ZExt:
1600
7
      Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1601
7
                        DAG.getValueType(ValVT));
1602
7
      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1603
7
      break;
1604
6
    case CCValAssign::AExt:
1605
6
      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1606
6
      break;
1607
0
    default:
1608
0
      llvm_unreachable("Unknown loc info!");
1609
3.90k
    }
1610
3.90k
1611
3.90k
    
if (3.90k
IsShader && 3.90k
Arg.VT.isVector()1.58k
) {
1612
447
      // Build a vector from the registers
1613
447
      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
1614
447
      unsigned NumElements = ParamType->getVectorNumElements();
1615
447
1616
447
      SmallVector<SDValue, 4> Regs;
1617
447
      Regs.push_back(Val);
1618
1.87k
      for (unsigned j = 1; 
j != NumElements1.87k
;
++j1.42k
) {
1619
1.42k
        Reg = ArgLocs[ArgIdx++].getLocReg();
1620
1.42k
        Reg = MF.addLiveIn(Reg, RC);
1621
1.42k
1622
1.42k
        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1623
1.42k
        Regs.push_back(Copy);
1624
1.42k
      }
1625
447
1626
447
      // Fill up the missing vector elements
1627
447
      NumElements = Arg.VT.getVectorNumElements() - NumElements;
1628
447
      Regs.append(NumElements, DAG.getUNDEF(VT));
1629
447
1630
447
      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
1631
447
      continue;
1632
447
    }
1633
3.45k
1634
3.45k
    InVals.push_back(Val);
1635
3.45k
  }
1636
15.0k
1637
15.0k
  
if (15.0k
!IsEntryFunc15.0k
) {
1638
864
    // Special inputs come after user arguments.
1639
864
    allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
1640
864
  }
1641
15.0k
1642
15.0k
  // Start adding system SGPRs.
1643
15.0k
  if (
IsEntryFunc15.0k
) {
1644
14.1k
    allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
1645
15.0k
  } else {
1646
864
    CCInfo.AllocateReg(Info->getScratchRSrcReg());
1647
864
    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
1648
864
    CCInfo.AllocateReg(Info->getFrameOffsetReg());
1649
864
    allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
1650
864
  }
1651
15.0k
1652
15.0k
  auto &ArgUsageInfo =
1653
15.0k
    DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
1654
15.0k
  ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo());
1655
15.0k
1656
15.0k
  unsigned StackArgSize = CCInfo.getNextStackOffset();
1657
15.0k
  Info->setBytesInStackArgArea(StackArgSize);
1658
15.0k
1659
2.22k
  return Chains.empty() ? Chain :
1660
12.8k
    DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
1661
15.0k
}
1662
1663
// TODO: If return values can't fit in registers, we should return as many as
1664
// possible in registers before passing on stack.
1665
bool SITargetLowering::CanLowerReturn(
1666
  CallingConv::ID CallConv,
1667
  MachineFunction &MF, bool IsVarArg,
1668
  const SmallVectorImpl<ISD::OutputArg> &Outs,
1669
15.5k
  LLVMContext &Context) const {
1670
15.5k
  // Replacing returns with sret/stack usage doesn't make sense for shaders.
1671
15.5k
  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
1672
15.5k
  // for shaders. Vector types should be explicitly handled by CC.
1673
15.5k
  if (AMDGPU::isEntryFunctionCC(CallConv))
1674
14.1k
    return true;
1675
1.32k
1676
1.32k
  SmallVector<CCValAssign, 16> RVLocs;
1677
1.32k
  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
1678
1.32k
  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
1679
1.32k
}
1680
1681
SDValue
1682
SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
1683
                              bool isVarArg,
1684
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
1685
                              const SmallVectorImpl<SDValue> &OutVals,
1686
14.9k
                              const SDLoc &DL, SelectionDAG &DAG) const {
1687
14.9k
  MachineFunction &MF = DAG.getMachineFunction();
1688
14.9k
  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1689
14.9k
1690
14.9k
  if (
AMDGPU::isKernel(CallConv)14.9k
) {
1691
13.6k
    return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
1692
13.6k
                                             OutVals, DL, DAG);
1693
13.6k
  }
1694
1.37k
1695
1.37k
  bool IsShader = AMDGPU::isShader(CallConv);
1696
1.37k
1697
1.37k
  Info->setIfReturnsVoid(Outs.size() == 0);
1698
814
  bool IsWaveEnd = Info->returnsVoid() && IsShader;
1699
1.37k
1700
1.37k
  SmallVector<ISD::OutputArg, 48> Splits;
1701
1.37k
  SmallVector<SDValue, 48> SplitVals;
1702
1.37k
1703
1.37k
  // Split vectors into their elements.
1704
2.43k
  for (unsigned i = 0, e = Outs.size(); 
i != e2.43k
;
++i1.05k
) {
1705
1.05k
    const ISD::OutputArg &Out = Outs[i];
1706
1.05k
1707
1.05k
    if (
IsShader && 1.05k
Out.VT.isVector()501
) {
1708
114
      MVT VT = Out.VT.getVectorElementType();
1709
114
      ISD::OutputArg NewOut = Out;
1710
114
      NewOut.Flags.setSplit();
1711
114
      NewOut.VT = VT;
1712
114
1713
114
      // We want the original number of vector elements here, e.g.
1714
114
      // three or five, not four or eight.
1715
114
      unsigned NumElements = Out.ArgVT.getVectorNumElements();
1716
114
1717
546
      for (unsigned j = 0; 
j != NumElements546
;
++j432
) {
1718
432
        SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
1719
432
                                   DAG.getConstant(j, DL, MVT::i32));
1720
432
        SplitVals.push_back(Elem);
1721
432
        Splits.push_back(NewOut);
1722
432
        NewOut.PartOffset += NewOut.VT.getStoreSize();
1723
432
      }
1724
1.05k
    } else {
1725
943
      SplitVals.push_back(OutVals[i]);
1726
943
      Splits.push_back(Out);
1727
943
    }
1728
1.05k
  }
1729
1.37k
1730
1.37k
  // CCValAssign - represent the assignment of the return value to a location.
1731
1.37k
  SmallVector<CCValAssign, 48> RVLocs;
1732
1.37k
1733
1.37k
  // CCState - Info about the registers and stack slots.
1734
1.37k
  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1735
1.37k
                 *DAG.getContext());
1736
1.37k
1737
1.37k
  // Analyze outgoing return values.
1738
1.37k
  CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
1739
1.37k
1740
1.37k
  SDValue Flag;
1741
1.37k
  SmallVector<SDValue, 48> RetOps;
1742
1.37k
  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1743
1.37k
1744
1.37k
  // Add return address for callable functions.
1745
1.37k
  if (
!Info->isEntryFunction()1.37k
) {
1746
828
    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1747
828
    SDValue ReturnAddrReg = CreateLiveInRegister(
1748
828
      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
1749
828
1750
828
    // FIXME: Should be able to use a vreg here, but need a way to prevent it
1751
828
    // from being allcoated to a CSR.
1752
828
1753
828
    SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
1754
828
                                                MVT::i64);
1755
828
1756
828
    Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
1757
828
    Flag = Chain.getValue(1);
1758
828
1759
828
    RetOps.push_back(PhysReturnAddrReg);
1760
828
  }
1761
1.37k
1762
1.37k
  // Copy the result values into the output registers.
1763
1.37k
  for (unsigned i = 0, realRVLocIdx = 0;
1764
2.75k
       i != RVLocs.size();
1765
1.37k
       
++i, ++realRVLocIdx1.37k
) {
1766
1.37k
    CCValAssign &VA = RVLocs[i];
1767
1.37k
    assert(VA.isRegLoc() && "Can only return in registers!");
1768
1.37k
    // TODO: Partially return in registers if return values don't fit.
1769
1.37k
1770
1.37k
    SDValue Arg = SplitVals[realRVLocIdx];
1771
1.37k
1772
1.37k
    // Copied from other backends.
1773
1.37k
    switch (VA.getLocInfo()) {
1774
1.37k
    case CCValAssign::Full:
1775
1.37k
      break;
1776
0
    case CCValAssign::BCvt:
1777
0
      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
1778
0
      break;
1779
0
    case CCValAssign::SExt:
1780
0
      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
1781
0
      break;
1782
0
    case CCValAssign::ZExt:
1783
0
      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
1784
0
      break;
1785
3
    case CCValAssign::AExt:
1786
3
      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
1787
3
      break;
1788
0
    default:
1789
0
      llvm_unreachable("Unknown loc info!");
1790
1.37k
    }
1791
1.37k
1792
1.37k
    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
1793
1.37k
    Flag = Chain.getValue(1);
1794
1.37k
    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1795
1.37k
  }
1796
1.37k
1797
1.37k
  // FIXME: Does sret work properly?
1798
1.37k
  
if (1.37k
!Info->isEntryFunction()1.37k
) {
1799
828
    const SIRegisterInfo *TRI
1800
828
      = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
1801
828
    const MCPhysReg *I =
1802
828
      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
1803
828
    if (
I828
) {
1804
0
      for (; 
*I0
;
++I0
) {
1805
0
        if (AMDGPU::SReg_64RegClass.contains(*I))
1806
0
          RetOps.push_back(DAG.getRegister(*I, MVT::i64));
1807
0
        else 
if (0
AMDGPU::SReg_32RegClass.contains(*I)0
)
1808
0
          RetOps.push_back(DAG.getRegister(*I, MVT::i32));
1809
0
        else
1810
0
          llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1811
0
      }
1812
0
    }
1813
828
  }
1814
1.37k
1815
1.37k
  // Update chain and glue.
1816
1.37k
  RetOps[0] = Chain;
1817
1.37k
  if (Flag.getNode())
1818
1.09k
    RetOps.push_back(Flag);
1819
1.37k
1820
1.37k
  unsigned Opc = AMDGPUISD::ENDPGM;
1821
1.37k
  if (!IsWaveEnd)
1822
1.09k
    
Opc = IsShader ? 1.09k
AMDGPUISD::RETURN_TO_EPILOG263
:
AMDGPUISD::RET_FLAG828
;
1823
1.37k
  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
1824
14.9k
}
1825
1826
SDValue SITargetLowering::LowerCallResult(
1827
    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
1828
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1829
    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
1830
425
    SDValue ThisVal) const {
1831
425
  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
1832
425
1833
425
  // Assign locations to each value returned by this call.
1834
425
  SmallVector<CCValAssign, 16> RVLocs;
1835
425
  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
1836
425
                 *DAG.getContext());
1837
425
  CCInfo.AnalyzeCallResult(Ins, RetCC);
1838
425
1839
425
  // Copy all of the result registers out of their specified physreg.
1840
536
  for (unsigned i = 0; 
i != RVLocs.size()536
;
++i111
) {
1841
111
    CCValAssign VA = RVLocs[i];
1842
111
    SDValue Val;
1843
111
1844
111
    if (
VA.isRegLoc()111
) {
1845
111
      Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
1846
111
      Chain = Val.getValue(1);
1847
111
      InFlag = Val.getValue(2);
1848
111
    } else 
if (0
VA.isMemLoc()0
) {
1849
0
      report_fatal_error("TODO: return values in memory");
1850
0
    } else
1851
0
      llvm_unreachable("unknown argument location type");
1852
111
1853
111
    switch (VA.getLocInfo()) {
1854
94
    case CCValAssign::Full:
1855
94
      break;
1856
0
    case CCValAssign::BCvt:
1857
0
      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
1858
0
      break;
1859
7
    case CCValAssign::ZExt:
1860
7
      Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
1861
7
                        DAG.getValueType(VA.getValVT()));
1862
7
      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1863
7
      break;
1864
7
    case CCValAssign::SExt:
1865
7
      Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
1866
7
                        DAG.getValueType(VA.getValVT()));
1867
7
      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1868
7
      break;
1869
3
    case CCValAssign::AExt:
1870
3
      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1871
3
      break;
1872
0
    default:
1873
0
      llvm_unreachable("Unknown loc info!");
1874
111
    }
1875
111
1876
111
    InVals.push_back(Val);
1877
111
  }
1878
425
1879
425
  return Chain;
1880
425
}
1881
1882
// Add code to pass special inputs required depending on used features separate
1883
// from the explicit user arguments present in the IR.
1884
void SITargetLowering::passSpecialInputs(
1885
    CallLoweringInfo &CLI,
1886
    const SIMachineFunctionInfo &Info,
1887
    SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
1888
    SmallVectorImpl<SDValue> &MemOpChains,
1889
    SDValue Chain,
1890
457
    SDValue StackPtr) const {
1891
457
  // If we don't have a call site, this was a call inserted by
1892
457
  // legalization. These can never use special inputs.
1893
457
  if (!CLI.CS)
1894
0
    return;
1895
457
1896
457
  const Function *CalleeFunc = CLI.CS.getCalledFunction();
1897
457
  assert(CalleeFunc);
1898
457
1899
457
  SelectionDAG &DAG = CLI.DAG;
1900
457
  const SDLoc &DL = CLI.DL;
1901
457
1902
457
  const SISubtarget *ST = getSubtarget();
1903
457
  const SIRegisterInfo *TRI = ST->getRegisterInfo();
1904
457
1905
457
  auto &ArgUsageInfo =
1906
457
    DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
1907
457
  const AMDGPUFunctionArgInfo &CalleeArgInfo
1908
457
    = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
1909
457
1910
457
  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
1911
457
1912
457
  // TODO: Unify with private memory register handling. This is complicated by
1913
457
  // the fact that at least in kernels, the input argument is not necessarily
1914
457
  // in the same location as the input.
1915
457
  AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
1916
457
    AMDGPUFunctionArgInfo::DISPATCH_PTR,
1917
457
    AMDGPUFunctionArgInfo::QUEUE_PTR,
1918
457
    AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
1919
457
    AMDGPUFunctionArgInfo::DISPATCH_ID,
1920
457
    AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
1921
457
    AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
1922
457
    AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
1923
457
    AMDGPUFunctionArgInfo::WORKITEM_ID_X,
1924
457
    AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
1925
457
    AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
1926
457
    AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
1927
457
  };
1928
457
1929
5.02k
  for (auto InputID : InputRegs) {
1930
5.02k
    const ArgDescriptor *OutgoingArg;
1931
5.02k
    const TargetRegisterClass *ArgRC;
1932
5.02k
1933
5.02k
    std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
1934
5.02k
    if (!OutgoingArg)
1935
4.90k
      continue;
1936
120
1937
120
    const ArgDescriptor *IncomingArg;
1938
120
    const TargetRegisterClass *IncomingArgRC;
1939
120
    std::tie(IncomingArg, IncomingArgRC)
1940
120
      = CallerArgInfo.getPreloadedValue(InputID);
1941
120
    assert(IncomingArgRC == ArgRC);
1942
120
1943
120
    // All special arguments are ints for now.
1944
120
    EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? 
MVT::i6440
:
MVT::i3280
;
1945
120
    SDValue InputReg;
1946
120
1947
120
    if (
IncomingArg120
) {
1948
111
      InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
1949
120
    } else {
1950
9
      // The implicit arg ptr is special because it doesn't have a corresponding
1951
9
      // input for kernels, and is computed from the kernarg segment pointer.
1952
9
      assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1953
9
      InputReg = getImplicitArgPtr(DAG, DL);
1954
9
    }
1955
120
1956
120
    if (
OutgoingArg->isRegister()120
) {
1957
110
      RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
1958
120
    } else {
1959
10
      SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
1960
10
                                              InputReg,
1961
10
                                              OutgoingArg->getStackOffset());
1962
10
      MemOpChains.push_back(ArgStore);
1963
10
    }
1964
5.02k
  }
1965
457
}
1966
1967
39
static bool canGuaranteeTCO(CallingConv::ID CC) {
1968
39
  return CC == CallingConv::Fast;
1969
39
}
1970
1971
/// Return true if we might ever do TCO for calls with this calling convention.
1972
41
static bool mayTailCallThisCC(CallingConv::ID CC) {
1973
41
  switch (CC) {
1974
2
  case CallingConv::C:
1975
2
    return true;
1976
39
  default:
1977
39
    return canGuaranteeTCO(CC);
1978
0
  }
1979
0
}
1980
1981
bool SITargetLowering::isEligibleForTailCallOptimization(
1982
    SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
1983
    const SmallVectorImpl<ISD::OutputArg> &Outs,
1984
    const SmallVectorImpl<SDValue> &OutVals,
1985
41
    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
1986
41
  if (!mayTailCallThisCC(CalleeCC))
1987
0
    return false;
1988
41
1989
41
  MachineFunction &MF = DAG.getMachineFunction();
1990
41
  const Function *CallerF = MF.getFunction();
1991
41
  CallingConv::ID CallerCC = CallerF->getCallingConv();
1992
41
  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1993
41
  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1994
41
1995
41
  // Kernels aren't callable, and don't have a live in return address so it
1996
41
  // doesn't make sense to do a tail call with entry functions.
1997
41
  if (!CallerPreserved)
1998
3
    return false;
1999
38
2000
38
  bool CCMatch = CallerCC == CalleeCC;
2001
38
2002
38
  if (
DAG.getTarget().Options.GuaranteedTailCallOpt38
) {
2003
0
    if (
canGuaranteeTCO(CalleeCC) && 0
CCMatch0
)
2004
0
      return true;
2005
0
    return false;
2006
0
  }
2007
38
2008
38
  // TODO: Can we handle var args?
2009
38
  
if (38
IsVarArg38
)
2010
0
    return false;
2011
38
2012
38
  
for (const Argument &Arg : CallerF->args()) 38
{
2013
99
    if (Arg.hasByValAttr())
2014
3
      return false;
2015
35
  }
2016
35
2017
35
  LLVMContext &Ctx = *DAG.getContext();
2018
35
2019
35
  // Check that the call results are passed in the same way.
2020
35
  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2021
35
                                  CCAssignFnForCall(CalleeCC, IsVarArg),
2022
35
                                  CCAssignFnForCall(CallerCC, IsVarArg)))
2023
0
    return false;
2024
35
2025
35
  // The callee has to preserve all registers the caller needs to preserve.
2026
35
  
if (35
!CCMatch35
) {
2027
0
    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2028
0
    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2029
0
      return false;
2030
35
  }
2031
35
2032
35
  // Nothing more to check if the callee is taking no arguments.
2033
35
  
if (35
Outs.empty()35
)
2034
2
    return true;
2035
33
2036
33
  SmallVector<CCValAssign, 16> ArgLocs;
2037
33
  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2038
33
2039
33
  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2040
33
2041
33
  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2042
33
  // If the stack arguments for this call do not fit into our own save area then
2043
33
  // the call cannot be made tail.
2044
33
  // TODO: Is this really necessary?
2045
33
  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2046
3
    return false;
2047
30
2048
30
  const MachineRegisterInfo &MRI = MF.getRegInfo();
2049
30
  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2050
30
}
2051
2052
13
bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2053
13
  if (!CI->isTailCall())
2054
9
    return false;
2055
4
2056
4
  const Function *ParentFn = CI->getParent()->getParent();
2057
4
  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2058
3
    return false;
2059
1
2060
1
  auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2061
1
  return (Attr.getValueAsString() != "true");
2062
1
}
2063
2064
// The wave scratch offset register is used as the global base pointer.
2065
SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2066
463
                                    SmallVectorImpl<SDValue> &InVals) const {
2067
463
  SelectionDAG &DAG = CLI.DAG;
2068
463
  const SDLoc &DL = CLI.DL;
2069
463
  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2070
463
  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2071
463
  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2072
463
  SDValue Chain = CLI.Chain;
2073
463
  SDValue Callee = CLI.Callee;
2074
463
  bool &IsTailCall = CLI.IsTailCall;
2075
463
  CallingConv::ID CallConv = CLI.CallConv;
2076
463
  bool IsVarArg = CLI.IsVarArg;
2077
463
  bool IsSibCall = false;
2078
463
  bool IsThisReturn = false;
2079
463
  MachineFunction &MF = DAG.getMachineFunction();
2080
463
2081
463
  if (
IsVarArg463
) {
2082
1
    return lowerUnhandledCall(CLI, InVals,
2083
1
                              "unsupported call to variadic function ");
2084
1
  }
2085
462
2086
462
  
if (462
!CLI.CS.getCalledFunction()462
) {
2087
4
    return lowerUnhandledCall(CLI, InVals,
2088
4
                              "unsupported indirect call to function ");
2089
4
  }
2090
458
2091
458
  
if (458
IsTailCall && 458
MF.getTarget().Options.GuaranteedTailCallOpt42
) {
2092
1
    return lowerUnhandledCall(CLI, InVals,
2093
1
                              "unsupported required tail call to function ");
2094
1
  }
2095
457
2096
457
  // The first 4 bytes are reserved for the callee's emergency stack slot.
2097
457
  const unsigned CalleeUsableStackOffset = 4;
2098
457
2099
457
  if (
IsTailCall457
) {
2100
41
    IsTailCall = isEligibleForTailCallOptimization(
2101
41
      Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2102
41
    if (
!IsTailCall && 41
CLI.CS9
&&
CLI.CS.isMustTailCall()9
) {
2103
0
      report_fatal_error("failed to perform tail call elimination on a call "
2104
0
                         "site marked musttail");
2105
0
    }
2106
41
2107
41
    bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2108
41
2109
41
    // A sibling call is one where we're under the usual C ABI and not planning
2110
41
    // to change that but can still do a tail call:
2111
41
    if (
!TailCallOpt && 41
IsTailCall41
)
2112
32
      IsSibCall = true;
2113
41
2114
41
    if (IsTailCall)
2115
32
      ++NumTailCalls;
2116
41
  }
2117
457
2118
457
  
if (GlobalAddressSDNode *457
GA457
= dyn_cast<GlobalAddressSDNode>(Callee)) {
2119
457
    // FIXME: Remove this hack for function pointer types.
2120
457
    const GlobalValue *GV = GA->getGlobal();
2121
457
    assert(Callee.getValueType() == MVT::i32);
2122
457
    Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(),
2123
457
                                  false, GA->getTargetFlags());
2124
457
  }
2125
457
2126
457
  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2127
457
2128
457
  // Analyze operands of the call, assigning locations to each operand.
2129
457
  SmallVector<CCValAssign, 16> ArgLocs;
2130
457
  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2131
457
  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2132
457
  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2133
457
2134
457
  // Get a count of how many bytes are to be pushed on the stack.
2135
457
  unsigned NumBytes = CCInfo.getNextStackOffset();
2136
457
2137
457
  if (
IsSibCall457
) {
2138
32
    // Since we're not changing the ABI to make this a tail call, the memory
2139
32
    // operands are already available in the caller's incoming argument space.
2140
32
    NumBytes = 0;
2141
32
  }
2142
457
2143
457
  // FPDiff is the byte offset of the call's argument area from the callee's.
2144
457
  // Stores to callee stack arguments will be placed in FixedStackSlots offset
2145
457
  // by this amount for a tail call. In a sibling call it must be 0 because the
2146
457
  // caller will deallocate the entire stack and the callee still expects its
2147
457
  // arguments to begin at SP+0. Completely unused for non-tail calls.
2148
457
  int32_t FPDiff = 0;
2149
457
  MachineFrameInfo &MFI = MF.getFrameInfo();
2150
457
  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2151
457
2152
457
  SDValue CallerSavedFP;
2153
457
2154
457
  // Adjust the stack pointer for the new arguments...
2155
457
  // These operations are automatically eliminated by the prolog/epilog pass
2156
457
  if (
!IsSibCall457
) {
2157
425
    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2158
425
2159
425
    unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2160
425
2161
425
    // In the HSA case, this should be an identity copy.
2162
425
    SDValue ScratchRSrcReg
2163
425
      = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2164
425
    RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2165
425
2166
425
    // TODO: Don't hardcode these registers and get from the callee function.
2167
425
    SDValue ScratchWaveOffsetReg
2168
425
      = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2169
425
    RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2170
425
2171
425
    if (
!Info->isEntryFunction()425
) {
2172
89
      // Avoid clobbering this function's FP value. In the current convention
2173
89
      // callee will overwrite this, so do save/restore around the call site.
2174
89
      CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2175
89
                                         Info->getFrameOffsetReg(), MVT::i32);
2176
89
    }
2177
425
  }
2178
457
2179
457
  // Stack pointer relative accesses are done by changing the offset SGPR. This
2180
457
  // is just the VGPR offset component.
2181
457
  SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
2182
457
2183
457
  SmallVector<SDValue, 8> MemOpChains;
2184
457
  MVT PtrVT = MVT::i32;
2185
457
2186
457
  // Walk the register/memloc assignments, inserting copies/loads.
2187
1.50k
  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2188
1.04k
       
++i, ++realArgIdx1.04k
) {
2189
1.04k
    CCValAssign &VA = ArgLocs[i];
2190
1.04k
    SDValue Arg = OutVals[realArgIdx];
2191
1.04k
2192
1.04k
    // Promote the value if needed.
2193
1.04k
    switch (VA.getLocInfo()) {
2194
1.02k
    case CCValAssign::Full:
2195
1.02k
      break;
2196
0
    case CCValAssign::BCvt:
2197
0
      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2198
0
      break;
2199
10
    case CCValAssign::ZExt:
2200
10
      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2201
10
      break;
2202
10
    case CCValAssign::SExt:
2203
10
      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2204
10
      break;
2205
4
    case CCValAssign::AExt:
2206
4
      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2207
4
      break;
2208
0
    case CCValAssign::FPExt:
2209
0
      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2210
0
      break;
2211
0
    default:
2212
0
      llvm_unreachable("Unknown loc info!");
2213
1.04k
    }
2214
1.04k
2215
1.04k
    
if (1.04k
VA.isRegLoc()1.04k
) {
2216
986
      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2217
1.04k
    } else {
2218
62
      assert(VA.isMemLoc());
2219
62
2220
62
      SDValue DstAddr;
2221
62
      MachinePointerInfo DstInfo;
2222
62
2223
62
      unsigned LocMemOffset = VA.getLocMemOffset();
2224
62
      int32_t Offset = LocMemOffset;
2225
62
      SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
2226
62
      PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
2227
62
2228
62
      if (
IsTailCall62
) {
2229
27
        ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2230
27
        unsigned OpSize = Flags.isByVal() ?
2231
27
          
Flags.getByValSize()3
:
VA.getValVT().getStoreSize()24
;
2232
27
2233
27
        Offset = Offset + FPDiff;
2234
27
        int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2235
27
2236
27
        DstAddr = DAG.getFrameIndex(FI, PtrVT);
2237
27
        DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, DstAddr, StackPtr);
2238
27
        DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2239
27
2240
27
        // Make sure any stack arguments overlapping with where we're storing
2241
27
        // are loaded before this eventual operation. Otherwise they'll be
2242
27
        // clobbered.
2243
27
2244
27
        // FIXME: Why is this really necessary? This seems to just result in a
2245
27
        // lot of code to copy the stack and write them back to the same
2246
27
        // locations, which are supposed to be immutable?
2247
27
        Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2248
62
      } else {
2249
35
        DstAddr = PtrOff;
2250
35
        DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2251
35
      }
2252
62
2253
62
      if (
Outs[i].Flags.isByVal()62
) {
2254
28
        SDValue SizeNode =
2255
28
            DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2256
28
        SDValue Cpy = DAG.getMemcpy(
2257
28
            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2258
28
            /*isVol = */ false, /*AlwaysInline = */ true,
2259
28
            /*isTailCall = */ false,
2260
28
            DstInfo, MachinePointerInfo());
2261
28
2262
28
        MemOpChains.push_back(Cpy);
2263
62
      } else {
2264
34
        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
2265
34
        MemOpChains.push_back(Store);
2266
34
      }
2267
62
    }
2268
1.04k
  }
2269
457
2270
457
  // Copy special input registers after user input arguments.
2271
457
  passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
2272
457
2273
457
  if (!MemOpChains.empty())
2274
46
    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2275
457
2276
457
  // Build a sequence of copy-to-reg nodes chained together with token chain
2277
457
  // and flag operands which copy the outgoing args into the appropriate regs.
2278
457
  SDValue InFlag;
2279
1.94k
  for (auto &RegToPass : RegsToPass) {
2280
1.94k
    Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2281
1.94k
                             RegToPass.second, InFlag);
2282
1.94k
    InFlag = Chain.getValue(1);
2283
1.94k
  }
2284
457
2285
457
2286
457
  SDValue PhysReturnAddrReg;
2287
457
  if (
IsTailCall457
) {
2288
32
    // Since the return is being combined with the call, we need to pass on the
2289
32
    // return address.
2290
32
2291
32
    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2292
32
    SDValue ReturnAddrReg = CreateLiveInRegister(
2293
32
      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2294
32
2295
32
    PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2296
32
                                        MVT::i64);
2297
32
    Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2298
32
    InFlag = Chain.getValue(1);
2299
32
  }
2300
457
2301
457
  // We don't usually want to end the call-sequence here because we would tidy
2302
457
  // the frame up *after* the call, however in the ABI-changing tail-call case
2303
457
  // we've carefully laid out the parameters so that when sp is reset they'll be
2304
457
  // in the correct location.
2305
457
  if (
IsTailCall && 457
!IsSibCall32
) {
2306
0
    Chain = DAG.getCALLSEQ_END(Chain,
2307
0
                               DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2308
0
                               DAG.getTargetConstant(0, DL, MVT::i32),
2309
0
                               InFlag, DL);
2310
0
    InFlag = Chain.getValue(1);
2311
0
  }
2312
457
2313
457
  std::vector<SDValue> Ops;
2314
457
  Ops.push_back(Chain);
2315
457
  Ops.push_back(Callee);
2316
457
2317
457
  if (
IsTailCall457
) {
2318
32
    // Each tail call may have to adjust the stack by a different amount, so
2319
32
    // this information must travel along with the operation for eventual
2320
32
    // consumption by emitEpilogue.
2321
32
    Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2322
32
2323
32
    Ops.push_back(PhysReturnAddrReg);
2324
32
  }
2325
457
2326
457
  // Add argument registers to the end of the list so that they are known live
2327
457
  // into the call.
2328
1.94k
  for (auto &RegToPass : RegsToPass) {
2329
1.94k
    Ops.push_back(DAG.getRegister(RegToPass.first,
2330
1.94k
                                  RegToPass.second.getValueType()));
2331
1.94k
  }
2332
457
2333
457
  // Add a register mask operand representing the call-preserved registers.
2334
457
2335
457
  const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
2336
457
  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2337
457
  assert(Mask && "Missing call preserved mask for calling convention");
2338
457
  Ops.push_back(DAG.getRegisterMask(Mask));
2339
457
2340
457
  if (InFlag.getNode())
2341
457
    Ops.push_back(InFlag);
2342
457
2343
457
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2344
457
2345
457
  // If we're doing a tall call, use a TC_RETURN here rather than an
2346
457
  // actual call instruction.
2347
457
  if (
IsTailCall457
) {
2348
32
    MFI.setHasTailCall();
2349
32
    return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2350
32
  }
2351
425
2352
425
  // Returns a chain and a flag for retval copy to use.
2353
425
  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2354
425
  Chain = Call.getValue(0);
2355
425
  InFlag = Call.getValue(1);
2356
425
2357
425
  if (
CallerSavedFP425
) {
2358
89
    SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2359
89
    Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2360
89
    InFlag = Chain.getValue(1);
2361
89
  }
2362
425
2363
425
  uint64_t CalleePopBytes = NumBytes;
2364
425
  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2365
425
                             DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2366
425
                             InFlag, DL);
2367
425
  if (!Ins.empty())
2368
101
    InFlag = Chain.getValue(1);
2369
425
2370
425
  // Handle result values, copying them out of physregs into vregs that we
2371
425
  // return.
2372
425
  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2373
425
                         InVals, IsThisReturn,
2374
425
                         IsThisReturn ? 
OutVals[0]0
:
SDValue()425
);
2375
463
}
2376
2377
unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2378
27
                                             SelectionDAG &DAG) const {
2379
27
  unsigned Reg = StringSwitch<unsigned>(RegName)
2380
27
    .Case("m0", AMDGPU::M0)
2381
27
    .Case("exec", AMDGPU::EXEC)
2382
27
    .Case("exec_lo", AMDGPU::EXEC_LO)
2383
27
    .Case("exec_hi", AMDGPU::EXEC_HI)
2384
27
    .Case("flat_scratch", AMDGPU::FLAT_SCR)
2385
27
    .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2386
27
    .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2387
27
    .Default(AMDGPU::NoRegister);
2388
27
2389
27
  if (
Reg == AMDGPU::NoRegister27
) {
2390
0
    report_fatal_error(Twine("invalid register name \""
2391
0
                             + StringRef(RegName)  + "\"."));
2392
0
2393
0
  }
2394
27
2395
27
  
if (27
Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
2396
27
      
Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)3
) {
2397
1
    report_fatal_error(Twine("invalid register \""
2398
1
                             + StringRef(RegName)  + "\" for subtarget."));
2399
1
  }
2400
26
2401
26
  switch (Reg) {
2402
17
  case AMDGPU::M0:
2403
17
  case AMDGPU::EXEC_LO:
2404
17
  case AMDGPU::EXEC_HI:
2405
17
  case AMDGPU::FLAT_SCR_LO:
2406
17
  case AMDGPU::FLAT_SCR_HI:
2407
17
    if (VT.getSizeInBits() == 32)
2408
16
      return Reg;
2409
1
    break;
2410
9
  case AMDGPU::EXEC:
2411
9
  case AMDGPU::FLAT_SCR:
2412
9
    if (VT.getSizeInBits() == 64)
2413
8
      return Reg;
2414
1
    break;
2415
0
  default:
2416
0
    llvm_unreachable("missing register type checking");
2417
2
  }
2418
2
2419
2
  report_fatal_error(Twine("invalid type for register \""
2420
2
                           + StringRef(RegName) + "\"."));
2421
2
}
2422
2423
// If kill is not the last instruction, split the block so kill is always a
2424
// proper terminator.
2425
MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2426
33
                                                    MachineBasicBlock *BB) const {
2427
33
  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2428
33
2429
33
  MachineBasicBlock::iterator SplitPoint(&MI);
2430
33
  ++SplitPoint;
2431
33
2432
33
  if (
SplitPoint == BB->end()33
) {
2433
4
    // Don't bother with a new block.
2434
4
    MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
2435
4
    return BB;
2436
4
  }
2437
29
2438
29
  MachineFunction *MF = BB->getParent();
2439
29
  MachineBasicBlock *SplitBB
2440
29
    = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2441
29
2442
29
  MF->insert(++MachineFunction::iterator(BB), SplitBB);
2443
29
  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2444
29
2445
29
  SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2446
29
  BB->addSuccessor(SplitBB);
2447
29
2448
29
  MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
2449
29
  return SplitBB;
2450
29
}
2451
2452
// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2453
// wavefront. If the value is uniform and just happens to be in a VGPR, this
2454
// will only do one iteration. In the worst case, this will loop 64 times.
2455
//
2456
// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2457
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
2458
  const SIInstrInfo *TII,
2459
  MachineRegisterInfo &MRI,
2460
  MachineBasicBlock &OrigBB,
2461
  MachineBasicBlock &LoopBB,
2462
  const DebugLoc &DL,
2463
  const MachineOperand &IdxReg,
2464
  unsigned InitReg,
2465
  unsigned ResultReg,
2466
  unsigned PhiReg,
2467
  unsigned InitSaveExecReg,
2468
  int Offset,
2469
33
  bool UseGPRIdxMode) {
2470
33
  MachineBasicBlock::iterator I = LoopBB.begin();
2471
33
2472
33
  unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2473
33
  unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2474
33
  unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2475
33
  unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2476
33
2477
33
  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2478
33
    .addReg(InitReg)
2479
33
    .addMBB(&OrigBB)
2480
33
    .addReg(ResultReg)
2481
33
    .addMBB(&LoopBB);
2482
33
2483
33
  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2484
33
    .addReg(InitSaveExecReg)
2485
33
    .addMBB(&OrigBB)
2486
33
    .addReg(NewExec)
2487
33
    .addMBB(&LoopBB);
2488
33
2489
33
  // Read the next variant <- also loop target.
2490
33
  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2491
33
    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2492
33
2493
33
  // Compare the just read M0 value to all possible Idx values.
2494
33
  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2495
33
    .addReg(CurrentIdxReg)
2496
33
    .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2497
33
2498
33
  if (
UseGPRIdxMode33
) {
2499
16
    unsigned IdxReg;
2500
16
    if (
Offset == 016
) {
2501
10
      IdxReg = CurrentIdxReg;
2502
16
    } else {
2503
6
      IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2504
6
      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2505
6
        .addReg(CurrentIdxReg, RegState::Kill)
2506
6
        .addImm(Offset);
2507
6
    }
2508
16
2509
16
    MachineInstr *SetIdx =
2510
16
      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
2511
16
      .addReg(IdxReg, RegState::Kill);
2512
16
    SetIdx->getOperand(2).setIsUndef();
2513
33
  } else {
2514
17
    // Move index from VCC into M0
2515
17
    if (
Offset == 017
) {
2516
11
      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2517
11
        .addReg(CurrentIdxReg, RegState::Kill);
2518
17
    } else {
2519
6
      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2520
6
        .addReg(CurrentIdxReg, RegState::Kill)
2521
6
        .addImm(Offset);
2522
6
    }
2523
17
  }
2524
33
2525
33
  // Update EXEC, save the original EXEC value to VCC.
2526
33
  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2527
33
    .addReg(CondReg, RegState::Kill);
2528
33
2529
33
  MRI.setSimpleHint(NewExec, CondReg);
2530
33
2531
33
  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2532
33
  MachineInstr *InsertPt =
2533
33
    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
2534
33
    .addReg(AMDGPU::EXEC)
2535
33
    .addReg(NewExec);
2536
33
2537
33
  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2538
33
  // s_cbranch_scc0?
2539
33
2540
33
  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2541
33
  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2542
33
    .addMBB(&LoopBB);
2543
33
2544
33
  return InsertPt->getIterator();
2545
33
}
2546
2547
// This has slightly sub-optimal regalloc when the source vector is killed by
2548
// the read. The register allocator does not understand that the kill is
2549
// per-workitem, so is kept alive for the whole loop so we end up not re-using a
2550
// subregister from it, using 1 more VGPR than necessary. This was saved when
2551
// this was expanded after register allocation.
2552
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
2553
                                                  MachineBasicBlock &MBB,
2554
                                                  MachineInstr &MI,
2555
                                                  unsigned InitResultReg,
2556
                                                  unsigned PhiReg,
2557
                                                  int Offset,
2558
33
                                                  bool UseGPRIdxMode) {
2559
33
  MachineFunction *MF = MBB.getParent();
2560
33
  MachineRegisterInfo &MRI = MF->getRegInfo();
2561
33
  const DebugLoc &DL = MI.getDebugLoc();
2562
33
  MachineBasicBlock::iterator I(&MI);
2563
33
2564
33
  unsigned DstReg = MI.getOperand(0).getReg();
2565
33
  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2566
33
  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2567
33
2568
33
  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2569
33
2570
33
  // Save the EXEC mask
2571
33
  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2572
33
    .addReg(AMDGPU::EXEC);
2573
33
2574
33
  // To insert the loop we need to split the block. Move everything after this
2575
33
  // point to a new block, and insert a new empty block between the two.
2576
33
  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
2577
33
  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2578
33
  MachineFunction::iterator MBBI(MBB);
2579
33
  ++MBBI;
2580
33
2581
33
  MF->insert(MBBI, LoopBB);
2582
33
  MF->insert(MBBI, RemainderBB);
2583
33
2584
33
  LoopBB->addSuccessor(LoopBB);
2585
33
  LoopBB->addSuccessor(RemainderBB);
2586
33
2587
33
  // Move the rest of the block into a new block.
2588
33
  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2589
33
  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2590
33
2591
33
  MBB.addSuccessor(LoopBB);
2592
33
2593
33
  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2594
33
2595
33
  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2596
33
                                      InitResultReg, DstReg, PhiReg, TmpExec,
2597
33
                                      Offset, UseGPRIdxMode);
2598
33
2599
33
  MachineBasicBlock::iterator First = RemainderBB->begin();
2600
33
  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
2601
33
    .addReg(SaveExec);
2602
33
2603
33
  return InsPt;
2604
33
}
2605
2606
// Returns subreg index, offset
2607
static std::pair<unsigned, int>
2608
computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
2609
                            const TargetRegisterClass *SuperRC,
2610
                            unsigned VecReg,
2611
172
                            int Offset) {
2612
172
  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
2613
172
2614
172
  // Skip out of bounds offsets, or else we would end up using an undefined
2615
172
  // register.
2616
172
  if (
Offset >= NumElts || 172
Offset < 0168
)
2617
40
    return std::make_pair(AMDGPU::sub0, Offset);
2618
132
2619
132
  return std::make_pair(AMDGPU::sub0 + Offset, 0);
2620
132
}
2621
2622
// Return true if the index is an SGPR and was set.
2623
static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
2624
                                 MachineRegisterInfo &MRI,
2625
                                 MachineInstr &MI,
2626
                                 int Offset,
2627
                                 bool UseGPRIdxMode,
2628
172
                                 bool IsIndirectSrc) {
2629
172
  MachineBasicBlock *MBB = MI.getParent();
2630
172
  const DebugLoc &DL = MI.getDebugLoc();
2631
172
  MachineBasicBlock::iterator I(&MI);
2632
172
2633
172
  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2634
172
  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
2635
172
2636
172
  assert(Idx->getReg() != AMDGPU::NoRegister);
2637
172
2638
172
  if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
2639
33
    return false;
2640
139
2641
139
  
if (139
UseGPRIdxMode139
) {
2642
42
    unsigned IdxMode = IsIndirectSrc ?
2643
42
      
VGPRIndexMode::SRC0_ENABLE22
:
VGPRIndexMode::DST_ENABLE20
;
2644
42
    if (
Offset == 042
) {
2645
28
      MachineInstr *SetOn =
2646
28
          BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2647
28
              .add(*Idx)
2648
28
              .addImm(IdxMode);
2649
28
2650
28
      SetOn->getOperand(3).setIsUndef();
2651
42
    } else {
2652
14
      unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2653
14
      BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
2654
14
          .add(*Idx)
2655
14
          .addImm(Offset);
2656
14
      MachineInstr *SetOn =
2657
14
        BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2658
14
        .addReg(Tmp, RegState::Kill)
2659
14
        .addImm(IdxMode);
2660
14
2661
14
      SetOn->getOperand(3).setIsUndef();
2662
14
    }
2663
42
2664
42
    return true;
2665
42
  }
2666
97
2667
97
  
if (97
Offset == 097
) {
2668
83
    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2669
83
      .add(*Idx);
2670
97
  } else {
2671
14
    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2672
14
      .add(*Idx)
2673
14
      .addImm(Offset);
2674
14
  }
2675
172
2676
172
  return true;
2677
172
}
2678
2679
// Control flow needs to be inserted if indexing with a VGPR.
2680
static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
2681
                                          MachineBasicBlock &MBB,
2682
78
                                          const SISubtarget &ST) {
2683
78
  const SIInstrInfo *TII = ST.getInstrInfo();
2684
78
  const SIRegisterInfo &TRI = TII->getRegisterInfo();
2685
78
  MachineFunction *MF = MBB.getParent();
2686
78
  MachineRegisterInfo &MRI = MF->getRegInfo();
2687
78
2688
78
  unsigned Dst = MI.getOperand(0).getReg();
2689
78
  unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
2690
78
  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
2691
78
2692
78
  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
2693
78
2694
78
  unsigned SubReg;
2695
78
  std::tie(SubReg, Offset)
2696
78
    = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
2697
78
2698
78
  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
2699
78
2700
78
  if (
setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)78
) {
2701
65
    MachineBasicBlock::iterator I(&MI);
2702
65
    const DebugLoc &DL = MI.getDebugLoc();
2703
65
2704
65
    if (
UseGPRIdxMode65
) {
2705
22
      // TODO: Look at the uses to avoid the copy. This may require rescheduling
2706
22
      // to avoid interfering with other uses, so probably requires a new
2707
22
      // optimization pass.
2708
22
      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
2709
22
        .addReg(SrcReg, RegState::Undef, SubReg)
2710
22
        .addReg(SrcReg, RegState::Implicit)
2711
22
        .addReg(AMDGPU::M0, RegState::Implicit);
2712
22
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2713
65
    } else {
2714
43
      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
2715
43
        .addReg(SrcReg, RegState::Undef, SubReg)
2716
43
        .addReg(SrcReg, RegState::Implicit);
2717
43
    }
2718
65
2719
65
    MI.eraseFromParent();
2720
65
2721
65
    return &MBB;
2722
65
  }
2723
13
2724
13
  const DebugLoc &DL = MI.getDebugLoc();
2725
13
  MachineBasicBlock::iterator I(&MI);
2726
13
2727
13
  unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2728
13
  unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2729
13
2730
13
  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
2731
13
2732
13
  if (
UseGPRIdxMode13
) {
2733
6
    MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2734
6
      .addImm(0) // Reset inside loop.
2735
6
      .addImm(VGPRIndexMode::SRC0_ENABLE);
2736
6
    SetOn->getOperand(3).setIsUndef();
2737
6
2738
6
    // Disable again after the loop.
2739
6
    BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2740
6
  }
2741
13
2742
13
  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
2743
13
  MachineBasicBlock *LoopBB = InsPt->getParent();
2744
13
2745
13
  if (
UseGPRIdxMode13
) {
2746
6
    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
2747
6
      .addReg(SrcReg, RegState::Undef, SubReg)
2748
6
      .addReg(SrcReg, RegState::Implicit)
2749
6
      .addReg(AMDGPU::M0, RegState::Implicit);
2750
13
  } else {
2751
7
    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
2752
7
      .addReg(SrcReg, RegState::Undef, SubReg)
2753
7
      .addReg(SrcReg, RegState::Implicit);
2754
7
  }
2755
78
2756
78
  MI.eraseFromParent();
2757
78
2758
78
  return LoopBB;
2759
78
}
2760
2761
static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
2762
64
                                 const TargetRegisterClass *VecRC) {
2763
64
  switch (TRI.getRegSizeInBits(*VecRC)) {
2764
0
  case 32: // 4 bytes
2765
0
    return AMDGPU::V_MOVRELD_B32_V1;
2766
6
  case 64: // 8 bytes
2767
6
    return AMDGPU::V_MOVRELD_B32_V2;
2768
44
  case 128: // 16 bytes
2769
44
    return AMDGPU::V_MOVRELD_B32_V4;
2770
10
  case 256: // 32 bytes
2771
10
    return AMDGPU::V_MOVRELD_B32_V8;
2772
4
  case 512: // 64 bytes
2773
4
    return AMDGPU::V_MOVRELD_B32_V16;
2774
0
  default:
2775
0
    llvm_unreachable("unsupported size for MOVRELD pseudos");
2776
0
  }
2777
0
}
2778
2779
static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
2780
                                          MachineBasicBlock &MBB,
2781
94
                                          const SISubtarget &ST) {
2782
94
  const SIInstrInfo *TII = ST.getInstrInfo();
2783
94
  const SIRegisterInfo &TRI = TII->getRegisterInfo();
2784
94
  MachineFunction *MF = MBB.getParent();
2785
94
  MachineRegisterInfo &MRI = MF->getRegInfo();
2786
94
2787
94
  unsigned Dst = MI.getOperand(0).getReg();
2788
94
  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
2789
94
  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2790
94
  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
2791
94
  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
2792
94
  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
2793
94
2794
94
  // This can be an immediate, but will be folded later.
2795
94
  assert(Val->getReg());
2796
94
2797
94
  unsigned SubReg;
2798
94
  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
2799
94
                                                         SrcVec->getReg(),
2800
94
                                                         Offset);
2801
94
  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
2802
94
2803
94
  if (
Idx->getReg() == AMDGPU::NoRegister94
) {
2804
0
    MachineBasicBlock::iterator I(&MI);
2805
0
    const DebugLoc &DL = MI.getDebugLoc();
2806
0
2807
0
    assert(Offset == 0);
2808
0
2809
0
    BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
2810
0
        .add(*SrcVec)
2811
0
        .add(*Val)
2812
0
        .addImm(SubReg);
2813
0
2814
0
    MI.eraseFromParent();
2815
0
    return &MBB;
2816
0
  }
2817
94
2818
94
  
if (94
setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)94
) {
2819
74
    MachineBasicBlock::iterator I(&MI);
2820
74
    const DebugLoc &DL = MI.getDebugLoc();
2821
74
2822
74
    if (
UseGPRIdxMode74
) {
2823
20
      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
2824
20
          .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
2825
20
          .add(*Val)
2826
20
          .addReg(Dst, RegState::ImplicitDefine)
2827
20
          .addReg(SrcVec->getReg(), RegState::Implicit)
2828
20
          .addReg(AMDGPU::M0, RegState::Implicit);
2829
20
2830
20
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2831
74
    } else {
2832
54
      const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
2833
54
2834
54
      BuildMI(MBB, I, DL, MovRelDesc)
2835
54
          .addReg(Dst, RegState::Define)
2836
54
          .addReg(SrcVec->getReg())
2837
54
          .add(*Val)
2838
54
          .addImm(SubReg - AMDGPU::sub0);
2839
54
    }
2840
74
2841
74
    MI.eraseFromParent();
2842
74
    return &MBB;
2843
74
  }
2844
20
2845
20
  
if (20
Val->isReg()20
)
2846
20
    MRI.clearKillFlags(Val->getReg());
2847
20
2848
20
  const DebugLoc &DL = MI.getDebugLoc();
2849
20
2850
20
  if (
UseGPRIdxMode20
) {
2851
10
    MachineBasicBlock::iterator I(&MI);
2852
10
2853
10
    MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2854
10
      .addImm(0) // Reset inside loop.
2855
10
      .addImm(VGPRIndexMode::DST_ENABLE);
2856
10
    SetOn->getOperand(3).setIsUndef();
2857
10
2858
10
    // Disable again after the loop.
2859
10
    BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2860
10
  }
2861
20
2862
20
  unsigned PhiReg = MRI.createVirtualRegister(VecRC);
2863
20
2864
20
  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
2865
20
                              Offset, UseGPRIdxMode);
2866
20
  MachineBasicBlock *LoopBB = InsPt->getParent();
2867
20
2868
20
  if (
UseGPRIdxMode20
) {
2869
10
    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
2870
10
        .addReg(PhiReg, RegState::Undef, SubReg) // vdst
2871
10
        .add(*Val)                               // src0
2872
10
        .addReg(Dst, RegState::ImplicitDefine)
2873
10
        .addReg(PhiReg, RegState::Implicit)
2874
10
        .addReg(AMDGPU::M0, RegState::Implicit);
2875
20
  } else {
2876
10
    const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
2877
10
2878
10
    BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
2879
10
        .addReg(Dst, RegState::Define)
2880
10
        .addReg(PhiReg)
2881
10
        .add(*Val)
2882
10
        .addImm(SubReg - AMDGPU::sub0);
2883
10
  }
2884
94
2885
94
  MI.eraseFromParent();
2886
94
2887
94
  return LoopBB;
2888
94
}
2889
2890
MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
2891
9.57k
  MachineInstr &MI, MachineBasicBlock *BB) const {
2892
9.57k
2893
9.57k
  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2894
9.57k
  MachineFunction *MF = BB->getParent();
2895
9.57k
  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2896
9.57k
2897
9.57k
  if (
TII->isMIMG(MI)9.57k
) {
2898
314
      if (!MI.memoperands_empty())
2899
0
        return BB;
2900
314
    // Add a memoperand for mimg instructions so that they aren't assumed to
2901
314
    // be ordered memory instuctions.
2902
314
2903
314
    MachinePointerInfo PtrInfo(MFI->getImagePSV());
2904
314
    MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable;
2905
314
    if (MI.mayStore())
2906
56
      Flags |= MachineMemOperand::MOStore;
2907
314
2908
314
    if (MI.mayLoad())
2909
314
      Flags |= MachineMemOperand::MOLoad;
2910
314
2911
314
    auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
2912
314
    MI.addMemOperand(*MF, MMO);
2913
314
    return BB;
2914
314
  }
2915
9.25k
2916
9.25k
  switch (MI.getOpcode()) {
2917
7.55k
  case AMDGPU::SI_INIT_M0:
2918
7.55k
    BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
2919
7.55k
            TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2920
7.55k
        .add(MI.getOperand(0));
2921
7.55k
    MI.eraseFromParent();
2922
7.55k
    return BB;
2923
9.25k
2924
2
  case AMDGPU::SI_INIT_EXEC:
2925
2
    // This should be before all vector instructions.
2926
2
    BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
2927
2
            AMDGPU::EXEC)
2928
2
        .addImm(MI.getOperand(0).getImm());
2929
2
    MI.eraseFromParent();
2930
2
    return BB;
2931
9.25k
2932
4
  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
2933
4
    // Extract the thread count from an SGPR input and set EXEC accordingly.
2934
4
    // Since BFM can't shift by 64, handle that case with CMP + CMOV.
2935
4
    //
2936
4
    // S_BFE_U32 count, input, {shift, 7}
2937
4
    // S_BFM_B64 exec, count, 0
2938
4
    // S_CMP_EQ_U32 count, 64
2939
4
    // S_CMOV_B64 exec, -1
2940
4
    MachineInstr *FirstMI = &*BB->begin();
2941
4
    MachineRegisterInfo &MRI = MF->getRegInfo();
2942
4
    unsigned InputReg = MI.getOperand(0).getReg();
2943
4
    unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2944
4
    bool Found = false;
2945
4
2946
4
    // Move the COPY of the input reg to the beginning, so that we can use it.
2947
10
    for (auto I = BB->begin(); 
I != &MI10
;
I++6
) {
2948
10
      if (I->getOpcode() != TargetOpcode::COPY ||
2949
10
          I->getOperand(0).getReg() != InputReg)
2950
6
        continue;
2951
4
2952
4
      
if (4
I == FirstMI4
) {
2953
0
        FirstMI = &*++BB->begin();
2954
4
      } else {
2955
4
        I->removeFromParent();
2956
4
        BB->insert(FirstMI, &*I);
2957
4
      }
2958
10
      Found = true;
2959
10
      break;
2960
10
    }
2961
4
    assert(Found);
2962
4
    (void)Found;
2963
4
2964
4
    // This should be before all vector instructions.
2965
4
    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
2966
4
        .addReg(InputReg)
2967
4
        .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
2968
4
    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
2969
4
            AMDGPU::EXEC)
2970
4
        .addReg(CountReg)
2971
4
        .addImm(0);
2972
4
    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
2973
4
        .addReg(CountReg, RegState::Kill)
2974
4
        .addImm(64);
2975
4
    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
2976
4
            AMDGPU::EXEC)
2977
4
        .addImm(-1);
2978
4
    MI.eraseFromParent();
2979
4
    return BB;
2980
9.25k
  }
2981
9.25k
2982
61
  case AMDGPU::GET_GROUPSTATICSIZE: {
2983
61
    DebugLoc DL = MI.getDebugLoc();
2984
61
    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
2985
61
        .add(MI.getOperand(0))
2986
61
        .addImm(MFI->getLDSSize());
2987
61
    MI.eraseFromParent();
2988
61
    return BB;
2989
9.25k
  }
2990
78
  case AMDGPU::SI_INDIRECT_SRC_V1:
2991
78
  case AMDGPU::SI_INDIRECT_SRC_V2:
2992
78
  case AMDGPU::SI_INDIRECT_SRC_V4:
2993
78
  case AMDGPU::SI_INDIRECT_SRC_V8:
2994
78
  case AMDGPU::SI_INDIRECT_SRC_V16:
2995
78
    return emitIndirectSrc(MI, *BB, *getSubtarget());
2996
94
  case AMDGPU::SI_INDIRECT_DST_V1:
2997
94
  case AMDGPU::SI_INDIRECT_DST_V2:
2998
94
  case AMDGPU::SI_INDIRECT_DST_V4:
2999
94
  case AMDGPU::SI_INDIRECT_DST_V8:
3000
94
  case AMDGPU::SI_INDIRECT_DST_V16:
3001
94
    return emitIndirectDst(MI, *BB, *getSubtarget());
3002
33
  case AMDGPU::SI_KILL:
3003
33
    return splitKillBlock(MI, BB);
3004
49
  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3005
49
    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3006
49
3007
49
    unsigned Dst = MI.getOperand(0).getReg();
3008
49
    unsigned Src0 = MI.getOperand(1).getReg();
3009
49
    unsigned Src1 = MI.getOperand(2).getReg();
3010
49
    const DebugLoc &DL = MI.getDebugLoc();
3011
49
    unsigned SrcCond = MI.getOperand(3).getReg();
3012
49
3013
49
    unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3014
49
    unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3015
49
3016
49
    BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3017
49
      .addReg(Src0, 0, AMDGPU::sub0)
3018
49
      .addReg(Src1, 0, AMDGPU::sub0)
3019
49
      .addReg(SrcCond);
3020
49
    BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3021
49
      .addReg(Src0, 0, AMDGPU::sub1)
3022
49
      .addReg(Src1, 0, AMDGPU::sub1)
3023
49
      .addReg(SrcCond);
3024
49
3025
49
    BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3026
49
      .addReg(DstLo)
3027
49
      .addImm(AMDGPU::sub0)
3028
49
      .addReg(DstHi)
3029
49
      .addImm(AMDGPU::sub1);
3030
49
    MI.eraseFromParent();
3031
49
    return BB;
3032
94
  }
3033
76
  case AMDGPU::SI_BR_UNDEF: {
3034
76
    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3035
76
    const DebugLoc &DL = MI.getDebugLoc();
3036
76
    MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3037
76
                           .add(MI.getOperand(0));
3038
76
    Br->getOperand(1).setIsUndef(true); // read undef SCC
3039
76
    MI.eraseFromParent();
3040
76
    return BB;
3041
94
  }
3042
850
  case AMDGPU::ADJCALLSTACKUP:
3043
850
  case AMDGPU::ADJCALLSTACKDOWN: {
3044
850
    const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3045
850
    MachineInstrBuilder MIB(*MF, &MI);
3046
850
    MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3047
850
        .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
3048
850
    return BB;
3049
850
  }
3050
457
  case AMDGPU::SI_CALL_ISEL:
3051
457
  case AMDGPU::SI_TCRETURN_ISEL: {
3052
457
    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3053
457
    const DebugLoc &DL = MI.getDebugLoc();
3054
457
    unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3055
457
3056
457
    MachineRegisterInfo &MRI = MF->getRegInfo();
3057
457
    unsigned GlobalAddrReg = MI.getOperand(0).getReg();
3058
457
    MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
3059
457
    assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
3060
457
3061
457
    const GlobalValue *G = PCRel->getOperand(1).getGlobal();
3062
457
3063
457
    MachineInstrBuilder MIB;
3064
457
    if (
MI.getOpcode() == AMDGPU::SI_CALL_ISEL457
) {
3065
425
      MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
3066
425
        .add(MI.getOperand(0))
3067
425
        .addGlobalAddress(G);
3068
457
    } else {
3069
32
      MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
3070
32
        .add(MI.getOperand(0))
3071
32
        .addGlobalAddress(G);
3072
32
3073
32
      // There is an additional imm operand for tcreturn, but it should be in the
3074
32
      // right place already.
3075
32
    }
3076
457
3077
3.03k
    for (unsigned I = 1, E = MI.getNumOperands(); 
I != E3.03k
;
++I2.57k
)
3078
2.57k
      MIB.add(MI.getOperand(I));
3079
457
3080
457
    MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
3081
457
    MI.eraseFromParent();
3082
457
    return BB;
3083
457
  }
3084
0
  default:
3085
0
    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
3086
0
  }
3087
0
}
3088
3089
3.21k
bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3090
3.21k
  // This currently forces unfolding various combinations of fsub into fma with
3091
3.21k
  // free fneg'd operands. As long as we have fast FMA (controlled by
3092
3.21k
  // isFMAFasterThanFMulAndFAdd), we should perform these.
3093
3.21k
3094
3.21k
  // When fma is quarter rate, for f64 where add / sub are at best half rate,
3095
3.21k
  // most of these combines appear to be cycle neutral but save on instruction
3096
3.21k
  // count / code size.
3097
3.21k
  return true;
3098
3.21k
}
3099
3100
EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3101
18.2k
                                         EVT VT) const {
3102
18.2k
  if (
!VT.isVector()18.2k
) {
3103
18.1k
    return MVT::i1;
3104
18.1k
  }
3105
64
  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3106
64
}
3107
3108
86.8k
MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3109
86.8k
  // TODO: Should i16 be used always if legal? For now it would force VALU
3110
86.8k
  // shifts.
3111
86.8k
  return (VT == MVT::i16) ? 
MVT::i163.97k
:
MVT::i3282.8k
;
3112
86.8k
}
3113
3114
// Answering this is somewhat tricky and depends on the specific device which
3115
// have different rates for fma or all f64 operations.
3116
//
3117
// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3118
// regardless of which device (although the number of cycles differs between
3119
// devices), so it is always profitable for f64.
3120
//
3121
// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3122
// only on full rate devices. Normally, we should prefer selecting v_mad_f32
3123
// which we can always do even without fused FP ops since it returns the same
3124
// result as the separate operations and since it is always full
3125
// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3126
// however does not support denormals, so we do report fma as faster if we have
3127
// a fast fma device and require denormals.
3128
//
3129
9.35k
bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3130
9.35k
  VT = VT.getScalarType();
3131
9.35k
3132
9.35k
  switch (VT.getSimpleVT().SimpleTy) {
3133
6.88k
  case MVT::f32:
3134
6.88k
    // This is as fast on some subtargets. However, we always have full rate f32
3135
6.88k
    // mad available which returns the same result as the separate operations
3136
6.88k
    // which we should prefer over fma. We can't use this if we want to support
3137
6.88k
    // denormals, so only report this in these cases.
3138
427
    return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
3139
870
  case MVT::f64:
3140
870
    return true;
3141
1.60k
  case MVT::f16:
3142
1.32k
    return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
3143
0
  default:
3144
0
    break;
3145
0
  }
3146
0
3147
0
  return false;
3148
0
}
3149
3150
//===----------------------------------------------------------------------===//
3151
// Custom DAG Lowering Operations
3152
//===----------------------------------------------------------------------===//
3153
3154
169k
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3155
169k
  switch (Op.getOpcode()) {
3156
16.8k
  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3157
1.50k
  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3158
75.2k
  case ISD::LOAD: {
3159
75.2k
    SDValue Result = LowerLOAD(Op, DAG);
3160
75.2k
    assert((!Result.getNode() ||
3161
75.2k
            Result.getNode()->getNumValues() == 2) &&
3162
75.2k
           "Load should return a value and a chain");
3163
75.2k
    return Result;
3164
169k
  }
3165
169k
3166
51
  case ISD::FSIN:
3167
51
  case ISD::FCOS:
3168
51
    return LowerTrig(Op, DAG);
3169
1.70k
  case ISD::SELECT: return LowerSELECT(Op, DAG);
3170
264
  case ISD::FDIV: return LowerFDIV(Op, DAG);
3171
251
  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3172
62.8k
  case ISD::STORE: return LowerSTORE(Op, DAG);
3173
766
  case ISD::GlobalAddress: {
3174
766
    MachineFunction &MF = DAG.getMachineFunction();
3175
766
    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3176
766
    return LowerGlobalAddress(MFI, Op, DAG);
3177
51
  }
3178
5.60k
  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3179
1.23k
  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3180
1.91k
  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3181
45
  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3182
32
  case ISD::INSERT_VECTOR_ELT:
3183
32
    return lowerINSERT_VECTOR_ELT(Op, DAG);
3184
586
  case ISD::EXTRACT_VECTOR_ELT:
3185
586
    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3186
477
  case ISD::FP_ROUND:
3187
477
    return lowerFP_ROUND(Op, DAG);
3188
51
3189
36
  case ISD::TRAP:
3190
36
  case ISD::DEBUGTRAP:
3191
36
    return lowerTRAP(Op, DAG);
3192
0
  }
3193
0
  return SDValue();
3194
0
}
3195
3196
void SITargetLowering::ReplaceNodeResults(SDNode *N,
3197
                                          SmallVectorImpl<SDValue> &Results,
3198
167
                                          SelectionDAG &DAG) const {
3199
167
  switch (N->getOpcode()) {
3200
62
  case ISD::INSERT_VECTOR_ELT: {
3201
62
    if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3202
10
      Results.push_back(Res);
3203
62
    return;
3204
167
  }
3205
0
  case ISD::EXTRACT_VECTOR_ELT: {
3206
0
    if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3207
0
      Results.push_back(Res);
3208
0
    return;
3209
167
  }
3210
54
  case ISD::INTRINSIC_WO_CHAIN: {
3211
54
    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3212
54
    if (
IID == Intrinsic::amdgcn_cvt_pkrtz54
) {
3213
54
      SDValue Src0 = N->getOperand(1);
3214
54
      SDValue Src1 = N->getOperand(2);
3215
54
      SDLoc SL(N);
3216
54
      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
3217
54
                                Src0, Src1);
3218
54
      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3219
54
      return;
3220
54
    }
3221
0
    break;
3222
0
  }
3223
40
  case ISD::SELECT: {
3224
40
    SDLoc SL(N);
3225
40
    EVT VT = N->getValueType(0);
3226
40
    EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3227
40
    SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3228
40
    SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3229
40
3230
40
    EVT SelectVT = NewVT;
3231
40
    if (
NewVT.bitsLT(MVT::i32)40
) {
3232
2
      LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3233
2
      RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3234
2
      SelectVT = MVT::i32;
3235
2
    }
3236
40
3237
40
    SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3238
40
                                    N->getOperand(0), LHS, RHS);
3239
40
3240
40
    if (NewVT != SelectVT)
3241
2
      NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3242
40
    Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3243
40
    return;
3244
0
  }
3245
11
  default:
3246
11
    break;
3247
11
  }
3248
11
}
3249
3250
/// \brief Helper function for LowerBRCOND
3251
645
static SDNode *findUser(SDValue Value, unsigned Opcode) {
3252
645
3253
645
  SDNode *Parent = Value.getNode();
3254
645
  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3255
1.38k
       
I != E1.38k
;
++I739
) {
3256
1.38k
3257
1.38k
    if (I.getUse().get() != Value)
3258
739
      continue;
3259
642
3260
642
    
if (642
I->getOpcode() == Opcode642
)
3261
642
      return *I;
3262
1.38k
  }
3263
3
  return nullptr;
3264
645
}
3265
3266
1.50k
unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3267
1.50k
  if (
Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN1.50k
) {
3268
414
    switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3269
323
    case Intrinsic::amdgcn_if:
3270
323
      return AMDGPUISD::IF;
3271
45
    case Intrinsic::amdgcn_else:
3272
45
      return AMDGPUISD::ELSE;
3273
44
    case Intrinsic::amdgcn_loop:
3274
44
      return AMDGPUISD::LOOP;
3275
0
    case Intrinsic::amdgcn_end_cf:
3276
0
      llvm_unreachable("should not occur");
3277
2
    default:
3278
2
      return 0;
3279
1.08k
    }
3280
1.08k
  }
3281
1.08k
3282
1.08k
  // break, if_break, else_break are all only used as inputs to loop, not
3283
1.08k
  // directly as branch conditions.
3284
1.08k
  return 0;
3285
1.08k
}
3286
3287
void SITargetLowering::createDebuggerPrologueStackObjects(
3288
4
    MachineFunction &MF) const {
3289
4
  // Create stack objects that are used for emitting debugger prologue.
3290
4
  //
3291
4
  // Debugger prologue writes work group IDs and work item IDs to scratch memory
3292
4
  // at fixed location in the following format:
3293
4
  //   offset 0:  work group ID x
3294
4
  //   offset 4:  work group ID y
3295
4
  //   offset 8:  work group ID z
3296
4
  //   offset 16: work item ID x
3297
4
  //   offset 20: work item ID y
3298
4
  //   offset 24: work item ID z
3299
4
  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3300
4
  int ObjectIdx = 0;
3301
4
3302
4
  // For each dimension:
3303
16
  for (unsigned i = 0; 
i < 316
;
++i12
) {
3304
12
    // Create fixed stack object for work group ID.
3305
12
    ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
3306
12
    Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
3307
12
    // Create fixed stack object for work item ID.
3308
12
    ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
3309
12
    Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
3310
12
  }
3311
4
}
3312
3313
1.05k
bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3314
1.05k
  const Triple &TT = getTargetMachine().getTargetTriple();
3315
1.05k
  return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
3316
79
         AMDGPU::shouldEmitConstantsToTextSection(TT);
3317
1.05k
}
3318
3319
546
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
3320
546
  return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
3321
496
              GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
3322
104
         !shouldEmitFixup(GV) &&
3323
61
         !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
3324
546
}
3325
3326
467
bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
3327
467
  return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
3328
467
}
3329
3330
/// This transforms the control flow intrinsics to get the branch destination as
3331
/// last parameter, also switches branch target with BR if the need arise
3332
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
3333
1.50k
                                      SelectionDAG &DAG) const {
3334
1.50k
  SDLoc DL(BRCOND);
3335
1.50k
3336
1.50k
  SDNode *Intr = BRCOND.getOperand(1).getNode();
3337
1.50k
  SDValue Target = BRCOND.getOperand(2);
3338
1.50k
  SDNode *BR = nullptr;
3339
1.50k
  SDNode *SetCC = nullptr;
3340
1.50k
3341
1.50k
  if (
Intr->getOpcode() == ISD::SETCC1.50k
) {
3342
1.22k
    // As long as we negate the condition everything is fine
3343
1.22k
    SetCC = Intr;
3344
1.22k
    Intr = SetCC->getOperand(0).getNode();
3345
1.22k
3346
1.50k
  } else {
3347
277
    // Get the target from BR if we don't negate the condition
3348
277
    BR = findUser(BRCOND, ISD::BR);
3349
277
    Target = BR->getOperand(1);
3350
277
  }
3351
1.50k
3352
1.50k
  // FIXME: This changes the types of the intrinsics instead of introducing new
3353
1.50k
  // nodes with the correct types.
3354
1.50k
  // e.g. llvm.amdgcn.loop
3355
1.50k
3356
1.50k
  // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
3357
1.50k
  // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
3358
1.50k
3359
1.50k
  unsigned CFNode = isCFIntrinsic(Intr);
3360
1.50k
  if (
CFNode == 01.50k
) {
3361
1.08k
    // This is a uniform branch so we don't need to legalize.
3362
1.08k
    return BRCOND;
3363
1.08k
  }
3364
412
3365
412
  bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
3366
412
                   Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
3367
412
3368
412
  assert(!SetCC ||
3369
412
        (SetCC->getConstantOperandVal(1) == 1 &&
3370
412
         cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
3371
412
                                                             ISD::SETNE));
3372
412
3373
412
  // operands of the new intrinsic call
3374
412
  SmallVector<SDValue, 4> Ops;
3375
412
  if (HaveChain)
3376
412
    Ops.push_back(BRCOND.getOperand(0));
3377
412
3378
412
  Ops.append(Intr->op_begin() + (HaveChain ?  
2412
:
10
), Intr->op_end());
3379
412
  Ops.push_back(Target);
3380
412
3381
412
  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
3382
412
3383
412
  // build the new intrinsic call
3384
412
  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
3385
412
3386
412
  if (
!HaveChain412
) {
3387
0
    SDValue Ops[] =  {
3388
0
      SDValue(Result, 0),
3389
0
      BRCOND.getOperand(0)
3390
0
    };
3391
0
3392
0
    Result = DAG.getMergeValues(Ops, DL).getNode();
3393
0
  }
3394
412
3395
412
  if (
BR412
) {
3396
81
    // Give the branch instruction our target
3397
81
    SDValue Ops[] = {
3398
81
      BR->getOperand(0),
3399
81
      BRCOND.getOperand(2)
3400
81
    };
3401
81
    SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
3402
81
    DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
3403
81
    BR = NewBR.getNode();
3404
81
  }
3405
412
3406
412
  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
3407
412
3408
412
  // Copy the intrinsic results to registers
3409
780
  for (unsigned i = 1, e = Intr->getNumValues() - 1; 
i != e780
;
++i368
) {
3410
368
    SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
3411
368
    if (!CopyToReg)
3412
3
      continue;
3413
365
3414
365
    Chain = DAG.getCopyToReg(
3415
365
      Chain, DL,
3416
365
      CopyToReg->getOperand(1),
3417
365
      SDValue(Result, i - 1),
3418
365
      SDValue());
3419
365
3420
365
    DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
3421
365
  }
3422
1.50k
3423
1.50k
  // Remove the old intrinsic from the chain
3424
1.50k
  DAG.ReplaceAllUsesOfValueWith(
3425
1.50k
    SDValue(Intr, Intr->getNumValues() - 1),
3426
1.50k
    Intr->getOperand(0));
3427
1.50k
3428
1.50k
  return Chain;
3429
1.50k
}
3430
3431
SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
3432
                                            SDValue Op,
3433
                                            const SDLoc &DL,
3434
2.42k
                                            EVT VT) const {
3435
2.42k
  return Op.getValueType().bitsLE(VT) ?
3436
2.41k
      DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
3437
12
      DAG.getNode(ISD::FTRUNC, DL, VT, Op);
3438
2.42k
}
3439
3440
477
SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
3441
477
  assert(Op.getValueType() == MVT::f16 &&
3442
477
         "Do not know how to custom lower FP_ROUND for non-f16 type");
3443
477
3444
477
  SDValue Src = Op.getOperand(0);
3445
477
  EVT SrcVT = Src.getValueType();
3446
477
  if (SrcVT != MVT::f64)
3447
467
    return Op;
3448
10
3449
10
  SDLoc DL(Op);
3450
10
3451
10
  SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
3452
10
  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
3453
10
  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
3454
10
}
3455
3456
36
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
3457
36
  SDLoc SL(Op);
3458
36
  MachineFunction &MF = DAG.getMachineFunction();
3459
36
  SDValue Chain = Op.getOperand(0);
3460
36
3461
36
  unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
3462
36
    
SISubtarget::TrapIDLLVMDebugTrap9
:
SISubtarget::TrapIDLLVMTrap27
;
3463
36
3464
36
  if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
3465
36
      
Subtarget->isTrapHandlerEnabled()16
) {
3466
8
    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3467
8
    unsigned UserSGPR = Info->getQueuePtrUserSGPR();
3468
8
    assert(UserSGPR != AMDGPU::NoRegister);
3469
8
3470
8
    SDValue QueuePtr = CreateLiveInRegister(
3471
8
      DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
3472
8
3473
8
    SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
3474
8
3475
8
    SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
3476
8
                                     QueuePtr, SDValue());
3477
8
3478
8
    SDValue Ops[] = {
3479
8
      ToReg,
3480
8
      DAG.getTargetConstant(TrapID, SL, MVT::i16),
3481
8
      SGPR01,
3482
8
      ToReg.getValue(1)
3483
8
    };
3484
8
3485
8
    return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
3486
8
  }
3487
28
3488
28
  switch (TrapID) {
3489
21
  case SISubtarget::TrapIDLLVMTrap:
3490
21
    return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
3491
7
  case SISubtarget::TrapIDLLVMDebugTrap: {
3492
7
    DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
3493
7
                                     "debugtrap handler not supported",
3494
7
                                     Op.getDebugLoc(),
3495
7
                                     DS_Warning);
3496
7
    LLVMContext &Ctx = MF.getFunction()->getContext();
3497
7
    Ctx.diagnose(NoTrap);
3498
7
    return Chain;
3499
28
  }
3500
0
  default:
3501
0
    llvm_unreachable("unsupported trap handler type!");
3502
0
  }
3503
0
3504
0
  return Chain;
3505
0
}
3506
3507
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
3508
32
                                             SelectionDAG &DAG) const {
3509
32
  // FIXME: Use inline constants (src_{shared, private}_base) instead.
3510
32
  if (
Subtarget->hasApertureRegs()32
) {
3511
12
    unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
3512
5
        AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
3513
7
        AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
3514
12
    unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
3515
5
        AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
3516
7
        AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
3517
12
    unsigned Encoding =
3518
12
        AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
3519
12
        Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
3520
12
        WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
3521
12
3522
12
    SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
3523
12
    SDValue ApertureReg = SDValue(
3524
12
        DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
3525
12
    SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
3526
12
    return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
3527
12
  }
3528
20
3529
20
  MachineFunction &MF = DAG.getMachineFunction();
3530
20
  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3531
20
  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
3532
20
  assert(UserSGPR != AMDGPU::NoRegister);
3533
20
3534
20
  SDValue QueuePtr = CreateLiveInRegister(
3535
20
    DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
3536
20
3537
20
  // Offset into amd_queue_t for group_segment_aperture_base_hi /
3538
20
  // private_segment_aperture_base_hi.
3539
20
  uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 
0x405
:
0x4415
;
3540
32
3541
32
  SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr,
3542
32
                            DAG.getConstant(StructOffset, DL, MVT::i64));
3543
32
3544
32
  // TODO: Use custom target PseudoSourceValue.
3545
32
  // TODO: We should use the value from the IR intrinsic call, but it might not
3546
32
  // be available and how do we get it?
3547
32
  Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
3548
32
                                              AMDGPUASI.CONSTANT_ADDRESS));
3549
32
3550
32
  MachinePointerInfo PtrInfo(V, StructOffset);
3551
32
  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
3552
32
                     MinAlign(64, StructOffset),
3553
32
                     MachineMemOperand::MODereferenceable |
3554
32
                         MachineMemOperand::MOInvariant);
3555
32
}
3556
3557
SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
3558
45
                                             SelectionDAG &DAG) const {
3559
45
  SDLoc SL(Op);
3560
45
  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
3561
45
3562
45
  SDValue Src = ASC->getOperand(0);
3563
45
  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
3564
45
3565
45
  const AMDGPUTargetMachine &TM =
3566
45
    static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
3567
45
3568
45
  // flat -> local/private
3569
45
  if (
ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS45
) {
3570
12
    unsigned DestAS = ASC->getDestAddressSpace();
3571
12
3572
12
    if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
3573
12
        
DestAS == AMDGPUASI.PRIVATE_ADDRESS5
) {
3574
12
      unsigned NullVal = TM.getNullPointerValue(DestAS);
3575
12
      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
3576
12
      SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
3577
12
      SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
3578
12
3579
12
      return DAG.getNode(ISD::SELECT, SL, MVT::i32,
3580
12
                         NonNull, Ptr, SegmentNullPtr);
3581
12
    }
3582
33
  }
3583
33
3584
33
  // local/private -> flat
3585
33
  
if (33
ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS33
) {
3586
32
    unsigned SrcAS = ASC->getSrcAddressSpace();
3587
32
3588
32
    if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
3589
32
        
SrcAS == AMDGPUASI.PRIVATE_ADDRESS22
) {
3590
32
      unsigned NullVal = TM.getNullPointerValue(SrcAS);
3591
32
      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
3592
32
3593
32
      SDValue NonNull
3594
32
        = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
3595
32
3596
32
      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
3597
32
      SDValue CvtPtr
3598
32
        = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
3599
32
3600
32
      return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
3601
32
                         DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
3602
32
                         FlatNullPtr);
3603
32
    }
3604
1
  }
3605
1
3606
1
  // global <-> flat are no-ops and never emitted.
3607
1
3608
1
  const MachineFunction &MF = DAG.getMachineFunction();
3609
1
  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
3610
1
    *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
3611
1
  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
3612
1
3613
1
  return DAG.getUNDEF(ASC->getValueType(0));
3614
1
}
3615
3616
SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
3617
94
                                                 SelectionDAG &DAG) const {
3618
94
  SDValue Idx = Op.getOperand(2);
3619
94
  if (isa<ConstantSDNode>(Idx))
3620
80
    return SDValue();
3621
14
3622
14
  // Avoid stack access for dynamic indexing.
3623
14
  SDLoc SL(Op);
3624
14
  SDValue Vec = Op.getOperand(0);
3625
14
  SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
3626
14
3627
14
  // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
3628
14
  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
3629
14
3630
14
  // Convert vector index to bit-index.
3631
14
  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
3632
14
                                  DAG.getConstant(16, SL, MVT::i32));
3633
14
3634
14
  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3635
14
3636
14
  SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
3637
14
                            DAG.getConstant(0xffff, SL, MVT::i32),
3638
14
                            ScaledIdx);
3639
14
3640
14
  SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
3641
14
  SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
3642
14
                            DAG.getNOT(SL, BFM, MVT::i32), BCVec);
3643
14
3644
14
  SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
3645
14
  return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
3646
14
}
3647
3648
SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
3649
586
                                                  SelectionDAG &DAG) const {
3650
586
  SDLoc SL(Op);
3651
586
3652
586
  EVT ResultVT = Op.getValueType();
3653
586
  SDValue Vec = Op.getOperand(0);
3654
586
  SDValue Idx = Op.getOperand(1);
3655
586
3656
586
  DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
3657
586
3658
586
  // Make sure we we do any optimizations that will make it easier to fold
3659
586
  // source modifiers before obscuring it with bit operations.
3660
586
3661
586
  // XXX - Why doesn't this get called when vector_shuffle is expanded?
3662
586
  if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
3663
7
    return Combined;
3664
579
3665
579
  
if (const ConstantSDNode *579
CIdx579
= dyn_cast<ConstantSDNode>(Idx)) {
3666
566
    SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3667
566
3668
566
    if (
CIdx->getZExtValue() == 1566
) {
3669
276
      Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
3670
276
                           DAG.getConstant(16, SL, MVT::i32));
3671
566
    } else {
3672
290
      assert(CIdx->getZExtValue() == 0);
3673
290
    }
3674
566
3675
566
    if (ResultVT.bitsLT(MVT::i32))
3676
495
      Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
3677
566
    return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
3678
566
  }
3679
13
3680
13
  SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
3681
13
3682
13
  // Convert vector index to bit-index.
3683
13
  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
3684
13
3685
13
  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3686
13
  SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
3687
13
3688
13
  SDValue Result = Elt;
3689
13
  if (ResultVT.bitsLT(MVT::i32))
3690
7
    Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
3691
586
3692
586
  return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
3693
586
}
3694
3695
bool
3696
1.52k
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
3697
1.52k
  // We can fold offsets for anything that doesn't require a GOT relocation.
3698
1.52k
  return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
3699
1.49k
              GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
3700
79
         !shouldEmitGOTReloc(GA->getGlobal());
3701
1.52k
}
3702
3703
static SDValue
3704
buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
3705
                        const SDLoc &DL, unsigned Offset, EVT PtrVT,
3706
486
                        unsigned GAFlags = SIInstrInfo::MO_NONE) {
3707
486
  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
3708
486
  // lowered to the following code sequence:
3709
486
  //
3710
486
  // For constant address space:
3711
486
  //   s_getpc_b64 s[0:1]
3712
486
  //   s_add_u32 s0, s0, $symbol
3713
486
  //   s_addc_u32 s1, s1, 0
3714
486
  //
3715
486
  //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
3716
486
  //   a fixup or relocation is emitted to replace $symbol with a literal
3717
486
  //   constant, which is a pc-relative offset from the encoding of the $symbol
3718
486
  //   operand to the global variable.
3719
486
  //
3720
486
  // For global address space:
3721
486
  //   s_getpc_b64 s[0:1]
3722
486
  //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3723
486
  //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3724
486
  //
3725
486
  //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
3726
486
  //   fixups or relocations are emitted to replace $symbol@*@lo and
3727
486
  //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3728
486
  //   which is a 64-bit pc-relative offset from the encoding of the $symbol
3729
486
  //   operand to the global variable.
3730
486
  //
3731
486
  // What we want here is an offset from the value returned by s_getpc
3732
486
  // (which is the address of the s_add_u32 instruction) to the global
3733
486
  // variable, but since the encoding of $symbol starts 4 bytes after the start
3734
486
  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
3735
486
  // small. This requires us to add 4 to the global variable offset in order to
3736
486
  // compute the correct address.
3737
486
  SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
3738
486
                                             GAFlags);
3739
486
  SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
3740
486
                                             GAFlags == SIInstrInfo::MO_NONE ?
3741
486
                                             
GAFlags19
:
GAFlags + 1467
);
3742
486
  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
3743
486
}
3744
3745
SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
3746
                                             SDValue Op,
3747
766
                                             SelectionDAG &DAG) const {
3748
766
  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
3749
766
  const GlobalValue *GV = GSD->getGlobal();
3750
766
3751
766
  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
3752
744
      GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
3753
766
      // FIXME: It isn't correct to rely on the type of the pointer. This should
3754
766
      // be removed when address space 0 is 64-bit.
3755
722
      !GV->getType()->getElementType()->isFunctionTy())
3756
280
    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
3757
486
3758
486
  SDLoc DL(GSD);
3759
486
  EVT PtrVT = Op.getValueType();
3760
486
3761
486
  if (shouldEmitFixup(GV))
3762
19
    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
3763
467
  else 
if (467
shouldEmitPCReloc(GV)467
)
3764
446
    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
3765
446
                                   SIInstrInfo::MO_REL32);
3766
21
3767
21
  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
3768
21
                                            SIInstrInfo::MO_GOTPCREL32);
3769
21
3770
21
  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
3771
21
  PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
3772
21
  const DataLayout &DataLayout = DAG.getDataLayout();
3773
21
  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
3774
21
  // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
3775
21
  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
3776
21
3777
21
  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
3778
21
                     MachineMemOperand::MODereferenceable |
3779
21
                         MachineMemOperand::MOInvariant);
3780
21
}
3781
3782
SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
3783
7.56k
                                   const SDLoc &DL, SDValue V) const {
3784
7.56k
  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
3785
7.56k
  // the destination register.
3786
7.56k
  //
3787
7.56k
  // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
3788
7.56k
  // so we will end up with redundant moves to m0.
3789
7.56k
  //
3790
7.56k
  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
3791
7.56k
3792
7.56k
  // A Null SDValue creates a glue result.
3793
7.56k
  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
3794
7.56k
                                  V, Chain);
3795
7.56k
  return SDValue(M0, 0);
3796
7.56k
}
3797
3798
SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
3799
                                                 SDValue Op,
3800
                                                 MVT VT,
3801
85
                                                 unsigned Offset) const {
3802
85
  SDLoc SL(Op);
3803
85
  SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
3804
85
                                           DAG.getEntryNode(), Offset, false);
3805
85
  // The local size values will have the hi 16-bits as zero.
3806
85
  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
3807
85
                     DAG.getValueType(VT));
3808
85
}
3809
3810
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
3811
2
                                        EVT VT) {
3812
2
  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
3813
2
                                      "non-hsa intrinsic with hsa target",
3814
2
                                      DL.getDebugLoc());
3815
2
  DAG.getContext()->diagnose(BadIntrin);
3816
2
  return DAG.getUNDEF(VT);
3817
2
}
3818
3819
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
3820
5
                                         EVT VT) {
3821
5
  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
3822
5
                                      "intrinsic not supported on subtarget",
3823
5
                                      DL.getDebugLoc());
3824
5
  DAG.getContext()->diagnose(BadIntrin);
3825
5
  return DAG.getUNDEF(VT);
3826
5
}
3827
3828
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
3829
5.60k
                                                  SelectionDAG &DAG) const {
3830
5.60k
  MachineFunction &MF = DAG.getMachineFunction();
3831
5.60k
  auto MFI = MF.getInfo<SIMachineFunctionInfo>();
3832
5.60k
3833
5.60k
  EVT VT = Op.getValueType();
3834
5.60k
  SDLoc DL(Op);
3835
5.60k
  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3836
5.60k
3837
5.60k
  // TODO: Should this propagate fast-math-flags?
3838
5.60k
3839
5.60k
  switch (IntrinsicID) {
3840
4
  case Intrinsic::amdgcn_implicit_buffer_ptr: {
3841
4
    if (getSubtarget()->isAmdCodeObjectV2(MF))
3842
2
      return emitNonHSAIntrinsicError(DAG, DL, VT);
3843
2
    return getPreloadedValue(DAG, *MFI, VT,
3844
2
                             AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3845
2
  }
3846
38
  case Intrinsic::amdgcn_dispatch_ptr:
3847
38
  case Intrinsic::amdgcn_queue_ptr: {
3848
38
    if (
!Subtarget->isAmdCodeObjectV2(MF)38
) {
3849
2
      DiagnosticInfoUnsupported BadIntrin(
3850
2
          *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
3851
2
          DL.getDebugLoc());
3852
2
      DAG.getContext()->diagnose(BadIntrin);
3853
2
      return DAG.getUNDEF(VT);
3854
2
    }
3855
36
3856
36
    auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
3857
36
      
AMDGPUFunctionArgInfo::DISPATCH_PTR27
:
AMDGPUFunctionArgInfo::QUEUE_PTR9
;
3858
36
    return getPreloadedValue(DAG, *MFI, VT, RegID);
3859
36
  }
3860
25
  case Intrinsic::amdgcn_implicitarg_ptr: {
3861
25
    if (MFI->isEntryFunction())
3862
19
      return getImplicitArgPtr(DAG, DL);
3863
6
    return getPreloadedValue(DAG, *MFI, VT,
3864
6
                             AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3865
6
  }
3866
27
  case Intrinsic::amdgcn_kernarg_segment_ptr: {
3867
27
    return getPreloadedValue(DAG, *MFI, VT,
3868
27
                             AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3869
6
  }
3870
9
  case Intrinsic::amdgcn_dispatch_id: {
3871
9
    return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
3872
6
  }
3873
20
  case Intrinsic::amdgcn_rcp:
3874
20
    return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
3875
31
  case Intrinsic::amdgcn_rsq:
3876
31
    return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
3877
5
  case Intrinsic::amdgcn_rsq_legacy:
3878
5
    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
3879
1
      return emitRemovedIntrinsicError(DAG, DL, VT);
3880
4
3881
4
    return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
3882
11
  case Intrinsic::amdgcn_rcp_legacy:
3883
11
    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
3884
4
      return emitRemovedIntrinsicError(DAG, DL, VT);
3885
7
    return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
3886
6
  case Intrinsic::amdgcn_rsq_clamp: {
3887
6
    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
3888
3
      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
3889
3
3890
3
    Type *Type = VT.getTypeForEVT(*DAG.getContext());
3891
3
    APFloat Max = APFloat::getLargest(Type->getFltSemantics());
3892
3
    APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
3893
3
3894
3
    SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
3895
3
    SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
3896
3
                              DAG.getConstantFP(Max, DL, VT));
3897
3
    return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
3898
3
                       DAG.getConstantFP(Min, DL, VT));
3899
3
  }
3900
2
  case Intrinsic::r600_read_ngroups_x:
3901
2
    if (Subtarget->isAmdHsaOS())
3902
0
      return emitNonHSAIntrinsicError(DAG, DL, VT);
3903
2
3904
2
    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3905
2
                                    SI::KernelInputOffsets::NGROUPS_X, false);
3906
2
  case Intrinsic::r600_read_ngroups_y:
3907
2
    if (Subtarget->isAmdHsaOS())
3908
0
      return emitNonHSAIntrinsicError(DAG, DL, VT);
3909
2
3910
2
    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3911
2
                                    SI::KernelInputOffsets::NGROUPS_Y, false);
3912
2
  case Intrinsic::r600_read_ngroups_z:
3913
2
    if (Subtarget->isAmdHsaOS())
3914
0
      return emitNonHSAIntrinsicError(DAG, DL, VT);
3915
2
3916
2
    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3917
2
                                    SI::KernelInputOffsets::NGROUPS_Z, false);
3918
2
  case Intrinsic::r600_read_global_size_x:
3919
2
    if (Subtarget->isAmdHsaOS())
3920
0
      return emitNonHSAIntrinsicError(DAG, DL, VT);
3921
2
3922
2
    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3923
2
                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
3924
2
  case Intrinsic::r600_read_global_size_y:
3925
2
    if (Subtarget->isAmdHsaOS())
3926
0
      return emitNonHSAIntrinsicError(DAG, DL, VT);
3927
2
3928
2
    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3929
2
                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
3930
2
  case Intrinsic::r600_read_global_size_z:
3931
2
    if (Subtarget->isAmdHsaOS())
3932
0
      return emitNonHSAIntrinsicError(DAG, DL, VT);
3933
2
3934
2
    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3935
2
                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
3936
13
  case Intrinsic::r600_read_local_size_x:
3937
13
    if (Subtarget->isAmdHsaOS())
3938
0
      return emitNonHSAIntrinsicError(DAG, DL, VT);
3939
13
3940
13
    return lowerImplicitZextParam(DAG, Op, MVT::i16,
3941
13
                                  SI::KernelInputOffsets::LOCAL_SIZE_X);
3942
36
  case Intrinsic::r600_read_local_size_y:
3943
36
    if (Subtarget->isAmdHsaOS())
3944
0
      return emitNonHSAIntrinsicError(DAG, DL, VT);
3945
36
3946
36
    return lowerImplicitZextParam(DAG, Op, MVT::i16,
3947
36
                                  SI::KernelInputOffsets::LOCAL_SIZE_Y);
3948
36
  case Intrinsic::r600_read_local_size_z:
3949
36
    if (Subtarget->isAmdHsaOS())
3950
0
      return emitNonHSAIntrinsicError(DAG, DL, VT);
3951
36
3952
36
    return lowerImplicitZextParam(DAG, Op, MVT::i16,
3953
36
                                  SI::KernelInputOffsets::LOCAL_SIZE_Z);
3954
41
  case Intrinsic::amdgcn_workgroup_id_x:
3955
41
  case Intrinsic::r600_read_tgid_x:
3956
41
    return getPreloadedValue(DAG, *MFI, VT,
3957
41
                             AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3958
24
  case Intrinsic::amdgcn_workgroup_id_y:
3959
24
  case Intrinsic::r600_read_tgid_y:
3960
24
    return getPreloadedValue(DAG, *MFI, VT,
3961
24
                             AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3962
24
  case Intrinsic::amdgcn_workgroup_id_z:
3963
24
  case Intrinsic::r600_read_tgid_z:
3964
24
    return getPreloadedValue(DAG, *MFI, VT,
3965
24
                             AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3966
2.44k
  case Intrinsic::amdgcn_workitem_id_x: {
3967
2.68k
  case Intrinsic::r600_read_tidig_x:
3968
2.68k
    return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
3969
2.68k
                          SDLoc(DAG.getEntryNode()),
3970
2.68k
                          MFI->getArgInfo().WorkItemIDX);
3971
2.44k
  }
3972
94
  case Intrinsic::amdgcn_workitem_id_y:
3973
94
  case Intrinsic::r600_read_tidig_y:
3974
94
    return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
3975
94
                          SDLoc(DAG.getEntryNode()),
3976
94
                          MFI->getArgInfo().WorkItemIDY);
3977
57
  case Intrinsic::amdgcn_workitem_id_z:
3978
57
  case Intrinsic::r600_read_tidig_z:
3979
57
    return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
3980
57
                          SDLoc(DAG.getEntryNode()),
3981
57
                          MFI->getArgInfo().WorkItemIDZ);
3982
489
  case AMDGPUIntrinsic::SI_load_const: {
3983
489
    SDValue Ops[] = {
3984
489
      Op.getOperand(1),
3985
489
      Op.getOperand(2)
3986
489
    };
3987
489
3988
489
    MachineMemOperand *MMO = MF.getMachineMemOperand(
3989
489
        MachinePointerInfo(),
3990
489
        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3991
489
            MachineMemOperand::MOInvariant,
3992
489
        VT.getStoreSize(), 4);
3993
489
    return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
3994
489
                                   Op->getVTList(), Ops, VT, MMO);
3995
57
  }
3996
4
  case Intrinsic::amdgcn_fdiv_fast:
3997
4
    return lowerFDIV_FAST(Op, DAG);
3998
75
  case Intrinsic::amdgcn_interp_mov: {
3999
75
    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
4000
75
    SDValue Glue = M0.getValue(1);
4001
75
    return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
4002
75
                       Op.getOperand(2), Op.getOperand(3), Glue);
4003
57
  }
4004
204
  case Intrinsic::amdgcn_interp_p1: {
4005
204
    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
4006
204
    SDValue Glue = M0.getValue(1);
4007
204
    return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
4008
204
                       Op.getOperand(2), Op.getOperand(3), Glue);
4009
57
  }
4010
188
  case Intrinsic::amdgcn_interp_p2: {
4011
188
    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
4012
188
    SDValue Glue = SDValue(M0.getNode(), 1);
4013
188
    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
4014
188
                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
4015
188
                       Glue);
4016
57
  }
4017
5
  case Intrinsic::amdgcn_sin:
4018
5
    return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
4019
57
4020
3
  case Intrinsic::amdgcn_cos:
4021
3
    return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
4022
57
4023
3
  case Intrinsic::amdgcn_log_clamp: {
4024
3
    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
4025
2
      return SDValue();
4026
1
4027
1
    DiagnosticInfoUnsupported BadIntrin(
4028
1
      *MF.getFunction(), "intrinsic not supported on subtarget",
4029
1
      DL.getDebugLoc());
4030
1
      DAG.getContext()->diagnose(BadIntrin);
4031
1
      return DAG.getUNDEF(VT);
4032
1
  }
4033
9
  case Intrinsic::amdgcn_ldexp:
4034
9
    return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
4035
9
                       Op.getOperand(1), Op.getOperand(2));
4036
1
4037
7
  case Intrinsic::amdgcn_fract:
4038
7
    return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
4039
1
4040
55
  case Intrinsic::amdgcn_class:
4041
55
    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
4042
55
                       Op.getOperand(1), Op.getOperand(2));
4043
10
  case Intrinsic::amdgcn_div_fmas:
4044
10
    return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
4045
10
                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
4046
10
                       Op.getOperand(4));
4047
1
4048
13
  case Intrinsic::amdgcn_div_fixup:
4049
13
    return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
4050
13
                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4051
1
4052
4
  case Intrinsic::amdgcn_trig_preop:
4053
4
    return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
4054
4
                       Op.getOperand(1), Op.getOperand(2));
4055
27
  case Intrinsic::amdgcn_div_scale: {
4056
27
    // 3rd parameter required to be a constant.
4057
27
    const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4058
27
    if (!Param)
4059
3
      return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
4060
24
4061
24
    // Translate to the operands expected by the machine instruction. The
4062
24
    // first parameter must be the same as the first instruction.
4063
24
    SDValue Numerator = Op.getOperand(1);
4064
24
    SDValue Denominator = Op.getOperand(2);
4065
24
4066
24
    // Note this order is opposite of the machine instruction's operations,
4067
24
    // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
4068
24
    // intrinsic has the numerator as the first operand to match a normal
4069
24
    // division operation.
4070
24
4071
24
    SDValue Src0 = Param->isAllOnesValue() ? 
Numerator8
:
Denominator16
;
4072
24
4073
24
    return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
4074
24
                       Denominator, Numerator);
4075
24
  }
4076
48
  case Intrinsic::amdgcn_icmp: {
4077
48
    const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4078
48
    if (!CD)
4079
6
      return DAG.getUNDEF(VT);
4080
42
4081
42
    int CondCode = CD->getSExtValue();
4082
42
    if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
4083
40
        CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
4084
2
      return DAG.getUNDEF(VT);
4085
40
4086
40
    ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
4087
40
    ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
4088
40
    return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
4089
40
                       Op.getOperand(2), DAG.getCondCode(CCOpcode));
4090
40
  }
4091
56
  case Intrinsic::amdgcn_fcmp: {
4092
56
    const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4093
56
    if (!CD)
4094
2
      return DAG.getUNDEF(VT);
4095
54
4096
54
    int CondCode = CD->getSExtValue();
4097
54
    if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
4098
52
        CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
4099
2
      return DAG.getUNDEF(VT);
4100
52
4101
52
    FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
4102
52
    ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
4103
52
    return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
4104
52
                       Op.getOperand(2), DAG.getCondCode(CCOpcode));
4105
52
  }
4106
69
  case Intrinsic::amdgcn_fmed3:
4107
69
    return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
4108
69
                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4109
31
  case Intrinsic::amdgcn_fmul_legacy:
4110
31
    return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
4111
31
                       Op.getOperand(1), Op.getOperand(2));
4112
4
  case Intrinsic::amdgcn_sffbh:
4113
4
    return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
4114
102
  case Intrinsic::amdgcn_sbfe:
4115
102
    return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
4116
102
                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4117
94
  case Intrinsic::amdgcn_ubfe:
4118
94
    return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
4119
94
                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4120
10
  case Intrinsic::amdgcn_cvt_pkrtz: {
4121
10
    // FIXME: Stop adding cast if v2f16 legal.
4122
10
    EVT VT = Op.getValueType();
4123
10
    SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
4124
10
                               Op.getOperand(1), Op.getOperand(2));
4125
10
    return DAG.getNode(ISD::BITCAST, DL, VT, Node);
4126
52
  }
4127
8
  case Intrinsic::amdgcn_wqm: {
4128
8
    SDValue Src = Op.getOperand(1);
4129
8
    return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
4130
8
                   0);
4131
52
  }
4132
14
  case Intrinsic::amdgcn_wwm: {
4133
14
    SDValue Src = Op.getOperand(1);
4134
14
    return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
4135
14
                   0);
4136
52
  }
4137
875
  default:
4138
875
    return Op;
4139
0
  }
4140
0
}
4141
4142
SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
4143
1.23k
                                                 SelectionDAG &DAG) const {
4144
1.23k
  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4145
1.23k
  SDLoc DL(Op);
4146
1.23k
  MachineFunction &MF = DAG.getMachineFunction();
4147
1.23k
4148
1.23k
  switch (IntrID) {
4149
150
  case Intrinsic::amdgcn_atomic_inc:
4150
150
  case Intrinsic::amdgcn_atomic_dec: {
4151
150
    MemSDNode *M = cast<MemSDNode>(Op);
4152
150
    unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
4153
150
      
AMDGPUISD::ATOMIC_INC72
:
AMDGPUISD::ATOMIC_DEC78
;
4154
150
    SDValue Ops[] = {
4155
150
      M->getOperand(0), // Chain
4156
150
      M->getOperand(2), // Ptr
4157
150
      M->getOperand(3)  // Value
4158
150
    };
4159
150
4160
150
    return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
4161
150
                                   M->getMemoryVT(), M->getMemOperand());
4162
150
  }
4163
125
  case Intrinsic::amdgcn_buffer_load:
4164
125
  case Intrinsic::amdgcn_buffer_load_format: {
4165
125
    SDValue Ops[] = {
4166
125
      Op.getOperand(0), // Chain
4167
125
      Op.getOperand(2), // rsrc
4168
125
      Op.getOperand(3), // vindex
4169
125
      Op.getOperand(4), // offset
4170
125
      Op.getOperand(5), // glc
4171
125
      Op.getOperand(6)  // slc
4172
125
    };
4173
125
    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
4174
125
4175
125
    unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
4176
125
        
AMDGPUISD::BUFFER_LOAD66
:
AMDGPUISD::BUFFER_LOAD_FORMAT59
;
4177
125
    EVT VT = Op.getValueType();
4178
125
    EVT IntVT = VT.changeTypeToInteger();
4179
125
4180
125
    MachineMemOperand *MMO = MF.getMachineMemOperand(
4181
125
      MachinePointerInfo(MFI->getBufferPSV()),
4182
125
      MachineMemOperand::MOLoad,
4183
125
      VT.getStoreSize(), VT.getStoreSize());
4184
125
4185
125
    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
4186
125
  }
4187
28
  case Intrinsic::amdgcn_tbuffer_load: {
4188
28
    SDValue Ops[] = {
4189
28
      Op.getOperand(0),  // Chain
4190
28
      Op.getOperand(2),  // rsrc
4191
28
      Op.getOperand(3),  // vindex
4192
28
      Op.getOperand(4),  // voffset
4193
28
      Op.getOperand(5),  // soffset
4194
28
      Op.getOperand(6),  // offset
4195
28
      Op.getOperand(7),  // dfmt
4196
28
      Op.getOperand(8),  // nfmt
4197
28
      Op.getOperand(9),  // glc
4198
28
      Op.getOperand(10)   // slc
4199
28
    };
4200
28
4201
28
    EVT VT = Op.getOperand(2).getValueType();
4202
28
4203
28
    MachineMemOperand *MMO = MF.getMachineMemOperand(
4204
28
      MachinePointerInfo(),
4205
28
      MachineMemOperand::MOLoad,
4206
28
      VT.getStoreSize(), VT.getStoreSize());
4207
28
    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
4208
28
                                   Op->getVTList(), Ops, VT, MMO);
4209
125
  }
4210
125
  // Basic sample.
4211
528
  case Intrinsic::amdgcn_image_sample:
4212
528
  case Intrinsic::amdgcn_image_sample_cl:
4213
528
  case Intrinsic::amdgcn_image_sample_d:
4214
528
  case Intrinsic::amdgcn_image_sample_d_cl:
4215
528
  case Intrinsic::amdgcn_image_sample_l:
4216
528
  case Intrinsic::amdgcn_image_sample_b:
4217
528
  case Intrinsic::amdgcn_image_sample_b_cl:
4218
528
  case Intrinsic::amdgcn_image_sample_lz:
4219
528
  case Intrinsic::amdgcn_image_sample_cd:
4220
528
  case Intrinsic::amdgcn_image_sample_cd_cl:
4221
528
4222
528
  // Sample with comparison.
4223
528
  case Intrinsic::amdgcn_image_sample_c:
4224
528
  case Intrinsic::amdgcn_image_sample_c_cl:
4225
528
  case Intrinsic::amdgcn_image_sample_c_d:
4226
528
  case Intrinsic::amdgcn_image_sample_c_d_cl:
4227
528
  case Intrinsic::amdgcn_image_sample_c_l:
4228
528
  case Intrinsic::amdgcn_image_sample_c_b:
4229
528
  case Intrinsic::amdgcn_image_sample_c_b_cl:
4230
528
  case Intrinsic::amdgcn_image_sample_c_lz:
4231
528
  case Intrinsic::amdgcn_image_sample_c_cd:
4232
528
  case Intrinsic::amdgcn_image_sample_c_cd_cl:
4233
528
4234
528
  // Sample with offsets.
4235
528
  case Intrinsic::amdgcn_image_sample_o:
4236
528
  case Intrinsic::amdgcn_image_sample_cl_o:
4237
528
  case Intrinsic::amdgcn_image_sample_d_o:
4238
528
  case Intrinsic::amdgcn_image_sample_d_cl_o:
4239
528
  case Intrinsic::amdgcn_image_sample_l_o:
4240
528
  case Intrinsic::amdgcn_image_sample_b_o:
4241
528
  case Intrinsic::amdgcn_image_sample_b_cl_o:
4242
528
  case Intrinsic::amdgcn_image_sample_lz_o:
4243
528
  case Intrinsic::amdgcn_image_sample_cd_o:
4244
528
  case Intrinsic::amdgcn_image_sample_cd_cl_o:
4245
528
4246
528
  // Sample with comparison and offsets.
4247
528
  case Intrinsic::amdgcn_image_sample_c_o:
4248
528
  case Intrinsic::amdgcn_image_sample_c_cl_o:
4249
528
  case Intrinsic::amdgcn_image_sample_c_d_o:
4250
528
  case Intrinsic::amdgcn_image_sample_c_d_cl_o:
4251
528
  case Intrinsic::amdgcn_image_sample_c_l_o:
4252
528
  case Intrinsic::amdgcn_image_sample_c_b_o:
4253
528
  case Intrinsic::amdgcn_image_sample_c_b_cl_o:
4254
528
  case Intrinsic::amdgcn_image_sample_c_lz_o:
4255
528
  case Intrinsic::amdgcn_image_sample_c_cd_o:
4256
528
  case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
4257
528
4258
528
  case Intrinsic::amdgcn_image_getlod: {
4259
528
    // Replace dmask with everything disabled with undef.
4260
528
    const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
4261
528
    if (
!DMask || 528
DMask->isNullValue()526
) {
4262
64
      SDValue Undef = DAG.getUNDEF(Op.getValueType());
4263
64
      return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
4264
64
    }
4265
464
4266
464
    return SDValue();
4267
464
  }
4268
408
  default:
4269
408
    return SDValue();
4270
0
  }
4271
0
}
4272
4273
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
4274
1.91k
                                              SelectionDAG &DAG) const {
4275
1.91k
  SDLoc DL(Op);
4276
1.91k
  SDValue Chain = Op.getOperand(0);
4277
1.91k
  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4278
1.91k
  MachineFunction &MF = DAG.getMachineFunction();
4279
1.91k
4280
1.91k
  switch (IntrinsicID) {
4281
313
  case Intrinsic::amdgcn_exp: {
4282
313
    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
4283
313
    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
4284
313
    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
4285
313
    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
4286
313
4287
313
    const SDValue Ops[] = {
4288
313
      Chain,
4289
313
      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
4290
313
      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
4291
313
      Op.getOperand(4), // src0
4292
313
      Op.getOperand(5), // src1
4293
313
      Op.getOperand(6), // src2
4294
313
      Op.getOperand(7), // src3
4295
313
      DAG.getTargetConstant(0, DL, MVT::i1), // compr
4296
313
      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
4297
313
    };
4298
313
4299
313
    unsigned Opc = Done->isNullValue() ?
4300
313
      
AMDGPUISD::EXPORT158
:
AMDGPUISD::EXPORT_DONE155
;
4301
313
    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
4302
1.91k
  }
4303
93
  case Intrinsic::amdgcn_exp_compr: {
4304
93
    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
4305
93
    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
4306
93
    SDValue Src0 = Op.getOperand(4);
4307
93
    SDValue Src1 = Op.getOperand(5);
4308
93
    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
4309
93
    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
4310
93
4311
93
    SDValue Undef = DAG.getUNDEF(MVT::f32);
4312
93
    const SDValue Ops[] = {
4313
93
      Chain,
4314
93
      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
4315
93
      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
4316
93
      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
4317
93
      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
4318
93
      Undef, // src2
4319
93
      Undef, // src3
4320
93
      DAG.getTargetConstant(1, DL, MVT::i1), // compr
4321
93
      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
4322
93
    };
4323
93
4324
93
    unsigned Opc = Done->isNullValue() ?
4325
93
      
AMDGPUISD::EXPORT24
:
AMDGPUISD::EXPORT_DONE69
;
4326
93
    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
4327
1.91k
  }
4328
24
  case Intrinsic::amdgcn_s_sendmsg:
4329
24
  case Intrinsic::amdgcn_s_sendmsghalt: {
4330
24
    unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
4331
24
      
AMDGPUISD::SENDMSG12
:
AMDGPUISD::SENDMSGHALT12
;
4332
24
    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
4333
24
    SDValue Glue = Chain.getValue(1);
4334
24
    return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
4335
24
                       Op.getOperand(2), Glue);
4336
24
  }
4337
2
  case Intrinsic::amdgcn_init_exec: {
4338
2
    return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
4339
2
                       Op.getOperand(2));
4340
24
  }
4341
4
  case Intrinsic::amdgcn_init_exec_from_input: {
4342
4
    return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
4343
4
                       Op.getOperand(2), Op.getOperand(3));
4344
24
  }
4345
30
  case AMDGPUIntrinsic::AMDGPU_kill: {
4346
30
    SDValue Src = Op.getOperand(2);
4347
30
    if (const ConstantFPSDNode *
K30
= dyn_cast<ConstantFPSDNode>(Src)) {
4348
11
      if (!K->isNegative())
4349
4
        return Chain;
4350
7
4351
7
      SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
4352
7
      return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
4353
7
    }
4354
19
4355
19
    SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
4356
19
    return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
4357
19
  }
4358
135
  case Intrinsic::amdgcn_s_barrier: {
4359
135
    if (
getTargetMachine().getOptLevel() > CodeGenOpt::None135
) {
4360
127
      const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
4361
127
      unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
4362
127
      if (WGSize <= ST.getWavefrontSize())
4363
5
        return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
4364
5
                                          Op.getOperand(0)), 0);
4365
130
    }
4366
130
    return SDValue();
4367
0
  };
4368
14
  case AMDGPUIntrinsic::SI_tbuffer_store: {
4369
14
4370
14
    // Extract vindex and voffset from vaddr as appropriate
4371
14
    const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
4372
14
    const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
4373
14
    SDValue VAddr = Op.getOperand(5);
4374
14
4375
14
    SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
4376
14
4377
14
    assert(!(OffEn->isOne() && IdxEn->isOne()) &&
4378
14
           "Legacy intrinsic doesn't support both offset and index - use new version");
4379
14
4380
14
    SDValue VIndex = IdxEn->isOne() ? 
VAddr4
:
Zero10
;
4381
14
    SDValue VOffset = OffEn->isOne() ? 
VAddr10
:
Zero4
;
4382
14
4383
14
    // Deal with the vec-3 case
4384
14
    const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
4385
14
    auto Opcode = NumChannels->getZExtValue() == 3 ?
4386
14
      
AMDGPUISD::TBUFFER_STORE_FORMAT_X32
:
AMDGPUISD::TBUFFER_STORE_FORMAT12
;
4387
14
4388
14
    SDValue Ops[] = {
4389
14
     Chain,
4390
14
     Op.getOperand(3),  // vdata
4391
14
     Op.getOperand(2),  // rsrc
4392
14
     VIndex,
4393
14
     VOffset,
4394
14
     Op.getOperand(6),  // soffset
4395
14
     Op.getOperand(7),  // inst_offset
4396
14
     Op.getOperand(8),  // dfmt
4397
14
     Op.getOperand(9),  // nfmt
4398
14
     Op.getOperand(12), // glc
4399
14
     Op.getOperand(13), // slc
4400
14
    };
4401
14
4402
14
    assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
4403
14
           "Value of tfe other than zero is unsupported");
4404
14
4405
14
    EVT VT = Op.getOperand(3).getValueType();
4406
14
    MachineMemOperand *MMO = MF.getMachineMemOperand(
4407
14
      MachinePointerInfo(),
4408
14
      MachineMemOperand::MOStore,
4409
14
      VT.getStoreSize(), 4);
4410
14
    return DAG.getMemIntrinsicNode(Opcode, DL,
4411
14
                                   Op->getVTList(), Ops, VT, MMO);
4412
0
  }
4413
0
4414
32
  case Intrinsic::amdgcn_tbuffer_store: {
4415
32
    SDValue Ops[] = {
4416
32
      Chain,
4417
32
      Op.getOperand(2),  // vdata
4418
32
      Op.getOperand(3),  // rsrc
4419
32
      Op.getOperand(4),  // vindex
4420
32
      Op.getOperand(5),  // voffset
4421
32
      Op.getOperand(6),  // soffset
4422
32
      Op.getOperand(7),  // offset
4423
32
      Op.getOperand(8),  // dfmt
4424
32
      Op.getOperand(9),  // nfmt
4425
32
      Op.getOperand(10), // glc
4426
32
      Op.getOperand(11)  // slc
4427
32
    };
4428
32
    EVT VT = Op.getOperand(3).getValueType();
4429
32
    MachineMemOperand *MMO = MF.getMachineMemOperand(
4430
32
      MachinePointerInfo(),
4431
32
      MachineMemOperand::MOStore,
4432
32
      VT.getStoreSize(), 4);
4433
32
    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
4434
32
                                   Op->getVTList(), Ops, VT, MMO);
4435
0
  }
4436
0
4437
1.26k
  default:
4438
1.26k
    return Op;
4439
0
  }
4440
0
}
4441
4442
75.2k
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
4443
75.2k
  SDLoc DL(Op);
4444
75.2k
  LoadSDNode *Load = cast<LoadSDNode>(Op);
4445
75.2k
  ISD::LoadExtType ExtType = Load->getExtensionType();
4446
75.2k
  EVT MemVT = Load->getMemoryVT();
4447
75.2k
4448
75.2k
  if (
ExtType == ISD::NON_EXTLOAD && 75.2k
MemVT.getSizeInBits() < 3275.2k
) {
4449
3.18k
    if (
MemVT == MVT::i16 && 3.18k
isTypeLegal(MVT::i16)2.90k
)
4450
2.90k
      return SDValue();
4451
286
4452
286
    // FIXME: Copied from PPC
4453
286
    // First, load into 32 bits, then truncate to 1 bit.
4454
286
4455
286
    SDValue Chain = Load->getChain();
4456
286
    SDValue BasePtr = Load->getBasePtr();
4457
286
    MachineMemOperand *MMO = Load->getMemOperand();
4458
286
4459
286
    EVT RealMemVT = (MemVT == MVT::i1) ? 
MVT::i8286
:
MVT::i160
;
4460
3.18k
4461
3.18k
    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
4462
3.18k
                                   BasePtr, RealMemVT, MMO);
4463
3.18k
4464
3.18k
    SDValue Ops[] = {
4465
3.18k
      DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
4466
3.18k
      NewLD.getValue(1)
4467
3.18k
    };
4468
3.18k
4469
3.18k
    return DAG.getMergeValues(Ops, DL);
4470
3.18k
  }
4471
72.0k
4472
72.0k
  
if (72.0k
!MemVT.isVector()72.0k
)
4473
0
    return SDValue();
4474
72.0k
4475
72.0k
  assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
4476
72.0k
         "Custom lowering for non-i32 vectors hasn't been implemented.");
4477
72.0k
4478
72.0k
  unsigned AS = Load->getAddressSpace();
4479
72.0k
  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
4480
72.0k
                          AS, Load->getAlignment())) {
4481
0
    SDValue Ops[2];
4482
0
    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
4483
0
    return DAG.getMergeValues(Ops, DL);
4484
0
  }
4485
72.0k
4486
72.0k
  MachineFunction &MF = DAG.getMachineFunction();
4487
72.0k
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
4488
72.0k
  // If there is a possibilty that flat instruction access scratch memory
4489
72.0k
  // then we need to use the same legalization rules we use for private.
4490
72.0k
  if (AS == AMDGPUASI.FLAT_ADDRESS)
4491
24
    AS = MFI->hasFlatScratchInit() ?
4492
24
         
AMDGPUASI.PRIVATE_ADDRESS4
:
AMDGPUASI.GLOBAL_ADDRESS20
;
4493
72.0k
4494
72.0k
  unsigned NumElements = MemVT.getVectorNumElements();
4495
72.0k
  if (
AS == AMDGPUASI.CONSTANT_ADDRESS72.0k
) {
4496
52.6k
    if (isMemOpUniform(Load))
4497
52.3k
      return SDValue();
4498
19.6k
    // Non-uniform loads will be selected to MUBUF instructions, so they
4499
19.6k
    // have the same legalization requirements as global and private
4500
19.6k
    // loads.
4501
19.6k
    //
4502
19.6k
  }
4503
19.6k
  
if (19.6k
AS == AMDGPUASI.CONSTANT_ADDRESS || 19.6k
AS == AMDGPUASI.GLOBAL_ADDRESS19.4k
) {
4504
12.6k
    if (
Subtarget->getScalarizeGlobalBehavior() && 12.6k
isMemOpUniform(Load)6.96k
&&
4505
12.6k
        
!Load->isVolatile()1.89k
&&
isMemOpHasNoClobberedMemOperand(Load)1.72k
)
4506
589
      return SDValue();
4507
19.1k
    // Non-uniform loads will be selected to MUBUF instructions, so they
4508
19.1k
    // have the same legalization requirements as global and private
4509
19.1k
    // loads.
4510
19.1k
    //
4511
19.1k
  }
4512
19.1k
  
if (19.1k
AS == AMDGPUASI.CONSTANT_ADDRESS || 19.1k
AS == AMDGPUASI.GLOBAL_ADDRESS18.8k
||
4513
19.1k
      
AS == AMDGPUASI.FLAT_ADDRESS7.02k
) {
4514
12.0k
    if (NumElements > 4)
4515
1.19k
      return SplitVectorLoad(Op, DAG);
4516
10.8k
    // v4 loads are supported for private and global memory.
4517
10.8k
    return SDValue();
4518
10.8k
  }
4519
7.02k
  
if (7.02k
AS == AMDGPUASI.PRIVATE_ADDRESS7.02k
) {
4520
347
    // Depending on the setting of the private_element_size field in the
4521
347
    // resource descriptor, we can only make private accesses up to a certain
4522
347
    // size.
4523
347
    switch (Subtarget->getMaxPrivateElementSize()) {
4524
174
    case 4:
4525
174
      return scalarizeVectorLoad(Load, DAG);
4526
53
    case 8:
4527
53
      if (NumElements > 2)
4528
5
        return SplitVectorLoad(Op, DAG);
4529
48
      return SDValue();
4530
120
    case 16:
4531
120
      // Same as global/flat
4532
120
      if (NumElements > 4)
4533
1
        return SplitVectorLoad(Op, DAG);
4534
119
      return SDValue();
4535
0
    default:
4536
0
      llvm_unreachable("unsupported private_element_size");
4537
7.02k
    }
4538
6.67k
  } else 
if (6.67k
AS == AMDGPUASI.LOCAL_ADDRESS6.67k
) {
4539
6.67k
    if (NumElements > 2)
4540
640
      return SplitVectorLoad(Op, DAG);
4541
6.03k
4542
6.03k
    
if (6.03k
NumElements == 26.03k
)
4543
6.03k
      return SDValue();
4544
0
4545
0
    // If properly aligned, if we split we might be able to use ds_read_b64.
4546
0
    return SplitVectorLoad(Op, DAG);
4547
0
  }
4548
0
  return SDValue();
4549
0
}
4550
4551
1.70k
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4552
1.70k
  if (Op.getValueType() != MVT::i64)
4553
0
    return SDValue();
4554
1.70k
4555
1.70k
  SDLoc DL(Op);
4556
1.70k
  SDValue Cond = Op.getOperand(0);
4557
1.70k
4558
1.70k
  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
4559
1.70k
  SDValue One = DAG.getConstant(1, DL, MVT::i32);
4560
1.70k
4561
1.70k
  SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
4562
1.70k
  SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
4563
1.70k
4564
1.70k
  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
4565
1.70k
  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
4566
1.70k
4567
1.70k
  SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
4568
1.70k
4569
1.70k
  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
4570
1.70k
  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
4571
1.70k
4572
1.70k
  SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
4573
1.70k
4574
1.70k
  SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
4575
1.70k
  return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
4576
1.70k
}
4577
4578
// Catch division cases where we can use shortcuts with rcp and rsq
4579
// instructions.
4580
SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
4581
203
                                              SelectionDAG &DAG) const {
4582
203
  SDLoc SL(Op);
4583
203
  SDValue LHS = Op.getOperand(0);
4584
203
  SDValue RHS = Op.getOperand(1);
4585
203
  EVT VT = Op.getValueType();
4586
203
  const SDNodeFlags Flags = Op->getFlags();
4587
203
  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
4588
203
                
Flags.hasUnsafeAlgebra()191
||
Flags.hasAllowReciprocal()162
;
4589
203
4590
203
  if (
!Unsafe && 203
VT == MVT::f32112
&&
Subtarget->hasFP32Denormals()100
)
4591
7
    return SDValue();
4592
196
4593
196
  
if (const ConstantFPSDNode *196
CLHS196
= dyn_cast<ConstantFPSDNode>(LHS)) {
4594
69
    if (
Unsafe || 69
VT == MVT::f3258
||
VT == MVT::f1610
) {
4595
69
      if (
CLHS->isExactlyValue(1.0)69
) {
4596
53
        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4597
53
        // the CI documentation has a worst case error of 1 ulp.
4598
53
        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4599
53
        // use it as long as we aren't trying to use denormals.
4600
53
        //
4601
53
        // v_rcp_f16 and v_rsq_f16 DO support denormals.
4602
53
4603
53
        // 1.0 / sqrt(x) -> rsq(x)
4604
53
4605
53
        // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
4606
53
        // error seems really high at 2^29 ULP.
4607
53
        if (RHS.getOpcode() == ISD::FSQRT)
4608
6
          return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
4609
47
4610
47
        // 1.0 / x -> rcp(x)
4611
47
        return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
4612
47
      }
4613
16
4614
16
      // Same as for 1.0, but expand the sign out of the constant.
4615
16
      
if (16
CLHS->isExactlyValue(-1.0)16
) {
4616
16
        // -1.0 / x -> rcp (fneg x)
4617
16
        SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4618
16
        return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
4619
16
      }
4620
127
    }
4621
69
  }
4622
127
4623
127
  
if (127
Unsafe127
) {
4624
80
    // Turn into multiply by the reciprocal.
4625
80
    // x / y -> x * (1.0 / y)
4626
80
    SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
4627
80
    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
4628
80
  }
4629
47
4630
47
  return SDValue();
4631
47
}
4632
4633
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
4634
52
                          EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
4635
52
  if (
GlueChain->getNumValues() <= 152
) {
4636
7
    return DAG.getNode(Opcode, SL, VT, A, B);
4637
7
  }
4638
45
4639
52
  assert(GlueChain->getNumValues() == 3);
4640
45
4641
45
  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
4642
45
  switch (Opcode) {
4643
0
  
default: 0
llvm_unreachable0
("no chain equivalent for opcode");
4644
45
  case ISD::FMUL:
4645
45
    Opcode = AMDGPUISD::FMUL_W_CHAIN;
4646
45
    break;
4647
45
  }
4648
45
4649
45
  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
4650
45
                     GlueChain.getValue(2));
4651
45
}
4652
4653
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
4654
                           EVT VT, SDValue A, SDValue B, SDValue C,
4655
260
                           SDValue GlueChain) {
4656
260
  if (
GlueChain->getNumValues() <= 1260
) {
4657
35
    return DAG.getNode(Opcode, SL, VT, A, B, C);
4658
35
  }
4659
225
4660
260
  assert(GlueChain->getNumValues() == 3);
4661
225
4662
225
  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
4663
225
  switch (Opcode) {
4664
0
  
default: 0
llvm_unreachable0
("no chain equivalent for opcode");
4665
225
  case ISD::FMA:
4666
225
    Opcode = AMDGPUISD::FMA_W_CHAIN;
4667
225
    break;
4668
225
  }
4669
225
4670
225
  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
4671
225
                     GlueChain.getValue(2));
4672
225
}
4673
4674
24
SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
4675
24
  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
4676
22
    return FastLowered;
4677
2
4678
2
  SDLoc SL(Op);
4679
2
  SDValue Src0 = Op.getOperand(0);
4680
2
  SDValue Src1 = Op.getOperand(1);
4681
2
4682
2
  SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
4683
2
  SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
4684
2
4685
2
  SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
4686
2
  SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
4687
2
4688
2
  SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
4689
2
  SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
4690
2
4691
2
  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
4692
2
}
4693
4694
// Faster 2.5 ULP division that does not support denormals.
4695
4
SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
4696
4
  SDLoc SL(Op);
4697
4
  SDValue LHS = Op.getOperand(1);
4698
4
  SDValue RHS = Op.getOperand(2);
4699
4
4700
4
  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
4701
4
4702
4
  const APFloat K0Val(BitsToFloat(0x6f800000));
4703
4
  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
4704
4
4705
4
  const APFloat K1Val(BitsToFloat(0x2f800000));
4706
4
  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
4707
4
4708
4
  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
4709
4
4710
4
  EVT SetCCVT =
4711
4
    getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
4712
4
4713
4
  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
4714
4
4715
4
  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
4716
4
4717
4
  // TODO: Should this propagate fast-math-flags?
4718
4
  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
4719
4
4720
4
  // rcp does not support denormals.
4721
4
  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
4722
4
4723
4
  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
4724
4
4725
4
  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
4726
4
}
4727
4728
172
SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
4729
172
  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
4730
120
    return FastLowered;
4731
52
4732
52
  SDLoc SL(Op);
4733
52
  SDValue LHS = Op.getOperand(0);
4734
52
  SDValue RHS = Op.getOperand(1);
4735
52
4736
52
  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
4737
52
4738
52
  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
4739
52
4740
52
  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
4741
52
                                          RHS, RHS, LHS);
4742
52
  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
4743
52
                                        LHS, RHS, LHS);
4744
52
4745
52
  // Denominator is scaled to not be denormal, so using rcp is ok.
4746
52
  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
4747
52
                                  DenominatorScaled);
4748
52
  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
4749
52
                                     DenominatorScaled);
4750
52
4751
52
  const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
4752
52
                               (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
4753
52
                               (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
4754
52
4755
52
  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
4756
52
4757
52
  if (
!Subtarget->hasFP32Denormals()52
) {
4758
45
    SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
4759
45
    const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
4760
45
                                                      SL, MVT::i32);
4761
45
    SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
4762
45
                                       DAG.getEntryNode(),
4763
45
                                       EnableDenormValue, BitField);
4764
45
    SDValue Ops[3] = {
4765
45
      NegDivScale0,
4766
45
      EnableDenorm.getValue(0),
4767
45
      EnableDenorm.getValue(1)
4768
45
    };
4769
45
4770
45
    NegDivScale0 = DAG.getMergeValues(Ops, SL);
4771
45
  }
4772
52
4773
52
  SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
4774
52
                             ApproxRcp, One, NegDivScale0);
4775
52
4776
52
  SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
4777
52
                             ApproxRcp, Fma0);
4778
52
4779
52
  SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
4780
52
                           Fma1, Fma1);
4781
52
4782
52
  SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
4783
52
                             NumeratorScaled, Mul);
4784
52
4785
52
  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
4786
52
4787
52
  SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
4788
52
                             NumeratorScaled, Fma3);
4789
52
4790
52
  if (
!Subtarget->hasFP32Denormals()52
) {
4791
45
    const SDValue DisableDenormValue =
4792
45
        DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
4793
45
    SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
4794
45
                                        Fma4.getValue(1),
4795
45
                                        DisableDenormValue,
4796
45
                                        BitField,
4797
45
                                        Fma4.getValue(2));
4798
45
4799
45
    SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
4800
45
                                      DisableDenorm, DAG.getRoot());
4801
45
    DAG.setRoot(OutputChain);
4802
45
  }
4803
172
4804
172
  SDValue Scale = NumeratorScaled.getValue(1);
4805
172
  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
4806
172
                             Fma4, Fma1, Fma3, Scale);
4807
172
4808
172
  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
4809
172
}
4810
4811
68
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
4812
68
  if (DAG.getTarget().Options.UnsafeFPMath)
4813
7
    return lowerFastUnsafeFDIV(Op, DAG);
4814
61
4815
61
  SDLoc SL(Op);
4816
61
  SDValue X = Op.getOperand(0);
4817
61
  SDValue Y = Op.getOperand(1);
4818
61
4819
61
  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
4820
61
4821
61
  SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
4822
61
4823
61
  SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
4824
61
4825
61
  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
4826
61
4827
61
  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
4828
61
4829
61
  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
4830
61
4831
61
  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
4832
61
4833
61
  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
4834
61
4835
61
  SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
4836
61
4837
61
  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
4838
61
  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
4839
61
4840
61
  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
4841
61
                             NegDivScale0, Mul, DivScale1);
4842
61
4843
61
  SDValue Scale;
4844
61
4845
61
  if (
Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS61
) {
4846
23
    // Workaround a hardware bug on SI where the condition output from div_scale
4847
23
    // is not usable.
4848
23
4849
23
    const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
4850
23
4851
23
    // Figure out if the scale to use for div_fmas.
4852
23
    SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
4853
23
    SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
4854
23
    SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
4855
23
    SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
4856
23
4857
23
    SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
4858
23
    SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
4859
23
4860
23
    SDValue Scale0Hi
4861
23
      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
4862
23
    SDValue Scale1Hi
4863
23
      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
4864
23
4865
23
    SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
4866
23
    SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
4867
23
    Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
4868
61
  } else {
4869
38
    Scale = DivScale1.getValue(1);
4870
38
  }
4871
68
4872
68
  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
4873
68
                             Fma4, Fma3, Mul, Scale);
4874
68
4875
68
  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
4876
68
}
4877
4878
264
SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
4879
264
  EVT VT = Op.getValueType();
4880
264
4881
264
  if (VT == MVT::f32)
4882
172
    return LowerFDIV32(Op, DAG);
4883
92
4884
92
  
if (92
VT == MVT::f6492
)
4885
68
    return LowerFDIV64(Op, DAG);
4886
24
4887
24
  
if (24
VT == MVT::f1624
)
4888
24
    return LowerFDIV16(Op, DAG);
4889
0
4890
0
  
llvm_unreachable0
("Unexpected type for fdiv");
4891
0
}
4892
4893
62.8k
SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
4894
62.8k
  SDLoc DL(Op);
4895
62.8k
  StoreSDNode *Store = cast<StoreSDNode>(Op);
4896
62.8k
  EVT VT = Store->getMemoryVT();
4897
62.8k
4898
62.8k
  if (
VT == MVT::i162.8k
) {
4899
207
    return DAG.getTruncStore(Store->getChain(), DL,
4900
207
       DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
4901
207
       Store->getBasePtr(), MVT::i1, Store->getMemOperand());
4902
207
  }
4903
62.6k
4904
62.8k
  assert(VT.isVector() &&
4905
62.6k
         Store->getValue().getValueType().getScalarType() == MVT::i32);
4906
62.6k
4907
62.6k
  unsigned AS = Store->getAddressSpace();
4908
62.6k
  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
4909
62.6k
                          AS, Store->getAlignment())) {
4910
24
    return expandUnalignedStore(Store, DAG);
4911
24
  }
4912
62.5k
4913
62.5k
  MachineFunction &MF = DAG.getMachineFunction();
4914
62.5k
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
4915
62.5k
  // If there is a possibilty that flat instruction access scratch memory
4916
62.5k
  // then we need to use the same legalization rules we use for private.
4917
62.5k
  if (AS == AMDGPUASI.FLAT_ADDRESS)
4918
244
    AS = MFI->hasFlatScratchInit() ?
4919
244
         
AMDGPUASI.PRIVATE_ADDRESS4
:
AMDGPUASI.GLOBAL_ADDRESS240
;
4920
62.5k
4921
62.5k
  unsigned NumElements = VT.getVectorNumElements();
4922
62.5k
  if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
4923
62.5k
      
AS == AMDGPUASI.FLAT_ADDRESS23.4k
) {
4924
39.1k
    if (NumElements > 4)
4925
3.80k
      return SplitVectorStore(Op, DAG);
4926
35.3k
    return SDValue();
4927
23.4k
  } else 
if (23.4k
AS == AMDGPUASI.PRIVATE_ADDRESS23.4k
) {
4928
559
    switch (Subtarget->getMaxPrivateElementSize()) {
4929
279
    case 4:
4930
279
      return scalarizeVectorStore(Store, DAG);
4931
126
    case 8:
4932
126
      if (NumElements > 2)
4933
10
        return SplitVectorStore(Op, DAG);
4934
116
      return SDValue();
4935
154
    case 16:
4936
154
      if (NumElements > 4)
4937
2
        return SplitVectorStore(Op, DAG);
4938
152
      return SDValue();
4939
0
    default:
4940
0
      llvm_unreachable("unsupported private_element_size");
4941
23.4k
    }
4942
22.8k
  } else 
if (22.8k
AS == AMDGPUASI.LOCAL_ADDRESS22.8k
) {
4943
22.8k
    if (NumElements > 2)
4944
2.10k
      return SplitVectorStore(Op, DAG);
4945
20.7k
4946
20.7k
    
if (20.7k
NumElements == 220.7k
)
4947
20.7k
      return Op;
4948
0
4949
0
    // If properly aligned, if we split we might be able to use ds_write_b64.
4950
0
    return SplitVectorStore(Op, DAG);
4951
0
  } else {
4952
0
    llvm_unreachable("unhandled address space");
4953
23.4k
  }
4954
0
}
4955
4956
51
SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
4957
51
  SDLoc DL(Op);
4958
51
  EVT VT = Op.getValueType();
4959
51
  SDValue Arg = Op.getOperand(0);
4960
51
  // TODO: Should this propagate fast-math-flags?
4961
51
  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
4962
51
                                  DAG.getNode(ISD::FMUL, DL, VT, Arg,
4963
51
                                              DAG.getConstantFP(0.5/M_PI, DL,
4964
51
                                                                VT)));
4965
51
4966
51
  switch (Op.getOpcode()) {
4967
24
  case ISD::FCOS:
4968
24
    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
4969
27
  case ISD::FSIN:
4970
27
    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
4971
0
  default:
4972
0
    llvm_unreachable("Wrong trig opcode");
4973
0
  }
4974
0
}
4975
4976
251
SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
4977
251
  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
4978
251
  assert(AtomicNode->isCompareAndSwap());
4979
251
  unsigned AS = AtomicNode->getAddressSpace();
4980
251
4981
251
  // No custom lowering required for local address space
4982
251
  if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
4983
56
    return Op;
4984
195
4985
195
  // Non-local address space requires custom lowering for atomic compare
4986
195
  // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
4987
195
  SDLoc DL(Op);
4988
195
  SDValue ChainIn = Op.getOperand(0);
4989
195
  SDValue Addr = Op.getOperand(1);
4990
195
  SDValue Old = Op.getOperand(2);
4991
195
  SDValue New = Op.getOperand(3);
4992
195
  EVT VT = Op.getValueType();
4993
195
  MVT SimpleVT = VT.getSimpleVT();
4994
195
  MVT VecType = MVT::getVectorVT(SimpleVT, 2);
4995
195
4996
195
  SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
4997
195
  SDValue Ops[] = { ChainIn, Addr, NewOld };
4998
195
4999
195
  return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
5000
195
                                 Ops, VT, AtomicNode->getMemOperand());
5001
195
}
5002
5003
//===----------------------------------------------------------------------===//
5004
// Custom DAG optimizations
5005
//===----------------------------------------------------------------------===//
5006
5007
SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
5008
1.00k
                                                     DAGCombinerInfo &DCI) const {
5009
1.00k
  EVT VT = N->getValueType(0);
5010
1.00k
  EVT ScalarVT = VT.getScalarType();
5011
1.00k
  if (ScalarVT != MVT::f32)
5012
234
    return SDValue();
5013
768
5014
768
  SelectionDAG &DAG = DCI.DAG;
5015
768
  SDLoc DL(N);
5016
768
5017
768
  SDValue Src = N->getOperand(0);
5018
768
  EVT SrcVT = Src.getValueType();
5019
768
5020
768
  // TODO: We could try to match extracting the higher bytes, which would be
5021
768
  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
5022
768
  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
5023
768
  // about in practice.
5024
768
  if (
DCI.isAfterLegalizeVectorOps() && 768
SrcVT == MVT::i32298
) {
5025
290
    if (
DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))290
) {
5026
106
      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
5027
106
      DCI.AddToWorklist(Cvt.getNode());
5028
106
      return Cvt;
5029
106
    }
5030
662
  }
5031
662
5032
662
  return SDValue();
5033
662
}
5034
5035
/// \brief Return true if the given offset Size in bytes can be folded into
5036
/// the immediate offsets of a memory instruction for the given address space.
5037
static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
5038
0
                          const SISubtarget &STI) {
5039
0
  auto AMDGPUASI = STI.getAMDGPUAS();
5040
0
  if (
AS == AMDGPUASI.GLOBAL_ADDRESS0
) {
5041
0
    // MUBUF instructions a 12-bit offset in bytes.
5042
0
    return isUInt<12>(OffsetSize);
5043
0
  }
5044
0
  
if (0
AS == AMDGPUASI.CONSTANT_ADDRESS0
) {
5045
0
    // SMRD instructions have an 8-bit offset in dwords on SI and
5046
0
    // a 20-bit offset in bytes on VI.
5047
0
    if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
5048
0
      return isUInt<20>(OffsetSize);
5049
0
    else
5050
0
      
return (OffsetSize % 4 == 0) && 0
isUInt<8>(OffsetSize / 4)0
;
5051
0
  }
5052
0
  
if (0
AS == AMDGPUASI.LOCAL_ADDRESS ||
5053
0
      
AS == AMDGPUASI.REGION_ADDRESS0
) {
5054
0
    // The single offset versions have a 16-bit offset in bytes.
5055
0
    return isUInt<16>(OffsetSize);
5056
0
  }
5057
0
  // Indirect register addressing does not use any offsets.
5058
0
  return false;
5059
0
}
5060
5061
// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
5062
5063
// This is a variant of
5064
// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
5065
//
5066
// The normal DAG combiner will do this, but only if the add has one use since
5067
// that would increase the number of instructions.
5068
//
5069
// This prevents us from seeing a constant offset that can be folded into a
5070
// memory instruction's addressing mode. If we know the resulting add offset of
5071
// a pointer can be folded into an addressing offset, we can replace the pointer
5072
// operand with the add of new constant offset. This eliminates one of the uses,
5073
// and may allow the remaining use to also be simplified.
5074
//
5075
SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
5076
                                               unsigned AddrSpace,
5077
111
                                               DAGCombinerInfo &DCI) const {
5078
111
  SDValue N0 = N->getOperand(0);
5079
111
  SDValue N1 = N->getOperand(1);
5080
111
5081
111
  if (N0.getOpcode() != ISD::ADD)
5082
69
    return SDValue();
5083
42
5084
42
  const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
5085
42
  if (!CN1)
5086
0
    return SDValue();
5087
42
5088
42
  const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5089
42
  if (!CAdd)
5090
42
    return SDValue();
5091
0
5092
0
  // If the resulting offset is too large, we can't fold it into the addressing
5093
0
  // mode offset.
5094
0
  APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
5095
0
  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
5096
0
    return SDValue();
5097
0
5098
0
  SelectionDAG &DAG = DCI.DAG;
5099
0
  SDLoc SL(N);
5100
0
  EVT VT = N->getValueType(0);
5101
0
5102
0
  SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
5103
0
  SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
5104
0
5105
0
  return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
5106
0
}
5107
5108
SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
5109
321k
                                                  DAGCombinerInfo &DCI) const {
5110
321k
  SDValue Ptr = N->getBasePtr();
5111
321k
  SelectionDAG &DAG = DCI.DAG;
5112
321k
  SDLoc SL(N);
5113
321k
5114
321k
  // TODO: We could also do this for multiplies.
5115
321k
  unsigned AS = N->getAddressSpace();
5116
321k
  if (
Ptr.getOpcode() == ISD::SHL && 321k
AS != AMDGPUASI.PRIVATE_ADDRESS111
) {
5117
111
    SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
5118
111
    if (
NewPtr111
) {
5119
0
      SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
5120
0
5121
0
      NewOps[N->getOpcode() == ISD::STORE ? 
20
:
10
] = NewPtr;
5122
0
      return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
5123
0
    }
5124
321k
  }
5125
321k
5126
321k
  return SDValue();
5127
321k
}
5128
5129
2.82k
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
5130
1.96k
  return (Opc == ISD::AND && 
(Val == 0 || 1.96k
Val == 0xffffffff1.12k
)) ||
5131
1.81k
         
(Opc == ISD::OR && 1.81k
(Val == 0xffffffff || 322
Val == 0316
)) ||
5132
1.76k
         
(Opc == ISD::XOR && 1.76k
Val == 0536
);
5133
2.82k
}
5134
5135
// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
5136
// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
5137
// integer combine opportunities since most 64-bit operations are decomposed
5138
// this way.  TODO: We won't want this for SALU especially if it is an inline
5139
// immediate.
5140
SDValue SITargetLowering::splitBinaryBitConstantOp(
5141
  DAGCombinerInfo &DCI,
5142
  const SDLoc &SL,
5143
  unsigned Opc, SDValue LHS,
5144
1.57k
  const ConstantSDNode *CRHS) const {
5145
1.57k
  uint64_t Val = CRHS->getZExtValue();
5146
1.57k
  uint32_t ValLo = Lo_32(Val);
5147
1.57k
  uint32_t ValHi = Hi_32(Val);
5148
1.57k
  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5149
1.57k
5150
1.57k
    if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
5151
1.25k
         bitOpWithConstantIsReducible(Opc, ValHi)) ||
5152
1.57k
        
(CRHS->hasOneUse() && 490
!TII->isInlineConstant(CRHS->getAPIntValue())130
)) {
5153
1.13k
    // If we need to materialize a 64-bit immediate, it will be split up later
5154
1.13k
    // anyway. Avoid creating the harder to understand 64-bit immediate
5155
1.13k
    // materialization.
5156
1.13k
    return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
5157
1.13k
  }
5158
442
5159
442
  return SDValue();
5160
442
}
5161
5162
// Returns true if argument is a boolean value which is not serialized into
5163
// memory or argument and does not require v_cmdmask_b32 to be deserialized.
5164
85
static bool isBoolSGPR(SDValue V) {
5165
85
  if (V.getValueType() != MVT::i1)
5166
49
    return false;
5167
36
  switch (V.getOpcode()) {
5168
13
  default: break;
5169
23
  case ISD::SETCC:
5170
23
  case ISD::AND:
5171
23
  case ISD::OR:
5172
23
  case ISD::XOR:
5173
23
  case AMDGPUISD::FP_CLASS:
5174
23
    return true;
5175
13
  }
5176
13
  return false;
5177
13
}
5178
5179
SDValue SITargetLowering::performAndCombine(SDNode *N,
5180
28.0k
                                            DAGCombinerInfo &DCI) const {
5181
28.0k
  if (DCI.isBeforeLegalize())
5182
711
    return SDValue();
5183
27.3k
5184
27.3k
  SelectionDAG &DAG = DCI.DAG;
5185
27.3k
  EVT VT = N->getValueType(0);
5186
27.3k
  SDValue LHS = N->getOperand(0);
5187
27.3k
  SDValue RHS = N->getOperand(1);
5188
27.3k
5189
27.3k
5190
27.3k
  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
5191
27.3k
  if (
VT == MVT::i64 && 27.3k
CRHS1.31k
) {
5192
1.13k
    if (SDValue Split
5193
1.13k
        = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
5194
1.05k
      return Split;
5195
26.2k
  }
5196
26.2k
5197
26.2k
  
if (26.2k
CRHS && 26.2k
VT == MVT::i3225.6k
) {
5198
22.2k
    // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
5199
22.2k
    // nb = number of trailing zeroes in mask
5200
22.2k
    // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
5201
22.2k
    // given that we are selecting 8 or 16 bit fields starting at byte boundary.
5202
22.2k
    uint64_t Mask = CRHS->getZExtValue();
5203
22.2k
    unsigned Bits = countPopulation(Mask);
5204
22.2k
    if (
getSubtarget()->hasSDWA() && 22.2k
LHS->getOpcode() == ISD::SRL6.50k
&&
5205
22.2k
        
(Bits == 8 || 1.49k
Bits == 16794
) &&
isShiftedMask_64(Mask)715
&&
!(Mask & 1)715
) {
5206
71
      if (auto *
CShift71
= dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
5207
71
        unsigned Shift = CShift->getZExtValue();
5208
71
        unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
5209
71
        unsigned Offset = NB + Shift;
5210
71
        if (
(Offset & (Bits - 1)) == 071
) { // Starts at a byte or word boundary.
5211
71
          SDLoc SL(N);
5212
71
          SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
5213
71
                                    LHS->getOperand(0),
5214
71
                                    DAG.getConstant(Offset, SL, MVT::i32),
5215
71
                                    DAG.getConstant(Bits, SL, MVT::i32));
5216
71
          EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
5217
71
          SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
5218
71
                                    DAG.getValueType(NarrowVT));
5219
71
          SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
5220
71
                                    DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
5221
71
          return Shl;
5222
71
        }
5223
26.2k
      }
5224
71
    }
5225
22.2k
  }
5226
26.2k
5227
26.2k
  // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
5228
26.2k
  // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
5229
26.2k
  
if (26.2k
LHS.getOpcode() == ISD::SETCC && 26.2k
RHS.getOpcode() == ISD::SETCC89
) {
5230
89
    ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
5231
89
    ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
5232
89
5233
89
    SDValue X = LHS.getOperand(0);
5234
89
    SDValue Y = RHS.getOperand(0);
5235
89
    if (
Y.getOpcode() != ISD::FABS || 89
Y.getOperand(0) != X10
)
5236
81
      return SDValue();
5237
8
5238
8
    
if (8
LCC == ISD::SETO8
) {
5239
6
      if (X != LHS.getOperand(1))
5240
0
        return SDValue();
5241
6
5242
6
      
if (6
RCC == ISD::SETUNE6
) {
5243
4
        const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
5244
4
        if (
!C1 || 4
!C1->isInfinity()4
||
C1->isNegative()4
)
5245
2
          return SDValue();
5246
2
5247
2
        const uint32_t Mask = SIInstrFlags::N_NORMAL |
5248
2
                              SIInstrFlags::N_SUBNORMAL |
5249
2
                              SIInstrFlags::N_ZERO |
5250
2
                              SIInstrFlags::P_ZERO |
5251
2
                              SIInstrFlags::P_SUBNORMAL |
5252
2
                              SIInstrFlags::P_NORMAL;
5253
2
5254
2
        static_assert(((~(SIInstrFlags::S_NAN |
5255
2
                          SIInstrFlags::Q_NAN |
5256
2
                          SIInstrFlags::N_INFINITY |
5257
2
                          SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
5258
2
                      "mask not equal");
5259
2
5260
2
        SDLoc DL(N);
5261
2
        return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
5262
2
                           X, DAG.getConstant(Mask, DL, MVT::i32));
5263
2
      }
5264
6
    }
5265
89
  }
5266
26.1k
5267
26.1k
  
if (26.1k
VT == MVT::i32 &&
5268
26.1k
      
(RHS.getOpcode() == ISD::SIGN_EXTEND || 22.4k
LHS.getOpcode() == ISD::SIGN_EXTEND22.4k
)) {
5269
11
    // and x, (sext cc from i1) => select cc, x, 0
5270
11
    if (RHS.getOpcode() != ISD::SIGN_EXTEND)
5271
10
      std::swap(LHS, RHS);
5272
11
    if (isBoolSGPR(RHS.getOperand(0)))
5273
11
      return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
5274
11
                           LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
5275
26.1k
  }
5276
26.1k
5277
26.1k
  return SDValue();
5278
26.1k
}
5279
5280
SDValue SITargetLowering::performOrCombine(SDNode *N,
5281
16.8k
                                           DAGCombinerInfo &DCI) const {
5282
16.8k
  SelectionDAG &DAG = DCI.DAG;
5283
16.8k
  SDValue LHS = N->getOperand(0);
5284
16.8k
  SDValue RHS = N->getOperand(1);
5285
16.8k
5286
16.8k
  EVT VT = N->getValueType(0);
5287
16.8k
  if (
VT == MVT::i116.8k
) {
5288
69
    // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
5289
69
    if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
5290
69
        
RHS.getOpcode() == AMDGPUISD::FP_CLASS14
) {
5291
14
      SDValue Src = LHS.getOperand(0);
5292
14
      if (Src != RHS.getOperand(0))
5293
1
        return SDValue();
5294
13
5295
13
      const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
5296
13
      const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
5297
13
      if (
!CLHS || 13
!CRHS13
)
5298
0
        return SDValue();
5299
13
5300
13
      // Only 10 bits are used.
5301
13
      static const uint32_t MaxMask = 0x3ff;
5302
13
5303
13
      uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
5304
13
      SDLoc DL(N);
5305
13
      return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
5306
13
                         Src, DAG.getConstant(NewMask, DL, MVT::i32));
5307
13
    }
5308
55
5309
55
    return SDValue();
5310
55
  }
5311
16.8k
5312
16.8k
  
if (16.8k
VT != MVT::i6416.8k
)
5313
14.2k
    return SDValue();
5314
2.60k
5315
2.60k
  // TODO: This could be a generic combine with a predicate for extracting the
5316
2.60k
  // high half of an integer being free.
5317
2.60k
5318
2.60k
  // (or i64:x, (zero_extend i32:y)) ->
5319
2.60k
  //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
5320
2.60k
  
if (2.60k
LHS.getOpcode() == ISD::ZERO_EXTEND &&
5321
517
      RHS.getOpcode() != ISD::ZERO_EXTEND)
5322
517
    std::swap(LHS, RHS);
5323
2.60k
5324
2.60k
  if (
RHS.getOpcode() == ISD::ZERO_EXTEND2.60k
) {
5325
1.70k
    SDValue ExtSrc = RHS.getOperand(0);
5326
1.70k
    EVT SrcVT = ExtSrc.getValueType();
5327
1.70k
    if (
SrcVT == MVT::i321.70k
) {
5328
1.70k
      SDLoc SL(N);
5329
1.70k
      SDValue LowLHS, HiBits;
5330
1.70k
      std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
5331
1.70k
      SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
5332
1.70k
5333
1.70k
      DCI.AddToWorklist(LowOr.getNode());
5334
1.70k
      DCI.AddToWorklist(HiBits.getNode());
5335
1.70k
5336
1.70k
      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5337
1.70k
                                LowOr, HiBits);
5338
1.70k
      return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
5339
1.70k
    }
5340
906
  }
5341
906
5342
906
  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
5343
906
  if (
CRHS906
) {
5344
161
    if (SDValue Split
5345
161
          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
5346
51
      return Split;
5347
855
  }
5348
855
5349
855
  return SDValue();
5350
855
}
5351
5352
SDValue SITargetLowering::performXorCombine(SDNode *N,
5353
1.24k
                                            DAGCombinerInfo &DCI) const {
5354
1.24k
  EVT VT = N->getValueType(0);
5355
1.24k
  if (VT != MVT::i64)
5356
721
    return SDValue();
5357
520
5358
520
  SDValue LHS = N->getOperand(0);
5359
520
  SDValue RHS = N->getOperand(1);
5360
520
5361
520
  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
5362
520
  if (
CRHS520
) {
5363
278
    if (SDValue Split
5364
278
          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
5365
26
      return Split;
5366
494
  }
5367
494
5368
494
  return SDValue();
5369
494
}
5370
5371
// Instructions that will be lowered with a final instruction that zeros the
5372
// high result bits.
5373
// XXX - probably only need to list legal operations.
5374
201
static bool fp16SrcZerosHighBits(unsigned Opc) {
5375
201
  switch (Opc) {
5376
155
  case ISD::FADD:
5377
155
  case ISD::FSUB:
5378
155
  case ISD::FMUL:
5379
155
  case ISD::FDIV:
5380
155
  case ISD::FREM:
5381
155
  case ISD::FMA:
5382
155
  case ISD::FMAD:
5383
155
  case ISD::FCANONICALIZE:
5384
155
  case ISD::FP_ROUND:
5385
155
  case ISD::UINT_TO_FP:
5386
155
  case ISD::SINT_TO_FP:
5387
155
  case ISD::FABS:
5388
155
    // Fabs is lowered to a bit operation, but it's an and which will clear the
5389
155
    // high bits anyway.
5390
155
  case ISD::FSQRT:
5391
155
  case ISD::FSIN:
5392
155
  case ISD::FCOS:
5393
155
  case ISD::FPOWI:
5394
155
  case ISD::FPOW:
5395
155
  case ISD::FLOG:
5396
155
  case ISD::FLOG2:
5397
155
  case ISD::FLOG10:
5398
155
  case ISD::FEXP:
5399
155
  case ISD::FEXP2:
5400
155
  case ISD::FCEIL:
5401
155
  case ISD::FTRUNC:
5402
155
  case ISD::FRINT:
5403
155
  case ISD::FNEARBYINT:
5404
155
  case ISD::FROUND:
5405
155
  case ISD::FFLOOR:
5406
155
  case ISD::FMINNUM:
5407
155
  case ISD::FMAXNUM:
5408
155
  case AMDGPUISD::FRACT:
5409
155
  case AMDGPUISD::CLAMP:
5410
155
  case AMDGPUISD::COS_HW:
5411
155
  case AMDGPUISD::SIN_HW:
5412
155
  case AMDGPUISD::FMIN3:
5413
155
  case AMDGPUISD::FMAX3:
5414
155
  case AMDGPUISD::FMED3:
5415
155
  case AMDGPUISD::FMAD_FTZ:
5416
155
  case AMDGPUISD::RCP:
5417
155
  case AMDGPUISD::RSQ:
5418
155
  case AMDGPUISD::LDEXP:
5419
155
    return true;
5420
46
  default:
5421
46
    // fcopysign, select and others may be lowered to 32-bit bit operations
5422
46
    // which don't zero the high bits.
5423
46
    return false;
5424
0
  }
5425
0
}
5426
5427
SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
5428
12.7k
                                                   DAGCombinerInfo &DCI) const {
5429
12.7k
  if (!Subtarget->has16BitInsts() ||
5430
8.47k
      DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5431
9.94k
    return SDValue();
5432
2.81k
5433
2.81k
  EVT VT = N->getValueType(0);
5434
2.81k
  if (VT != MVT::i32)
5435
1.46k
    return SDValue();
5436
1.34k
5437
1.34k
  SDValue Src = N->getOperand(0);
5438
1.34k
  if (Src.getValueType() != MVT::i16)
5439
168
    return SDValue();
5440
1.17k
5441
1.17k
  // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
5442
1.17k
  // FIXME: It is not universally true that the high bits are zeroed on gfx9.
5443
1.17k
  
if (1.17k
Src.getOpcode() == ISD::BITCAST1.17k
) {
5444
201
    SDValue BCSrc = Src.getOperand(0);
5445
201
    if (BCSrc.getValueType() == MVT::f16 &&
5446
201
        fp16SrcZerosHighBits(BCSrc.getOpcode()))
5447
155
      return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
5448
1.02k
  }
5449
1.02k
5450
1.02k
  return SDValue();
5451
1.02k
}
5452
5453
SDValue SITargetLowering::performClassCombine(SDNode *N,
5454
74
                                              DAGCombinerInfo &DCI) const {
5455
74
  SelectionDAG &DAG = DCI.DAG;
5456
74
  SDValue Mask = N->getOperand(1);
5457
74
5458
74
  // fp_class x, 0 -> false
5459
74
  if (const ConstantSDNode *
CMask74
= dyn_cast<ConstantSDNode>(Mask)) {
5460
51
    if (CMask->isNullValue())
5461
2
      return DAG.getConstant(0, SDLoc(N), MVT::i1);
5462
72
  }
5463
72
5464
72
  
if (72
N->getOperand(0).isUndef()72
)
5465
2
    return DAG.getUNDEF(MVT::i1);
5466
70
5467
70
  return SDValue();
5468
70
}
5469
5470
61
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
5471
61
  if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
5472
32
    return true;
5473
29
5474
29
  return DAG.isKnownNeverNaN(Op);
5475
29
}
5476
5477
static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
5478
413
                            const SISubtarget *ST, unsigned MaxDepth=5) {
5479
413
  // If source is a result of another standard FP operation it is already in
5480
413
  // canonical form.
5481
413
5482
413
  switch (Op.getOpcode()) {
5483
187
  default:
5484
187
    break;
5485
413
5486
413
  // These will flush denorms if required.
5487
58
  case ISD::FADD:
5488
58
  case ISD::FSUB:
5489
58
  case ISD::FMUL:
5490
58
  case ISD::FSQRT:
5491
58
  case ISD::FCEIL:
5492
58
  case ISD::FFLOOR:
5493
58
  case ISD::FMA:
5494
58
  case ISD::FMAD:
5495
58
5496
58
  case ISD::FCANONICALIZE:
5497
58
    return true;
5498
58
5499
20
  case ISD::FP_ROUND:
5500
20
    return Op.getValueType().getScalarType() != MVT::f16 ||
5501
16
           ST->hasFP16Denormals();
5502
58
5503
8
  case ISD::FP_EXTEND:
5504
8
    return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
5505
4
           ST->hasFP16Denormals();
5506
58
5507
0
  case ISD::FP16_TO_FP:
5508
0
  case ISD::FP_TO_FP16:
5509
0
    return ST->hasFP16Denormals();
5510
0
5511
0
  // It can/will be lowered or combined as a bit operation.
5512
0
  // Need to check their input recursively to handle.
5513
68
  case ISD::FNEG:
5514
68
  case ISD::FABS:
5515
68
    return (MaxDepth > 0) &&
5516
68
           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
5517
68
5518
16
  case ISD::FSIN:
5519
16
  case ISD::FCOS:
5520
16
  case ISD::FSINCOS:
5521
16
    return Op.getValueType().getScalarType() != MVT::f16;
5522
16
5523
16
  // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
5524
16
  // For such targets need to check their input recursively.
5525
44
  case ISD::FMINNUM:
5526
44
  case ISD::FMAXNUM:
5527
44
  case ISD::FMINNAN:
5528
44
  case ISD::FMAXNAN:
5529
44
5530
44
    if (ST->supportsMinMaxDenormModes() &&
5531
22
        DAG.isKnownNeverNaN(Op.getOperand(0)) &&
5532
0
        DAG.isKnownNeverNaN(Op.getOperand(1)))
5533
0
      return true;
5534
44
5535
44
    return (MaxDepth > 0) &&
5536
44
           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
5537
12
           isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
5538
44
5539
12
  case ISD::ConstantFP: {
5540
12
    auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
5541
12
    return !F.isDenormal() && 
!(F.isNaN() && 12
F.isSignaling()0
);
5542
187
  }
5543
187
  }
5544
187
  return false;
5545
187
}
5546
5547
// Constant fold canonicalize.
5548
SDValue SITargetLowering::performFCanonicalizeCombine(
5549
  SDNode *N,
5550
401
  DAGCombinerInfo &DCI) const {
5551
401
  SelectionDAG &DAG = DCI.DAG;
5552
401
  ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
5553
401
5554
401
  if (
!CFP401
) {
5555
301
    SDValue N0 = N->getOperand(0);
5556
301
    EVT VT = N0.getValueType().getScalarType();
5557
301
    auto ST = getSubtarget();
5558
301
5559
301
    if (
((VT == MVT::f32 && 301
ST->hasFP32Denormals()168
) ||
5560
262
         
(VT == MVT::f64 && 262
ST->hasFP64Denormals()28
) ||
5561
236
         
(VT == MVT::f16 && 236
ST->hasFP16Denormals()105
)) &&
5562
160
        DAG.isKnownNeverNaN(N0))
5563
10
      return N0;
5564
291
5565
291
    bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
5566
291
5567
291
    if (
(IsIEEEMode || 291
isKnownNeverSNan(DAG, N0)8
) &&
5568
289
        isCanonicalized(DAG, N0, ST))
5569
94
      return N0;
5570
197
5571
197
    return SDValue();
5572
197
  }
5573
100
5574
100
  const APFloat &C = CFP->getValueAPF();
5575
100
5576
100
  // Flush denormals to 0 if not enabled.
5577
100
  if (
C.isDenormal()100
) {
5578
24
    EVT VT = N->getValueType(0);
5579
24
    EVT SVT = VT.getScalarType();
5580
24
    if (
SVT == MVT::f32 && 24
!Subtarget->hasFP32Denormals()4
)
5581
2
      return DAG.getConstantFP(0.0, SDLoc(N), VT);
5582
22
5583
22
    
if (22
SVT == MVT::f64 && 22
!Subtarget->hasFP64Denormals()4
)
5584
2
      return DAG.getConstantFP(0.0, SDLoc(N), VT);
5585
20
5586
20
    
if (20
SVT == MVT::f16 && 20
!Subtarget->hasFP16Denormals()16
)
5587
0
      return DAG.getConstantFP(0.0, SDLoc(N), VT);
5588
96
  }
5589
96
5590
96
  
if (96
C.isNaN()96
) {
5591
42
    EVT VT = N->getValueType(0);
5592
42
    APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
5593
42
    if (
C.isSignaling()42
) {
5594
22
      // Quiet a signaling NaN.
5595
22
      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
5596
22
    }
5597
20
5598
20
    // Make sure it is the canonical NaN bitpattern.
5599
20
    //
5600
20
    // TODO: Can we use -1 as the canonical NaN value since it's an inline
5601
20
    // immediate?
5602
20
    
if (20
C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()20
)
5603
14
      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
5604
60
  }
5605
60
5606
60
  return N->getOperand(0);
5607
60
}
5608
5609
51
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
5610
51
  switch (Opc) {
5611
16
  case ISD::FMAXNUM:
5612
16
    return AMDGPUISD::FMAX3;
5613
5
  case ISD::SMAX:
5614
5
    return AMDGPUISD::SMAX3;
5615
5
  case ISD::UMAX:
5616
5
    return AMDGPUISD::UMAX3;
5617
12
  case ISD::FMINNUM:
5618
12
    return AMDGPUISD::FMIN3;
5619
8
  case ISD::SMIN:
5620
8
    return AMDGPUISD::SMIN3;
5621
5
  case ISD::UMIN:
5622
5
    return AMDGPUISD::UMIN3;
5623
0
  default:
5624
0
    llvm_unreachable("Not a min/max opcode");
5625
0
  }
5626
0
}
5627
5628
SDValue SITargetLowering::performIntMed3ImmCombine(
5629
  SelectionDAG &DAG, const SDLoc &SL,
5630
152
  SDValue Op0, SDValue Op1, bool Signed) const {
5631
152
  ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
5632
152
  if (!K1)
5633
92
    return SDValue();
5634
60
5635
60
  ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
5636
60
  if (!K0)
5637
3
    return SDValue();
5638
57
5639
57
  
if (57
Signed57
) {
5640
48
    if (K0->getAPIntValue().sge(K1->getAPIntValue()))
5641
3
      return SDValue();
5642
9
  } else {
5643
9
    if (K0->getAPIntValue().uge(K1->getAPIntValue()))
5644
3
      return SDValue();
5645
51
  }
5646
51
5647
51
  EVT VT = K0->getValueType(0);
5648
51
  unsigned Med3Opc = Signed ? 
AMDGPUISD::SMED345
:
AMDGPUISD::UMED36
;
5649
51
  if (
VT == MVT::i32 || 51
(VT == MVT::i16 && 4
Subtarget->hasMed3_16()4
)) {
5650
49
    return DAG.getNode(Med3Opc, SL, VT,
5651
49
                       Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
5652
49
  }
5653
2
5654
2
  // If there isn't a 16-bit med3 operation, convert to 32-bit.
5655
2
  MVT NVT = MVT::i32;
5656
2
  unsigned ExtOp = Signed ? 
ISD::SIGN_EXTEND1
:
ISD::ZERO_EXTEND1
;
5657
152
5658
152
  SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
5659
152
  SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
5660
152
  SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
5661
152
5662
152
  SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
5663
152
  return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
5664
152
}
5665
5666
769
static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
5667
769
  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
5668
592
    return C;
5669
177
5670
177
  
if (BuildVectorSDNode *177
BV177
= dyn_cast<BuildVectorSDNode>(Op)) {
5671
43
    if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
5672
41
      return C;
5673
136
  }
5674
136
5675
136
  return nullptr;
5676
136
}
5677
5678
SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
5679
                                                  const SDLoc &SL,
5680
                                                  SDValue Op0,
5681
451
                                                  SDValue Op1) const {
5682
451
  ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
5683
451
  if (!K1)
5684
133
    return SDValue();
5685
318
5686
318
  ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
5687
318
  if (!K0)
5688
3
    return SDValue();
5689
315
5690
315
  // Ordered >= (although NaN inputs should have folded away by now).
5691
315
  APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
5692
315
  if (Cmp == APFloat::cmpGreaterThan)
5693
8
    return SDValue();
5694
307
5695
307
  // TODO: Check IEEE bit enabled?
5696
307
  EVT VT = Op0.getValueType();
5697
307
  if (
Subtarget->enableDX10Clamp()307
) {
5698
298
    // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
5699
298
    // hardware fmed3 behavior converting to a min.
5700
298
    // FIXME: Should this be allowing -0.0?
5701
298
    if (
K1->isExactlyValue(1.0) && 298
K0->isExactlyValue(0.0)248
)
5702
245
      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
5703
62
  }
5704
62
5705
62
  // med3 for f16 is only available on gfx9+, and not available for v2f16.
5706
62
  
if (62
VT == MVT::f32 || 62
(VT == MVT::f16 && 11
Subtarget->hasMed3_16()5
)) {
5707
53
    // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
5708
53
    // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
5709
53
    // then give the other result, which is different from med3 with a NaN
5710
53
    // input.
5711
53
    SDValue Var = Op0.getOperand(0);
5712
53
    if (!isKnownNeverSNan(DAG, Var))
5713
15
      return SDValue();
5714
38
5715
38
    return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
5716
38
                       Var, SDValue(K0, 0), SDValue(K1, 0));
5717
38
  }
5718
9
5719
9
  return SDValue();
5720
9
}
5721
5722
SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
5723
2.90k
                                               DAGCombinerInfo &DCI) const {
5724
2.90k
  SelectionDAG &DAG = DCI.DAG;
5725
2.90k
5726
2.90k
  EVT VT = N->getValueType(0);
5727
2.90k
  unsigned Opc = N->getOpcode();
5728
2.90k
  SDValue Op0 = N->getOperand(0);
5729
2.90k
  SDValue Op1 = N->getOperand(1);
5730
2.90k
5731
2.90k
  // Only do this if the inner op has one use since this will just increases
5732
2.90k
  // register pressure for no benefit.
5733
2.90k
5734
2.90k
5735
2.90k
  if (
Opc != AMDGPUISD::FMIN_LEGACY && 2.90k
Opc != AMDGPUISD::FMAX_LEGACY2.88k
&&
5736
2.85k
      VT != MVT::f64 &&
5737
2.90k
      
((VT != MVT::f16 && 2.69k
VT != MVT::i162.57k
) ||
Subtarget->hasMin3Max3_16()218
)) {
5738
2.53k
    // max(max(a, b), c) -> max3(a, b, c)
5739
2.53k
    // min(min(a, b), c) -> min3(a, b, c)
5740
2.53k
    if (
Op0.getOpcode() == Opc && 2.53k
Op0.hasOneUse()47
) {
5741
41
      SDLoc DL(N);
5742
41
      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
5743
41
                         DL,
5744
41
                         N->getValueType(0),
5745
41
                         Op0.getOperand(0),
5746
41
                         Op0.getOperand(1),
5747
41
                         Op1);
5748
41
    }
5749
2.49k
5750
2.49k
    // Try commuted.
5751
2.49k
    // max(a, max(b, c)) -> max3(a, b, c)
5752
2.49k
    // min(a, min(b, c)) -> min3(a, b, c)
5753
2.49k
    
if (2.49k
Op1.getOpcode() == Opc && 2.49k
Op1.hasOneUse()10
) {
5754
10
      SDLoc DL(N);
5755
10
      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
5756
10
                         DL,
5757
10
                         N->getValueType(0),
5758
10
                         Op0,
5759
10
                         Op1.getOperand(0),
5760
10
                         Op1.getOperand(1));
5761
10
    }
5762
2.85k
  }
5763
2.85k
5764
2.85k
  // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
5765
2.85k
  
if (2.85k
Opc == ISD::SMIN && 2.85k
Op0.getOpcode() == ISD::SMAX368
&&
Op0.hasOneUse()96
) {
5766
90
    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
5767
45
      return Med3;
5768
2.80k
  }
5769
2.80k
5770
2.80k
  
if (2.80k
Opc == ISD::UMIN && 2.80k
Op0.getOpcode() == ISD::UMAX298
&&
Op0.hasOneUse()68
) {
5771
62
    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
5772
6
      return Med3;
5773
2.80k
  }
5774
2.80k
5775
2.80k
  // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
5776
2.80k
  
if (2.80k
((Opc == ISD::FMINNUM && 2.80k
Op0.getOpcode() == ISD::FMAXNUM938
) ||
5777
2.33k
       (Opc == AMDGPUISD::FMIN_LEGACY &&
5778
2.33k
        Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
5779
466
      
(VT == MVT::f32 || 466
VT == MVT::f64110
||
5780
92
       
(VT == MVT::f16 && 92
Subtarget->has16BitInsts()70
) ||
5781
466
       
(VT == MVT::v2f16 && 22
Subtarget->hasVOP3PInsts()22
)) &&
5782
2.80k
      
Op0.hasOneUse()466
) {
5783
451
    if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
5784
283
      return Res;
5785
2.51k
  }
5786
2.51k
5787
2.51k
  return SDValue();
5788
2.51k
}
5789
5790
160
static bool isClampZeroToOne(SDValue A, SDValue B) {
5791
160
  if (ConstantFPSDNode *
CA160
= dyn_cast<ConstantFPSDNode>(A)) {
5792
98
    if (ConstantFPSDNode *
CB98
= dyn_cast<ConstantFPSDNode>(B)) {
5793
86
      // FIXME: Should this be allowing -0.0?
5794
37
      return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
5795
50
             
(CA->isExactlyValue(1.0) && 50
CB->isExactlyValue(0.0)12
);
5796
86
    }
5797
74
  }
5798
74
5799
74
  return false;
5800
74
}
5801
5802
// FIXME: Should only worry about snans for version with chain.
5803
SDValue SITargetLowering::performFMed3Combine(SDNode *N,
5804
107
                                              DAGCombinerInfo &DCI) const {
5805
107
  EVT VT = N->getValueType(0);
5806
107
  // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
5807
107
  // NaNs. With a NaN input, the order of the operands may change the result.
5808
107
5809
107
  SelectionDAG &DAG = DCI.DAG;
5810
107
  SDLoc SL(N);
5811
107
5812
107
  SDValue Src0 = N->getOperand(0);
5813
107
  SDValue Src1 = N->getOperand(1);
5814
107
  SDValue Src2 = N->getOperand(2);
5815
107
5816
107
  if (
isClampZeroToOne(Src0, Src1)107
) {
5817
36
    // const_a, const_b, x -> clamp is safe in all cases including signaling
5818
36
    // nans.
5819
36
    // FIXME: Should this be allowing -0.0?
5820
36
    return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
5821
36
  }
5822
71
5823
71
  // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
5824
71
  // handling no dx10-clamp?
5825
71
  
if (71
Subtarget->enableDX10Clamp()71
) {
5826
53
    // If NaNs is clamped to 0, we are free to reorder the inputs.
5827
53
5828
53
    if (
isa<ConstantFPSDNode>(Src0) && 53
!isa<ConstantFPSDNode>(Src1)9
)
5829
6
      std::swap(Src0, Src1);
5830
53
5831
53
    if (
isa<ConstantFPSDNode>(Src1) && 53
!isa<ConstantFPSDNode>(Src2)47
)
5832
3
      std::swap(Src1, Src2);
5833
53
5834
53
    if (
isa<ConstantFPSDNode>(Src0) && 53
!isa<ConstantFPSDNode>(Src1)3
)
5835
3
      std::swap(Src0, Src1);
5836
53
5837
53
    if (isClampZeroToOne(Src1, Src2))
5838
12
      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
5839
59
  }
5840
59
5841
59
  return SDValue();
5842
59
}
5843
5844
SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
5845
133
                                                 DAGCombinerInfo &DCI) const {
5846
133
  SDValue Src0 = N->getOperand(0);
5847
133
  SDValue Src1 = N->getOperand(1);
5848
133
  if (
Src0.isUndef() && 133
Src1.isUndef()11
)
5849
3
    return DCI.DAG.getUNDEF(N->getValueType(0));
5850
130
  return SDValue();
5851
130
}
5852
5853
SDValue SITargetLowering::performExtractVectorEltCombine(
5854
104k
  SDNode *N, DAGCombinerInfo &DCI) const {
5855
104k
  SDValue Vec = N->getOperand(0);
5856
104k
5857
104k
  SelectionDAG &DAG = DCI.DAG;
5858
104k
  if (
Vec.getOpcode() == ISD::FNEG && 104k
allUsesHaveSourceMods(N)22
) {
5859
14
    SDLoc SL(N);
5860
14
    EVT EltVT = N->getValueType(0);
5861
14
    SDValue Idx = N->getOperand(1);
5862
14
    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
5863
14
                              Vec.getOperand(0), Idx);
5864
14
    return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
5865
14
  }
5866
104k
5867
104k
  return SDValue();
5868
104k
}
5869
5870
static bool convertBuildVectorCastElt(SelectionDAG &DAG,
5871
508
                                      SDValue &Lo, SDValue &Hi) {
5872
508
  if (Hi.getOpcode() == ISD::BITCAST &&
5873
6
      Hi.getOperand(0).getValueType() == MVT::f16 &&
5874
508
      
(isa<ConstantSDNode>(Lo) || 6
Lo.isUndef()4
)) {
5875
2
    Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
5876
2
    Hi = Hi.getOperand(0);
5877
2
    return true;
5878
2
  }
5879
506
5880
506
  return false;
5881
506
}
5882
5883
SDValue SITargetLowering::performBuildVectorCombine(
5884
85.7k
  SDNode *N, DAGCombinerInfo &DCI) const {
5885
85.7k
  SDLoc SL(N);
5886
85.7k
5887
85.7k
  if (!isTypeLegal(MVT::v2i16))
5888
82.5k
    return SDValue();
5889
3.13k
  SelectionDAG &DAG = DCI.DAG;
5890
3.13k
  EVT VT = N->getValueType(0);
5891
3.13k
5892
3.13k
  if (
VT == MVT::v2i163.13k
) {
5893
255
    SDValue Lo = N->getOperand(0);
5894
255
    SDValue Hi = N->getOperand(1);
5895
255
5896
255
    // v2i16 build_vector (const|undef), (bitcast f16:$x)
5897
255
    // -> bitcast (v2f16 build_vector const|undef, $x
5898
255
    if (
convertBuildVectorCastElt(DAG, Lo, Hi)255
) {
5899
2
      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi  });
5900
2
      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
5901
2
    }
5902
253
5903
253
    
if (253
convertBuildVectorCastElt(DAG, Hi, Lo)253
) {
5904
0
      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo  });
5905
0
      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
5906
0
    }
5907
3.13k
  }
5908
3.13k
5909
3.13k
  return SDValue();
5910
3.13k
}
5911
5912
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
5913
                                          const SDNode *N0,
5914
198
                                          const SDNode *N1) const {
5915
198
  EVT VT = N0->getValueType(0);
5916
198
5917
198
  // Only do this if we are not trying to support denormals. v_mad_f32 does not
5918
198
  // support denormals ever.
5919
198
  if (
(VT == MVT::f32 && 198
!Subtarget->hasFP32Denormals()92
) ||
5920
134
      
(VT == MVT::f16 && 134
!Subtarget->hasFP16Denormals()60
))
5921
108
    return ISD::FMAD;
5922
90
5923
90
  const TargetOptions &Options = DAG.getTarget().Options;
5924
90
  if (
(Options.AllowFPOpFusion == FPOpFusion::Fast || 90
Options.UnsafeFPMath45
||
5925
45
       (N0->getFlags().hasUnsafeAlgebra() &&
5926
45
        N1->getFlags().hasUnsafeAlgebra())) &&
5927
90
      
isFMAFasterThanFMulAndFAdd(VT)48
) {
5928
32
    return ISD::FMA;
5929
32
  }
5930
58
5931
58
  return 0;
5932
58
}
5933
5934
SDValue SITargetLowering::performAddCombine(SDNode *N,
5935
118k
                                            DAGCombinerInfo &DCI) const {
5936
118k
  SelectionDAG &DAG = DCI.DAG;
5937
118k
  EVT VT = N->getValueType(0);
5938
118k
5939
118k
  if (VT != MVT::i32)
5940
103k
    return SDValue();
5941
15.3k
5942
15.3k
  SDLoc SL(N);
5943
15.3k
  SDValue LHS = N->getOperand(0);
5944
15.3k
  SDValue RHS = N->getOperand(1);
5945
15.3k
5946
15.3k
  // add x, zext (setcc) => addcarry x, 0, setcc
5947
15.3k
  // add x, sext (setcc) => subcarry x, 0, setcc
5948
15.3k
  unsigned Opc = LHS.getOpcode();
5949
15.3k
  if (
Opc == ISD::ZERO_EXTEND || 15.3k
Opc == ISD::SIGN_EXTEND15.3k
||
5950
15.3k
      
Opc == ISD::ANY_EXTEND15.3k
||
Opc == ISD::ADDCARRY15.3k
)
5951
39
    std::swap(RHS, LHS);
5952
15.3k
5953
15.3k
  Opc = RHS.getOpcode();
5954
15.3k
  switch (Opc) {
5955
15.2k
  default: break;
5956
58
  case ISD::ZERO_EXTEND:
5957
58
  case ISD::SIGN_EXTEND:
5958
58
  case ISD::ANY_EXTEND: {
5959
58
    auto Cond = RHS.getOperand(0);
5960
58
    if (!isBoolSGPR(Cond))
5961
49
      break;
5962
9
    SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
5963
9
    SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
5964
9
    Opc = (Opc == ISD::SIGN_EXTEND) ? 
ISD::SUBCARRY4
:
ISD::ADDCARRY5
;
5965
9
    return DAG.getNode(Opc, SL, VTList, Args);
5966
9
  }
5967
0
  case ISD::ADDCARRY: {
5968
0
    // add x, (addcarry y, 0, cc) => addcarry x, y, cc
5969
0
    auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
5970
0
    if (
!C || 0
C->getZExtValue() != 00
)
break0
;
5971
0
    SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
5972
0
    return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
5973
0
  }
5974
15.3k
  }
5975
15.3k
  return SDValue();
5976
15.3k
}
5977
5978
SDValue SITargetLowering::performSubCombine(SDNode *N,
5979
4.56k
                                            DAGCombinerInfo &DCI) const {
5980
4.56k
  SelectionDAG &DAG = DCI.DAG;
5981
4.56k
  EVT VT = N->getValueType(0);
5982
4.56k
5983
4.56k
  if (VT != MVT::i32)
5984
2.74k
    return SDValue();
5985
1.82k
5986
1.82k
  SDLoc SL(N);
5987
1.82k
  SDValue LHS = N->getOperand(0);
5988
1.82k
  SDValue RHS = N->getOperand(1);
5989
1.82k
5990
1.82k
  unsigned Opc = LHS.getOpcode();
5991
1.82k
  if (Opc != ISD::SUBCARRY)
5992
1.82k
    std::swap(RHS, LHS);
5993
1.82k
5994
1.82k
  if (
LHS.getOpcode() == ISD::SUBCARRY1.82k
) {
5995
1
    // sub (subcarry x, 0, cc), y => subcarry x, y, cc
5996
1
    auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
5997
1
    if (
!C || 1
C->getZExtValue() != 01
)
5998
0
      return SDValue();
5999
1
    SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
6000
1
    return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
6001
1
  }
6002
1.82k
  return SDValue();
6003
1.82k
}
6004
6005
SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
6006
30
  DAGCombinerInfo &DCI) const {
6007
30
6008
30
  if (N->getValueType(0) != MVT::i32)
6009
0
    return SDValue();
6010
30
6011
30
  auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
6012
30
  if (
!C || 30
C->getZExtValue() != 014
)
6013
16
    return SDValue();
6014
14
6015
14
  SelectionDAG &DAG = DCI.DAG;
6016
14
  SDValue LHS = N->getOperand(0);
6017
14
6018
14
  // addcarry (add x, y), 0, cc => addcarry x, y, cc
6019
14
  // subcarry (sub x, y), 0, cc => subcarry x, y, cc
6020
14
  unsigned LHSOpc = LHS.getOpcode();
6021
14
  unsigned Opc = N->getOpcode();
6022
14
  if (
(LHSOpc == ISD::ADD && 14
Opc == ISD::ADDCARRY0
) ||
6023
14
      
(LHSOpc == ISD::SUB && 14
Opc == ISD::SUBCARRY1
)) {
6024
1
    SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
6025
1
    return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
6026
1
  }
6027
13
  return SDValue();
6028
13
}
6029
6030
SDValue SITargetLowering::performFAddCombine(SDNode *N,
6031
5.56k
                                             DAGCombinerInfo &DCI) const {
6032
5.56k
  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
6033
3.87k
    return SDValue();
6034
1.69k
6035
1.69k
  SelectionDAG &DAG = DCI.DAG;
6036
1.69k
  EVT VT = N->getValueType(0);
6037
1.69k
6038
1.69k
  SDLoc SL(N);
6039
1.69k
  SDValue LHS = N->getOperand(0);
6040
1.69k
  SDValue RHS = N->getOperand(1);
6041
1.69k
6042
1.69k
  // These should really be instruction patterns, but writing patterns with
6043
1.69k
  // source modiifiers is a pain.
6044
1.69k
6045
1.69k
  // fadd (fadd (a, a), b) -> mad 2.0, a, b
6046
1.69k
  if (
LHS.getOpcode() == ISD::FADD1.69k
) {
6047
267
    SDValue A = LHS.getOperand(0);
6048
267
    if (
A == LHS.getOperand(1)267
) {
6049
96
      unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
6050
96
      if (
FusedOp != 096
) {
6051
64
        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
6052
64
        return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
6053
64
      }
6054
1.63k
    }
6055
267
  }
6056
1.63k
6057
1.63k
  // fadd (b, fadd (a, a)) -> mad 2.0, a, b
6058
1.63k
  
if (1.63k
RHS.getOpcode() == ISD::FADD1.63k
) {
6059
81
    SDValue A = RHS.getOperand(0);
6060
81
    if (
A == RHS.getOperand(1)81
) {
6061
32
      unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
6062
32
      if (
FusedOp != 032
) {
6063
22
        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
6064
22
        return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
6065
22
      }
6066
1.61k
    }
6067
81
  }
6068
1.61k
6069
1.61k
  return SDValue();
6070
1.61k
}
6071
6072
SDValue SITargetLowering::performFSubCombine(SDNode *N,
6073
1.71k
                                             DAGCombinerInfo &DCI) const {
6074
1.71k
  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
6075
1.18k
    return SDValue();
6076
532
6077
532
  SelectionDAG &DAG = DCI.DAG;
6078
532
  SDLoc SL(N);
6079
532
  EVT VT = N->getValueType(0);
6080
532
  assert(!VT.isVector());
6081
532
6082
532
  // Try to get the fneg to fold into the source modifier. This undoes generic
6083
532
  // DAG combines and folds them into the mad.
6084
532
  //
6085
532
  // Only do this if we are not trying to support denormals. v_mad_f32 does
6086
532
  // not support denormals ever.
6087
532
  SDValue LHS = N->getOperand(0);
6088
532
  SDValue RHS = N->getOperand(1);
6089
532
  if (
LHS.getOpcode() == ISD::FADD532
) {
6090
45
    // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
6091
45
    SDValue A = LHS.getOperand(0);
6092
45
    if (
A == LHS.getOperand(1)45
) {
6093
26
      unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
6094
26
      if (
FusedOp != 026
){
6095
19
        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
6096
19
        SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
6097
19
6098
19
        return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
6099
19
      }
6100
513
    }
6101
45
  }
6102
513
6103
513
  
if (513
RHS.getOpcode() == ISD::FADD513
) {
6104
50
    // (fsub c, (fadd a, a)) -> mad -2.0, a, c
6105
50
6106
50
    SDValue A = RHS.getOperand(0);
6107
50
    if (
A == RHS.getOperand(1)50
) {
6108
44
      unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
6109
44
      if (
FusedOp != 044
){
6110
35
        const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
6111
35
        return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
6112
35
      }
6113
478
    }
6114
50
  }
6115
478
6116
478
  return SDValue();
6117
478
}
6118
6119
SDValue SITargetLowering::performSetCCCombine(SDNode *N,
6120
10.6k
                                              DAGCombinerInfo &DCI) const {
6121
10.6k
  SelectionDAG &DAG = DCI.DAG;
6122
10.6k
  SDLoc SL(N);
6123
10.6k
6124
10.6k
  SDValue LHS = N->getOperand(0);
6125
10.6k
  SDValue RHS = N->getOperand(1);
6126
10.6k
  EVT VT = LHS.getValueType();
6127
10.6k
  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
6128
10.6k
6129
10.6k
  auto CRHS = dyn_cast<ConstantSDNode>(RHS);
6130
10.6k
  if (
!CRHS10.6k
) {
6131
5.95k
    CRHS = dyn_cast<ConstantSDNode>(LHS);
6132
5.95k
    if (
CRHS5.95k
) {
6133
0
      std::swap(LHS, RHS);
6134
0
      CC = getSetCCSwappedOperands(CC);
6135
0
    }
6136
5.95k
  }
6137
10.6k
6138
10.6k
  if (
CRHS && 10.6k
VT == MVT::i324.68k
&&
LHS.getOpcode() == ISD::SIGN_EXTEND4.00k
&&
6139
10.6k
      
isBoolSGPR(LHS.getOperand(0))16
) {
6140
3
    // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
6141
3
    // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
6142
3
    // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
6143
3
    // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
6144
3
    if ((CRHS->isAllOnesValue() &&
6145
3
         
(CC == ISD::SETNE || 3
CC == ISD::SETGT3
||
CC == ISD::SETULT0
)) ||
6146
0
        (CRHS->isNullValue() &&
6147
0
         
(CC == ISD::SETEQ || 0
CC == ISD::SETGE0
||
CC == ISD::SETULE0
)))
6148
3
      return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
6149
3
                         DAG.getConstant(-1, SL, MVT::i1));
6150
0
    
if (0
(CRHS->isAllOnesValue() &&
6151
0
         
(CC == ISD::SETEQ || 0
CC == ISD::SETLE0
||
CC == ISD::SETUGE0
)) ||
6152
0
        (CRHS->isNullValue() &&
6153
0
         
(CC == ISD::SETNE || 0
CC == ISD::SETUGT0
||
CC == ISD::SETLT0
)))
6154
0
      return LHS.getOperand(0);
6155
10.6k
  }
6156
10.6k
6157
10.6k
  
if (10.6k
VT != MVT::f32 && 10.6k
VT != MVT::f649.44k
&& (Subtarget->has16BitInsts() &&
6158
3.95k
                                           VT != MVT::f16))
6159
3.72k
    return SDValue();
6160
6.91k
6161
6.91k
  // Match isinf pattern
6162
6.91k
  // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
6163
6.91k
  
if (6.91k
CC == ISD::SETOEQ && 6.91k
LHS.getOpcode() == ISD::FABS98
) {
6164
6
    const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
6165
6
    if (!CRHS)
6166
0
      return SDValue();
6167
6
6168
6
    const APFloat &APF = CRHS->getValueAPF();
6169
6
    if (
APF.isInfinity() && 6
!APF.isNegative()6
) {
6170
2
      unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
6171
2
      return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
6172
2
                         DAG.getConstant(Mask, SL, MVT::i32));
6173
2
    }
6174
6.91k
  }
6175
6.91k
6176
6.91k
  return SDValue();
6177
6.91k
}
6178
6179
SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
6180
360
                                                     DAGCombinerInfo &DCI) const {
6181
360
  SelectionDAG &DAG = DCI.DAG;
6182
360
  SDLoc SL(N);
6183
360
  unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
6184
360
6185
360
  SDValue Src = N->getOperand(0);
6186
360
  SDValue Srl = N->getOperand(0);
6187
360
  if (Srl.getOpcode() == ISD::ZERO_EXTEND)
6188
52
    Srl = Srl.getOperand(0);
6189
360
6190
360
  // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
6191
360
  if (
Srl.getOpcode() == ISD::SRL360
) {
6192
59
    // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
6193
59
    // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
6194
59
    // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
6195
59
6196
59
    if (const ConstantSDNode *C =
6197
59
        dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
6198
59
      Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
6199
59
                               EVT(MVT::i32));
6200
59
6201
59
      unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
6202
59
      if (
SrcOffset < 32 && 59
SrcOffset % 8 == 059
) {
6203
59
        return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
6204
59
                           MVT::f32, Srl);
6205
59
      }
6206
301
    }
6207
59
  }
6208
301
6209
301
  APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
6210
301
6211
301
  KnownBits Known;
6212
301
  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
6213
301
                                        !DCI.isBeforeLegalizeOps());
6214
301
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6215
301
  if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
6216
301
      
TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)301
) {
6217
95
    DCI.CommitTargetLoweringOpt(TLO);
6218
95
  }
6219
360
6220
360
  return SDValue();
6221
360
}
6222
6223
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
6224
1.10M
                                            DAGCombinerInfo &DCI) const {
6225
1.10M
  switch (N->getOpcode()) {
6226
241k
  default:
6227
241k
    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
6228
118k
  case ISD::ADD:
6229
118k
    return performAddCombine(N, DCI);
6230
4.56k
  case ISD::SUB:
6231
4.56k
    return performSubCombine(N, DCI);
6232
30
  case ISD::ADDCARRY:
6233
30
  case ISD::SUBCARRY:
6234
30
    return performAddCarrySubCarryCombine(N, DCI);
6235
5.56k
  case ISD::FADD:
6236
5.56k
    return performFAddCombine(N, DCI);
6237
1.71k
  case ISD::FSUB:
6238
1.71k
    return performFSubCombine(N, DCI);
6239
10.6k
  case ISD::SETCC:
6240
10.6k
    return performSetCCCombine(N, DCI);
6241
8.29k
  case ISD::FMAXNUM:
6242
8.29k
  case ISD::FMINNUM:
6243
8.29k
  case ISD::SMAX:
6244
8.29k
  case ISD::SMIN:
6245
8.29k
  case ISD::UMAX:
6246
8.29k
  case ISD::UMIN:
6247
8.29k
  case AMDGPUISD::FMIN_LEGACY:
6248
8.29k
  case AMDGPUISD::FMAX_LEGACY: {
6249
8.29k
    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
6250
2.90k
        getTargetMachine().getOptLevel() > CodeGenOpt::None)
6251
2.90k
      return performMinMaxCombine(N, DCI);
6252
5.38k
    break;
6253
5.38k
  }
6254
466k
  case ISD::LOAD:
6255
466k
  case ISD::STORE:
6256
466k
  case ISD::ATOMIC_LOAD:
6257
466k
  case ISD::ATOMIC_STORE:
6258
466k
  case ISD::ATOMIC_CMP_SWAP:
6259
466k
  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
6260
466k
  case ISD::ATOMIC_SWAP:
6261
466k
  case ISD::ATOMIC_LOAD_ADD:
6262
466k
  case ISD::ATOMIC_LOAD_SUB:
6263
466k
  case ISD::ATOMIC_LOAD_AND:
6264
466k
  case ISD::ATOMIC_LOAD_OR:
6265
466k
  case ISD::ATOMIC_LOAD_XOR:
6266
466k
  case ISD::ATOMIC_LOAD_NAND:
6267
466k
  case ISD::ATOMIC_LOAD_MIN:
6268
466k
  case ISD::ATOMIC_LOAD_MAX:
6269
466k
  case ISD::ATOMIC_LOAD_UMIN:
6270
466k
  case ISD::ATOMIC_LOAD_UMAX:
6271
466k
  case AMDGPUISD::ATOMIC_INC:
6272
466k
  case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
6273
466k
    if (DCI.isBeforeLegalize())
6274
144k
      break;
6275
321k
    return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
6276
28.0k
  case ISD::AND:
6277
28.0k
    return performAndCombine(N, DCI);
6278
16.8k
  case ISD::OR:
6279
16.8k
    return performOrCombine(N, DCI);
6280
1.24k
  case ISD::XOR:
6281
1.24k
    return performXorCombine(N, DCI);
6282
12.7k
  case ISD::ZERO_EXTEND:
6283
12.7k
    return performZeroExtendCombine(N, DCI);
6284
74
  case AMDGPUISD::FP_CLASS:
6285
74
    return performClassCombine(N, DCI);
6286
401
  case ISD::FCANONICALIZE:
6287
401
    return performFCanonicalizeCombine(N, DCI);
6288
549
  case AMDGPUISD::FRACT:
6289
549
  case AMDGPUISD::RCP:
6290
549
  case AMDGPUISD::RSQ:
6291
549
  case AMDGPUISD::RCP_LEGACY:
6292
549
  case AMDGPUISD::RSQ_LEGACY:
6293
549
  case AMDGPUISD::RSQ_CLAMP:
6294
549
  case AMDGPUISD::LDEXP: {
6295
549
    SDValue Src = N->getOperand(0);
6296
549
    if (Src.isUndef())
6297
11
      return Src;
6298
538
    break;
6299
538
  }
6300
1.00k
  case ISD::SINT_TO_FP:
6301
1.00k
  case ISD::UINT_TO_FP:
6302
1.00k
    return performUCharToFloatCombine(N, DCI);
6303
360
  case AMDGPUISD::CVT_F32_UBYTE0:
6304
360
  case AMDGPUISD::CVT_F32_UBYTE1:
6305
360
  case AMDGPUISD::CVT_F32_UBYTE2:
6306
360
  case AMDGPUISD::CVT_F32_UBYTE3:
6307
360
    return performCvtF32UByteNCombine(N, DCI);
6308
107
  case AMDGPUISD::FMED3:
6309
107
    return performFMed3Combine(N, DCI);
6310
133
  case AMDGPUISD::CVT_PKRTZ_F16_F32:
6311
133
    return performCvtPkRTZCombine(N, DCI);
6312
89
  case ISD::SCALAR_TO_VECTOR: {
6313
89
    SelectionDAG &DAG = DCI.DAG;
6314
89
    EVT VT = N->getValueType(0);
6315
89
6316
89
    // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
6317
89
    if (
VT == MVT::v2i16 || 89
VT == MVT::v2f1674
) {
6318
28
      SDLoc SL(N);
6319
28
      SDValue Src = N->getOperand(0);
6320
28
      EVT EltVT = Src.getValueType();
6321
28
      if (EltVT == MVT::f16)
6322
13
        Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
6323
28
6324
28
      SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
6325
28
      return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
6326
28
    }
6327
61
6328
61
    break;
6329
61
  }
6330
104k
  case ISD::EXTRACT_VECTOR_ELT:
6331
104k
    return performExtractVectorEltCombine(N, DCI);
6332
85.7k
  case ISD::BUILD_VECTOR:
6333
85.7k
    return performBuildVectorCombine(N, DCI);
6334
150k
  }
6335
150k
  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
6336
150k
}
6337
6338
/// \brief Helper function for adjustWritemask
6339
342
static unsigned SubIdx2Lane(unsigned Idx) {
6340
342
  switch (Idx) {
6341
0
  default: return 0;
6342
111
  case AMDGPU::sub0: return 0;
6343
86
  case AMDGPU::sub1: return 1;
6344
78
  case AMDGPU::sub2: return 2;
6345
67
  case AMDGPU::sub3: return 3;
6346
0
  }
6347
0
}
6348
6349
/// \brief Adjust the writemask of MIMG instructions
6350
void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
6351
258
                                       SelectionDAG &DAG) const {
6352
258
  SDNode *Users[4] = { };
6353
258
  unsigned Lane = 0;
6354
258
  unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 
226
:
3232
;
6355
258
  unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
6356
258
  unsigned NewDmask = 0;
6357
258
6358
258
  // Try to figure out the used register components
6359
258
  for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
6360
680
       
I != E680
;
++I422
) {
6361
558
6362
558
    // Don't look at users of the chain.
6363
558
    if (I.getUse().getResNo() != 0)
6364
82
      continue;
6365
476
6366
476
    // Abort if we can't understand the usage
6367
476
    
if (476
!I->isMachineOpcode() ||
6368
460
        I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
6369
134
      return;
6370
342
6371
342
    // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
6372
342
    // Note that subregs are packed, i.e. Lane==0 is the first bit set
6373
342
    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
6374
342
    // set, etc.
6375
342
    Lane = SubIdx2Lane(I->getConstantOperandVal(1));
6376
342
6377
342
    // Set which texture component corresponds to the lane.
6378
342
    unsigned Comp;
6379
1.12k
    for (unsigned i = 0, Dmask = OldDmask; 
i <= Lane1.12k
;
i++785
) {
6380
785
      assert(Dmask);
6381
785
      Comp = countTrailingZeros(Dmask);
6382
785
      Dmask &= ~(1 << Comp);
6383
785
    }
6384
342
6385
342
    // Abort if we have more than one user per component
6386
342
    if (Users[Lane])
6387
2
      return;
6388
340
6389
340
    Users[Lane] = *I;
6390
340
    NewDmask |= 1 << Comp;
6391
340
  }
6392
258
6393
258
  // Abort if there's no change
6394
122
  
if (122
NewDmask == OldDmask122
)
6395
50
    return;
6396
72
6397
72
  // Adjust the writemask in the node
6398
72
  std::vector<SDValue> Ops;
6399
72
  Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
6400
72
  Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
6401
72
  Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
6402
72
  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
6403
72
6404
72
  // If we only got one lane, replace it with a copy
6405
72
  // (if NewDmask has only one bit set...)
6406
72
  if (
NewDmask && 72
(NewDmask & (NewDmask-1)) == 072
) {
6407
38
    SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
6408
38
                                       MVT::i32);
6409
38
    SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
6410
38
                                      SDLoc(), Users[Lane]->getValueType(0),
6411
38
                                      SDValue(Node, 0), RC);
6412
38
    DAG.ReplaceAllUsesWith(Users[Lane], Copy);
6413
38
    return;
6414
38
  }
6415
34
6416
34
  // Update the users of the node with the new indices
6417
170
  
for (unsigned i = 0, Idx = AMDGPU::sub0; 34
i < 4170
;
++i136
) {
6418
136
    SDNode *User = Users[i];
6419
136
    if (!User)
6420
48
      continue;
6421
88
6422
88
    SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
6423
88
    DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
6424
88
6425
88
    switch (Idx) {
6426
0
    default: break;
6427
34
    case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
6428
34
    case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
6429
20
    case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
6430
136
    }
6431
136
  }
6432
258
}
6433
6434
331k
static bool isFrameIndexOp(SDValue Op) {
6435
331k
  if (Op.getOpcode() == ISD::AssertZext)
6436
46
    Op = Op.getOperand(0);
6437
331k
6438
331k
  return isa<FrameIndexSDNode>(Op);
6439
331k
}
6440
6441
/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
6442
/// with frame index operands.
6443
/// LLVM assumes that inputs are to these instructions are registers.
6444
SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
6445
54.7k
                                                        SelectionDAG &DAG) const {
6446
54.7k
  if (
Node->getOpcode() == ISD::CopyToReg54.7k
) {
6447
9.79k
    RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
6448
9.79k
    SDValue SrcVal = Node->getOperand(2);
6449
9.79k
6450
9.79k
    // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
6451
9.79k
    // to try understanding copies to physical registers.
6452
9.79k
    if (SrcVal.getValueType() == MVT::i1 &&
6453
9.79k
        
TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())155
) {
6454
8
      SDLoc SL(Node);
6455
8
      MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
6456
8
      SDValue VReg = DAG.getRegister(
6457
8
        MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
6458
8
6459
8
      SDNode *Glued = Node->getGluedNode();
6460
8
      SDValue ToVReg
6461
8
        = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
6462
8
                         SDValue(Glued, Glued ? 
Glued->getNumValues() - 12
:
06
));
6463
8
      SDValue ToResultReg
6464
8
        = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
6465
8
                           VReg, ToVReg.getValue(1));
6466
8
      DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
6467
8
      DAG.RemoveDeadNode(Node);
6468
8
      return ToResultReg.getNode();
6469
8
    }
6470
54.7k
  }
6471
54.7k
6472
54.7k
  SmallVector<SDValue, 8> Ops;
6473
386k
  for (unsigned i = 0; 
i < Node->getNumOperands()386k
;
++i331k
) {
6474
331k
    if (
!isFrameIndexOp(Node->getOperand(i))331k
) {
6475
331k
      Ops.push_back(Node->getOperand(i));
6476
331k
      continue;
6477
331k
    }
6478
23
6479
23
    SDLoc DL(Node);
6480
23
    Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
6481
23
                                     Node->getOperand(i).getValueType(),
6482
23
                                     Node->getOperand(i)), 0));
6483
23
  }
6484
54.7k
6485
54.7k
  DAG.UpdateNodeOperands(Node, Ops);
6486
54.7k
  return Node;
6487
54.7k
}
6488
6489
/// \brief Fold the instructions after selecting them.
6490
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
6491
357k
                                          SelectionDAG &DAG) const {
6492
357k
  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6493
357k
  unsigned Opcode = Node->getMachineOpcode();
6494
357k
6495
357k
  if (
TII->isMIMG(Opcode) && 357k
!TII->get(Opcode).mayStore()390
&&
6496
334
      !TII->isGather4(Opcode))
6497
258
    adjustWritemask(Node, DAG);
6498
357k
6499
357k
  if (Opcode == AMDGPU::INSERT_SUBREG ||
6500
357k
      
Opcode == AMDGPU::REG_SEQUENCE357k
) {
6501
44.9k
    legalizeTargetIndependentNode(Node, DAG);
6502
44.9k
    return Node;
6503
44.9k
  }
6504
312k
6505
312k
  switch (Opcode) {
6506
255
  case AMDGPU::V_DIV_SCALE_F32:
6507
255
  case AMDGPU::V_DIV_SCALE_F64: {
6508
255
    // Satisfy the operand register constraint when one of the inputs is
6509
255
    // undefined. Ordinarily each undef value will have its own implicit_def of
6510
255
    // a vreg, so force these to use a single register.
6511
255
    SDValue Src0 = Node->getOperand(0);
6512
255
    SDValue Src1 = Node->getOperand(1);
6513
255
    SDValue Src2 = Node->getOperand(2);
6514
255
6515
255
    if ((Src0.isMachineOpcode() &&
6516
252
         Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
6517
249
        
(Src0 == Src1 || 249
Src0 == Src2124
))
6518
249
      break;
6519
6
6520
6
    MVT VT = Src0.getValueType().getSimpleVT();
6521
6
    const TargetRegisterClass *RC = getRegClassFor(VT);
6522
6
6523
6
    MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
6524
6
    SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
6525
6
6526
6
    SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
6527
6
                                      UndefReg, Src0, SDValue());
6528
6
6529
6
    // src0 must be the same register as src1 or src2, even if the value is
6530
6
    // undefined, so make sure we don't violate this constraint.
6531
6
    if (Src0.isMachineOpcode() &&
6532
6
        
Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF3
) {
6533
3
      if (Src1.isMachineOpcode() &&
6534
3
          Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
6535
0
        Src0 = Src1;
6536
3
      else 
if (3
Src2.isMachineOpcode() &&
6537
3
               Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
6538
2
        Src0 = Src2;
6539
1
      else {
6540
1
        assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
6541
1
        Src0 = UndefReg;
6542
1
        Src1 = UndefReg;
6543
1
      }
6544
3
    } else
6545
3
      break;
6546
3
6547
3
    SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
6548
3
    for (unsigned I = 3, N = Node->getNumOperands(); 
I != N3
;
++I0
)
6549
0
      Ops.push_back(Node->getOperand(I));
6550
3
6551
3
    Ops.push_back(ImpDef.getValue(1));
6552
3
    return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
6553
3
  }
6554
312k
  default:
6555
312k
    break;
6556
312k
  }
6557
312k
6558
312k
  return Node;
6559
312k
}
6560
6561
/// \brief Assign the register class depending on the number of
6562
/// bits set in the writemask
6563
void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
6564
30.2k
                                                     SDNode *Node) const {
6565
30.2k
  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6566
30.2k
6567
30.2k
  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6568
30.2k
6569
30.2k
  if (
TII->isVOP3(MI.getOpcode())30.2k
) {
6570
28.3k
    // Make sure constant bus requirements are respected.
6571
28.3k
    TII->legalizeOperandsVOP3(MRI, MI);
6572
28.3k
    return;
6573
28.3k
  }
6574
1.92k
6575
1.92k
  
if (1.92k
TII->isMIMG(MI)1.92k
) {
6576
258
    unsigned VReg = MI.getOperand(0).getReg();
6577
258
    const TargetRegisterClass *RC = MRI.getRegClass(VReg);
6578
258
    // TODO: Need mapping tables to handle other cases (register classes).
6579
258
    if (RC != &AMDGPU::VReg_128RegClass)
6580
10
      return;
6581
248
6582
248
    
unsigned DmaskIdx = MI.getNumOperands() == 12 ? 248
320
:
4228
;
6583
248
    unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
6584
248
    unsigned BitsSet = 0;
6585
1.24k
    for (unsigned i = 0; 
i < 41.24k
;
++i992
)
6586
992
      
BitsSet += Writemask & (1 << i) ? 992
1830
:
0162
;
6587
248
    switch (BitsSet) {
6588
176
    default: return;
6589
38
    case 1:  RC = &AMDGPU::VGPR_32RegClass; break;
6590
14
    case 2:  RC = &AMDGPU::VReg_64RegClass; break;
6591
20
    case 3:  RC = &AMDGPU::VReg_96RegClass; break;
6592
72
    }
6593
72
6594
72
    unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
6595
72
    MI.setDesc(TII->get(NewOpcode));
6596
72
    MRI.setRegClass(VReg, RC);
6597
72
    return;
6598
72
  }
6599
1.66k
6600
1.66k
  // Replace unused atomics with the no return version.
6601
1.66k
  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
6602
1.66k
  if (
NoRetAtomicOp != -11.66k
) {
6603
1.64k
    if (
!Node->hasAnyUseOfValue(0)1.64k
) {
6604
856
      MI.setDesc(TII->get(NoRetAtomicOp));
6605
856
      MI.RemoveOperand(0);
6606
856
      return;
6607
856
    }
6608
784
6609
784
    // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
6610
784
    // instruction, because the return type of these instructions is a vec2 of
6611
784
    // the memory type, so it can be tied to the input operand.
6612
784
    // This means these instructions always have a use, so we need to add a
6613
784
    // special case to check if the atomic has only one extract_subreg use,
6614
784
    // which itself has no uses.
6615
784
    
if (784
(Node->hasNUsesOfValue(1, 0) &&
6616
782
         Node->use_begin()->isMachineOpcode() &&
6617
758
         Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
6618
784
         
!Node->use_begin()->hasAnyUseOfValue(0)12
)) {
6619
0
      unsigned Def = MI.getOperand(0).getReg();
6620
0
6621
0
      // Change this into a noret atomic.
6622
0
      MI.setDesc(TII->get(NoRetAtomicOp));
6623
0
      MI.RemoveOperand(0);
6624
0
6625
0
      // If we only remove the def operand from the atomic instruction, the
6626
0
      // extract_subreg will be left with a use of a vreg without a def.
6627
0
      // So we need to insert an implicit_def to avoid machine verifier
6628
0
      // errors.
6629
0
      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
6630
0
              TII->get(AMDGPU::IMPLICIT_DEF), Def);
6631
0
    }
6632
1.64k
    return;
6633
1.64k
  }
6634
30.2k
}
6635
6636
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
6637
44.0k
                              uint64_t Val) {
6638
44.0k
  SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
6639
44.0k
  return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
6640
44.0k
}
6641
6642
MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
6643
                                                const SDLoc &DL,
6644
3.96k
                                                SDValue Ptr) const {
6645
3.96k
  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6646
3.96k
6647
3.96k
  // Build the half of the subregister with the constants before building the
6648
3.96k
  // full 128-bit register. If we are building multiple resource descriptors,
6649
3.96k
  // this will allow CSEing of the 2-component register.
6650
3.96k
  const SDValue Ops0[] = {
6651
3.96k
    DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
6652
3.96k
    buildSMovImm32(DAG, DL, 0),
6653
3.96k
    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
6654
3.96k
    buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
6655
3.96k
    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
6656
3.96k
  };
6657
3.96k
6658
3.96k
  SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
6659
3.96k
                                                MVT::v2i32, Ops0), 0);
6660
3.96k
6661
3.96k
  // Combine the constants and the pointer.
6662
3.96k
  const SDValue Ops1[] = {
6663
3.96k
    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
6664
3.96k
    Ptr,
6665
3.96k
    DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
6666
3.96k
    SubRegHi,
6667
3.96k
    DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
6668
3.96k
  };
6669
3.96k
6670
3.96k
  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
6671
3.96k
}
6672
6673
/// \brief Return a resource descriptor with the 'Add TID' bit enabled
6674
///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
6675
///        of the resource descriptor) to create an offset, which is added to
6676
///        the resource pointer.
6677
MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
6678
                                           SDValue Ptr, uint32_t RsrcDword1,
6679
18.0k
                                           uint64_t RsrcDword2And3) const {
6680
18.0k
  SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
6681
18.0k
  SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
6682
18.0k
  if (
RsrcDword118.0k
) {
6683
0
    PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
6684
0
                                     DAG.getConstant(RsrcDword1, DL, MVT::i32)),
6685
0
                    0);
6686
0
  }
6687
18.0k
6688
18.0k
  SDValue DataLo = buildSMovImm32(DAG, DL,
6689
18.0k
                                  RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
6690
18.0k
  SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
6691
18.0k
6692
18.0k
  const SDValue Ops[] = {
6693
18.0k
    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
6694
18.0k
    PtrLo,
6695
18.0k
    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
6696
18.0k
    PtrHi,
6697
18.0k
    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
6698
18.0k
    DataLo,
6699
18.0k
    DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
6700
18.0k
    DataHi,
6701
18.0k
    DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
6702
18.0k
  };
6703
18.0k
6704
18.0k
  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
6705
18.0k
}
6706
6707
//===----------------------------------------------------------------------===//
6708
//                         SI Inline Assembly Support
6709
//===----------------------------------------------------------------------===//
6710
6711
std::pair<unsigned, const TargetRegisterClass *>
6712
SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
6713
                                               StringRef Constraint,
6714
1.82k
                                               MVT VT) const {
6715
1.82k
  if (!isTypeLegal(VT))
6716
977
    return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
6717
849
6718
849
  
if (849
Constraint.size() == 1849
) {
6719
464
    switch (Constraint[0]) {
6720
275
    case 's':
6721
275
    case 'r':
6722
275
      switch (VT.getSizeInBits()) {
6723
0
      default:
6724
0
        return std::make_pair(0U, nullptr);
6725
142
      case 32:
6726
142
      case 16:
6727
142
        return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
6728
63
      case 64:
6729
63
        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
6730
10
      case 128:
6731
10
        return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
6732
44
      case 256:
6733
44
        return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
6734
16
      case 512:
6735
16
        return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
6736
0
      }
6737
0
6738
189
    case 'v':
6739
189
      switch (VT.getSizeInBits()) {
6740
0
      default:
6741
0
        return std::make_pair(0U, nullptr);
6742
126
      case 32:
6743
126
      case 16:
6744
126
        return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
6745
36
      case 64:
6746
36
        return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
6747
0
      case 96:
6748
0
        return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
6749
27
      case 128:
6750
27
        return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
6751
0
      case 256:
6752
0
        return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
6753
0
      case 512:
6754
0
        return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
6755
385
      }
6756
464
    }
6757
464
  }
6758
385
6759
385
  
if (385
Constraint.size() > 1385
) {
6760
385
    const TargetRegisterClass *RC = nullptr;
6761
385
    if (
Constraint[1] == 'v'385
) {
6762
106
      RC = &AMDGPU::VGPR_32RegClass;
6763
385
    } else 
if (279
Constraint[1] == 's'279
) {
6764
246
      RC = &AMDGPU::SGPR_32RegClass;
6765
246
    }
6766
385
6767
385
    if (
RC385
) {
6768
352
      uint32_t Idx;
6769
352
      bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
6770
352
      if (
!Failed && 352
Idx < RC->getNumRegs()0
)
6771
0
        return std::make_pair(RC->getRegister(Idx), RC);
6772
385
    }
6773
385
  }
6774
385
  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
6775
385
}
6776
6777
SITargetLowering::ConstraintType
6778
5.90k
SITargetLowering::getConstraintType(StringRef Constraint) const {
6779
5.90k
  if (
Constraint.size() == 15.90k
) {
6780
2.07k
    switch (Constraint[0]) {
6781
144
    default: break;
6782
1.92k
    case 's':
6783
1.92k
    case 'v':
6784
1.92k
      return C_RegisterClass;
6785
3.98k
    }
6786
3.98k
  }
6787
3.98k
  return TargetLowering::getConstraintType(Constraint);
6788
3.98k
}
6789
6790
// Figure out which registers should be reserved for stack access. Only after
6791
// the function is legalized do we know all of the non-spill stack objects or if
6792
// calls are present.
6793
15.0k
void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
6794
15.0k
  MachineRegisterInfo &MRI = MF.getRegInfo();
6795
15.0k
  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6796
15.0k
  const MachineFrameInfo &MFI = MF.getFrameInfo();
6797
15.0k
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
6798
15.0k
  const SIRegisterInfo *TRI = ST.getRegisterInfo();
6799
15.0k
6800
15.0k
  if (
Info->isEntryFunction()15.0k
) {
6801
14.1k
    // Callable functions have fixed registers used for stack access.
6802
14.1k
    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
6803
14.1k
  }
6804
15.0k
6805
15.0k
  // We have to assume the SP is needed in case there are calls in the function
6806
15.0k
  // during lowering. Calls are only detected after the function is
6807
15.0k
  // lowered. We're about to reserve registers, so don't bother using it if we
6808
15.0k
  // aren't really going to use it.
6809
15.0k
  bool NeedSP = !Info->isEntryFunction() ||
6810
14.1k
    MFI.hasVarSizedObjects() ||
6811
14.1k
    MFI.hasCalls();
6812
15.0k
6813
15.0k
  if (
NeedSP15.0k
) {
6814
1.19k
    unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
6815
1.19k
    Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
6816
1.19k
6817
1.19k
    assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
6818
1.19k
    assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
6819
1.19k
                               Info->getStackPtrOffsetReg()));
6820
1.19k
    MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
6821
1.19k
  }
6822
15.0k
6823
15.0k
  MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
6824
15.0k
  MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
6825
15.0k
  MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
6826
15.0k
                     Info->getScratchWaveOffsetReg());
6827
15.0k
6828
15.0k
  TargetLoweringBase::finalizeLowering(MF);
6829
15.0k
}