Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
/// \file
9
/// This file implements the targeting of the Machinelegalizer class for
10
/// AMDGPU.
11
/// \todo This should be generated by TableGen.
12
//===----------------------------------------------------------------------===//
13
14
#include "AMDGPU.h"
15
#include "AMDGPULegalizerInfo.h"
16
#include "AMDGPUTargetMachine.h"
17
#include "SIMachineFunctionInfo.h"
18
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20
#include "llvm/CodeGen/TargetOpcodes.h"
21
#include "llvm/CodeGen/ValueTypes.h"
22
#include "llvm/IR/DerivedTypes.h"
23
#include "llvm/IR/Type.h"
24
#include "llvm/Support/Debug.h"
25
26
#define DEBUG_TYPE "amdgpu-legalinfo"
27
28
using namespace llvm;
29
using namespace LegalizeActions;
30
using namespace LegalizeMutations;
31
using namespace LegalityPredicates;
32
33
34
static LegalityPredicate isMultiple32(unsigned TypeIdx,
35
3.64k
                                      unsigned MaxSize = 512) {
36
3.64k
  return [=](const LegalityQuery &Query) {
37
132
    const LLT Ty = Query.Types[TypeIdx];
38
132
    const LLT EltTy = Ty.getScalarType();
39
132
    return Ty.getSizeInBits() <= MaxSize && 
EltTy.getSizeInBits() % 32 == 0127
;
40
132
  };
41
3.64k
}
42
43
22.6k
static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
44
22.6k
  return [=](const LegalityQuery &Query) {
45
517
    const LLT Ty = Query.Types[TypeIdx];
46
517
    return Ty.isVector() &&
47
517
           
Ty.getNumElements() % 2 != 0427
&&
48
517
           
Ty.getElementType().getSizeInBits() < 32239
;
49
517
  };
50
22.6k
}
51
52
22.6k
static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
53
22.6k
  return [=](const LegalityQuery &Query) {
54
195
    const LLT Ty = Query.Types[TypeIdx];
55
195
    const LLT EltTy = Ty.getElementType();
56
195
    return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
57
195
  };
58
22.6k
}
59
60
3.64k
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
61
3.64k
  return [=](const LegalityQuery &Query) {
62
37
    const LLT Ty = Query.Types[TypeIdx];
63
37
    const LLT EltTy = Ty.getElementType();
64
37
    unsigned Size = Ty.getSizeInBits();
65
37
    unsigned Pieces = (Size + 63) / 64;
66
37
    unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
67
37
    return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
68
37
  };
69
3.64k
}
70
71
3.64k
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
72
3.64k
  return [=](const LegalityQuery &Query) {
73
58
    const LLT QueryTy = Query.Types[TypeIdx];
74
58
    return QueryTy.isVector() && 
QueryTy.getSizeInBits() > Size43
;
75
58
  };
76
3.64k
}
77
78
3.64k
static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
79
3.64k
  return [=](const LegalityQuery &Query) {
80
30
    const LLT QueryTy = Query.Types[TypeIdx];
81
30
    return QueryTy.isVector() && 
QueryTy.getNumElements() % 2 != 026
;
82
30
  };
83
3.64k
}
84
85
// Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
86
// v2s16.
87
7.28k
static LegalityPredicate isRegisterType(unsigned TypeIdx) {
88
7.28k
  return [=](const LegalityQuery &Query) {
89
247
    const LLT Ty = Query.Types[TypeIdx];
90
247
    if (Ty.isVector()) {
91
247
      const int EltSize = Ty.getElementType().getSizeInBits();
92
247
      return EltSize == 32 || 
EltSize == 64219
||
93
247
            
(199
EltSize == 16199
&&
Ty.getNumElements() % 2 == 0169
) ||
94
247
             
EltSize == 12867
||
EltSize == 25665
;
95
247
    }
96
0
97
0
    return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
98
0
  };
99
7.28k
}
100
101
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
102
                                         const GCNTargetMachine &TM)
103
3.64k
  :  ST(ST_) {
104
3.64k
  using namespace TargetOpcode;
105
3.64k
106
25.4k
  auto GetAddrSpacePtr = [&TM](unsigned AS) {
107
25.4k
    return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
108
25.4k
  };
109
3.64k
110
3.64k
  const LLT S1 = LLT::scalar(1);
111
3.64k
  const LLT S8 = LLT::scalar(8);
112
3.64k
  const LLT S16 = LLT::scalar(16);
113
3.64k
  const LLT S32 = LLT::scalar(32);
114
3.64k
  const LLT S64 = LLT::scalar(64);
115
3.64k
  const LLT S128 = LLT::scalar(128);
116
3.64k
  const LLT S256 = LLT::scalar(256);
117
3.64k
  const LLT S512 = LLT::scalar(512);
118
3.64k
119
3.64k
  const LLT V2S16 = LLT::vector(2, 16);
120
3.64k
  const LLT V4S16 = LLT::vector(4, 16);
121
3.64k
122
3.64k
  const LLT V2S32 = LLT::vector(2, 32);
123
3.64k
  const LLT V3S32 = LLT::vector(3, 32);
124
3.64k
  const LLT V4S32 = LLT::vector(4, 32);
125
3.64k
  const LLT V5S32 = LLT::vector(5, 32);
126
3.64k
  const LLT V6S32 = LLT::vector(6, 32);
127
3.64k
  const LLT V7S32 = LLT::vector(7, 32);
128
3.64k
  const LLT V8S32 = LLT::vector(8, 32);
129
3.64k
  const LLT V9S32 = LLT::vector(9, 32);
130
3.64k
  const LLT V10S32 = LLT::vector(10, 32);
131
3.64k
  const LLT V11S32 = LLT::vector(11, 32);
132
3.64k
  const LLT V12S32 = LLT::vector(12, 32);
133
3.64k
  const LLT V13S32 = LLT::vector(13, 32);
134
3.64k
  const LLT V14S32 = LLT::vector(14, 32);
135
3.64k
  const LLT V15S32 = LLT::vector(15, 32);
136
3.64k
  const LLT V16S32 = LLT::vector(16, 32);
137
3.64k
138
3.64k
  const LLT V2S64 = LLT::vector(2, 64);
139
3.64k
  const LLT V3S64 = LLT::vector(3, 64);
140
3.64k
  const LLT V4S64 = LLT::vector(4, 64);
141
3.64k
  const LLT V5S64 = LLT::vector(5, 64);
142
3.64k
  const LLT V6S64 = LLT::vector(6, 64);
143
3.64k
  const LLT V7S64 = LLT::vector(7, 64);
144
3.64k
  const LLT V8S64 = LLT::vector(8, 64);
145
3.64k
146
3.64k
  std::initializer_list<LLT> AllS32Vectors =
147
3.64k
    {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
148
3.64k
     V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
149
3.64k
  std::initializer_list<LLT> AllS64Vectors =
150
3.64k
    {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
151
3.64k
152
3.64k
  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
153
3.64k
  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
154
3.64k
  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
155
3.64k
  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
156
3.64k
  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
157
3.64k
  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
158
3.64k
  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
159
3.64k
160
3.64k
  const LLT CodePtr = FlatPtr;
161
3.64k
162
3.64k
  const std::initializer_list<LLT> AddrSpaces64 = {
163
3.64k
    GlobalPtr, ConstantPtr, FlatPtr
164
3.64k
  };
165
3.64k
166
3.64k
  const std::initializer_list<LLT> AddrSpaces32 = {
167
3.64k
    LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
168
3.64k
  };
169
3.64k
170
3.64k
  const std::initializer_list<LLT> FPTypesBase = {
171
3.64k
    S32, S64
172
3.64k
  };
173
3.64k
174
3.64k
  const std::initializer_list<LLT> FPTypes16 = {
175
3.64k
    S32, S64, S16
176
3.64k
  };
177
3.64k
178
3.64k
  const std::initializer_list<LLT> FPTypesPK16 = {
179
3.64k
    S32, S64, S16, V2S16
180
3.64k
  };
181
3.64k
182
3.64k
  setAction({G_BRCOND, S1}, Legal);
183
3.64k
184
3.64k
  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
185
3.64k
  // elements for v3s16
186
3.64k
  getActionDefinitionsBuilder(G_PHI)
187
3.64k
    .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
188
3.64k
    .legalFor(AllS32Vectors)
189
3.64k
    .legalFor(AllS64Vectors)
190
3.64k
    .legalFor(AddrSpaces64)
191
3.64k
    .legalFor(AddrSpaces32)
192
3.64k
    .clampScalar(0, S32, S256)
193
3.64k
    .widenScalarToNextPow2(0, 32)
194
3.64k
    .clampMaxNumElements(0, S32, 16)
195
3.64k
    .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
196
3.64k
    .legalIf(isPointer(0));
197
3.64k
198
3.64k
  if (ST.has16BitInsts()) {
199
2.01k
    getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
200
2.01k
      .legalFor({S32, S16})
201
2.01k
      .clampScalar(0, S16, S32)
202
2.01k
      .scalarize(0);
203
2.01k
  } else {
204
1.62k
    getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
205
1.62k
      .legalFor({S32})
206
1.62k
      .clampScalar(0, S32, S32)
207
1.62k
      .scalarize(0);
208
1.62k
  }
209
3.64k
210
3.64k
  getActionDefinitionsBuilder({G_UMULH, G_SMULH})
211
3.64k
    .legalFor({S32})
212
3.64k
    .clampScalar(0, S32, S32)
213
3.64k
    .scalarize(0);
214
3.64k
215
3.64k
  // Report legal for any types we can handle anywhere. For the cases only legal
216
3.64k
  // on the SALU, RegBankSelect will be able to re-legalize.
217
3.64k
  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
218
3.64k
    .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
219
3.64k
    .clampScalar(0, S32, S64)
220
3.64k
    .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
221
3.64k
    .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
222
3.64k
    .widenScalarToNextPow2(0)
223
3.64k
    .scalarize(0);
224
3.64k
225
3.64k
  getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
226
3.64k
                               G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
227
3.64k
    .legalFor({{S32, S1}})
228
3.64k
    .clampScalar(0, S32, S32);
229
3.64k
230
3.64k
  getActionDefinitionsBuilder(G_BITCAST)
231
3.64k
    .legalForCartesianProduct({S32, V2S16})
232
3.64k
    .legalForCartesianProduct({S64, V2S32, V4S16})
233
3.64k
    .legalForCartesianProduct({V2S64, V4S32})
234
3.64k
    // Don't worry about the size constraint.
235
3.64k
    .legalIf(all(isPointer(0), isPointer(1)));
236
3.64k
237
3.64k
  if (ST.has16BitInsts()) {
238
2.01k
    getActionDefinitionsBuilder(G_FCONSTANT)
239
2.01k
      .legalFor({S32, S64, S16})
240
2.01k
      .clampScalar(0, S16, S64);
241
2.01k
  } else {
242
1.62k
    getActionDefinitionsBuilder(G_FCONSTANT)
243
1.62k
      .legalFor({S32, S64})
244
1.62k
      .clampScalar(0, S32, S64);
245
1.62k
  }
246
3.64k
247
3.64k
  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
248
3.64k
    .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
249
3.64k
               ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
250
3.64k
    .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
251
3.64k
    .clampScalarOrElt(0, S32, S512)
252
3.64k
    .legalIf(isMultiple32(0))
253
3.64k
    .widenScalarToNextPow2(0, 32)
254
3.64k
    .clampMaxNumElements(0, S32, 16);
255
3.64k
256
3.64k
257
3.64k
  // FIXME: i1 operands to intrinsics should always be legal, but other i1
258
3.64k
  // values may not be legal.  We need to figure out how to distinguish
259
3.64k
  // between these two scenarios.
260
3.64k
  getActionDefinitionsBuilder(G_CONSTANT)
261
3.64k
    .legalFor({S1, S32, S64, GlobalPtr,
262
3.64k
               LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
263
3.64k
    .clampScalar(0, S32, S64)
264
3.64k
    .widenScalarToNextPow2(0)
265
3.64k
    .legalIf(isPointer(0));
266
3.64k
267
3.64k
  setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
268
3.64k
269
3.64k
  auto &FPOpActions = getActionDefinitionsBuilder(
270
3.64k
    { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
271
3.64k
    .legalFor({S32, S64});
272
3.64k
273
3.64k
  if (ST.has16BitInsts()) {
274
2.01k
    if (ST.hasVOP3PInsts())
275
775
      FPOpActions.legalFor({S16, V2S16});
276
1.24k
    else
277
1.24k
      FPOpActions.legalFor({S16});
278
2.01k
  }
279
3.64k
280
3.64k
  auto &MinNumMaxNum = getActionDefinitionsBuilder({
281
3.64k
      G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
282
3.64k
283
3.64k
  if (ST.hasVOP3PInsts()) {
284
775
    MinNumMaxNum.customFor(FPTypesPK16)
285
775
      .clampMaxNumElements(0, S16, 2)
286
775
      .clampScalar(0, S16, S64)
287
775
      .scalarize(0);
288
2.86k
  } else if (ST.has16BitInsts()) {
289
1.24k
    MinNumMaxNum.customFor(FPTypes16)
290
1.24k
      .clampScalar(0, S16, S64)
291
1.24k
      .scalarize(0);
292
1.62k
  } else {
293
1.62k
    MinNumMaxNum.customFor(FPTypesBase)
294
1.62k
      .clampScalar(0, S32, S64)
295
1.62k
      .scalarize(0);
296
1.62k
  }
297
3.64k
298
3.64k
  // TODO: Implement
299
3.64k
  getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
300
3.64k
301
3.64k
  if (ST.hasVOP3PInsts())
302
775
    FPOpActions.clampMaxNumElements(0, S16, 2);
303
3.64k
  FPOpActions
304
3.64k
    .scalarize(0)
305
3.64k
    .clampScalar(0, ST.has16BitInsts() ? 
S162.01k
:
S321.62k
, S64);
306
3.64k
307
3.64k
  if (ST.has16BitInsts()) {
308
2.01k
    getActionDefinitionsBuilder(G_FSQRT)
309
2.01k
      .legalFor({S32, S64, S16})
310
2.01k
      .scalarize(0)
311
2.01k
      .clampScalar(0, S16, S64);
312
2.01k
  } else {
313
1.62k
    getActionDefinitionsBuilder(G_FSQRT)
314
1.62k
      .legalFor({S32, S64})
315
1.62k
      .scalarize(0)
316
1.62k
      .clampScalar(0, S32, S64);
317
1.62k
  }
318
3.64k
319
3.64k
  getActionDefinitionsBuilder(G_FPTRUNC)
320
3.64k
    .legalFor({{S32, S64}, {S16, S32}})
321
3.64k
    .scalarize(0);
322
3.64k
323
3.64k
  getActionDefinitionsBuilder(G_FPEXT)
324
3.64k
    .legalFor({{S64, S32}, {S32, S16}})
325
3.64k
    .lowerFor({{S64, S16}}) // FIXME: Implement
326
3.64k
    .scalarize(0);
327
3.64k
328
3.64k
  // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
329
3.64k
  getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
330
3.64k
331
3.64k
  getActionDefinitionsBuilder(G_FSUB)
332
3.64k
      // Use actual fsub instruction
333
3.64k
      .legalFor({S32})
334
3.64k
      // Must use fadd + fneg
335
3.64k
      .lowerFor({S64, S16, V2S16})
336
3.64k
      .scalarize(0)
337
3.64k
      .clampScalar(0, S32, S64);
338
3.64k
339
3.64k
  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
340
3.64k
    .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
341
3.64k
               {S32, S1}, {S64, S1}, {S16, S1},
342
3.64k
               // FIXME: Hack
343
3.64k
               {S64, LLT::scalar(33)},
344
3.64k
               {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
345
3.64k
    .scalarize(0);
346
3.64k
347
3.64k
  getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
348
3.64k
    .legalFor({{S32, S32}, {S64, S32}})
349
3.64k
    .lowerFor({{S32, S64}})
350
3.64k
    .customFor({{S64, S64}})
351
3.64k
    .scalarize(0);
352
3.64k
353
3.64k
  getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
354
3.64k
    .legalFor({{S32, S32}, {S32, S64}})
355
3.64k
    .scalarize(0);
356
3.64k
357
3.64k
  getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
358
3.64k
    .legalFor({S32, S64})
359
3.64k
    .scalarize(0);
360
3.64k
361
3.64k
  if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
362
2.57k
    getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
363
2.57k
      .legalFor({S32, S64})
364
2.57k
      .clampScalar(0, S32, S64)
365
2.57k
      .scalarize(0);
366
2.57k
  } else {
367
1.06k
    getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
368
1.06k
      .legalFor({S32})
369
1.06k
      .customFor({S64})
370
1.06k
      .clampScalar(0, S32, S64)
371
1.06k
      .scalarize(0);
372
1.06k
  }
373
3.64k
374
3.64k
  getActionDefinitionsBuilder(G_GEP)
375
3.64k
    .legalForCartesianProduct(AddrSpaces64, {S64})
376
3.64k
    .legalForCartesianProduct(AddrSpaces32, {S32})
377
3.64k
    .scalarize(0);
378
3.64k
379
3.64k
  setAction({G_BLOCK_ADDR, CodePtr}, Legal);
380
3.64k
381
3.64k
  auto &CmpBuilder =
382
3.64k
    getActionDefinitionsBuilder(G_ICMP)
383
3.64k
    .legalForCartesianProduct(
384
3.64k
      {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
385
3.64k
    .legalFor({{S1, S32}, {S1, S64}});
386
3.64k
  if (ST.has16BitInsts()) {
387
2.01k
    CmpBuilder.legalFor({{S1, S16}});
388
2.01k
  }
389
3.64k
390
3.64k
  CmpBuilder
391
3.64k
    .widenScalarToNextPow2(1)
392
3.64k
    .clampScalar(1, S32, S64)
393
3.64k
    .scalarize(0)
394
3.64k
    .legalIf(all(typeIs(0, S1), isPointer(1)));
395
3.64k
396
3.64k
  getActionDefinitionsBuilder(G_FCMP)
397
3.64k
    .legalForCartesianProduct({S1}, ST.has16BitInsts() ? 
FPTypes162.01k
:
FPTypesBase1.62k
)
398
3.64k
    .widenScalarToNextPow2(1)
399
3.64k
    .clampScalar(1, S32, S64)
400
3.64k
    .scalarize(0);
401
3.64k
402
3.64k
  // FIXME: fexp, flog2, flog10 needs to be custom lowered.
403
3.64k
  getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
404
3.64k
                               G_FLOG, G_FLOG2, G_FLOG10})
405
3.64k
    .legalFor({S32})
406
3.64k
    .scalarize(0);
407
3.64k
408
3.64k
  // The 64-bit versions produce 32-bit results, but only on the SALU.
409
3.64k
  getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
410
3.64k
                               G_CTTZ, G_CTTZ_ZERO_UNDEF,
411
3.64k
                               G_CTPOP})
412
3.64k
    .legalFor({{S32, S32}, {S32, S64}})
413
3.64k
    .clampScalar(0, S32, S32)
414
3.64k
    .clampScalar(1, S32, S64)
415
3.64k
    .scalarize(0)
416
3.64k
    .widenScalarToNextPow2(0, 32)
417
3.64k
    .widenScalarToNextPow2(1, 32);
418
3.64k
419
3.64k
  // TODO: Expand for > s32
420
3.64k
  getActionDefinitionsBuilder(G_BSWAP)
421
3.64k
    .legalFor({S32})
422
3.64k
    .clampScalar(0, S32, S32)
423
3.64k
    .scalarize(0);
424
3.64k
425
3.64k
  if (ST.has16BitInsts()) {
426
2.01k
    if (ST.hasVOP3PInsts()) {
427
775
      getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
428
775
        .legalFor({S32, S16, V2S16})
429
775
        .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
430
775
        .clampMaxNumElements(0, S16, 2)
431
775
        .clampScalar(0, S16, S32)
432
775
        .widenScalarToNextPow2(0)
433
775
        .scalarize(0);
434
1.24k
    } else {
435
1.24k
      getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
436
1.24k
        .legalFor({S32, S16})
437
1.24k
        .widenScalarToNextPow2(0)
438
1.24k
        .clampScalar(0, S16, S32)
439
1.24k
        .scalarize(0);
440
1.24k
    }
441
2.01k
  } else {
442
1.62k
    getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
443
1.62k
      .legalFor({S32})
444
1.62k
      .clampScalar(0, S32, S32)
445
1.62k
      .widenScalarToNextPow2(0)
446
1.62k
      .scalarize(0);
447
1.62k
  }
448
3.64k
449
7.28k
  auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
450
7.28k
    return [=](const LegalityQuery &Query) {
451
8
      return Query.Types[TypeIdx0].getSizeInBits() <
452
8
             Query.Types[TypeIdx1].getSizeInBits();
453
8
    };
454
7.28k
  };
455
3.64k
456
7.28k
  auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
457
7.28k
    return [=](const LegalityQuery &Query) {
458
2
      return Query.Types[TypeIdx0].getSizeInBits() >
459
2
             Query.Types[TypeIdx1].getSizeInBits();
460
2
    };
461
7.28k
  };
462
3.64k
463
3.64k
  getActionDefinitionsBuilder(G_INTTOPTR)
464
3.64k
    // List the common cases
465
3.64k
    .legalForCartesianProduct(AddrSpaces64, {S64})
466
3.64k
    .legalForCartesianProduct(AddrSpaces32, {S32})
467
3.64k
    .scalarize(0)
468
3.64k
    // Accept any address space as long as the size matches
469
3.64k
    .legalIf(sameSize(0, 1))
470
3.64k
    .widenScalarIf(smallerThan(1, 0),
471
3.64k
      [](const LegalityQuery &Query) {
472
3
        return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
473
3
      })
474
3.64k
    .narrowScalarIf(greaterThan(1, 0),
475
3.64k
      [](const LegalityQuery &Query) {
476
1
        return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
477
1
      });
478
3.64k
479
3.64k
  getActionDefinitionsBuilder(G_PTRTOINT)
480
3.64k
    // List the common cases
481
3.64k
    .legalForCartesianProduct(AddrSpaces64, {S64})
482
3.64k
    .legalForCartesianProduct(AddrSpaces32, {S32})
483
3.64k
    .scalarize(0)
484
3.64k
    // Accept any address space as long as the size matches
485
3.64k
    .legalIf(sameSize(0, 1))
486
3.64k
    .widenScalarIf(smallerThan(0, 1),
487
3.64k
      [](const LegalityQuery &Query) {
488
3
        return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
489
3
      })
490
3.64k
    .narrowScalarIf(
491
3.64k
      greaterThan(0, 1),
492
3.64k
      [](const LegalityQuery &Query) {
493
1
        return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
494
1
      });
495
3.64k
496
3.64k
  if (ST.hasFlatAddressSpace()) {
497
2.56k
    getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
498
2.56k
      .scalarize(0)
499
2.56k
      .custom();
500
2.56k
  }
501
3.64k
502
3.64k
  // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
503
3.64k
  // handle some operations by just promoting the register during
504
3.64k
  // selection. There are also d16 loads on GFX9+ which preserve the high bits.
505
3.64k
  getActionDefinitionsBuilder({G_LOAD, G_STORE})
506
3.64k
    .narrowScalarIf([](const LegalityQuery &Query) {
507
414
        unsigned Size = Query.Types[0].getSizeInBits();
508
414
        unsigned MemSize = Query.MMODescrs[0].SizeInBits;
509
414
        return (Size > 32 && 
MemSize < Size173
);
510
414
      },
511
3.64k
      [](const LegalityQuery &Query) {
512
18
        return std::make_pair(0, LLT::scalar(32));
513
18
      })
514
3.64k
    .fewerElementsIf([=](const LegalityQuery &Query) {
515
396
        unsigned MemSize = Query.MMODescrs[0].SizeInBits;
516
396
        return (MemSize == 96) &&
517
396
               
Query.Types[0].isVector()10
&&
518
396
               
!ST.hasDwordx3LoadStores()6
;
519
396
      },
520
3.64k
      [=](const LegalityQuery &Query) {
521
3
        return std::make_pair(0, V2S32);
522
3
      })
523
3.64k
    .legalIf([=](const LegalityQuery &Query) {
524
393
        const LLT &Ty0 = Query.Types[0];
525
393
526
393
        unsigned Size = Ty0.getSizeInBits();
527
393
        unsigned MemSize = Query.MMODescrs[0].SizeInBits;
528
393
        if (Size < 32 || 
(377
Size > 32377
&&
MemSize < Size152
))
529
16
          return false;
530
377
531
377
        if (Ty0.isVector() && 
Size != MemSize28
)
532
0
          return false;
533
377
534
377
        // TODO: Decompose private loads into 4-byte components.
535
377
        // TODO: Illegal flat loads on SI
536
377
        switch (MemSize) {
537
377
        case 8:
538
28
        case 16:
539
28
          return Size == 32;
540
327
        case 32:
541
327
        case 64:
542
327
        case 128:
543
327
          return true;
544
327
545
327
        case 96:
546
7
          return ST.hasDwordx3LoadStores();
547
327
548
327
        case 256:
549
11
        case 512:
550
11
          // TODO: Possibly support loads of i256 and i512 .  This will require
551
11
          // adding i256 and i512 types to MVT in order for to be able to use
552
11
          // TableGen.
553
11
          // TODO: Add support for other vector types, this will require
554
11
          //       defining more value mappings for the new types.
555
11
          return Ty0.isVector() && 
(6
Ty0.getScalarType().getSizeInBits() == 326
||
556
6
                                    
Ty0.getScalarType().getSizeInBits() == 640
);
557
11
558
11
        default:
559
4
          return false;
560
377
        }
561
377
      })
562
3.64k
    .clampScalar(0, S32, S64);
563
3.64k
564
3.64k
565
3.64k
  // FIXME: Handle alignment requirements.
566
3.64k
  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
567
3.64k
    .legalForTypesWithMemDesc({
568
3.64k
        {S32, GlobalPtr, 8, 8},
569
3.64k
        {S32, GlobalPtr, 16, 8},
570
3.64k
        {S32, LocalPtr, 8, 8},
571
3.64k
        {S32, LocalPtr, 16, 8},
572
3.64k
        {S32, PrivatePtr, 8, 8},
573
3.64k
        {S32, PrivatePtr, 16, 8}});
574
3.64k
  if (ST.hasFlatAddressSpace()) {
575
2.56k
    ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
576
2.56k
                                       {S32, FlatPtr, 16, 8}});
577
2.56k
  }
578
3.64k
579
3.64k
  ExtLoads.clampScalar(0, S32, S32)
580
3.64k
          .widenScalarToNextPow2(0)
581
3.64k
          .unsupportedIfMemSizeNotPow2()
582
3.64k
          .lower();
583
3.64k
584
3.64k
  auto &Atomics = getActionDefinitionsBuilder(
585
3.64k
    {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
586
3.64k
     G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
587
3.64k
     G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
588
3.64k
     G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
589
3.64k
    .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
590
3.64k
               {S64, GlobalPtr}, {S64, LocalPtr}});
591
3.64k
  if (ST.hasFlatAddressSpace()) {
592
2.56k
    Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
593
2.56k
  }
594
3.64k
595
3.64k
  // TODO: Pointer types, any 32-bit or 64-bit vector
596
3.64k
  getActionDefinitionsBuilder(G_SELECT)
597
3.64k
    .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
598
3.64k
          GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
599
3.64k
          LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
600
3.64k
    .clampScalar(0, S16, S64)
601
3.64k
    .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
602
3.64k
    .fewerElementsIf(numElementsNotEven(0), scalarize(0))
603
3.64k
    .scalarize(1)
604
3.64k
    .clampMaxNumElements(0, S32, 2)
605
3.64k
    .clampMaxNumElements(0, LocalPtr, 2)
606
3.64k
    .clampMaxNumElements(0, PrivatePtr, 2)
607
3.64k
    .scalarize(0)
608
3.64k
    .widenScalarToNextPow2(0)
609
3.64k
    .legalIf(all(isPointer(0), typeIs(1, S1)));
610
3.64k
611
3.64k
  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
612
3.64k
  // be more flexible with the shift amount type.
613
3.64k
  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
614
3.64k
    .legalFor({{S32, S32}, {S64, S32}});
615
3.64k
  if (ST.has16BitInsts()) {
616
2.01k
    if (ST.hasVOP3PInsts()) {
617
775
      Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
618
775
            .clampMaxNumElements(0, S16, 2);
619
775
    } else
620
1.24k
      Shifts.legalFor({{S16, S32}, {S16, S16}});
621
2.01k
622
2.01k
    Shifts.clampScalar(1, S16, S32);
623
2.01k
    Shifts.clampScalar(0, S16, S64);
624
2.01k
    Shifts.widenScalarToNextPow2(0, 16);
625
2.01k
  } else {
626
1.62k
    // Make sure we legalize the shift amount type first, as the general
627
1.62k
    // expansion for the shifted type will produce much worse code if it hasn't
628
1.62k
    // been truncated already.
629
1.62k
    Shifts.clampScalar(1, S32, S32);
630
1.62k
    Shifts.clampScalar(0, S32, S64);
631
1.62k
    Shifts.widenScalarToNextPow2(0, 32);
632
1.62k
  }
633
3.64k
  Shifts.scalarize(0);
634
3.64k
635
7.28k
  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
636
7.28k
    unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 
13.64k
:
03.64k
;
637
7.28k
    unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 
03.64k
:
13.64k
;
638
7.28k
    unsigned IdxTypeIdx = 2;
639
7.28k
640
7.28k
    getActionDefinitionsBuilder(Op)
641
7.28k
      .customIf([=](const LegalityQuery &Query) {
642
51
          const LLT EltTy = Query.Types[EltTypeIdx];
643
51
          const LLT VecTy = Query.Types[VecTypeIdx];
644
51
          const LLT IdxTy = Query.Types[IdxTypeIdx];
645
51
          return (EltTy.getSizeInBits() == 16 ||
646
51
                  
EltTy.getSizeInBits() % 32 == 040
) &&
647
51
                 
VecTy.getSizeInBits() % 32 == 045
&&
648
51
                 
VecTy.getSizeInBits() <= 51240
&&
649
51
                 
IdxTy.getSizeInBits() == 3240
;
650
51
        })
651
7.28k
      .clampScalar(EltTypeIdx, S32, S64)
652
7.28k
      .clampScalar(VecTypeIdx, S32, S64)
653
7.28k
      .clampScalar(IdxTypeIdx, S32, S32);
654
7.28k
  }
655
3.64k
656
3.64k
  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
657
3.64k
    .unsupportedIf([=](const LegalityQuery &Query) {
658
0
        const LLT &EltTy = Query.Types[1].getElementType();
659
0
        return Query.Types[0] != EltTy;
660
0
      });
661
3.64k
662
7.28k
  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
663
7.28k
    unsigned BigTyIdx = Op == G_EXTRACT ? 
13.64k
:
03.64k
;
664
7.28k
    unsigned LitTyIdx = Op == G_EXTRACT ? 
03.64k
:
13.64k
;
665
7.28k
666
7.28k
    // FIXME: Doesn't handle extract of illegal sizes.
667
7.28k
    getActionDefinitionsBuilder(Op)
668
7.28k
      .legalIf([=](const LegalityQuery &Query) {
669
769
          const LLT BigTy = Query.Types[BigTyIdx];
670
769
          const LLT LitTy = Query.Types[LitTyIdx];
671
769
          return (BigTy.getSizeInBits() % 32 == 0) &&
672
769
                 
(LitTy.getSizeInBits() % 16 == 0)676
;
673
769
        })
674
7.28k
      .widenScalarIf(
675
7.28k
        [=](const LegalityQuery &Query) {
676
118
          const LLT BigTy = Query.Types[BigTyIdx];
677
118
          return (BigTy.getScalarSizeInBits() < 16);
678
118
        },
679
7.28k
        LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
680
7.28k
      .widenScalarIf(
681
7.28k
        [=](const LegalityQuery &Query) {
682
100
          const LLT LitTy = Query.Types[LitTyIdx];
683
100
          return (LitTy.getScalarSizeInBits() < 16);
684
100
        },
685
7.28k
        LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
686
7.28k
      .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
687
7.28k
      .widenScalarToNextPow2(BigTyIdx, 32);
688
7.28k
689
7.28k
  }
690
3.64k
691
3.64k
  getActionDefinitionsBuilder(G_BUILD_VECTOR)
692
3.64k
      .legalForCartesianProduct(AllS32Vectors, {S32})
693
3.64k
      .legalForCartesianProduct(AllS64Vectors, {S64})
694
3.64k
      .clampNumElements(0, V16S32, V16S32)
695
3.64k
      .clampNumElements(0, V2S64, V8S64)
696
3.64k
      .minScalarSameAs(1, 0)
697
3.64k
      .legalIf(isRegisterType(0))
698
3.64k
      .minScalarOrElt(0, S32);
699
3.64k
700
3.64k
  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
701
3.64k
    .legalIf(isRegisterType(0));
702
3.64k
703
3.64k
  // Merge/Unmerge
704
7.28k
  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
705
7.28k
    unsigned BigTyIdx = Op == G_MERGE_VALUES ? 
03.64k
:
13.64k
;
706
7.28k
    unsigned LitTyIdx = Op == G_MERGE_VALUES ? 
13.64k
:
03.64k
;
707
7.28k
708
7.28k
    auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
709
2.32k
      const LLT &Ty = Query.Types[TypeIdx];
710
2.32k
      if (Ty.isVector()) {
711
928
        const LLT &EltTy = Ty.getElementType();
712
928
        if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
713
10
          return true;
714
918
        if (!isPowerOf2_32(EltTy.getSizeInBits()))
715
0
          return true;
716
2.31k
      }
717
2.31k
      return false;
718
2.31k
    };
719
7.28k
720
7.28k
    getActionDefinitionsBuilder(Op)
721
7.28k
      .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
722
7.28k
      // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
723
7.28k
      // worth considering the multiples of 64 since 2*192 and 2*384 are not
724
7.28k
      // valid.
725
7.28k
      .clampScalar(LitTyIdx, S16, S256)
726
7.28k
      .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
727
7.28k
728
7.28k
      // Break up vectors with weird elements into scalars
729
7.28k
      .fewerElementsIf(
730
7.28k
        [=](const LegalityQuery &Query) 
{ return notValidElt(Query, 0); }1.16k
,
731
7.28k
        scalarize(0))
732
7.28k
      .fewerElementsIf(
733
7.28k
        [=](const LegalityQuery &Query) 
{ return notValidElt(Query, 1); }1.16k
,
734
7.28k
        scalarize(1))
735
7.28k
      .clampScalar(BigTyIdx, S32, S512)
736
7.28k
      .widenScalarIf(
737
7.28k
        [=](const LegalityQuery &Query) {
738
1.15k
          const LLT &Ty = Query.Types[BigTyIdx];
739
1.15k
          return !isPowerOf2_32(Ty.getSizeInBits()) &&
740
1.15k
                 
Ty.getSizeInBits() % 16 != 0217
;
741
1.15k
        },
742
7.28k
        [=](const LegalityQuery &Query) {
743
0
          // Pick the next power of 2, or a multiple of 64 over 128.
744
0
          // Whichever is smaller.
745
0
          const LLT &Ty = Query.Types[BigTyIdx];
746
0
          unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
747
0
          if (NewSizeInBits >= 256) {
748
0
            unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
749
0
            if (RoundedTo < NewSizeInBits)
750
0
              NewSizeInBits = RoundedTo;
751
0
          }
752
0
          return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
753
0
        })
754
7.28k
      .legalIf([=](const LegalityQuery &Query) {
755
1.15k
          const LLT &BigTy = Query.Types[BigTyIdx];
756
1.15k
          const LLT &LitTy = Query.Types[LitTyIdx];
757
1.15k
758
1.15k
          if (BigTy.isVector() && 
BigTy.getSizeInBits() < 32868
)
759
0
            return false;
760
1.15k
          if (LitTy.isVector() && 
LitTy.getSizeInBits() < 3250
)
761
0
            return false;
762
1.15k
763
1.15k
          return BigTy.getSizeInBits() % 16 == 0 &&
764
1.15k
                 LitTy.getSizeInBits() % 16 == 0 &&
765
1.15k
                 BigTy.getSizeInBits() <= 512;
766
1.15k
        })
767
7.28k
      // Any vectors left are the wrong size. Scalarize them.
768
7.28k
      .scalarize(0)
769
7.28k
      .scalarize(1);
770
7.28k
  }
771
3.64k
772
3.64k
  computeTables();
773
3.64k
  verify(*ST.getInstrInfo());
774
3.64k
}
775
776
bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
777
                                         MachineRegisterInfo &MRI,
778
                                         MachineIRBuilder &MIRBuilder,
779
292
                                         GISelChangeObserver &Observer) const {
780
292
  switch (MI.getOpcode()) {
781
292
  case TargetOpcode::G_ADDRSPACE_CAST:
782
36
    return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
783
292
  case TargetOpcode::G_FRINT:
784
3
    return legalizeFrint(MI, MRI, MIRBuilder);
785
292
  case TargetOpcode::G_FCEIL:
786
3
    return legalizeFceil(MI, MRI, MIRBuilder);
787
292
  case TargetOpcode::G_INTRINSIC_TRUNC:
788
6
    return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
789
292
  case TargetOpcode::G_SITOFP:
790
1
    return legalizeITOFP(MI, MRI, MIRBuilder, true);
791
292
  case TargetOpcode::G_UITOFP:
792
1
    return legalizeITOFP(MI, MRI, MIRBuilder, false);
793
292
  case TargetOpcode::G_FMINNUM:
794
206
  case TargetOpcode::G_FMAXNUM:
795
206
  case TargetOpcode::G_FMINNUM_IEEE:
796
206
  case TargetOpcode::G_FMAXNUM_IEEE:
797
206
    return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
798
206
  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
799
31
    return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
800
206
  case TargetOpcode::G_INSERT_VECTOR_ELT:
801
5
    return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
802
206
  default:
803
0
    return false;
804
0
  }
805
0
806
0
  llvm_unreachable("expected switch to return");
807
0
}
808
809
Register AMDGPULegalizerInfo::getSegmentAperture(
810
  unsigned AS,
811
  MachineRegisterInfo &MRI,
812
8
  MachineIRBuilder &MIRBuilder) const {
813
8
  MachineFunction &MF = MIRBuilder.getMF();
814
8
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
815
8
  const LLT S32 = LLT::scalar(32);
816
8
817
8
  if (ST.hasApertureRegs()) {
818
4
    // FIXME: Use inline constants (src_{shared, private}_base) instead of
819
4
    // getreg.
820
4
    unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
821
0
        AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
822
4
        AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
823
4
    unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
824
0
        AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
825
4
        AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
826
4
    unsigned Encoding =
827
4
        AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
828
4
        Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
829
4
        WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
830
4
831
4
    Register ApertureReg = MRI.createGenericVirtualRegister(S32);
832
4
    Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
833
4
834
4
    MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
835
4
      .addDef(GetReg)
836
4
      .addImm(Encoding);
837
4
    MRI.setType(GetReg, S32);
838
4
839
4
    auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
840
4
    MIRBuilder.buildInstr(TargetOpcode::G_SHL)
841
4
      .addDef(ApertureReg)
842
4
      .addUse(GetReg)
843
4
      .addUse(ShiftAmt.getReg(0));
844
4
845
4
    return ApertureReg;
846
4
  }
847
4
848
4
  Register QueuePtr = MRI.createGenericVirtualRegister(
849
4
    LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
850
4
851
4
  // FIXME: Placeholder until we can track the input registers.
852
4
  MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
853
4
854
4
  // Offset into amd_queue_t for group_segment_aperture_base_hi /
855
4
  // private_segment_aperture_base_hi.
856
4
  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 
0x400
: 0x44;
857
4
858
4
  // FIXME: Don't use undef
859
4
  Value *V = UndefValue::get(PointerType::get(
860
4
                               Type::getInt8Ty(MF.getFunction().getContext()),
861
4
                               AMDGPUAS::CONSTANT_ADDRESS));
862
4
863
4
  MachinePointerInfo PtrInfo(V, StructOffset);
864
4
  MachineMemOperand *MMO = MF.getMachineMemOperand(
865
4
    PtrInfo,
866
4
    MachineMemOperand::MOLoad |
867
4
    MachineMemOperand::MODereferenceable |
868
4
    MachineMemOperand::MOInvariant,
869
4
    4,
870
4
    MinAlign(64, StructOffset));
871
4
872
4
  Register LoadResult = MRI.createGenericVirtualRegister(S32);
873
4
  Register LoadAddr;
874
4
875
4
  MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
876
4
  MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
877
4
  return LoadResult;
878
4
}
879
880
bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
881
  MachineInstr &MI, MachineRegisterInfo &MRI,
882
36
  MachineIRBuilder &MIRBuilder) const {
883
36
  MachineFunction &MF = MIRBuilder.getMF();
884
36
885
36
  MIRBuilder.setInstr(MI);
886
36
887
36
  Register Dst = MI.getOperand(0).getReg();
888
36
  Register Src = MI.getOperand(1).getReg();
889
36
890
36
  LLT DstTy = MRI.getType(Dst);
891
36
  LLT SrcTy = MRI.getType(Src);
892
36
  unsigned DestAS = DstTy.getAddressSpace();
893
36
  unsigned SrcAS = SrcTy.getAddressSpace();
894
36
895
36
  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
896
36
  // vector element.
897
36
  assert(!DstTy.isVector());
898
36
899
36
  const AMDGPUTargetMachine &TM
900
36
    = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
901
36
902
36
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
903
36
  if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
904
20
    MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
905
20
    return true;
906
20
  }
907
16
908
16
  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
909
8
    assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
910
8
           DestAS == AMDGPUAS::PRIVATE_ADDRESS);
911
8
    unsigned NullVal = TM.getNullPointerValue(DestAS);
912
8
913
8
    auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
914
8
    auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
915
8
916
8
    Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
917
8
918
8
    // Extract low 32-bits of the pointer.
919
8
    MIRBuilder.buildExtract(PtrLo32, Src, 0);
920
8
921
8
    Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
922
8
    MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
923
8
    MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
924
8
925
8
    MI.eraseFromParent();
926
8
    return true;
927
8
  }
928
8
929
8
  assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
930
8
         SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
931
8
932
8
  auto SegmentNull =
933
8
      MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
934
8
  auto FlatNull =
935
8
      MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
936
8
937
8
  Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
938
8
939
8
  Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
940
8
  MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
941
8
942
8
  Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
943
8
944
8
  // Coerce the type of the low half of the result so we can use merge_values.
945
8
  Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
946
8
  MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
947
8
    .addDef(SrcAsInt)
948
8
    .addUse(Src);
949
8
950
8
  // TODO: Should we allow mismatched types but matching sizes in merges to
951
8
  // avoid the ptrtoint?
952
8
  MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
953
8
  MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
954
8
955
8
  MI.eraseFromParent();
956
8
  return true;
957
8
}
958
959
bool AMDGPULegalizerInfo::legalizeFrint(
960
  MachineInstr &MI, MachineRegisterInfo &MRI,
961
3
  MachineIRBuilder &MIRBuilder) const {
962
3
  MIRBuilder.setInstr(MI);
963
3
964
3
  Register Src = MI.getOperand(1).getReg();
965
3
  LLT Ty = MRI.getType(Src);
966
3
  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
967
3
968
3
  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
969
3
  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
970
3
971
3
  auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
972
3
  auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
973
3
974
3
  // TODO: Should this propagate fast-math-flags?
975
3
  auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
976
3
  auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
977
3
978
3
  auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
979
3
  auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
980
3
981
3
  auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
982
3
  MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
983
3
  return true;
984
3
}
985
986
bool AMDGPULegalizerInfo::legalizeFceil(
987
  MachineInstr &MI, MachineRegisterInfo &MRI,
988
3
  MachineIRBuilder &B) const {
989
3
  B.setInstr(MI);
990
3
991
3
  const LLT S1 = LLT::scalar(1);
992
3
  const LLT S64 = LLT::scalar(64);
993
3
994
3
  Register Src = MI.getOperand(1).getReg();
995
3
  assert(MRI.getType(Src) == S64);
996
3
997
3
  // result = trunc(src)
998
3
  // if (src > 0.0 && src != result)
999
3
  //   result += 1.0
1000
3
1001
3
  auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1002
3
1003
3
  const auto Zero = B.buildFConstant(S64, 0.0);
1004
3
  const auto One = B.buildFConstant(S64, 1.0);
1005
3
  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1006
3
  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1007
3
  auto And = B.buildAnd(S1, Lt0, NeTrunc);
1008
3
  auto Add = B.buildSelect(S64, And, One, Zero);
1009
3
1010
3
  // TODO: Should this propagate fast-math-flags?
1011
3
  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1012
3
  return true;
1013
3
}
1014
1015
static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1016
6
                                              MachineIRBuilder &B) {
1017
6
  const unsigned FractBits = 52;
1018
6
  const unsigned ExpBits = 11;
1019
6
  LLT S32 = LLT::scalar(32);
1020
6
1021
6
  auto Const0 = B.buildConstant(S32, FractBits - 32);
1022
6
  auto Const1 = B.buildConstant(S32, ExpBits);
1023
6
1024
6
  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1025
6
    .addUse(Const0.getReg(0))
1026
6
    .addUse(Const1.getReg(0));
1027
6
1028
6
  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1029
6
}
1030
1031
bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1032
  MachineInstr &MI, MachineRegisterInfo &MRI,
1033
6
  MachineIRBuilder &B) const {
1034
6
  B.setInstr(MI);
1035
6
1036
6
  const LLT S1 = LLT::scalar(1);
1037
6
  const LLT S32 = LLT::scalar(32);
1038
6
  const LLT S64 = LLT::scalar(64);
1039
6
1040
6
  Register Src = MI.getOperand(1).getReg();
1041
6
  assert(MRI.getType(Src) == S64);
1042
6
1043
6
  // TODO: Should this use extract since the low half is unused?
1044
6
  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1045
6
  Register Hi = Unmerge.getReg(1);
1046
6
1047
6
  // Extract the upper half, since this is where we will find the sign and
1048
6
  // exponent.
1049
6
  auto Exp = extractF64Exponent(Hi, B);
1050
6
1051
6
  const unsigned FractBits = 52;
1052
6
1053
6
  // Extract the sign bit.
1054
6
  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1055
6
  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1056
6
1057
6
  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1058
6
1059
6
  const auto Zero32 = B.buildConstant(S32, 0);
1060
6
1061
6
  // Extend back to 64-bits.
1062
6
  auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1063
6
1064
6
  auto Shr = B.buildAShr(S64, FractMask, Exp);
1065
6
  auto Not = B.buildNot(S64, Shr);
1066
6
  auto Tmp0 = B.buildAnd(S64, Src, Not);
1067
6
  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1068
6
1069
6
  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1070
6
  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1071
6
1072
6
  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1073
6
  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1074
6
  return true;
1075
6
}
1076
1077
bool AMDGPULegalizerInfo::legalizeITOFP(
1078
  MachineInstr &MI, MachineRegisterInfo &MRI,
1079
2
  MachineIRBuilder &B, bool Signed) const {
1080
2
  B.setInstr(MI);
1081
2
1082
2
  Register Dst = MI.getOperand(0).getReg();
1083
2
  Register Src = MI.getOperand(1).getReg();
1084
2
1085
2
  const LLT S64 = LLT::scalar(64);
1086
2
  const LLT S32 = LLT::scalar(32);
1087
2
1088
2
  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1089
2
1090
2
  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1091
2
1092
2
  auto CvtHi = Signed ?
1093
1
    B.buildSITOFP(S64, Unmerge.getReg(1)) :
1094
2
    
B.buildUITOFP(S64, Unmerge.getReg(1))1
;
1095
2
1096
2
  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1097
2
1098
2
  auto ThirtyTwo = B.buildConstant(S32, 32);
1099
2
  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1100
2
    .addUse(CvtHi.getReg(0))
1101
2
    .addUse(ThirtyTwo.getReg(0));
1102
2
1103
2
  // TODO: Should this propagate fast-math-flags?
1104
2
  B.buildFAdd(Dst, LdExp, CvtLo);
1105
2
  MI.eraseFromParent();
1106
2
  return true;
1107
2
}
1108
1109
bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1110
  MachineInstr &MI, MachineRegisterInfo &MRI,
1111
206
  MachineIRBuilder &B) const {
1112
206
  MachineFunction &MF = B.getMF();
1113
206
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1114
206
1115
206
  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1116
206
                        
MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE106
;
1117
206
1118
206
  // With ieee_mode disabled, the instructions have the correct behavior
1119
206
  // already for G_FMINNUM/G_FMAXNUM
1120
206
  if (!MFI->getMode().IEEE)
1121
6
    return !IsIEEEOp;
1122
200
1123
200
  if (IsIEEEOp)
1124
100
    return true;
1125
100
1126
100
  MachineIRBuilder HelperBuilder(MI);
1127
100
  GISelObserverWrapper DummyObserver;
1128
100
  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1129
100
  HelperBuilder.setMBB(*MI.getParent());
1130
100
  return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1131
100
}
1132
1133
bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1134
  MachineInstr &MI, MachineRegisterInfo &MRI,
1135
31
  MachineIRBuilder &B) const {
1136
31
  // TODO: Should move some of this into LegalizerHelper.
1137
31
1138
31
  // TODO: Promote dynamic indexing of s16 to s32
1139
31
  // TODO: Dynamic s64 indexing is only legal for SGPR.
1140
31
  Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1141
31
  if (!IdxVal) // Dynamic case will be selected to register indexing.
1142
11
    return true;
1143
20
1144
20
  Register Dst = MI.getOperand(0).getReg();
1145
20
  Register Vec = MI.getOperand(1).getReg();
1146
20
1147
20
  LLT VecTy = MRI.getType(Vec);
1148
20
  LLT EltTy = VecTy.getElementType();
1149
20
  assert(EltTy == MRI.getType(Dst));
1150
20
1151
20
  B.setInstr(MI);
1152
20
1153
20
  if (IdxVal.getValue() < VecTy.getNumElements())
1154
18
    B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1155
2
  else
1156
2
    B.buildUndef(Dst);
1157
20
1158
20
  MI.eraseFromParent();
1159
20
  return true;
1160
20
}
1161
1162
bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1163
  MachineInstr &MI, MachineRegisterInfo &MRI,
1164
5
  MachineIRBuilder &B) const {
1165
5
  // TODO: Should move some of this into LegalizerHelper.
1166
5
1167
5
  // TODO: Promote dynamic indexing of s16 to s32
1168
5
  // TODO: Dynamic s64 indexing is only legal for SGPR.
1169
5
  Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1170
5
  if (!IdxVal) // Dynamic case will be selected to register indexing.
1171
2
    return true;
1172
3
1173
3
  Register Dst = MI.getOperand(0).getReg();
1174
3
  Register Vec = MI.getOperand(1).getReg();
1175
3
  Register Ins = MI.getOperand(2).getReg();
1176
3
1177
3
  LLT VecTy = MRI.getType(Vec);
1178
3
  LLT EltTy = VecTy.getElementType();
1179
3
  assert(EltTy == MRI.getType(Ins));
1180
3
1181
3
  B.setInstr(MI);
1182
3
1183
3
  if (IdxVal.getValue() < VecTy.getNumElements())
1184
2
    B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1185
1
  else
1186
1
    B.buildUndef(Dst);
1187
3
1188
3
  MI.eraseFromParent();
1189
3
  return true;
1190
3
}
1191
1192
// Return the use branch instruction, otherwise null if the usage is invalid.
1193
static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1194
8
                                       MachineRegisterInfo &MRI) {
1195
8
  Register CondDef = MI.getOperand(0).getReg();
1196
8
  if (!MRI.hasOneNonDBGUse(CondDef))
1197
1
    return nullptr;
1198
7
1199
7
  MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1200
7
  return UseMI.getParent() == MI.getParent() &&
1201
7
    
UseMI.getOpcode() == AMDGPU::G_BRCOND6
?
&UseMI4
:
nullptr3
;
1202
7
}
1203
1204
Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1205
152
                                                Register Reg, LLT Ty) const {
1206
152
  Register LiveIn = MRI.getLiveInVirtReg(Reg);
1207
152
  if (LiveIn)
1208
152
    return LiveIn;
1209
0
1210
0
  Register NewReg = MRI.createGenericVirtualRegister(Ty);
1211
0
  MRI.addLiveIn(Reg, NewReg);
1212
0
  return NewReg;
1213
0
}
1214
1215
bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1216
152
                                         const ArgDescriptor *Arg) const {
1217
152
  if (!Arg->isRegister())
1218
0
    return false; // TODO: Handle these
1219
152
1220
152
  assert(Arg->getRegister() != 0);
1221
152
  assert(Arg->getRegister().isPhysical());
1222
152
1223
152
  MachineRegisterInfo &MRI = *B.getMRI();
1224
152
1225
152
  LLT Ty = MRI.getType(DstReg);
1226
152
  Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1227
152
1228
152
  if (Arg->isMasked()) {
1229
0
    // TODO: Should we try to emit this once in the entry block?
1230
0
    const LLT S32 = LLT::scalar(32);
1231
0
    const unsigned Mask = Arg->getMask();
1232
0
    const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1233
0
1234
0
    auto ShiftAmt = B.buildConstant(S32, Shift);
1235
0
    auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1236
0
    B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1237
0
  } else
1238
152
    B.buildCopy(DstReg, LiveIn);
1239
152
1240
152
  // Insert the argument copy if it doens't already exist.
1241
152
  // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1242
152
  if (!MRI.getVRegDef(LiveIn)) {
1243
52
    MachineBasicBlock &EntryMBB = B.getMF().front();
1244
52
    EntryMBB.addLiveIn(Arg->getRegister());
1245
52
    B.setInsertPt(EntryMBB, EntryMBB.begin());
1246
52
    B.buildCopy(LiveIn, Arg->getRegister());
1247
52
  }
1248
152
1249
152
  return true;
1250
152
}
1251
1252
bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1253
  MachineInstr &MI,
1254
  MachineRegisterInfo &MRI,
1255
  MachineIRBuilder &B,
1256
137
  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1257
137
  B.setInstr(MI);
1258
137
1259
137
  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1260
137
1261
137
  const ArgDescriptor *Arg;
1262
137
  const TargetRegisterClass *RC;
1263
137
  std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1264
137
  if (!Arg) {
1265
0
    LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1266
0
    return false;
1267
0
  }
1268
137
1269
137
  if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1270
137
    MI.eraseFromParent();
1271
137
    return true;
1272
137
  }
1273
0
1274
0
  return false;
1275
0
}
1276
1277
bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1278
                                                 MachineRegisterInfo &MRI,
1279
15
                                                 MachineIRBuilder &B) const {
1280
15
  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1281
15
  if (!MFI->isEntryFunction()) {
1282
0
    return legalizePreloadedArgIntrin(MI, MRI, B,
1283
0
                                      AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1284
0
  }
1285
15
1286
15
  B.setInstr(MI);
1287
15
1288
15
  uint64_t Offset =
1289
15
    ST.getTargetLowering()->getImplicitParameterOffset(
1290
15
      B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1291
15
  Register DstReg = MI.getOperand(0).getReg();
1292
15
  LLT DstTy = MRI.getType(DstReg);
1293
15
  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1294
15
1295
15
  const ArgDescriptor *Arg;
1296
15
  const TargetRegisterClass *RC;
1297
15
  std::tie(Arg, RC)
1298
15
    = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1299
15
  if (!Arg)
1300
0
    return false;
1301
15
1302
15
  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1303
15
  if (!loadInputValue(KernargPtrReg, B, Arg))
1304
0
    return false;
1305
15
1306
15
  B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1307
15
  MI.eraseFromParent();
1308
15
  return true;
1309
15
}
1310
1311
bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1312
                                            MachineRegisterInfo &MRI,
1313
172
                                            MachineIRBuilder &B) const {
1314
172
  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1315
172
  switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1316
172
  case Intrinsic::amdgcn_if: {
1317
6
    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1318
2
      const SIRegisterInfo *TRI
1319
2
        = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1320
2
1321
2
      B.setInstr(*BrCond);
1322
2
      Register Def = MI.getOperand(1).getReg();
1323
2
      Register Use = MI.getOperand(3).getReg();
1324
2
      B.buildInstr(AMDGPU::SI_IF)
1325
2
        .addDef(Def)
1326
2
        .addUse(Use)
1327
2
        .addMBB(BrCond->getOperand(1).getMBB());
1328
2
1329
2
      MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1330
2
      MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1331
2
      MI.eraseFromParent();
1332
2
      BrCond->eraseFromParent();
1333
2
      return true;
1334
2
    }
1335
4
1336
4
    return false;
1337
4
  }
1338
4
  case Intrinsic::amdgcn_loop: {
1339
2
    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1340
2
      const SIRegisterInfo *TRI
1341
2
        = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1342
2
1343
2
      B.setInstr(*BrCond);
1344
2
      Register Reg = MI.getOperand(2).getReg();
1345
2
      B.buildInstr(AMDGPU::SI_LOOP)
1346
2
        .addUse(Reg)
1347
2
        .addMBB(BrCond->getOperand(1).getMBB());
1348
2
      MI.eraseFromParent();
1349
2
      BrCond->eraseFromParent();
1350
2
      MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1351
2
      return true;
1352
2
    }
1353
0
1354
0
    return false;
1355
0
  }
1356
85
  case Intrinsic::amdgcn_kernarg_segment_ptr:
1357
85
    return legalizePreloadedArgIntrin(
1358
85
      MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1359
15
  case Intrinsic::amdgcn_implicitarg_ptr:
1360
15
    return legalizeImplicitArgPtr(MI, MRI, B);
1361
18
  case Intrinsic::amdgcn_workitem_id_x:
1362
18
    return legalizePreloadedArgIntrin(MI, MRI, B,
1363
18
                                      AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1364
6
  case Intrinsic::amdgcn_workitem_id_y:
1365
6
    return legalizePreloadedArgIntrin(MI, MRI, B,
1366
6
                                      AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1367
6
  case Intrinsic::amdgcn_workitem_id_z:
1368
6
    return legalizePreloadedArgIntrin(MI, MRI, B,
1369
6
                                      AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1370
6
  case Intrinsic::amdgcn_workgroup_id_x:
1371
6
    return legalizePreloadedArgIntrin(MI, MRI, B,
1372
6
                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1373
6
  case Intrinsic::amdgcn_workgroup_id_y:
1374
6
    return legalizePreloadedArgIntrin(MI, MRI, B,
1375
6
                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1376
6
  case Intrinsic::amdgcn_workgroup_id_z:
1377
6
    return legalizePreloadedArgIntrin(MI, MRI, B,
1378
6
                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1379
1
  case Intrinsic::amdgcn_dispatch_ptr:
1380
1
    return legalizePreloadedArgIntrin(MI, MRI, B,
1381
1
                                      AMDGPUFunctionArgInfo::DISPATCH_PTR);
1382
1
  case Intrinsic::amdgcn_queue_ptr:
1383
1
    return legalizePreloadedArgIntrin(MI, MRI, B,
1384
1
                                      AMDGPUFunctionArgInfo::QUEUE_PTR);
1385
1
  case Intrinsic::amdgcn_implicit_buffer_ptr:
1386
1
    return legalizePreloadedArgIntrin(
1387
1
      MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1388
1
  case Intrinsic::amdgcn_dispatch_id:
1389
1
    return legalizePreloadedArgIntrin(MI, MRI, B,
1390
1
                                      AMDGPUFunctionArgInfo::DISPATCH_ID);
1391
12
  default:
1392
12
    return true;
1393
0
  }
1394
0
1395
0
  return true;
1396
0
}