/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | /// \file |
11 | | /// \brief SI Implementation of TargetInstrInfo. |
12 | | // |
13 | | //===----------------------------------------------------------------------===// |
14 | | |
15 | | #include "SIInstrInfo.h" |
16 | | #include "AMDGPU.h" |
17 | | #include "AMDGPUSubtarget.h" |
18 | | #include "GCNHazardRecognizer.h" |
19 | | #include "SIDefines.h" |
20 | | #include "SIMachineFunctionInfo.h" |
21 | | #include "SIRegisterInfo.h" |
22 | | #include "Utils/AMDGPUBaseInfo.h" |
23 | | #include "llvm/ADT/APInt.h" |
24 | | #include "llvm/ADT/ArrayRef.h" |
25 | | #include "llvm/ADT/SmallVector.h" |
26 | | #include "llvm/ADT/StringRef.h" |
27 | | #include "llvm/ADT/iterator_range.h" |
28 | | #include "llvm/Analysis/AliasAnalysis.h" |
29 | | #include "llvm/Analysis/MemoryLocation.h" |
30 | | #include "llvm/Analysis/ValueTracking.h" |
31 | | #include "llvm/CodeGen/MachineBasicBlock.h" |
32 | | #include "llvm/CodeGen/MachineFrameInfo.h" |
33 | | #include "llvm/CodeGen/MachineFunction.h" |
34 | | #include "llvm/CodeGen/MachineInstr.h" |
35 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
36 | | #include "llvm/CodeGen/MachineInstrBundle.h" |
37 | | #include "llvm/CodeGen/MachineMemOperand.h" |
38 | | #include "llvm/CodeGen/MachineOperand.h" |
39 | | #include "llvm/CodeGen/MachineRegisterInfo.h" |
40 | | #include "llvm/CodeGen/MachineValueType.h" |
41 | | #include "llvm/CodeGen/RegisterScavenging.h" |
42 | | #include "llvm/CodeGen/ScheduleDAG.h" |
43 | | #include "llvm/CodeGen/SelectionDAGNodes.h" |
44 | | #include "llvm/IR/DebugLoc.h" |
45 | | #include "llvm/IR/DiagnosticInfo.h" |
46 | | #include "llvm/IR/Function.h" |
47 | | #include "llvm/IR/InlineAsm.h" |
48 | | #include "llvm/IR/LLVMContext.h" |
49 | | #include "llvm/MC/MCInstrDesc.h" |
50 | | #include "llvm/Support/Casting.h" |
51 | | #include "llvm/Support/CommandLine.h" |
52 | | #include "llvm/Support/Compiler.h" |
53 | | #include "llvm/Support/ErrorHandling.h" |
54 | | #include "llvm/Support/MathExtras.h" |
55 | | #include "llvm/Target/TargetMachine.h" |
56 | | #include "llvm/Target/TargetOpcodes.h" |
57 | | #include "llvm/Target/TargetRegisterInfo.h" |
58 | | #include <cassert> |
59 | | #include <cstdint> |
60 | | #include <iterator> |
61 | | #include <utility> |
62 | | |
63 | | using namespace llvm; |
64 | | |
65 | | // Must be at least 4 to be able to branch over minimum unconditional branch |
66 | | // code. This is only for making it possible to write reasonably small tests for |
67 | | // long branches. |
68 | | static cl::opt<unsigned> |
69 | | BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), |
70 | | cl::desc("Restrict range of branch instructions (DEBUG)")); |
71 | | |
72 | | SIInstrInfo::SIInstrInfo(const SISubtarget &ST) |
73 | 1.81k | : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} |
74 | | |
75 | | //===----------------------------------------------------------------------===// |
76 | | // TargetInstrInfo callbacks |
77 | | //===----------------------------------------------------------------------===// |
78 | | |
79 | 458k | static unsigned getNumOperandsNoGlue(SDNode *Node) { |
80 | 458k | unsigned N = Node->getNumOperands(); |
81 | 511k | while (N && 511k Node->getOperand(N - 1).getValueType() == MVT::Glue511k ) |
82 | 52.3k | --N; |
83 | 458k | return N; |
84 | 458k | } |
85 | | |
86 | 458k | static SDValue findChainOperand(SDNode *Load) { |
87 | 458k | SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); |
88 | 458k | assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); |
89 | 458k | return LastOp; |
90 | 458k | } |
91 | | |
92 | | /// \brief Returns true if both nodes have the same value for the given |
93 | | /// operand \p Op, or if both nodes do not have this operand. |
94 | 612k | static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { |
95 | 612k | unsigned Opc0 = N0->getMachineOpcode(); |
96 | 612k | unsigned Opc1 = N1->getMachineOpcode(); |
97 | 612k | |
98 | 612k | int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); |
99 | 612k | int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); |
100 | 612k | |
101 | 612k | if (Op0Idx == -1 && 612k Op1Idx == -113.9k ) |
102 | 13.9k | return true; |
103 | 598k | |
104 | 598k | |
105 | 598k | if (598k (Op0Idx == -1 && 598k Op1Idx != -125 ) || |
106 | 598k | (Op1Idx == -1 && 598k Op0Idx != -127 )) |
107 | 52 | return false; |
108 | 598k | |
109 | 598k | // getNamedOperandIdx returns the index for the MachineInstr's operands, |
110 | 598k | // which includes the result as the first operand. We are indexing into the |
111 | 598k | // MachineSDNode's operands, so we need to skip the result operand to get |
112 | 598k | // the real index. |
113 | 598k | --Op0Idx; |
114 | 598k | --Op1Idx; |
115 | 598k | |
116 | 598k | return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); |
117 | 598k | } |
118 | | |
119 | | bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, |
120 | 26.3k | AliasAnalysis *AA) const { |
121 | 26.3k | // TODO: The generic check fails for VALU instructions that should be |
122 | 26.3k | // rematerializable due to implicit reads of exec. We really want all of the |
123 | 26.3k | // generic logic for this except for this. |
124 | 26.3k | switch (MI.getOpcode()) { |
125 | 6.39k | case AMDGPU::V_MOV_B32_e32: |
126 | 6.39k | case AMDGPU::V_MOV_B32_e64: |
127 | 6.39k | case AMDGPU::V_MOV_B64_PSEUDO: |
128 | 6.39k | return true; |
129 | 19.9k | default: |
130 | 19.9k | return false; |
131 | 0 | } |
132 | 0 | } |
133 | | |
134 | | bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, |
135 | | int64_t &Offset0, |
136 | 410k | int64_t &Offset1) const { |
137 | 410k | if (!Load0->isMachineOpcode() || 410k !Load1->isMachineOpcode()410k ) |
138 | 46.4k | return false; |
139 | 364k | |
140 | 364k | unsigned Opc0 = Load0->getMachineOpcode(); |
141 | 364k | unsigned Opc1 = Load1->getMachineOpcode(); |
142 | 364k | |
143 | 364k | // Make sure both are actually loads. |
144 | 364k | if (!get(Opc0).mayLoad() || 364k !get(Opc1).mayLoad()364k ) |
145 | 40.2k | return false; |
146 | 324k | |
147 | 324k | if (324k isDS(Opc0) && 324k isDS(Opc1)0 ) { |
148 | 0 |
|
149 | 0 | // FIXME: Handle this case: |
150 | 0 | if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) |
151 | 0 | return false; |
152 | 0 |
|
153 | 0 | // Check base reg. |
154 | 0 | if (0 Load0->getOperand(1) != Load1->getOperand(1)0 ) |
155 | 0 | return false; |
156 | 0 |
|
157 | 0 | // Check chain. |
158 | 0 | if (0 findChainOperand(Load0) != findChainOperand(Load1)0 ) |
159 | 0 | return false; |
160 | 0 |
|
161 | 0 | // Skip read2 / write2 variants for simplicity. |
162 | 0 | // TODO: We should report true if the used offsets are adjacent (excluded |
163 | 0 | // st64 versions). |
164 | 0 | if (0 AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || |
165 | 0 | AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) |
166 | 0 | return false; |
167 | 0 |
|
168 | 0 | Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); |
169 | 0 | Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); |
170 | 0 | return true; |
171 | 0 | } |
172 | 324k | |
173 | 324k | if (324k isSMRD(Opc0) && 324k isSMRD(Opc1)52.1k ) { |
174 | 35.9k | // Skip time and cache invalidation instructions. |
175 | 35.9k | if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || |
176 | 35.9k | AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) |
177 | 4 | return false; |
178 | 35.9k | |
179 | 35.9k | assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); |
180 | 35.9k | |
181 | 35.9k | // Check base reg. |
182 | 35.9k | if (Load0->getOperand(0) != Load1->getOperand(0)) |
183 | 3.76k | return false; |
184 | 32.2k | |
185 | 32.2k | const ConstantSDNode *Load0Offset = |
186 | 32.2k | dyn_cast<ConstantSDNode>(Load0->getOperand(1)); |
187 | 32.2k | const ConstantSDNode *Load1Offset = |
188 | 32.2k | dyn_cast<ConstantSDNode>(Load1->getOperand(1)); |
189 | 32.2k | |
190 | 32.2k | if (!Load0Offset || 32.2k !Load1Offset32.2k ) |
191 | 6 | return false; |
192 | 32.2k | |
193 | 32.2k | // Check chain. |
194 | 32.2k | if (32.2k findChainOperand(Load0) != findChainOperand(Load1)32.2k ) |
195 | 0 | return false; |
196 | 32.2k | |
197 | 32.2k | Offset0 = Load0Offset->getZExtValue(); |
198 | 32.2k | Offset1 = Load1Offset->getZExtValue(); |
199 | 32.2k | return true; |
200 | 32.2k | } |
201 | 288k | |
202 | 288k | // MUBUF and MTBUF can access the same addresses. |
203 | 288k | if (288k (isMUBUF(Opc0) || 288k isMTBUF(Opc0)59.0k ) && (isMUBUF(Opc1) || 229k isMTBUF(Opc1)9.37k )) { |
204 | 219k | |
205 | 219k | // MUBUF and MTBUF have vaddr at different indices. |
206 | 219k | if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || |
207 | 197k | findChainOperand(Load0) != findChainOperand(Load1) || |
208 | 197k | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || |
209 | 195k | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) |
210 | 25.3k | return false; |
211 | 194k | |
212 | 194k | int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); |
213 | 194k | int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); |
214 | 194k | |
215 | 194k | if (OffIdx0 == -1 || 194k OffIdx1 == -1194k ) |
216 | 0 | return false; |
217 | 194k | |
218 | 194k | // getNamedOperandIdx returns the index for MachineInstrs. Since they |
219 | 194k | // inlcude the output in the operand list, but SDNodes don't, we need to |
220 | 194k | // subtract the index by one. |
221 | 194k | --OffIdx0; |
222 | 194k | --OffIdx1; |
223 | 194k | |
224 | 194k | SDValue Off0 = Load0->getOperand(OffIdx0); |
225 | 194k | SDValue Off1 = Load1->getOperand(OffIdx1); |
226 | 194k | |
227 | 194k | // The offset might be a FrameIndexSDNode. |
228 | 194k | if (!isa<ConstantSDNode>(Off0) || 194k !isa<ConstantSDNode>(Off1)194k ) |
229 | 0 | return false; |
230 | 194k | |
231 | 194k | Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); |
232 | 194k | Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); |
233 | 194k | return true; |
234 | 194k | } |
235 | 68.4k | |
236 | 68.4k | return false; |
237 | 68.4k | } |
238 | | |
239 | 31.9k | static bool isStride64(unsigned Opc) { |
240 | 31.9k | switch (Opc) { |
241 | 5 | case AMDGPU::DS_READ2ST64_B32: |
242 | 5 | case AMDGPU::DS_READ2ST64_B64: |
243 | 5 | case AMDGPU::DS_WRITE2ST64_B32: |
244 | 5 | case AMDGPU::DS_WRITE2ST64_B64: |
245 | 5 | return true; |
246 | 31.9k | default: |
247 | 31.9k | return false; |
248 | 0 | } |
249 | 0 | } |
250 | | |
251 | | bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, |
252 | | int64_t &Offset, |
253 | 972k | const TargetRegisterInfo *TRI) const { |
254 | 972k | unsigned Opc = LdSt.getOpcode(); |
255 | 972k | |
256 | 972k | if (isDS(LdSt)972k ) { |
257 | 68.3k | const MachineOperand *OffsetImm = |
258 | 68.3k | getNamedOperand(LdSt, AMDGPU::OpName::offset); |
259 | 68.3k | if (OffsetImm68.3k ) { |
260 | 28.0k | // Normal, single offset LDS instruction. |
261 | 28.0k | const MachineOperand *AddrReg = |
262 | 28.0k | getNamedOperand(LdSt, AMDGPU::OpName::addr); |
263 | 28.0k | |
264 | 28.0k | BaseReg = AddrReg->getReg(); |
265 | 28.0k | Offset = OffsetImm->getImm(); |
266 | 28.0k | return true; |
267 | 28.0k | } |
268 | 40.2k | |
269 | 40.2k | // The 2 offset instructions use offset0 and offset1 instead. We can treat |
270 | 40.2k | // these as a load with a single offset if the 2 offsets are consecutive. We |
271 | 40.2k | // will use this for some partially aligned loads. |
272 | 40.2k | const MachineOperand *Offset0Imm = |
273 | 40.2k | getNamedOperand(LdSt, AMDGPU::OpName::offset0); |
274 | 40.2k | const MachineOperand *Offset1Imm = |
275 | 40.2k | getNamedOperand(LdSt, AMDGPU::OpName::offset1); |
276 | 40.2k | |
277 | 40.2k | uint8_t Offset0 = Offset0Imm->getImm(); |
278 | 40.2k | uint8_t Offset1 = Offset1Imm->getImm(); |
279 | 40.2k | |
280 | 40.2k | if (Offset1 > Offset0 && 40.2k Offset1 - Offset0 == 140.2k ) { |
281 | 31.9k | // Each of these offsets is in element sized units, so we need to convert |
282 | 31.9k | // to bytes of the individual reads. |
283 | 31.9k | |
284 | 31.9k | unsigned EltSize; |
285 | 31.9k | if (LdSt.mayLoad()) |
286 | 7.93k | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; |
287 | 24.0k | else { |
288 | 24.0k | assert(LdSt.mayStore()); |
289 | 24.0k | int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); |
290 | 24.0k | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; |
291 | 24.0k | } |
292 | 31.9k | |
293 | 31.9k | if (isStride64(Opc)) |
294 | 5 | EltSize *= 64; |
295 | 31.9k | |
296 | 31.9k | const MachineOperand *AddrReg = |
297 | 31.9k | getNamedOperand(LdSt, AMDGPU::OpName::addr); |
298 | 31.9k | BaseReg = AddrReg->getReg(); |
299 | 31.9k | Offset = EltSize * Offset0; |
300 | 31.9k | return true; |
301 | 31.9k | } |
302 | 8.30k | |
303 | 8.30k | return false; |
304 | 8.30k | } |
305 | 904k | |
306 | 904k | if (904k isMUBUF(LdSt) || 904k isMTBUF(LdSt)95.3k ) { |
307 | 808k | const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); |
308 | 808k | if (SOffset && 808k SOffset->isReg()808k ) |
309 | 715k | return false; |
310 | 93.2k | |
311 | 93.2k | const MachineOperand *AddrReg = |
312 | 93.2k | getNamedOperand(LdSt, AMDGPU::OpName::vaddr); |
313 | 93.2k | if (!AddrReg) |
314 | 88.3k | return false; |
315 | 4.94k | |
316 | 4.94k | const MachineOperand *OffsetImm = |
317 | 4.94k | getNamedOperand(LdSt, AMDGPU::OpName::offset); |
318 | 4.94k | BaseReg = AddrReg->getReg(); |
319 | 4.94k | Offset = OffsetImm->getImm(); |
320 | 4.94k | |
321 | 4.94k | if (SOffset) // soffset can be an inline immediate. |
322 | 4.94k | Offset += SOffset->getImm(); |
323 | 808k | |
324 | 808k | return true; |
325 | 808k | } |
326 | 95.1k | |
327 | 95.1k | if (95.1k isSMRD(LdSt)95.1k ) { |
328 | 29.7k | const MachineOperand *OffsetImm = |
329 | 29.7k | getNamedOperand(LdSt, AMDGPU::OpName::offset); |
330 | 29.7k | if (!OffsetImm) |
331 | 59 | return false; |
332 | 29.7k | |
333 | 29.7k | const MachineOperand *SBaseReg = |
334 | 29.7k | getNamedOperand(LdSt, AMDGPU::OpName::sbase); |
335 | 29.7k | BaseReg = SBaseReg->getReg(); |
336 | 29.7k | Offset = OffsetImm->getImm(); |
337 | 29.7k | return true; |
338 | 29.7k | } |
339 | 65.4k | |
340 | 65.4k | if (65.4k isFLAT(LdSt)65.4k ) { |
341 | 63.9k | const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); |
342 | 63.9k | if (VAddr63.9k ) { |
343 | 63.9k | // Can't analyze 2 offsets. |
344 | 63.9k | if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) |
345 | 0 | return false; |
346 | 63.9k | |
347 | 63.9k | BaseReg = VAddr->getReg(); |
348 | 63.9k | } else { |
349 | 0 | // scratch instructions have either vaddr or saddr. |
350 | 0 | BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg(); |
351 | 0 | } |
352 | 63.9k | |
353 | 63.9k | Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); |
354 | 63.9k | return true; |
355 | 1.51k | } |
356 | 1.51k | |
357 | 1.51k | return false; |
358 | 1.51k | } |
359 | | |
360 | | static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1, |
361 | 25.5k | const MachineInstr &MI2, unsigned BaseReg2) { |
362 | 25.5k | if (BaseReg1 == BaseReg2) |
363 | 17.6k | return true; |
364 | 7.88k | |
365 | 7.88k | if (7.88k !MI1.hasOneMemOperand() || 7.88k !MI2.hasOneMemOperand()7.88k ) |
366 | 146 | return false; |
367 | 7.74k | |
368 | 7.74k | auto MO1 = *MI1.memoperands_begin(); |
369 | 7.74k | auto MO2 = *MI2.memoperands_begin(); |
370 | 7.74k | if (MO1->getAddrSpace() != MO2->getAddrSpace()) |
371 | 4.87k | return false; |
372 | 2.86k | |
373 | 2.86k | auto Base1 = MO1->getValue(); |
374 | 2.86k | auto Base2 = MO2->getValue(); |
375 | 2.86k | if (!Base1 || 2.86k !Base22.81k ) |
376 | 54 | return false; |
377 | 2.81k | const MachineFunction &MF = *MI1.getParent()->getParent(); |
378 | 2.81k | const DataLayout &DL = MF.getFunction()->getParent()->getDataLayout(); |
379 | 2.81k | Base1 = GetUnderlyingObject(Base1, DL); |
380 | 2.81k | Base2 = GetUnderlyingObject(Base1, DL); |
381 | 2.81k | |
382 | 2.81k | if (isa<UndefValue>(Base1) || 2.81k isa<UndefValue>(Base2)2.06k ) |
383 | 750 | return false; |
384 | 2.06k | |
385 | 2.06k | return Base1 == Base2; |
386 | 2.06k | } |
387 | | |
388 | | bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, |
389 | | unsigned BaseReg1, |
390 | | MachineInstr &SecondLdSt, |
391 | | unsigned BaseReg2, |
392 | 25.5k | unsigned NumLoads) const { |
393 | 25.5k | if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2)) |
394 | 5.82k | return false; |
395 | 19.7k | |
396 | 19.7k | const MachineOperand *FirstDst = nullptr; |
397 | 19.7k | const MachineOperand *SecondDst = nullptr; |
398 | 19.7k | |
399 | 19.7k | if ((isMUBUF(FirstLdSt) && 19.7k isMUBUF(SecondLdSt)177 ) || |
400 | 19.5k | (isMTBUF(FirstLdSt) && 19.5k isMTBUF(SecondLdSt)0 ) || |
401 | 19.7k | (isFLAT(FirstLdSt) && 19.5k isFLAT(SecondLdSt)1.68k )) { |
402 | 1.85k | const unsigned MaxGlobalLoadCluster = 6; |
403 | 1.85k | if (NumLoads > MaxGlobalLoadCluster) |
404 | 0 | return false; |
405 | 1.85k | |
406 | 1.85k | FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); |
407 | 1.85k | if (!FirstDst) |
408 | 617 | FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); |
409 | 1.85k | SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); |
410 | 1.85k | if (!SecondDst) |
411 | 617 | SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); |
412 | 19.7k | } else if (17.8k isSMRD(FirstLdSt) && 17.8k isSMRD(SecondLdSt)16.0k ) { |
413 | 16.0k | FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); |
414 | 16.0k | SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); |
415 | 17.8k | } else if (1.77k isDS(FirstLdSt) && 1.77k isDS(SecondLdSt)1.77k ) { |
416 | 1.77k | FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); |
417 | 1.77k | SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); |
418 | 1.77k | } |
419 | 19.7k | |
420 | 19.7k | if (19.7k !FirstDst || 19.7k !SecondDst19.2k ) |
421 | 462 | return false; |
422 | 19.2k | |
423 | 19.2k | // Try to limit clustering based on the total number of bytes loaded |
424 | 19.2k | // rather than the number of instructions. This is done to help reduce |
425 | 19.2k | // register pressure. The method used is somewhat inexact, though, |
426 | 19.2k | // because it assumes that all loads in the cluster will load the |
427 | 19.2k | // same number of bytes as FirstLdSt. |
428 | 19.2k | |
429 | 19.2k | // The unit of this value is bytes. |
430 | 19.2k | // FIXME: This needs finer tuning. |
431 | 19.2k | unsigned LoadClusterThreshold = 16; |
432 | 19.2k | |
433 | 19.2k | const MachineRegisterInfo &MRI = |
434 | 19.2k | FirstLdSt.getParent()->getParent()->getRegInfo(); |
435 | 19.2k | const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); |
436 | 19.2k | |
437 | 19.2k | return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; |
438 | 19.2k | } |
439 | | |
440 | | static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, |
441 | | MachineBasicBlock::iterator MI, |
442 | | const DebugLoc &DL, unsigned DestReg, |
443 | 10 | unsigned SrcReg, bool KillSrc) { |
444 | 10 | MachineFunction *MF = MBB.getParent(); |
445 | 10 | DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(), |
446 | 10 | "illegal SGPR to VGPR copy", |
447 | 10 | DL, DS_Error); |
448 | 10 | LLVMContext &C = MF->getFunction()->getContext(); |
449 | 10 | C.diagnose(IllegalCopy); |
450 | 10 | |
451 | 10 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) |
452 | 10 | .addReg(SrcReg, getKillRegState(KillSrc)); |
453 | 10 | } |
454 | | |
455 | | void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, |
456 | | MachineBasicBlock::iterator MI, |
457 | | const DebugLoc &DL, unsigned DestReg, |
458 | 39.9k | unsigned SrcReg, bool KillSrc) const { |
459 | 39.9k | const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); |
460 | 39.9k | |
461 | 39.9k | if (RC == &AMDGPU::VGPR_32RegClass39.9k ) { |
462 | 23.8k | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || |
463 | 23.8k | AMDGPU::SReg_32RegClass.contains(SrcReg)); |
464 | 23.8k | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) |
465 | 23.8k | .addReg(SrcReg, getKillRegState(KillSrc)); |
466 | 23.8k | return; |
467 | 23.8k | } |
468 | 16.1k | |
469 | 16.1k | if (16.1k RC == &AMDGPU::SReg_32_XM0RegClass || |
470 | 16.1k | RC == &AMDGPU::SReg_32RegClass16.1k ) { |
471 | 10.4k | if (SrcReg == AMDGPU::SCC10.4k ) { |
472 | 0 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) |
473 | 0 | .addImm(-1) |
474 | 0 | .addImm(0); |
475 | 0 | return; |
476 | 0 | } |
477 | 10.4k | |
478 | 10.4k | if (10.4k !AMDGPU::SReg_32RegClass.contains(SrcReg)10.4k ) { |
479 | 2 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); |
480 | 2 | return; |
481 | 2 | } |
482 | 10.4k | |
483 | 10.4k | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) |
484 | 10.4k | .addReg(SrcReg, getKillRegState(KillSrc)); |
485 | 10.4k | return; |
486 | 10.4k | } |
487 | 5.73k | |
488 | 5.73k | if (5.73k RC == &AMDGPU::SReg_64RegClass5.73k ) { |
489 | 1.46k | if (DestReg == AMDGPU::VCC1.46k ) { |
490 | 23 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)23 ) { |
491 | 15 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) |
492 | 15 | .addReg(SrcReg, getKillRegState(KillSrc)); |
493 | 23 | } else { |
494 | 8 | // FIXME: Hack until VReg_1 removed. |
495 | 8 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); |
496 | 8 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) |
497 | 8 | .addImm(0) |
498 | 8 | .addReg(SrcReg, getKillRegState(KillSrc)); |
499 | 8 | } |
500 | 23 | |
501 | 23 | return; |
502 | 23 | } |
503 | 1.44k | |
504 | 1.44k | if (1.44k !AMDGPU::SReg_64RegClass.contains(SrcReg)1.44k ) { |
505 | 2 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); |
506 | 2 | return; |
507 | 2 | } |
508 | 1.43k | |
509 | 1.43k | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) |
510 | 1.43k | .addReg(SrcReg, getKillRegState(KillSrc)); |
511 | 1.43k | return; |
512 | 1.43k | } |
513 | 4.27k | |
514 | 4.27k | if (4.27k DestReg == AMDGPU::SCC4.27k ) { |
515 | 0 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); |
516 | 0 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) |
517 | 0 | .addReg(SrcReg, getKillRegState(KillSrc)) |
518 | 0 | .addImm(0); |
519 | 0 | return; |
520 | 0 | } |
521 | 4.27k | |
522 | 4.27k | unsigned EltSize = 4; |
523 | 4.27k | unsigned Opcode = AMDGPU::V_MOV_B32_e32; |
524 | 4.27k | if (RI.isSGPRClass(RC)4.27k ) { |
525 | 142 | if (RI.getRegSizeInBits(*RC) > 32142 ) { |
526 | 142 | Opcode = AMDGPU::S_MOV_B64; |
527 | 142 | EltSize = 8; |
528 | 142 | } else { |
529 | 0 | Opcode = AMDGPU::S_MOV_B32; |
530 | 0 | EltSize = 4; |
531 | 0 | } |
532 | 142 | |
533 | 142 | if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))142 ) { |
534 | 6 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); |
535 | 6 | return; |
536 | 6 | } |
537 | 4.26k | } |
538 | 4.26k | |
539 | 4.26k | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); |
540 | 4.26k | bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); |
541 | 4.26k | |
542 | 13.1k | for (unsigned Idx = 0; Idx < SubIndices.size()13.1k ; ++Idx8.90k ) { |
543 | 8.90k | unsigned SubIdx; |
544 | 8.90k | if (Forward) |
545 | 6.88k | SubIdx = SubIndices[Idx]; |
546 | 8.90k | else |
547 | 2.02k | SubIdx = SubIndices[SubIndices.size() - Idx - 1]; |
548 | 8.90k | |
549 | 8.90k | MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, |
550 | 8.90k | get(Opcode), RI.getSubReg(DestReg, SubIdx)); |
551 | 8.90k | |
552 | 8.90k | Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); |
553 | 8.90k | |
554 | 8.90k | if (Idx == 0) |
555 | 4.26k | Builder.addReg(DestReg, RegState::Define | RegState::Implicit); |
556 | 8.90k | |
557 | 7.28k | bool UseKill = KillSrc && Idx == SubIndices.size() - 1; |
558 | 8.90k | Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); |
559 | 8.90k | } |
560 | 39.9k | } |
561 | | |
562 | 242k | int SIInstrInfo::commuteOpcode(unsigned Opcode) const { |
563 | 242k | int NewOpc; |
564 | 242k | |
565 | 242k | // Try to map original to commuted opcode |
566 | 242k | NewOpc = AMDGPU::getCommuteRev(Opcode); |
567 | 242k | if (NewOpc != -1) |
568 | 242k | // Check if the commuted (REV) opcode exists on the target. |
569 | 22.9k | return pseudoToMCOpcode(NewOpc) != -1 ? 22.9k NewOpc22.9k : -10 ; |
570 | 219k | |
571 | 219k | // Try to map commuted to original opcode |
572 | 219k | NewOpc = AMDGPU::getCommuteOrig(Opcode); |
573 | 219k | if (NewOpc != -1) |
574 | 219k | // Check if the original (non-REV) opcode exists on the target. |
575 | 39.3k | return pseudoToMCOpcode(NewOpc) != -1 ? 39.3k NewOpc26.7k : -112.5k ; |
576 | 180k | |
577 | 180k | return Opcode; |
578 | 180k | } |
579 | | |
580 | | void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, |
581 | | MachineBasicBlock::iterator MI, |
582 | | const DebugLoc &DL, unsigned DestReg, |
583 | 0 | int64_t Value) const { |
584 | 0 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
585 | 0 | const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); |
586 | 0 | if (RegClass == &AMDGPU::SReg_32RegClass || |
587 | 0 | RegClass == &AMDGPU::SGPR_32RegClass || |
588 | 0 | RegClass == &AMDGPU::SReg_32_XM0RegClass || |
589 | 0 | RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass0 ) { |
590 | 0 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) |
591 | 0 | .addImm(Value); |
592 | 0 | return; |
593 | 0 | } |
594 | 0 |
|
595 | 0 | if (0 RegClass == &AMDGPU::SReg_64RegClass || |
596 | 0 | RegClass == &AMDGPU::SGPR_64RegClass || |
597 | 0 | RegClass == &AMDGPU::SReg_64_XEXECRegClass0 ) { |
598 | 0 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) |
599 | 0 | .addImm(Value); |
600 | 0 | return; |
601 | 0 | } |
602 | 0 |
|
603 | 0 | if (0 RegClass == &AMDGPU::VGPR_32RegClass0 ) { |
604 | 0 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) |
605 | 0 | .addImm(Value); |
606 | 0 | return; |
607 | 0 | } |
608 | 0 | if (0 RegClass == &AMDGPU::VReg_64RegClass0 ) { |
609 | 0 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) |
610 | 0 | .addImm(Value); |
611 | 0 | return; |
612 | 0 | } |
613 | 0 |
|
614 | 0 | unsigned EltSize = 4; |
615 | 0 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; |
616 | 0 | if (RI.isSGPRClass(RegClass)0 ) { |
617 | 0 | if (RI.getRegSizeInBits(*RegClass) > 320 ) { |
618 | 0 | Opcode = AMDGPU::S_MOV_B64; |
619 | 0 | EltSize = 8; |
620 | 0 | } else { |
621 | 0 | Opcode = AMDGPU::S_MOV_B32; |
622 | 0 | EltSize = 4; |
623 | 0 | } |
624 | 0 | } |
625 | 0 |
|
626 | 0 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); |
627 | 0 | for (unsigned Idx = 0; Idx < SubIndices.size()0 ; ++Idx0 ) { |
628 | 0 | int64_t IdxValue = Idx == 0 ? Value0 : 00 ; |
629 | 0 |
|
630 | 0 | MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, |
631 | 0 | get(Opcode), RI.getSubReg(DestReg, Idx)); |
632 | 0 | Builder.addImm(IdxValue); |
633 | 0 | } |
634 | 0 | } |
635 | | |
636 | | const TargetRegisterClass * |
637 | 0 | SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { |
638 | 0 | return &AMDGPU::VGPR_32RegClass; |
639 | 0 | } |
640 | | |
641 | | void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, |
642 | | MachineBasicBlock::iterator I, |
643 | | const DebugLoc &DL, unsigned DstReg, |
644 | | ArrayRef<MachineOperand> Cond, |
645 | | unsigned TrueReg, |
646 | 0 | unsigned FalseReg) const { |
647 | 0 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
648 | 0 | assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && |
649 | 0 | "Not a VGPR32 reg"); |
650 | 0 |
|
651 | 0 | if (Cond.size() == 10 ) { |
652 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
653 | 0 | .addReg(FalseReg) |
654 | 0 | .addReg(TrueReg) |
655 | 0 | .add(Cond[0]); |
656 | 0 | } else if (0 Cond.size() == 20 ) { |
657 | 0 | assert(Cond[0].isImm() && "Cond[0] is not an immediate"); |
658 | 0 | switch (Cond[0].getImm()) { |
659 | 0 | case SIInstrInfo::SCC_TRUE: { |
660 | 0 | unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
661 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) |
662 | 0 | .addImm(-1) |
663 | 0 | .addImm(0); |
664 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
665 | 0 | .addReg(FalseReg) |
666 | 0 | .addReg(TrueReg) |
667 | 0 | .addReg(SReg); |
668 | 0 | break; |
669 | 0 | } |
670 | 0 | case SIInstrInfo::SCC_FALSE: { |
671 | 0 | unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
672 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) |
673 | 0 | .addImm(0) |
674 | 0 | .addImm(-1); |
675 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
676 | 0 | .addReg(FalseReg) |
677 | 0 | .addReg(TrueReg) |
678 | 0 | .addReg(SReg); |
679 | 0 | break; |
680 | 0 | } |
681 | 0 | case SIInstrInfo::VCCNZ: { |
682 | 0 | MachineOperand RegOp = Cond[1]; |
683 | 0 | RegOp.setImplicit(false); |
684 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
685 | 0 | .addReg(FalseReg) |
686 | 0 | .addReg(TrueReg) |
687 | 0 | .add(RegOp); |
688 | 0 | break; |
689 | 0 | } |
690 | 0 | case SIInstrInfo::VCCZ: { |
691 | 0 | MachineOperand RegOp = Cond[1]; |
692 | 0 | RegOp.setImplicit(false); |
693 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
694 | 0 | .addReg(TrueReg) |
695 | 0 | .addReg(FalseReg) |
696 | 0 | .add(RegOp); |
697 | 0 | break; |
698 | 0 | } |
699 | 0 | case SIInstrInfo::EXECNZ: { |
700 | 0 | unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
701 | 0 | unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
702 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) |
703 | 0 | .addImm(0); |
704 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) |
705 | 0 | .addImm(-1) |
706 | 0 | .addImm(0); |
707 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
708 | 0 | .addReg(FalseReg) |
709 | 0 | .addReg(TrueReg) |
710 | 0 | .addReg(SReg); |
711 | 0 | break; |
712 | 0 | } |
713 | 0 | case SIInstrInfo::EXECZ: { |
714 | 0 | unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
715 | 0 | unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
716 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) |
717 | 0 | .addImm(0); |
718 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) |
719 | 0 | .addImm(0) |
720 | 0 | .addImm(-1); |
721 | 0 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
722 | 0 | .addReg(FalseReg) |
723 | 0 | .addReg(TrueReg) |
724 | 0 | .addReg(SReg); |
725 | 0 | llvm_unreachable("Unhandled branch predicate EXECZ"); |
726 | 0 | break; |
727 | 0 | } |
728 | 0 | default: |
729 | 0 | llvm_unreachable("invalid branch predicate"); |
730 | 0 | } |
731 | 0 | } else { |
732 | 0 | llvm_unreachable("Can only handle Cond size 1 or 2"); |
733 | 0 | } |
734 | 0 | } |
735 | | |
736 | | unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, |
737 | | MachineBasicBlock::iterator I, |
738 | | const DebugLoc &DL, |
739 | 0 | unsigned SrcReg, int Value) const { |
740 | 0 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); |
741 | 0 | unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
742 | 0 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) |
743 | 0 | .addImm(Value) |
744 | 0 | .addReg(SrcReg); |
745 | 0 |
|
746 | 0 | return Reg; |
747 | 0 | } |
748 | | |
749 | | unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, |
750 | | MachineBasicBlock::iterator I, |
751 | | const DebugLoc &DL, |
752 | 0 | unsigned SrcReg, int Value) const { |
753 | 0 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); |
754 | 0 | unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
755 | 0 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) |
756 | 0 | .addImm(Value) |
757 | 0 | .addReg(SrcReg); |
758 | 0 |
|
759 | 0 | return Reg; |
760 | 0 | } |
761 | | |
762 | 8.58k | unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { |
763 | 8.58k | |
764 | 8.58k | if (RI.getRegSizeInBits(*DstRC) == 328.58k ) { |
765 | 8.29k | return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B324.54k : AMDGPU::V_MOV_B32_e323.75k ; |
766 | 294 | } else if (294 RI.getRegSizeInBits(*DstRC) == 64 && 294 RI.isSGPRClass(DstRC)294 ) { |
767 | 7 | return AMDGPU::S_MOV_B64; |
768 | 287 | } else if (287 RI.getRegSizeInBits(*DstRC) == 64 && 287 !RI.isSGPRClass(DstRC)287 ) { |
769 | 287 | return AMDGPU::V_MOV_B64_PSEUDO; |
770 | 287 | } |
771 | 0 | return AMDGPU::COPY; |
772 | 0 | } |
773 | | |
774 | 600 | static unsigned getSGPRSpillSaveOpcode(unsigned Size) { |
775 | 600 | switch (Size) { |
776 | 458 | case 4: |
777 | 458 | return AMDGPU::SI_SPILL_S32_SAVE; |
778 | 85 | case 8: |
779 | 85 | return AMDGPU::SI_SPILL_S64_SAVE; |
780 | 22 | case 16: |
781 | 22 | return AMDGPU::SI_SPILL_S128_SAVE; |
782 | 27 | case 32: |
783 | 27 | return AMDGPU::SI_SPILL_S256_SAVE; |
784 | 8 | case 64: |
785 | 8 | return AMDGPU::SI_SPILL_S512_SAVE; |
786 | 0 | default: |
787 | 0 | llvm_unreachable("unknown register size"); |
788 | 0 | } |
789 | 0 | } |
790 | | |
791 | 1.08k | static unsigned getVGPRSpillSaveOpcode(unsigned Size) { |
792 | 1.08k | switch (Size) { |
793 | 419 | case 4: |
794 | 419 | return AMDGPU::SI_SPILL_V32_SAVE; |
795 | 13 | case 8: |
796 | 13 | return AMDGPU::SI_SPILL_V64_SAVE; |
797 | 0 | case 12: |
798 | 0 | return AMDGPU::SI_SPILL_V96_SAVE; |
799 | 657 | case 16: |
800 | 657 | return AMDGPU::SI_SPILL_V128_SAVE; |
801 | 0 | case 32: |
802 | 0 | return AMDGPU::SI_SPILL_V256_SAVE; |
803 | 0 | case 64: |
804 | 0 | return AMDGPU::SI_SPILL_V512_SAVE; |
805 | 0 | default: |
806 | 0 | llvm_unreachable("unknown register size"); |
807 | 0 | } |
808 | 0 | } |
809 | | |
810 | | void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, |
811 | | MachineBasicBlock::iterator MI, |
812 | | unsigned SrcReg, bool isKill, |
813 | | int FrameIndex, |
814 | | const TargetRegisterClass *RC, |
815 | 1.68k | const TargetRegisterInfo *TRI) const { |
816 | 1.68k | MachineFunction *MF = MBB.getParent(); |
817 | 1.68k | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
818 | 1.68k | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); |
819 | 1.68k | DebugLoc DL = MBB.findDebugLoc(MI); |
820 | 1.68k | |
821 | 1.68k | assert(SrcReg != MFI->getStackPtrOffsetReg() && |
822 | 1.68k | SrcReg != MFI->getFrameOffsetReg() && |
823 | 1.68k | SrcReg != MFI->getScratchWaveOffsetReg()); |
824 | 1.68k | |
825 | 1.68k | unsigned Size = FrameInfo.getObjectSize(FrameIndex); |
826 | 1.68k | unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); |
827 | 1.68k | MachinePointerInfo PtrInfo |
828 | 1.68k | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); |
829 | 1.68k | MachineMemOperand *MMO |
830 | 1.68k | = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, |
831 | 1.68k | Size, Align); |
832 | 1.68k | unsigned SpillSize = TRI->getSpillSize(*RC); |
833 | 1.68k | |
834 | 1.68k | if (RI.isSGPRClass(RC)1.68k ) { |
835 | 600 | MFI->setHasSpilledSGPRs(); |
836 | 600 | |
837 | 600 | // We are only allowed to create one new instruction when spilling |
838 | 600 | // registers, so we need to use pseudo instruction for spilling SGPRs. |
839 | 600 | const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); |
840 | 600 | |
841 | 600 | // The SGPR spill/restore instructions only work on number sgprs, so we need |
842 | 600 | // to make sure we are using the correct register class. |
843 | 600 | if (TargetRegisterInfo::isVirtualRegister(SrcReg) && 600 SpillSize == 4150 ) { |
844 | 125 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
845 | 125 | MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); |
846 | 125 | } |
847 | 600 | |
848 | 600 | MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) |
849 | 600 | .addReg(SrcReg, getKillRegState(isKill)) // data |
850 | 600 | .addFrameIndex(FrameIndex) // addr |
851 | 600 | .addMemOperand(MMO) |
852 | 600 | .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) |
853 | 600 | .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); |
854 | 600 | // Add the scratch resource registers as implicit uses because we may end up |
855 | 600 | // needing them, and need to ensure that the reserved registers are |
856 | 600 | // correctly handled. |
857 | 600 | |
858 | 600 | FrameInfo.setStackID(FrameIndex, 1); |
859 | 600 | if (ST.hasScalarStores()600 ) { |
860 | 300 | // m0 is used for offset to scalar stores if used to spill. |
861 | 300 | Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); |
862 | 300 | } |
863 | 600 | |
864 | 600 | return; |
865 | 600 | } |
866 | 1.08k | |
867 | 1.08k | if (1.08k !ST.isVGPRSpillingEnabled(*MF->getFunction())1.08k ) { |
868 | 0 | LLVMContext &Ctx = MF->getFunction()->getContext(); |
869 | 0 | Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" |
870 | 0 | " spill register"); |
871 | 0 | BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) |
872 | 0 | .addReg(SrcReg); |
873 | 0 |
|
874 | 0 | return; |
875 | 0 | } |
876 | 1.08k | |
877 | 1.08k | assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); |
878 | 1.08k | |
879 | 1.08k | unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); |
880 | 1.08k | MFI->setHasSpilledVGPRs(); |
881 | 1.08k | BuildMI(MBB, MI, DL, get(Opcode)) |
882 | 1.08k | .addReg(SrcReg, getKillRegState(isKill)) // data |
883 | 1.08k | .addFrameIndex(FrameIndex) // addr |
884 | 1.08k | .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc |
885 | 1.08k | .addReg(MFI->getFrameOffsetReg()) // scratch_offset |
886 | 1.08k | .addImm(0) // offset |
887 | 1.08k | .addMemOperand(MMO); |
888 | 1.08k | } |
889 | | |
890 | 587 | static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { |
891 | 587 | switch (Size) { |
892 | 452 | case 4: |
893 | 452 | return AMDGPU::SI_SPILL_S32_RESTORE; |
894 | 79 | case 8: |
895 | 79 | return AMDGPU::SI_SPILL_S64_RESTORE; |
896 | 21 | case 16: |
897 | 21 | return AMDGPU::SI_SPILL_S128_RESTORE; |
898 | 27 | case 32: |
899 | 27 | return AMDGPU::SI_SPILL_S256_RESTORE; |
900 | 8 | case 64: |
901 | 8 | return AMDGPU::SI_SPILL_S512_RESTORE; |
902 | 0 | default: |
903 | 0 | llvm_unreachable("unknown register size"); |
904 | 0 | } |
905 | 0 | } |
906 | | |
907 | 1.00k | static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { |
908 | 1.00k | switch (Size) { |
909 | 328 | case 4: |
910 | 328 | return AMDGPU::SI_SPILL_V32_RESTORE; |
911 | 13 | case 8: |
912 | 13 | return AMDGPU::SI_SPILL_V64_RESTORE; |
913 | 0 | case 12: |
914 | 0 | return AMDGPU::SI_SPILL_V96_RESTORE; |
915 | 661 | case 16: |
916 | 661 | return AMDGPU::SI_SPILL_V128_RESTORE; |
917 | 0 | case 32: |
918 | 0 | return AMDGPU::SI_SPILL_V256_RESTORE; |
919 | 0 | case 64: |
920 | 0 | return AMDGPU::SI_SPILL_V512_RESTORE; |
921 | 0 | default: |
922 | 0 | llvm_unreachable("unknown register size"); |
923 | 0 | } |
924 | 0 | } |
925 | | |
926 | | void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, |
927 | | MachineBasicBlock::iterator MI, |
928 | | unsigned DestReg, int FrameIndex, |
929 | | const TargetRegisterClass *RC, |
930 | 1.58k | const TargetRegisterInfo *TRI) const { |
931 | 1.58k | MachineFunction *MF = MBB.getParent(); |
932 | 1.58k | const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
933 | 1.58k | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); |
934 | 1.58k | DebugLoc DL = MBB.findDebugLoc(MI); |
935 | 1.58k | unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); |
936 | 1.58k | unsigned Size = FrameInfo.getObjectSize(FrameIndex); |
937 | 1.58k | unsigned SpillSize = TRI->getSpillSize(*RC); |
938 | 1.58k | |
939 | 1.58k | MachinePointerInfo PtrInfo |
940 | 1.58k | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); |
941 | 1.58k | |
942 | 1.58k | MachineMemOperand *MMO = MF->getMachineMemOperand( |
943 | 1.58k | PtrInfo, MachineMemOperand::MOLoad, Size, Align); |
944 | 1.58k | |
945 | 1.58k | if (RI.isSGPRClass(RC)1.58k ) { |
946 | 587 | // FIXME: Maybe this should not include a memoperand because it will be |
947 | 587 | // lowered to non-memory instructions. |
948 | 587 | const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); |
949 | 587 | if (TargetRegisterInfo::isVirtualRegister(DestReg) && 587 SpillSize == 4154 ) { |
950 | 126 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
951 | 126 | MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); |
952 | 126 | } |
953 | 587 | |
954 | 587 | FrameInfo.setStackID(FrameIndex, 1); |
955 | 587 | MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) |
956 | 587 | .addFrameIndex(FrameIndex) // addr |
957 | 587 | .addMemOperand(MMO) |
958 | 587 | .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) |
959 | 587 | .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); |
960 | 587 | |
961 | 587 | if (ST.hasScalarStores()587 ) { |
962 | 295 | // m0 is used for offset to scalar stores if used to spill. |
963 | 295 | Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); |
964 | 295 | } |
965 | 587 | |
966 | 587 | return; |
967 | 587 | } |
968 | 1.00k | |
969 | 1.00k | if (1.00k !ST.isVGPRSpillingEnabled(*MF->getFunction())1.00k ) { |
970 | 0 | LLVMContext &Ctx = MF->getFunction()->getContext(); |
971 | 0 | Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" |
972 | 0 | " restore register"); |
973 | 0 | BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); |
974 | 0 |
|
975 | 0 | return; |
976 | 0 | } |
977 | 1.00k | |
978 | 1.00k | assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); |
979 | 1.00k | |
980 | 1.00k | unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); |
981 | 1.00k | BuildMI(MBB, MI, DL, get(Opcode), DestReg) |
982 | 1.00k | .addFrameIndex(FrameIndex) // vaddr |
983 | 1.00k | .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc |
984 | 1.00k | .addReg(MFI->getFrameOffsetReg()) // scratch_offset |
985 | 1.00k | .addImm(0) // offset |
986 | 1.00k | .addMemOperand(MMO); |
987 | 1.00k | } |
988 | | |
989 | | /// \param @Offset Offset in bytes of the FrameIndex being spilled |
990 | | unsigned SIInstrInfo::calculateLDSSpillAddress( |
991 | | MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, |
992 | 0 | unsigned FrameOffset, unsigned Size) const { |
993 | 0 | MachineFunction *MF = MBB.getParent(); |
994 | 0 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
995 | 0 | const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); |
996 | 0 | DebugLoc DL = MBB.findDebugLoc(MI); |
997 | 0 | unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); |
998 | 0 | unsigned WavefrontSize = ST.getWavefrontSize(); |
999 | 0 |
|
1000 | 0 | unsigned TIDReg = MFI->getTIDReg(); |
1001 | 0 | if (!MFI->hasCalculatedTID()0 ) { |
1002 | 0 | MachineBasicBlock &Entry = MBB.getParent()->front(); |
1003 | 0 | MachineBasicBlock::iterator Insert = Entry.front(); |
1004 | 0 | DebugLoc DL = Insert->getDebugLoc(); |
1005 | 0 |
|
1006 | 0 | TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, |
1007 | 0 | *MF); |
1008 | 0 | if (TIDReg == AMDGPU::NoRegister) |
1009 | 0 | return TIDReg; |
1010 | 0 |
|
1011 | 0 | if (0 !AMDGPU::isShader(MF->getFunction()->getCallingConv()) && |
1012 | 0 | WorkGroupSize > WavefrontSize0 ) { |
1013 | 0 | unsigned TIDIGXReg |
1014 | 0 | = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); |
1015 | 0 | unsigned TIDIGYReg |
1016 | 0 | = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); |
1017 | 0 | unsigned TIDIGZReg |
1018 | 0 | = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); |
1019 | 0 | unsigned InputPtrReg = |
1020 | 0 | MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); |
1021 | 0 | for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { |
1022 | 0 | if (!Entry.isLiveIn(Reg)) |
1023 | 0 | Entry.addLiveIn(Reg); |
1024 | 0 | } |
1025 | 0 |
|
1026 | 0 | RS->enterBasicBlock(Entry); |
1027 | 0 | // FIXME: Can we scavenge an SReg_64 and access the subregs? |
1028 | 0 | unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); |
1029 | 0 | unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); |
1030 | 0 | BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) |
1031 | 0 | .addReg(InputPtrReg) |
1032 | 0 | .addImm(SI::KernelInputOffsets::NGROUPS_Z); |
1033 | 0 | BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) |
1034 | 0 | .addReg(InputPtrReg) |
1035 | 0 | .addImm(SI::KernelInputOffsets::NGROUPS_Y); |
1036 | 0 |
|
1037 | 0 | // NGROUPS.X * NGROUPS.Y |
1038 | 0 | BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) |
1039 | 0 | .addReg(STmp1) |
1040 | 0 | .addReg(STmp0); |
1041 | 0 | // (NGROUPS.X * NGROUPS.Y) * TIDIG.X |
1042 | 0 | BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) |
1043 | 0 | .addReg(STmp1) |
1044 | 0 | .addReg(TIDIGXReg); |
1045 | 0 | // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) |
1046 | 0 | BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) |
1047 | 0 | .addReg(STmp0) |
1048 | 0 | .addReg(TIDIGYReg) |
1049 | 0 | .addReg(TIDReg); |
1050 | 0 | // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z |
1051 | 0 | BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) |
1052 | 0 | .addReg(TIDReg) |
1053 | 0 | .addReg(TIDIGZReg); |
1054 | 0 | } else { |
1055 | 0 | // Get the wave id |
1056 | 0 | BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), |
1057 | 0 | TIDReg) |
1058 | 0 | .addImm(-1) |
1059 | 0 | .addImm(0); |
1060 | 0 |
|
1061 | 0 | BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), |
1062 | 0 | TIDReg) |
1063 | 0 | .addImm(-1) |
1064 | 0 | .addReg(TIDReg); |
1065 | 0 | } |
1066 | 0 |
|
1067 | 0 | BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), |
1068 | 0 | TIDReg) |
1069 | 0 | .addImm(2) |
1070 | 0 | .addReg(TIDReg); |
1071 | 0 | MFI->setTIDReg(TIDReg); |
1072 | 0 | } |
1073 | 0 |
|
1074 | 0 | // Add FrameIndex to LDS offset |
1075 | 0 | unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); |
1076 | 0 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) |
1077 | 0 | .addImm(LDSOffset) |
1078 | 0 | .addReg(TIDReg); |
1079 | 0 |
|
1080 | 0 | return TmpReg; |
1081 | 0 | } |
1082 | | |
1083 | | void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, |
1084 | | MachineBasicBlock::iterator MI, |
1085 | 1.54k | int Count) const { |
1086 | 1.54k | DebugLoc DL = MBB.findDebugLoc(MI); |
1087 | 3.09k | while (Count > 03.09k ) { |
1088 | 1.54k | int Arg; |
1089 | 1.54k | if (Count >= 8) |
1090 | 0 | Arg = 7; |
1091 | 1.54k | else |
1092 | 1.54k | Arg = Count - 1; |
1093 | 1.54k | Count -= 8; |
1094 | 1.54k | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) |
1095 | 1.54k | .addImm(Arg); |
1096 | 1.54k | } |
1097 | 1.54k | } |
1098 | | |
1099 | | void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, |
1100 | 1.54k | MachineBasicBlock::iterator MI) const { |
1101 | 1.54k | insertWaitStates(MBB, MI, 1); |
1102 | 1.54k | } |
1103 | | |
1104 | 0 | void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { |
1105 | 0 | auto MF = MBB.getParent(); |
1106 | 0 | SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
1107 | 0 |
|
1108 | 0 | assert(Info->isEntryFunction()); |
1109 | 0 |
|
1110 | 0 | if (MBB.succ_empty()0 ) { |
1111 | 0 | bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); |
1112 | 0 | if (HasNoTerminator) |
1113 | 0 | BuildMI(MBB, MBB.end(), DebugLoc(), |
1114 | 0 | get(Info->returnsVoid() ? AMDGPU::S_ENDPGM0 : AMDGPU::SI_RETURN_TO_EPILOG0 )); |
1115 | 0 | } |
1116 | 0 | } |
1117 | | |
1118 | 444k | unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { |
1119 | 444k | switch (MI.getOpcode()) { |
1120 | 443k | default: return 1; // FIXME: Do wait states equal cycles? |
1121 | 444k | |
1122 | 1.27k | case AMDGPU::S_NOP: |
1123 | 1.27k | return MI.getOperand(0).getImm() + 1; |
1124 | 0 | } |
1125 | 0 | } |
1126 | | |
1127 | 243k | bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { |
1128 | 243k | MachineBasicBlock &MBB = *MI.getParent(); |
1129 | 243k | DebugLoc DL = MBB.findDebugLoc(MI); |
1130 | 243k | switch (MI.getOpcode()) { |
1131 | 242k | default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); |
1132 | 0 | case AMDGPU::S_MOV_B64_term: |
1133 | 0 | // This is only a terminator to get the correct spill code placement during |
1134 | 0 | // register allocation. |
1135 | 0 | MI.setDesc(get(AMDGPU::S_MOV_B64)); |
1136 | 0 | break; |
1137 | 243k | |
1138 | 0 | case AMDGPU::S_XOR_B64_term: |
1139 | 0 | // This is only a terminator to get the correct spill code placement during |
1140 | 0 | // register allocation. |
1141 | 0 | MI.setDesc(get(AMDGPU::S_XOR_B64)); |
1142 | 0 | break; |
1143 | 243k | |
1144 | 0 | case AMDGPU::S_ANDN2_B64_term: |
1145 | 0 | // This is only a terminator to get the correct spill code placement during |
1146 | 0 | // register allocation. |
1147 | 0 | MI.setDesc(get(AMDGPU::S_ANDN2_B64)); |
1148 | 0 | break; |
1149 | 243k | |
1150 | 210 | case AMDGPU::V_MOV_B64_PSEUDO: { |
1151 | 210 | unsigned Dst = MI.getOperand(0).getReg(); |
1152 | 210 | unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); |
1153 | 210 | unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); |
1154 | 210 | |
1155 | 210 | const MachineOperand &SrcOp = MI.getOperand(1); |
1156 | 210 | // FIXME: Will this work for 64-bit floating point immediates? |
1157 | 210 | assert(!SrcOp.isFPImm()); |
1158 | 210 | if (SrcOp.isImm()210 ) { |
1159 | 210 | APInt Imm(64, SrcOp.getImm()); |
1160 | 210 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) |
1161 | 210 | .addImm(Imm.getLoBits(32).getZExtValue()) |
1162 | 210 | .addReg(Dst, RegState::Implicit | RegState::Define); |
1163 | 210 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) |
1164 | 210 | .addImm(Imm.getHiBits(32).getZExtValue()) |
1165 | 210 | .addReg(Dst, RegState::Implicit | RegState::Define); |
1166 | 210 | } else { |
1167 | 0 | assert(SrcOp.isReg()); |
1168 | 0 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) |
1169 | 0 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) |
1170 | 0 | .addReg(Dst, RegState::Implicit | RegState::Define); |
1171 | 0 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) |
1172 | 0 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) |
1173 | 0 | .addReg(Dst, RegState::Implicit | RegState::Define); |
1174 | 0 | } |
1175 | 210 | MI.eraseFromParent(); |
1176 | 210 | break; |
1177 | 243k | } |
1178 | 6 | case AMDGPU::V_SET_INACTIVE_B32: { |
1179 | 6 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) |
1180 | 6 | .addReg(AMDGPU::EXEC); |
1181 | 6 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) |
1182 | 6 | .add(MI.getOperand(2)); |
1183 | 6 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) |
1184 | 6 | .addReg(AMDGPU::EXEC); |
1185 | 6 | MI.eraseFromParent(); |
1186 | 6 | break; |
1187 | 243k | } |
1188 | 2 | case AMDGPU::V_SET_INACTIVE_B64: { |
1189 | 2 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) |
1190 | 2 | .addReg(AMDGPU::EXEC); |
1191 | 2 | MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), |
1192 | 2 | MI.getOperand(0).getReg()) |
1193 | 2 | .add(MI.getOperand(2)); |
1194 | 2 | expandPostRAPseudo(*Copy); |
1195 | 2 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) |
1196 | 2 | .addReg(AMDGPU::EXEC); |
1197 | 2 | MI.eraseFromParent(); |
1198 | 2 | break; |
1199 | 243k | } |
1200 | 64 | case AMDGPU::V_MOVRELD_B32_V1: |
1201 | 64 | case AMDGPU::V_MOVRELD_B32_V2: |
1202 | 64 | case AMDGPU::V_MOVRELD_B32_V4: |
1203 | 64 | case AMDGPU::V_MOVRELD_B32_V8: |
1204 | 64 | case AMDGPU::V_MOVRELD_B32_V16: { |
1205 | 64 | const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); |
1206 | 64 | unsigned VecReg = MI.getOperand(0).getReg(); |
1207 | 64 | bool IsUndef = MI.getOperand(1).isUndef(); |
1208 | 64 | unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); |
1209 | 64 | assert(VecReg == MI.getOperand(1).getReg()); |
1210 | 64 | |
1211 | 64 | MachineInstr *MovRel = |
1212 | 64 | BuildMI(MBB, MI, DL, MovRelDesc) |
1213 | 64 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) |
1214 | 64 | .add(MI.getOperand(2)) |
1215 | 64 | .addReg(VecReg, RegState::ImplicitDefine) |
1216 | 64 | .addReg(VecReg, |
1217 | 64 | RegState::Implicit | (IsUndef ? RegState::Undef2 : 062 )); |
1218 | 64 | |
1219 | 64 | const int ImpDefIdx = |
1220 | 64 | MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); |
1221 | 64 | const int ImpUseIdx = ImpDefIdx + 1; |
1222 | 64 | MovRel->tieOperands(ImpDefIdx, ImpUseIdx); |
1223 | 64 | |
1224 | 64 | MI.eraseFromParent(); |
1225 | 64 | break; |
1226 | 64 | } |
1227 | 484 | case AMDGPU::SI_PC_ADD_REL_OFFSET: { |
1228 | 484 | MachineFunction &MF = *MBB.getParent(); |
1229 | 484 | unsigned Reg = MI.getOperand(0).getReg(); |
1230 | 484 | unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); |
1231 | 484 | unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); |
1232 | 484 | |
1233 | 484 | // Create a bundle so these instructions won't be re-ordered by the |
1234 | 484 | // post-RA scheduler. |
1235 | 484 | MIBundleBuilder Bundler(MBB, MI); |
1236 | 484 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); |
1237 | 484 | |
1238 | 484 | // Add 32-bit offset from this instruction to the start of the |
1239 | 484 | // constant data. |
1240 | 484 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) |
1241 | 484 | .addReg(RegLo) |
1242 | 484 | .add(MI.getOperand(1))); |
1243 | 484 | |
1244 | 484 | MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) |
1245 | 484 | .addReg(RegHi); |
1246 | 484 | if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) |
1247 | 19 | MIB.addImm(0); |
1248 | 484 | else |
1249 | 465 | MIB.add(MI.getOperand(2)); |
1250 | 484 | |
1251 | 484 | Bundler.append(MIB); |
1252 | 484 | finalizeBundle(MBB, Bundler.begin()); |
1253 | 484 | |
1254 | 484 | MI.eraseFromParent(); |
1255 | 484 | break; |
1256 | 64 | } |
1257 | 16 | case AMDGPU::EXIT_WWM: { |
1258 | 16 | // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM |
1259 | 16 | // is exited. |
1260 | 16 | MI.setDesc(get(AMDGPU::S_MOV_B64)); |
1261 | 16 | break; |
1262 | 782 | } |
1263 | 782 | } |
1264 | 782 | return true; |
1265 | 782 | } |
1266 | | |
1267 | | bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, |
1268 | | MachineOperand &Src0, |
1269 | | unsigned Src0OpName, |
1270 | | MachineOperand &Src1, |
1271 | 191k | unsigned Src1OpName) const { |
1272 | 191k | MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); |
1273 | 191k | if (!Src0Mods) |
1274 | 153k | return false; |
1275 | 37.4k | |
1276 | 37.4k | MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); |
1277 | 37.4k | assert(Src1Mods && |
1278 | 37.4k | "All commutable instructions have both src0 and src1 modifiers"); |
1279 | 37.4k | |
1280 | 37.4k | int Src0ModsVal = Src0Mods->getImm(); |
1281 | 37.4k | int Src1ModsVal = Src1Mods->getImm(); |
1282 | 37.4k | |
1283 | 37.4k | Src1Mods->setImm(Src0ModsVal); |
1284 | 37.4k | Src0Mods->setImm(Src1ModsVal); |
1285 | 37.4k | return true; |
1286 | 37.4k | } |
1287 | | |
1288 | | static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, |
1289 | | MachineOperand &RegOp, |
1290 | 25.0k | MachineOperand &NonRegOp) { |
1291 | 25.0k | unsigned Reg = RegOp.getReg(); |
1292 | 25.0k | unsigned SubReg = RegOp.getSubReg(); |
1293 | 25.0k | bool IsKill = RegOp.isKill(); |
1294 | 25.0k | bool IsDead = RegOp.isDead(); |
1295 | 25.0k | bool IsUndef = RegOp.isUndef(); |
1296 | 25.0k | bool IsDebug = RegOp.isDebug(); |
1297 | 25.0k | |
1298 | 25.0k | if (NonRegOp.isImm()) |
1299 | 25.0k | RegOp.ChangeToImmediate(NonRegOp.getImm()); |
1300 | 0 | else if (0 NonRegOp.isFI()0 ) |
1301 | 0 | RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); |
1302 | 0 | else |
1303 | 0 | return nullptr; |
1304 | 25.0k | |
1305 | 25.0k | NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); |
1306 | 25.0k | NonRegOp.setSubReg(SubReg); |
1307 | 25.0k | |
1308 | 25.0k | return &MI; |
1309 | 25.0k | } |
1310 | | |
1311 | | MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, |
1312 | | unsigned Src0Idx, |
1313 | 234k | unsigned Src1Idx) const { |
1314 | 234k | assert(!NewMI && "this should never be used"); |
1315 | 234k | |
1316 | 234k | unsigned Opc = MI.getOpcode(); |
1317 | 234k | int CommutedOpcode = commuteOpcode(Opc); |
1318 | 234k | if (CommutedOpcode == -1) |
1319 | 12.5k | return nullptr; |
1320 | 221k | |
1321 | 234k | assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == |
1322 | 221k | static_cast<int>(Src0Idx) && |
1323 | 221k | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == |
1324 | 221k | static_cast<int>(Src1Idx) && |
1325 | 221k | "inconsistency with findCommutedOpIndices"); |
1326 | 221k | |
1327 | 221k | MachineOperand &Src0 = MI.getOperand(Src0Idx); |
1328 | 221k | MachineOperand &Src1 = MI.getOperand(Src1Idx); |
1329 | 221k | |
1330 | 221k | MachineInstr *CommutedMI = nullptr; |
1331 | 221k | if (Src0.isReg() && 221k Src1.isReg()198k ) { |
1332 | 184k | if (isOperandLegal(MI, Src1Idx, &Src0)184k ) { |
1333 | 166k | // Be sure to copy the source modifiers to the right place. |
1334 | 166k | CommutedMI |
1335 | 166k | = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); |
1336 | 166k | } |
1337 | 184k | |
1338 | 221k | } else if (36.9k Src0.isReg() && 36.9k !Src1.isReg()14.1k ) { |
1339 | 14.1k | // src0 should always be able to support any operand type, so no need to |
1340 | 14.1k | // check operand legality. |
1341 | 14.1k | CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); |
1342 | 36.9k | } else if (22.8k !Src0.isReg() && 22.8k Src1.isReg()22.8k ) { |
1343 | 22.8k | if (isOperandLegal(MI, Src1Idx, &Src0)) |
1344 | 10.9k | CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); |
1345 | 22.8k | } else { |
1346 | 1 | // FIXME: Found two non registers to commute. This does happen. |
1347 | 1 | return nullptr; |
1348 | 1 | } |
1349 | 221k | |
1350 | 221k | if (221k CommutedMI221k ) { |
1351 | 191k | swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, |
1352 | 191k | Src1, AMDGPU::OpName::src1_modifiers); |
1353 | 191k | |
1354 | 191k | CommutedMI->setDesc(get(CommutedOpcode)); |
1355 | 191k | } |
1356 | 234k | |
1357 | 234k | return CommutedMI; |
1358 | 234k | } |
1359 | | |
1360 | | // This needs to be implemented because the source modifiers may be inserted |
1361 | | // between the true commutable operands, and the base |
1362 | | // TargetInstrInfo::commuteInstruction uses it. |
1363 | | bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, |
1364 | 263k | unsigned &SrcOpIdx1) const { |
1365 | 263k | if (!MI.isCommutable()) |
1366 | 33.5k | return false; |
1367 | 230k | |
1368 | 230k | unsigned Opc = MI.getOpcode(); |
1369 | 230k | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); |
1370 | 230k | if (Src0Idx == -1) |
1371 | 0 | return false; |
1372 | 230k | |
1373 | 230k | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); |
1374 | 230k | if (Src1Idx == -1) |
1375 | 0 | return false; |
1376 | 230k | |
1377 | 230k | return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); |
1378 | 230k | } |
1379 | | |
1380 | | bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, |
1381 | 925 | int64_t BrOffset) const { |
1382 | 925 | // BranchRelaxation should never have to check s_setpc_b64 because its dest |
1383 | 925 | // block is unanalyzable. |
1384 | 925 | assert(BranchOp != AMDGPU::S_SETPC_B64); |
1385 | 925 | |
1386 | 925 | // Convert to dwords. |
1387 | 925 | BrOffset /= 4; |
1388 | 925 | |
1389 | 925 | // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is |
1390 | 925 | // from the next instruction. |
1391 | 925 | BrOffset -= 1; |
1392 | 925 | |
1393 | 925 | return isIntN(BranchOffsetBits, BrOffset); |
1394 | 925 | } |
1395 | | |
1396 | | MachineBasicBlock *SIInstrInfo::getBranchDestBlock( |
1397 | 954 | const MachineInstr &MI) const { |
1398 | 954 | if (MI.getOpcode() == AMDGPU::S_SETPC_B64954 ) { |
1399 | 0 | // This would be a difficult analysis to perform, but can always be legal so |
1400 | 0 | // there's no need to analyze it. |
1401 | 0 | return nullptr; |
1402 | 0 | } |
1403 | 954 | |
1404 | 954 | return MI.getOperand(0).getMBB(); |
1405 | 954 | } |
1406 | | |
1407 | | unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, |
1408 | | MachineBasicBlock &DestBB, |
1409 | | const DebugLoc &DL, |
1410 | | int64_t BrOffset, |
1411 | 29 | RegScavenger *RS) const { |
1412 | 29 | assert(RS && "RegScavenger required for long branching"); |
1413 | 29 | assert(MBB.empty() && |
1414 | 29 | "new block should be inserted for expanding unconditional branch"); |
1415 | 29 | assert(MBB.pred_size() == 1); |
1416 | 29 | |
1417 | 29 | MachineFunction *MF = MBB.getParent(); |
1418 | 29 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
1419 | 29 | |
1420 | 29 | // FIXME: Virtual register workaround for RegScavenger not working with empty |
1421 | 29 | // blocks. |
1422 | 29 | unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
1423 | 29 | |
1424 | 29 | auto I = MBB.end(); |
1425 | 29 | |
1426 | 29 | // We need to compute the offset relative to the instruction immediately after |
1427 | 29 | // s_getpc_b64. Insert pc arithmetic code before last terminator. |
1428 | 29 | MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); |
1429 | 29 | |
1430 | 29 | // TODO: Handle > 32-bit block address. |
1431 | 29 | if (BrOffset >= 029 ) { |
1432 | 21 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) |
1433 | 21 | .addReg(PCReg, RegState::Define, AMDGPU::sub0) |
1434 | 21 | .addReg(PCReg, 0, AMDGPU::sub0) |
1435 | 21 | .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); |
1436 | 21 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) |
1437 | 21 | .addReg(PCReg, RegState::Define, AMDGPU::sub1) |
1438 | 21 | .addReg(PCReg, 0, AMDGPU::sub1) |
1439 | 21 | .addImm(0); |
1440 | 29 | } else { |
1441 | 8 | // Backwards branch. |
1442 | 8 | BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) |
1443 | 8 | .addReg(PCReg, RegState::Define, AMDGPU::sub0) |
1444 | 8 | .addReg(PCReg, 0, AMDGPU::sub0) |
1445 | 8 | .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); |
1446 | 8 | BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) |
1447 | 8 | .addReg(PCReg, RegState::Define, AMDGPU::sub1) |
1448 | 8 | .addReg(PCReg, 0, AMDGPU::sub1) |
1449 | 8 | .addImm(0); |
1450 | 8 | } |
1451 | 29 | |
1452 | 29 | // Insert the indirect branch after the other terminator. |
1453 | 29 | BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) |
1454 | 29 | .addReg(PCReg); |
1455 | 29 | |
1456 | 29 | // FIXME: If spilling is necessary, this will fail because this scavenger has |
1457 | 29 | // no emergency stack slots. It is non-trivial to spill in this situation, |
1458 | 29 | // because the restore code needs to be specially placed after the |
1459 | 29 | // jump. BranchRelaxation then needs to be made aware of the newly inserted |
1460 | 29 | // block. |
1461 | 29 | // |
1462 | 29 | // If a spill is needed for the pc register pair, we need to insert a spill |
1463 | 29 | // restore block right before the destination block, and insert a short branch |
1464 | 29 | // into the old destination block's fallthrough predecessor. |
1465 | 29 | // e.g.: |
1466 | 29 | // |
1467 | 29 | // s_cbranch_scc0 skip_long_branch: |
1468 | 29 | // |
1469 | 29 | // long_branch_bb: |
1470 | 29 | // spill s[8:9] |
1471 | 29 | // s_getpc_b64 s[8:9] |
1472 | 29 | // s_add_u32 s8, s8, restore_bb |
1473 | 29 | // s_addc_u32 s9, s9, 0 |
1474 | 29 | // s_setpc_b64 s[8:9] |
1475 | 29 | // |
1476 | 29 | // skip_long_branch: |
1477 | 29 | // foo; |
1478 | 29 | // |
1479 | 29 | // ..... |
1480 | 29 | // |
1481 | 29 | // dest_bb_fallthrough_predecessor: |
1482 | 29 | // bar; |
1483 | 29 | // s_branch dest_bb |
1484 | 29 | // |
1485 | 29 | // restore_bb: |
1486 | 29 | // restore s[8:9] |
1487 | 29 | // fallthrough dest_bb |
1488 | 29 | /// |
1489 | 29 | // dest_bb: |
1490 | 29 | // buzz; |
1491 | 29 | |
1492 | 29 | RS->enterBasicBlockEnd(MBB); |
1493 | 29 | unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, |
1494 | 29 | MachineBasicBlock::iterator(GetPC), 0); |
1495 | 29 | MRI.replaceRegWith(PCReg, Scav); |
1496 | 29 | MRI.clearVirtRegs(); |
1497 | 29 | RS->setRegUsed(Scav); |
1498 | 29 | |
1499 | 29 | return 4 + 8 + 4 + 4; |
1500 | 29 | } |
1501 | | |
1502 | 1.48k | unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { |
1503 | 1.48k | switch (Cond) { |
1504 | 440 | case SIInstrInfo::SCC_TRUE: |
1505 | 440 | return AMDGPU::S_CBRANCH_SCC1; |
1506 | 412 | case SIInstrInfo::SCC_FALSE: |
1507 | 412 | return AMDGPU::S_CBRANCH_SCC0; |
1508 | 237 | case SIInstrInfo::VCCNZ: |
1509 | 237 | return AMDGPU::S_CBRANCH_VCCNZ; |
1510 | 243 | case SIInstrInfo::VCCZ: |
1511 | 243 | return AMDGPU::S_CBRANCH_VCCZ; |
1512 | 89 | case SIInstrInfo::EXECNZ: |
1513 | 89 | return AMDGPU::S_CBRANCH_EXECNZ; |
1514 | 59 | case SIInstrInfo::EXECZ: |
1515 | 59 | return AMDGPU::S_CBRANCH_EXECZ; |
1516 | 0 | default: |
1517 | 0 | llvm_unreachable("invalid branch predicate"); |
1518 | 0 | } |
1519 | 0 | } |
1520 | | |
1521 | 662k | SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { |
1522 | 662k | switch (Opcode) { |
1523 | 1.85k | case AMDGPU::S_CBRANCH_SCC0: |
1524 | 1.85k | return SCC_FALSE; |
1525 | 15.2k | case AMDGPU::S_CBRANCH_SCC1: |
1526 | 15.2k | return SCC_TRUE; |
1527 | 7.13k | case AMDGPU::S_CBRANCH_VCCNZ: |
1528 | 7.13k | return VCCNZ; |
1529 | 1.19k | case AMDGPU::S_CBRANCH_VCCZ: |
1530 | 1.19k | return VCCZ; |
1531 | 3.32k | case AMDGPU::S_CBRANCH_EXECNZ: |
1532 | 3.32k | return EXECNZ; |
1533 | 354 | case AMDGPU::S_CBRANCH_EXECZ: |
1534 | 354 | return EXECZ; |
1535 | 633k | default: |
1536 | 633k | return INVALID_BR; |
1537 | 0 | } |
1538 | 0 | } |
1539 | | |
1540 | | bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, |
1541 | | MachineBasicBlock::iterator I, |
1542 | | MachineBasicBlock *&TBB, |
1543 | | MachineBasicBlock *&FBB, |
1544 | | SmallVectorImpl<MachineOperand> &Cond, |
1545 | 687k | bool AllowModify) const { |
1546 | 687k | if (I->getOpcode() == AMDGPU::S_BRANCH687k ) { |
1547 | 24.2k | // Unconditional Branch |
1548 | 24.2k | TBB = I->getOperand(0).getMBB(); |
1549 | 24.2k | return false; |
1550 | 24.2k | } |
1551 | 662k | |
1552 | 662k | MachineBasicBlock *CondBB = nullptr; |
1553 | 662k | |
1554 | 662k | if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO662k ) { |
1555 | 0 | CondBB = I->getOperand(1).getMBB(); |
1556 | 0 | Cond.push_back(I->getOperand(0)); |
1557 | 662k | } else { |
1558 | 662k | BranchPredicate Pred = getBranchPredicate(I->getOpcode()); |
1559 | 662k | if (Pred == INVALID_BR) |
1560 | 633k | return true; |
1561 | 29.0k | |
1562 | 29.0k | CondBB = I->getOperand(0).getMBB(); |
1563 | 29.0k | Cond.push_back(MachineOperand::CreateImm(Pred)); |
1564 | 29.0k | Cond.push_back(I->getOperand(1)); // Save the branch register. |
1565 | 29.0k | } |
1566 | 29.0k | ++I; |
1567 | 29.0k | |
1568 | 29.0k | if (I == MBB.end()29.0k ) { |
1569 | 15.2k | // Conditional branch followed by fall-through. |
1570 | 15.2k | TBB = CondBB; |
1571 | 15.2k | return false; |
1572 | 15.2k | } |
1573 | 13.7k | |
1574 | 13.7k | if (13.7k I->getOpcode() == AMDGPU::S_BRANCH13.7k ) { |
1575 | 13.7k | TBB = CondBB; |
1576 | 13.7k | FBB = I->getOperand(0).getMBB(); |
1577 | 13.7k | return false; |
1578 | 13.7k | } |
1579 | 0 |
|
1580 | 0 | return true; |
1581 | 0 | } |
1582 | | |
1583 | | bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, |
1584 | | MachineBasicBlock *&FBB, |
1585 | | SmallVectorImpl<MachineOperand> &Cond, |
1586 | 724k | bool AllowModify) const { |
1587 | 724k | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); |
1588 | 724k | if (I == MBB.end()) |
1589 | 36.9k | return false; |
1590 | 687k | |
1591 | 687k | if (687k I->getOpcode() != AMDGPU::SI_MASK_BRANCH687k ) |
1592 | 676k | return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); |
1593 | 11.4k | |
1594 | 11.4k | ++I; |
1595 | 11.4k | |
1596 | 11.4k | // TODO: Should be able to treat as fallthrough? |
1597 | 11.4k | if (I == MBB.end()) |
1598 | 785 | return true; |
1599 | 10.6k | |
1600 | 10.6k | if (10.6k analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)10.6k ) |
1601 | 0 | return true; |
1602 | 10.6k | |
1603 | 10.6k | MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); |
1604 | 10.6k | |
1605 | 10.6k | // Specifically handle the case where the conditional branch is to the same |
1606 | 10.6k | // destination as the mask branch. e.g. |
1607 | 10.6k | // |
1608 | 10.6k | // si_mask_branch BB8 |
1609 | 10.6k | // s_cbranch_execz BB8 |
1610 | 10.6k | // s_cbranch BB9 |
1611 | 10.6k | // |
1612 | 10.6k | // This is required to understand divergent loops which may need the branches |
1613 | 10.6k | // to be relaxed. |
1614 | 10.6k | if (TBB != MaskBrDest || 10.6k Cond.empty()10.6k ) |
1615 | 10.3k | return true; |
1616 | 266 | |
1617 | 266 | auto Pred = Cond[0].getImm(); |
1618 | 2 | return (Pred != EXECZ && Pred != EXECNZ); |
1619 | 724k | } |
1620 | | |
1621 | | unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, |
1622 | 2.37k | int *BytesRemoved) const { |
1623 | 2.37k | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); |
1624 | 2.37k | |
1625 | 2.37k | unsigned Count = 0; |
1626 | 2.37k | unsigned RemovedSize = 0; |
1627 | 5.31k | while (I != MBB.end()5.31k ) { |
1628 | 2.94k | MachineBasicBlock::iterator Next = std::next(I); |
1629 | 2.94k | if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH2.94k ) { |
1630 | 2 | I = Next; |
1631 | 2 | continue; |
1632 | 2 | } |
1633 | 2.93k | |
1634 | 2.93k | RemovedSize += getInstSizeInBytes(*I); |
1635 | 2.93k | I->eraseFromParent(); |
1636 | 2.93k | ++Count; |
1637 | 2.93k | I = Next; |
1638 | 2.93k | } |
1639 | 2.37k | |
1640 | 2.37k | if (BytesRemoved) |
1641 | 23 | *BytesRemoved = RemovedSize; |
1642 | 2.37k | |
1643 | 2.37k | return Count; |
1644 | 2.37k | } |
1645 | | |
1646 | | // Copy the flags onto the implicit condition register operand. |
1647 | | static void preserveCondRegFlags(MachineOperand &CondReg, |
1648 | 1.41k | const MachineOperand &OrigCond) { |
1649 | 1.41k | CondReg.setIsUndef(OrigCond.isUndef()); |
1650 | 1.41k | CondReg.setIsKill(OrigCond.isKill()); |
1651 | 1.41k | } |
1652 | | |
1653 | | unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, |
1654 | | MachineBasicBlock *TBB, |
1655 | | MachineBasicBlock *FBB, |
1656 | | ArrayRef<MachineOperand> Cond, |
1657 | | const DebugLoc &DL, |
1658 | 2.13k | int *BytesAdded) const { |
1659 | 2.13k | if (!FBB && 2.13k Cond.empty()2.04k ) { |
1660 | 658 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) |
1661 | 658 | .addMBB(TBB); |
1662 | 658 | if (BytesAdded) |
1663 | 0 | *BytesAdded = 4; |
1664 | 658 | return 1; |
1665 | 658 | } |
1666 | 1.48k | |
1667 | 1.48k | if(1.48k Cond.size() == 1 && 1.48k Cond[0].isReg()0 ) { |
1668 | 0 | BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) |
1669 | 0 | .add(Cond[0]) |
1670 | 0 | .addMBB(TBB); |
1671 | 0 | return 1; |
1672 | 0 | } |
1673 | 1.48k | |
1674 | 1.48k | assert(TBB && Cond[0].isImm()); |
1675 | 1.48k | |
1676 | 1.48k | unsigned Opcode |
1677 | 1.48k | = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); |
1678 | 1.48k | |
1679 | 1.48k | if (!FBB1.48k ) { |
1680 | 1.38k | Cond[1].isUndef(); |
1681 | 1.38k | MachineInstr *CondBr = |
1682 | 1.38k | BuildMI(&MBB, DL, get(Opcode)) |
1683 | 1.38k | .addMBB(TBB); |
1684 | 1.38k | |
1685 | 1.38k | // Copy the flags onto the implicit condition register operand. |
1686 | 1.38k | preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); |
1687 | 1.38k | |
1688 | 1.38k | if (BytesAdded) |
1689 | 0 | *BytesAdded = 4; |
1690 | 1.38k | return 1; |
1691 | 1.38k | } |
1692 | 92 | |
1693 | 1.48k | assert(TBB && FBB); |
1694 | 92 | |
1695 | 92 | MachineInstr *CondBr = |
1696 | 92 | BuildMI(&MBB, DL, get(Opcode)) |
1697 | 92 | .addMBB(TBB); |
1698 | 92 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) |
1699 | 92 | .addMBB(FBB); |
1700 | 92 | |
1701 | 92 | MachineOperand &CondReg = CondBr->getOperand(1); |
1702 | 92 | CondReg.setIsUndef(Cond[1].isUndef()); |
1703 | 92 | CondReg.setIsKill(Cond[1].isKill()); |
1704 | 92 | |
1705 | 92 | if (BytesAdded) |
1706 | 23 | *BytesAdded = 8; |
1707 | 2.13k | |
1708 | 2.13k | return 2; |
1709 | 2.13k | } |
1710 | | |
1711 | | bool SIInstrInfo::reverseBranchCondition( |
1712 | 1.23k | SmallVectorImpl<MachineOperand> &Cond) const { |
1713 | 1.23k | if (Cond.size() != 21.23k ) { |
1714 | 0 | return true; |
1715 | 0 | } |
1716 | 1.23k | |
1717 | 1.23k | if (1.23k Cond[0].isImm()1.23k ) { |
1718 | 1.23k | Cond[0].setImm(-Cond[0].getImm()); |
1719 | 1.23k | return false; |
1720 | 1.23k | } |
1721 | 0 |
|
1722 | 0 | return true; |
1723 | 0 | } |
1724 | | |
1725 | | bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, |
1726 | | ArrayRef<MachineOperand> Cond, |
1727 | | unsigned TrueReg, unsigned FalseReg, |
1728 | | int &CondCycles, |
1729 | 22 | int &TrueCycles, int &FalseCycles) const { |
1730 | 22 | switch (Cond[0].getImm()) { |
1731 | 15 | case VCCNZ: |
1732 | 15 | case VCCZ: { |
1733 | 15 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
1734 | 15 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); |
1735 | 15 | assert(MRI.getRegClass(FalseReg) == RC); |
1736 | 15 | |
1737 | 15 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; |
1738 | 15 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? |
1739 | 15 | |
1740 | 15 | // Limit to equal cost for branch vs. N v_cndmask_b32s. |
1741 | 14 | return !RI.isSGPRClass(RC) && NumInsts <= 6; |
1742 | 15 | } |
1743 | 7 | case SCC_TRUE: |
1744 | 7 | case SCC_FALSE: { |
1745 | 7 | // FIXME: We could insert for VGPRs if we could replace the original compare |
1746 | 7 | // with a vector one. |
1747 | 7 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
1748 | 7 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); |
1749 | 7 | assert(MRI.getRegClass(FalseReg) == RC); |
1750 | 7 | |
1751 | 7 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; |
1752 | 7 | |
1753 | 7 | // Multiples of 8 can do s_cselect_b64 |
1754 | 7 | if (NumInsts % 2 == 0) |
1755 | 3 | NumInsts /= 2; |
1756 | 7 | |
1757 | 7 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? |
1758 | 7 | return RI.isSGPRClass(RC); |
1759 | 7 | } |
1760 | 0 | default: |
1761 | 0 | return false; |
1762 | 0 | } |
1763 | 0 | } |
1764 | | |
1765 | | void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, |
1766 | | MachineBasicBlock::iterator I, const DebugLoc &DL, |
1767 | | unsigned DstReg, ArrayRef<MachineOperand> Cond, |
1768 | 16 | unsigned TrueReg, unsigned FalseReg) const { |
1769 | 16 | BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); |
1770 | 16 | if (Pred == VCCZ || 16 Pred == SCC_FALSE16 ) { |
1771 | 0 | Pred = static_cast<BranchPredicate>(-Pred); |
1772 | 0 | std::swap(TrueReg, FalseReg); |
1773 | 0 | } |
1774 | 16 | |
1775 | 16 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
1776 | 16 | const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); |
1777 | 16 | unsigned DstSize = RI.getRegSizeInBits(*DstRC); |
1778 | 16 | |
1779 | 16 | if (DstSize == 3216 ) { |
1780 | 9 | unsigned SelOp = Pred == SCC_TRUE ? |
1781 | 9 | AMDGPU::S_CSELECT_B323 : AMDGPU::V_CNDMASK_B32_e326 ; |
1782 | 9 | |
1783 | 9 | // Instruction's operands are backwards from what is expected. |
1784 | 9 | MachineInstr *Select = |
1785 | 9 | BuildMI(MBB, I, DL, get(SelOp), DstReg) |
1786 | 9 | .addReg(FalseReg) |
1787 | 9 | .addReg(TrueReg); |
1788 | 9 | |
1789 | 9 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); |
1790 | 9 | return; |
1791 | 9 | } |
1792 | 7 | |
1793 | 7 | if (7 DstSize == 64 && 7 Pred == SCC_TRUE3 ) { |
1794 | 1 | MachineInstr *Select = |
1795 | 1 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) |
1796 | 1 | .addReg(FalseReg) |
1797 | 1 | .addReg(TrueReg); |
1798 | 1 | |
1799 | 1 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); |
1800 | 1 | return; |
1801 | 1 | } |
1802 | 6 | |
1803 | 6 | static const int16_t Sub0_15[] = { |
1804 | 6 | AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, |
1805 | 6 | AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, |
1806 | 6 | AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, |
1807 | 6 | AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, |
1808 | 6 | }; |
1809 | 6 | |
1810 | 6 | static const int16_t Sub0_15_64[] = { |
1811 | 6 | AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, |
1812 | 6 | AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, |
1813 | 6 | AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, |
1814 | 6 | AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, |
1815 | 6 | }; |
1816 | 6 | |
1817 | 6 | unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; |
1818 | 6 | const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; |
1819 | 6 | const int16_t *SubIndices = Sub0_15; |
1820 | 6 | int NElts = DstSize / 32; |
1821 | 6 | |
1822 | 6 | // 64-bit select is only avaialble for SALU. |
1823 | 6 | if (Pred == SCC_TRUE6 ) { |
1824 | 2 | SelOp = AMDGPU::S_CSELECT_B64; |
1825 | 2 | EltRC = &AMDGPU::SGPR_64RegClass; |
1826 | 2 | SubIndices = Sub0_15_64; |
1827 | 2 | |
1828 | 2 | assert(NElts % 2 == 0); |
1829 | 2 | NElts /= 2; |
1830 | 2 | } |
1831 | 6 | |
1832 | 6 | MachineInstrBuilder MIB = BuildMI( |
1833 | 6 | MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); |
1834 | 6 | |
1835 | 6 | I = MIB->getIterator(); |
1836 | 6 | |
1837 | 6 | SmallVector<unsigned, 8> Regs; |
1838 | 22 | for (int Idx = 0; Idx != NElts22 ; ++Idx16 ) { |
1839 | 16 | unsigned DstElt = MRI.createVirtualRegister(EltRC); |
1840 | 16 | Regs.push_back(DstElt); |
1841 | 16 | |
1842 | 16 | unsigned SubIdx = SubIndices[Idx]; |
1843 | 16 | |
1844 | 16 | MachineInstr *Select = |
1845 | 16 | BuildMI(MBB, I, DL, get(SelOp), DstElt) |
1846 | 16 | .addReg(FalseReg, 0, SubIdx) |
1847 | 16 | .addReg(TrueReg, 0, SubIdx); |
1848 | 16 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); |
1849 | 16 | |
1850 | 16 | MIB.addReg(DstElt) |
1851 | 16 | .addImm(SubIdx); |
1852 | 16 | } |
1853 | 16 | } |
1854 | | |
1855 | 760k | bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { |
1856 | 760k | switch (MI.getOpcode()) { |
1857 | 22.1k | case AMDGPU::V_MOV_B32_e32: |
1858 | 22.1k | case AMDGPU::V_MOV_B32_e64: |
1859 | 22.1k | case AMDGPU::V_MOV_B64_PSEUDO: { |
1860 | 22.1k | // If there are additional implicit register operands, this may be used for |
1861 | 22.1k | // register indexing so the source register operand isn't simply copied. |
1862 | 22.1k | unsigned NumOps = MI.getDesc().getNumOperands() + |
1863 | 22.1k | MI.getDesc().getNumImplicitUses(); |
1864 | 22.1k | |
1865 | 22.1k | return MI.getNumOperands() == NumOps; |
1866 | 22.1k | } |
1867 | 287k | case AMDGPU::S_MOV_B32: |
1868 | 287k | case AMDGPU::S_MOV_B64: |
1869 | 287k | case AMDGPU::COPY: |
1870 | 287k | return true; |
1871 | 451k | default: |
1872 | 451k | return false; |
1873 | 0 | } |
1874 | 0 | } |
1875 | | |
1876 | | unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( |
1877 | 93.9k | PseudoSourceValue::PSVKind Kind) const { |
1878 | 93.9k | switch(Kind) { |
1879 | 17.2k | case PseudoSourceValue::Stack: |
1880 | 17.2k | case PseudoSourceValue::FixedStack: |
1881 | 17.2k | return AMDGPUASI.PRIVATE_ADDRESS; |
1882 | 76.6k | case PseudoSourceValue::ConstantPool: |
1883 | 76.6k | case PseudoSourceValue::GOT: |
1884 | 76.6k | case PseudoSourceValue::JumpTable: |
1885 | 76.6k | case PseudoSourceValue::GlobalValueCallEntry: |
1886 | 76.6k | case PseudoSourceValue::ExternalSymbolCallEntry: |
1887 | 76.6k | case PseudoSourceValue::TargetCustom: |
1888 | 76.6k | return AMDGPUASI.CONSTANT_ADDRESS; |
1889 | 0 | } |
1890 | 0 | return AMDGPUASI.FLAT_ADDRESS; |
1891 | 0 | } |
1892 | | |
1893 | 16 | static void removeModOperands(MachineInstr &MI) { |
1894 | 16 | unsigned Opc = MI.getOpcode(); |
1895 | 16 | int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, |
1896 | 16 | AMDGPU::OpName::src0_modifiers); |
1897 | 16 | int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, |
1898 | 16 | AMDGPU::OpName::src1_modifiers); |
1899 | 16 | int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, |
1900 | 16 | AMDGPU::OpName::src2_modifiers); |
1901 | 16 | |
1902 | 16 | MI.RemoveOperand(Src2ModIdx); |
1903 | 16 | MI.RemoveOperand(Src1ModIdx); |
1904 | 16 | MI.RemoveOperand(Src0ModIdx); |
1905 | 16 | } |
1906 | | |
1907 | | bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, |
1908 | 47.1k | unsigned Reg, MachineRegisterInfo *MRI) const { |
1909 | 47.1k | if (!MRI->hasOneNonDBGUse(Reg)) |
1910 | 28.7k | return false; |
1911 | 18.4k | |
1912 | 18.4k | unsigned Opc = UseMI.getOpcode(); |
1913 | 18.4k | if (Opc == AMDGPU::COPY18.4k ) { |
1914 | 3.78k | bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); |
1915 | 3.78k | switch (DefMI.getOpcode()) { |
1916 | 0 | default: |
1917 | 0 | return false; |
1918 | 265 | case AMDGPU::S_MOV_B64: |
1919 | 265 | // TODO: We could fold 64-bit immediates, but this get compilicated |
1920 | 265 | // when there are sub-registers. |
1921 | 265 | return false; |
1922 | 3.78k | |
1923 | 3.51k | case AMDGPU::V_MOV_B32_e32: |
1924 | 3.51k | case AMDGPU::S_MOV_B32: |
1925 | 3.51k | break; |
1926 | 3.51k | } |
1927 | 3.51k | unsigned NewOpc = isVGPRCopy ? 3.51k AMDGPU::V_MOV_B32_e323.50k : AMDGPU::S_MOV_B3218 ; |
1928 | 3.51k | const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); |
1929 | 3.51k | assert(ImmOp); |
1930 | 3.51k | // FIXME: We could handle FrameIndex values here. |
1931 | 3.51k | if (!ImmOp->isImm()3.51k ) { |
1932 | 17 | return false; |
1933 | 17 | } |
1934 | 3.50k | UseMI.setDesc(get(NewOpc)); |
1935 | 3.50k | UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); |
1936 | 3.50k | UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); |
1937 | 3.50k | return true; |
1938 | 3.50k | } |
1939 | 14.6k | |
1940 | 14.6k | if (14.6k Opc == AMDGPU::V_MAD_F32 || 14.6k Opc == AMDGPU::V_MAC_F32_e6414.6k || |
1941 | 14.6k | Opc == AMDGPU::V_MAD_F1614.5k || Opc == AMDGPU::V_MAC_F16_e6414.5k ) { |
1942 | 164 | // Don't fold if we are using source or output modifiers. The new VOP2 |
1943 | 164 | // instructions don't have them. |
1944 | 164 | if (hasAnyModifiersSet(UseMI)) |
1945 | 32 | return false; |
1946 | 132 | |
1947 | 132 | const MachineOperand &ImmOp = DefMI.getOperand(1); |
1948 | 132 | |
1949 | 132 | // If this is a free constant, there's no reason to do this. |
1950 | 132 | // TODO: We could fold this here instead of letting SIFoldOperands do it |
1951 | 132 | // later. |
1952 | 132 | MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); |
1953 | 132 | |
1954 | 132 | // Any src operand can be used for the legality check. |
1955 | 132 | if (isInlineConstant(UseMI, *Src0, ImmOp)) |
1956 | 87 | return false; |
1957 | 45 | |
1958 | 45 | bool IsF32 = Opc == AMDGPU::V_MAD_F32 || 45 Opc == AMDGPU::V_MAC_F32_e6445 ; |
1959 | 45 | MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); |
1960 | 45 | MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); |
1961 | 45 | |
1962 | 45 | // Multiplied part is the constant: Use v_madmk_{f16, f32}. |
1963 | 45 | // We should only expect these to be on src0 due to canonicalizations. |
1964 | 45 | if (Src0->isReg() && 45 Src0->getReg() == Reg45 ) { |
1965 | 0 | if (!Src1->isReg() || 0 RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))0 ) |
1966 | 0 | return false; |
1967 | 0 |
|
1968 | 0 | if (0 !Src2->isReg() || 0 RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))0 ) |
1969 | 0 | return false; |
1970 | 0 |
|
1971 | 0 | // We need to swap operands 0 and 1 since madmk constant is at operand 1. |
1972 | 0 |
|
1973 | 0 | const int64_t Imm = DefMI.getOperand(1).getImm(); |
1974 | 0 |
|
1975 | 0 | // FIXME: This would be a lot easier if we could return a new instruction |
1976 | 0 | // instead of having to modify in place. |
1977 | 0 |
|
1978 | 0 | // Remove these first since they are at the end. |
1979 | 0 | UseMI.RemoveOperand( |
1980 | 0 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); |
1981 | 0 | UseMI.RemoveOperand( |
1982 | 0 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); |
1983 | 0 |
|
1984 | 0 | unsigned Src1Reg = Src1->getReg(); |
1985 | 0 | unsigned Src1SubReg = Src1->getSubReg(); |
1986 | 0 | Src0->setReg(Src1Reg); |
1987 | 0 | Src0->setSubReg(Src1SubReg); |
1988 | 0 | Src0->setIsKill(Src1->isKill()); |
1989 | 0 |
|
1990 | 0 | if (Opc == AMDGPU::V_MAC_F32_e64 || |
1991 | 0 | Opc == AMDGPU::V_MAC_F16_e64) |
1992 | 0 | UseMI.untieRegOperand( |
1993 | 0 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); |
1994 | 0 |
|
1995 | 0 | Src1->ChangeToImmediate(Imm); |
1996 | 0 |
|
1997 | 0 | removeModOperands(UseMI); |
1998 | 0 | UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F320 : AMDGPU::V_MADMK_F160 )); |
1999 | 0 |
|
2000 | 0 | bool DeleteDef = MRI->hasOneNonDBGUse(Reg); |
2001 | 0 | if (DeleteDef) |
2002 | 0 | DefMI.eraseFromParent(); |
2003 | 0 |
|
2004 | 0 | return true; |
2005 | 0 | } |
2006 | 45 | |
2007 | 45 | // Added part is the constant: Use v_madak_{f16, f32}. |
2008 | 45 | if (45 Src2->isReg() && 45 Src2->getReg() == Reg45 ) { |
2009 | 19 | // Not allowed to use constant bus for another operand. |
2010 | 19 | // We can however allow an inline immediate as src0. |
2011 | 19 | if (!Src0->isImm() && |
2012 | 19 | (Src0->isReg() && 19 RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))19 )) |
2013 | 2 | return false; |
2014 | 17 | |
2015 | 17 | if (17 !Src1->isReg() || 17 RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))17 ) |
2016 | 1 | return false; |
2017 | 16 | |
2018 | 16 | const int64_t Imm = DefMI.getOperand(1).getImm(); |
2019 | 16 | |
2020 | 16 | // FIXME: This would be a lot easier if we could return a new instruction |
2021 | 16 | // instead of having to modify in place. |
2022 | 16 | |
2023 | 16 | // Remove these first since they are at the end. |
2024 | 16 | UseMI.RemoveOperand( |
2025 | 16 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); |
2026 | 16 | UseMI.RemoveOperand( |
2027 | 16 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); |
2028 | 16 | |
2029 | 16 | if (Opc == AMDGPU::V_MAC_F32_e64 || |
2030 | 1 | Opc == AMDGPU::V_MAC_F16_e64) |
2031 | 16 | UseMI.untieRegOperand( |
2032 | 16 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); |
2033 | 16 | |
2034 | 16 | // ChangingToImmediate adds Src2 back to the instruction. |
2035 | 16 | Src2->ChangeToImmediate(Imm); |
2036 | 16 | |
2037 | 16 | // These come before src2. |
2038 | 16 | removeModOperands(UseMI); |
2039 | 16 | UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F3215 : AMDGPU::V_MADAK_F161 )); |
2040 | 16 | |
2041 | 16 | bool DeleteDef = MRI->hasOneNonDBGUse(Reg); |
2042 | 16 | if (DeleteDef) |
2043 | 0 | DefMI.eraseFromParent(); |
2044 | 19 | |
2045 | 19 | return true; |
2046 | 19 | } |
2047 | 164 | } |
2048 | 14.4k | |
2049 | 14.4k | return false; |
2050 | 14.4k | } |
2051 | | |
2052 | | static bool offsetsDoNotOverlap(int WidthA, int OffsetA, |
2053 | 11.4k | int WidthB, int OffsetB) { |
2054 | 11.4k | int LowOffset = OffsetA < OffsetB ? OffsetA2.59k : OffsetB8.81k ; |
2055 | 11.4k | int HighOffset = OffsetA < OffsetB ? OffsetB2.59k : OffsetA8.81k ; |
2056 | 11.4k | int LowWidth = (LowOffset == OffsetA) ? WidthA5.83k : WidthB5.57k ; |
2057 | 11.4k | return LowOffset + LowWidth <= HighOffset; |
2058 | 11.4k | } |
2059 | | |
2060 | | bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, |
2061 | 835k | MachineInstr &MIb) const { |
2062 | 835k | unsigned BaseReg0, BaseReg1; |
2063 | 835k | int64_t Offset0, Offset1; |
2064 | 835k | |
2065 | 835k | if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && |
2066 | 835k | getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)56.0k ) { |
2067 | 53.1k | |
2068 | 53.1k | if (!MIa.hasOneMemOperand() || 53.1k !MIb.hasOneMemOperand()39.7k ) { |
2069 | 14.8k | // FIXME: Handle ds_read2 / ds_write2. |
2070 | 14.8k | return false; |
2071 | 14.8k | } |
2072 | 38.3k | unsigned Width0 = (*MIa.memoperands_begin())->getSize(); |
2073 | 38.3k | unsigned Width1 = (*MIb.memoperands_begin())->getSize(); |
2074 | 38.3k | if (BaseReg0 == BaseReg1 && |
2075 | 38.3k | offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)11.4k ) { |
2076 | 8.10k | return true; |
2077 | 8.10k | } |
2078 | 812k | } |
2079 | 812k | |
2080 | 812k | return false; |
2081 | 812k | } |
2082 | | |
2083 | | bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, |
2084 | | MachineInstr &MIb, |
2085 | 910k | AliasAnalysis *AA) const { |
2086 | 910k | assert((MIa.mayLoad() || MIa.mayStore()) && |
2087 | 910k | "MIa must load from or modify a memory location"); |
2088 | 910k | assert((MIb.mayLoad() || MIb.mayStore()) && |
2089 | 910k | "MIb must load from or modify a memory location"); |
2090 | 910k | |
2091 | 910k | if (MIa.hasUnmodeledSideEffects() || 910k MIb.hasUnmodeledSideEffects()910k ) |
2092 | 0 | return false; |
2093 | 910k | |
2094 | 910k | // XXX - Can we relax this between address spaces? |
2095 | 910k | if (910k MIa.hasOrderedMemoryRef() || 910k MIb.hasOrderedMemoryRef()910k ) |
2096 | 8 | return false; |
2097 | 910k | |
2098 | 910k | if (910k AA && 910k MIa.hasOneMemOperand()493 && MIb.hasOneMemOperand()449 ) { |
2099 | 445 | const MachineMemOperand *MMOa = *MIa.memoperands_begin(); |
2100 | 445 | const MachineMemOperand *MMOb = *MIb.memoperands_begin(); |
2101 | 445 | if (MMOa->getValue() && 445 MMOb->getValue()445 ) { |
2102 | 439 | MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); |
2103 | 439 | MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); |
2104 | 439 | if (!AA->alias(LocA, LocB)) |
2105 | 186 | return true; |
2106 | 910k | } |
2107 | 445 | } |
2108 | 910k | |
2109 | 910k | // TODO: Should we check the address space from the MachineMemOperand? That |
2110 | 910k | // would allow us to distinguish objects we know don't alias based on the |
2111 | 910k | // underlying address space, even if it was lowered to a different one, |
2112 | 910k | // e.g. private accesses lowered to use MUBUF instructions on a scratch |
2113 | 910k | // buffer. |
2114 | 910k | if (910k isDS(MIa)910k ) { |
2115 | 80.2k | if (isDS(MIb)) |
2116 | 33.7k | return checkInstOffsetsDoNotOverlap(MIa, MIb); |
2117 | 46.4k | |
2118 | 46.4k | return !isFLAT(MIb) || 46.4k isSegmentSpecificFLAT(MIb)34.5k ; |
2119 | 80.2k | } |
2120 | 829k | |
2121 | 829k | if (829k isMUBUF(MIa) || 829k isMTBUF(MIa)41.3k ) { |
2122 | 788k | if (isMUBUF(MIb) || 788k isMTBUF(MIb)13.2k ) |
2123 | 775k | return checkInstOffsetsDoNotOverlap(MIa, MIb); |
2124 | 13.2k | |
2125 | 13.2k | return !isFLAT(MIb) && 13.2k !isSMRD(MIb)3.12k ; |
2126 | 788k | } |
2127 | 41.3k | |
2128 | 41.3k | if (41.3k isSMRD(MIa)41.3k ) { |
2129 | 1.33k | if (isSMRD(MIb)) |
2130 | 0 | return checkInstOffsetsDoNotOverlap(MIa, MIb); |
2131 | 1.33k | |
2132 | 1.33k | return !isFLAT(MIb) && 1.33k !isMUBUF(MIa)1.01k && !isMTBUF(MIa)1.01k ; |
2133 | 1.33k | } |
2134 | 39.9k | |
2135 | 39.9k | if (39.9k isFLAT(MIa)39.9k ) { |
2136 | 39.8k | if (isFLAT(MIb)) |
2137 | 26.5k | return checkInstOffsetsDoNotOverlap(MIa, MIb); |
2138 | 13.2k | |
2139 | 13.2k | return false; |
2140 | 13.2k | } |
2141 | 191 | |
2142 | 191 | return false; |
2143 | 191 | } |
2144 | | |
2145 | 445 | static int64_t getFoldableImm(const MachineOperand* MO) { |
2146 | 445 | if (!MO->isReg()) |
2147 | 0 | return false; |
2148 | 445 | const MachineFunction *MF = MO->getParent()->getParent()->getParent(); |
2149 | 445 | const MachineRegisterInfo &MRI = MF->getRegInfo(); |
2150 | 445 | auto Def = MRI.getUniqueVRegDef(MO->getReg()); |
2151 | 445 | if (Def && 445 Def->getOpcode() == AMDGPU::V_MOV_B32_e32442 && |
2152 | 21 | Def->getOperand(1).isImm()) |
2153 | 21 | return Def->getOperand(1).getImm(); |
2154 | 424 | return AMDGPU::NoRegister; |
2155 | 424 | } |
2156 | | |
2157 | | MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, |
2158 | | MachineInstr &MI, |
2159 | 173 | LiveVariables *LV) const { |
2160 | 173 | bool IsF16 = false; |
2161 | 173 | |
2162 | 173 | switch (MI.getOpcode()) { |
2163 | 1 | default: |
2164 | 1 | return nullptr; |
2165 | 0 | case AMDGPU::V_MAC_F16_e64: |
2166 | 0 | IsF16 = true; |
2167 | 0 | LLVM_FALLTHROUGH; |
2168 | 6 | case AMDGPU::V_MAC_F32_e64: |
2169 | 6 | break; |
2170 | 6 | case AMDGPU::V_MAC_F16_e32: |
2171 | 6 | IsF16 = true; |
2172 | 6 | LLVM_FALLTHROUGH; |
2173 | 166 | case AMDGPU::V_MAC_F32_e32: { |
2174 | 166 | int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), |
2175 | 166 | AMDGPU::OpName::src0); |
2176 | 166 | const MachineOperand *Src0 = &MI.getOperand(Src0Idx); |
2177 | 166 | if (!Src0->isReg() && 166 !Src0->isImm()4 ) |
2178 | 1 | return nullptr; |
2179 | 165 | |
2180 | 165 | if (165 Src0->isImm() && 165 !isInlineConstant(MI, Src0Idx, *Src0)3 ) |
2181 | 2 | return nullptr; |
2182 | 163 | |
2183 | 163 | break; |
2184 | 163 | } |
2185 | 169 | } |
2186 | 169 | |
2187 | 169 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); |
2188 | 169 | const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); |
2189 | 169 | const MachineOperand *Src0Mods = |
2190 | 169 | getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); |
2191 | 169 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); |
2192 | 169 | const MachineOperand *Src1Mods = |
2193 | 169 | getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); |
2194 | 169 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); |
2195 | 169 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); |
2196 | 169 | const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); |
2197 | 169 | |
2198 | 169 | if (!Src0Mods && 169 !Src1Mods163 && !Clamp163 && !Omod163 && |
2199 | 169 | // If we have an SGPR input, we will violate the constant bus restriction. |
2200 | 169 | (!Src0->isReg() || 163 !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg())162 )) { |
2201 | 158 | if (auto Imm158 = getFoldableImm(Src2)) { |
2202 | 10 | return BuildMI(*MBB, MI, MI.getDebugLoc(), |
2203 | 10 | get(IsF16 ? AMDGPU::V_MADAK_F162 : AMDGPU::V_MADAK_F328 )) |
2204 | 10 | .add(*Dst) |
2205 | 10 | .add(*Src0) |
2206 | 10 | .add(*Src1) |
2207 | 10 | .addImm(Imm); |
2208 | 10 | } |
2209 | 148 | if (auto 148 Imm148 = getFoldableImm(Src1)) { |
2210 | 9 | return BuildMI(*MBB, MI, MI.getDebugLoc(), |
2211 | 9 | get(IsF16 ? AMDGPU::V_MADMK_F161 : AMDGPU::V_MADMK_F328 )) |
2212 | 9 | .add(*Dst) |
2213 | 9 | .add(*Src0) |
2214 | 9 | .addImm(Imm) |
2215 | 9 | .add(*Src2); |
2216 | 9 | } |
2217 | 139 | if (auto 139 Imm139 = getFoldableImm(Src0)) { |
2218 | 2 | if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32, |
2219 | 2 | AMDGPU::OpName::src0), Src1)) |
2220 | 2 | return BuildMI(*MBB, MI, MI.getDebugLoc(), |
2221 | 2 | get(IsF16 ? AMDGPU::V_MADMK_F161 : AMDGPU::V_MADMK_F321 )) |
2222 | 2 | .add(*Dst) |
2223 | 2 | .add(*Src1) |
2224 | 2 | .addImm(Imm) |
2225 | 2 | .add(*Src2); |
2226 | 148 | } |
2227 | 158 | } |
2228 | 148 | |
2229 | 148 | return BuildMI(*MBB, MI, MI.getDebugLoc(), |
2230 | 148 | get(IsF16 ? AMDGPU::V_MAD_F162 : AMDGPU::V_MAD_F32146 )) |
2231 | 148 | .add(*Dst) |
2232 | 148 | .addImm(Src0Mods ? Src0Mods->getImm()6 : 0142 ) |
2233 | 148 | .add(*Src0) |
2234 | 148 | .addImm(Src1Mods ? Src1Mods->getImm()6 : 0142 ) |
2235 | 148 | .add(*Src1) |
2236 | 148 | .addImm(0) // Src mods |
2237 | 148 | .add(*Src2) |
2238 | 148 | .addImm(Clamp ? Clamp->getImm()6 : 0142 ) |
2239 | 148 | .addImm(Omod ? Omod->getImm()6 : 0142 ); |
2240 | 173 | } |
2241 | | |
2242 | | // It's not generally safe to move VALU instructions across these since it will |
2243 | | // start using the register as a base index rather than directly. |
2244 | | // XXX - Why isn't hasSideEffects sufficient for these? |
2245 | 427k | static bool changesVGPRIndexingMode(const MachineInstr &MI) { |
2246 | 427k | switch (MI.getOpcode()) { |
2247 | 232 | case AMDGPU::S_SET_GPR_IDX_ON: |
2248 | 232 | case AMDGPU::S_SET_GPR_IDX_MODE: |
2249 | 232 | case AMDGPU::S_SET_GPR_IDX_OFF: |
2250 | 232 | return true; |
2251 | 426k | default: |
2252 | 426k | return false; |
2253 | 0 | } |
2254 | 0 | } |
2255 | | |
2256 | | bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, |
2257 | | const MachineBasicBlock *MBB, |
2258 | 458k | const MachineFunction &MF) const { |
2259 | 458k | // XXX - Do we want the SP check in the base implementation? |
2260 | 458k | |
2261 | 458k | // Target-independent instructions do not have an implicit-use of EXEC, even |
2262 | 458k | // when they operate on VGPRs. Treating EXEC modifications as scheduling |
2263 | 458k | // boundaries prevents incorrect movements of such instructions. |
2264 | 458k | return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || |
2265 | 429k | MI.modifiesRegister(AMDGPU::EXEC, &RI) || |
2266 | 427k | MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || |
2267 | 427k | MI.getOpcode() == AMDGPU::S_SETREG_B32 || |
2268 | 427k | changesVGPRIndexingMode(MI); |
2269 | 458k | } |
2270 | | |
2271 | 5.18k | bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { |
2272 | 5.18k | switch (Imm.getBitWidth()) { |
2273 | 0 | case 32: |
2274 | 0 | return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), |
2275 | 0 | ST.hasInv2PiInlineImm()); |
2276 | 5.09k | case 64: |
2277 | 5.09k | return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), |
2278 | 5.09k | ST.hasInv2PiInlineImm()); |
2279 | 86 | case 16: |
2280 | 86 | return ST.has16BitInsts() && |
2281 | 86 | AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), |
2282 | 86 | ST.hasInv2PiInlineImm()); |
2283 | 0 | default: |
2284 | 0 | llvm_unreachable("invalid bitwidth"); |
2285 | 0 | } |
2286 | 0 | } |
2287 | | |
2288 | | bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, |
2289 | 3.76M | uint8_t OperandType) const { |
2290 | 3.76M | if (!MO.isImm() || |
2291 | 3.76M | OperandType < AMDGPU::OPERAND_SRC_FIRST || |
2292 | 3.71M | OperandType > AMDGPU::OPERAND_SRC_LAST) |
2293 | 50.6k | return false; |
2294 | 3.71M | |
2295 | 3.71M | // MachineOperand provides no way to tell the true operand size, since it only |
2296 | 3.71M | // records a 64-bit value. We need to know the size to determine if a 32-bit |
2297 | 3.71M | // floating point immediate bit pattern is legal for an integer immediate. It |
2298 | 3.71M | // would be for any 32-bit integer operand, but would not be for a 64-bit one. |
2299 | 3.71M | |
2300 | 3.71M | int64_t Imm = MO.getImm(); |
2301 | 3.71M | switch (OperandType) { |
2302 | 3.56M | case AMDGPU::OPERAND_REG_IMM_INT32: |
2303 | 3.56M | case AMDGPU::OPERAND_REG_IMM_FP32: |
2304 | 3.56M | case AMDGPU::OPERAND_REG_INLINE_C_INT32: |
2305 | 3.56M | case AMDGPU::OPERAND_REG_INLINE_C_FP32: { |
2306 | 3.56M | int32_t Trunc = static_cast<int32_t>(Imm); |
2307 | 3.56M | return Trunc == Imm && |
2308 | 3.56M | AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); |
2309 | 3.56M | } |
2310 | 38.2k | case AMDGPU::OPERAND_REG_IMM_INT64: |
2311 | 38.2k | case AMDGPU::OPERAND_REG_IMM_FP64: |
2312 | 38.2k | case AMDGPU::OPERAND_REG_INLINE_C_INT64: |
2313 | 38.2k | case AMDGPU::OPERAND_REG_INLINE_C_FP64: |
2314 | 38.2k | return AMDGPU::isInlinableLiteral64(MO.getImm(), |
2315 | 38.2k | ST.hasInv2PiInlineImm()); |
2316 | 114k | case AMDGPU::OPERAND_REG_IMM_INT16: |
2317 | 114k | case AMDGPU::OPERAND_REG_IMM_FP16: |
2318 | 114k | case AMDGPU::OPERAND_REG_INLINE_C_INT16: |
2319 | 114k | case AMDGPU::OPERAND_REG_INLINE_C_FP16: { |
2320 | 114k | if (isInt<16>(Imm) || 114k isUInt<16>(Imm)3.20k ) { |
2321 | 114k | // A few special case instructions have 16-bit operands on subtargets |
2322 | 114k | // where 16-bit instructions are not legal. |
2323 | 114k | // TODO: Do the 32-bit immediates work? We shouldn't really need to handle |
2324 | 114k | // constants in these cases |
2325 | 114k | int16_t Trunc = static_cast<int16_t>(Imm); |
2326 | 114k | return ST.has16BitInsts() && |
2327 | 114k | AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); |
2328 | 114k | } |
2329 | 71 | |
2330 | 71 | return false; |
2331 | 71 | } |
2332 | 3.24k | case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: |
2333 | 3.24k | case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { |
2334 | 3.24k | uint32_t Trunc = static_cast<uint32_t>(Imm); |
2335 | 3.24k | return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); |
2336 | 3.24k | } |
2337 | 0 | default: |
2338 | 0 | llvm_unreachable("invalid bitwidth"); |
2339 | 0 | } |
2340 | 0 | } |
2341 | | |
2342 | | bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, |
2343 | 418k | const MCOperandInfo &OpInfo) const { |
2344 | 418k | switch (MO.getType()) { |
2345 | 292k | case MachineOperand::MO_Register: |
2346 | 292k | return false; |
2347 | 124k | case MachineOperand::MO_Immediate: |
2348 | 124k | return !isInlineConstant(MO, OpInfo); |
2349 | 1.94k | case MachineOperand::MO_FrameIndex: |
2350 | 1.94k | case MachineOperand::MO_MachineBasicBlock: |
2351 | 1.94k | case MachineOperand::MO_ExternalSymbol: |
2352 | 1.94k | case MachineOperand::MO_GlobalAddress: |
2353 | 1.94k | case MachineOperand::MO_MCSymbol: |
2354 | 1.94k | return true; |
2355 | 0 | default: |
2356 | 0 | llvm_unreachable("unexpected operand type"); |
2357 | 0 | } |
2358 | 0 | } |
2359 | | |
2360 | | static bool compareMachineOp(const MachineOperand &Op0, |
2361 | 15.3k | const MachineOperand &Op1) { |
2362 | 15.3k | if (Op0.getType() != Op1.getType()) |
2363 | 0 | return false; |
2364 | 15.3k | |
2365 | 15.3k | switch (Op0.getType()) { |
2366 | 15.3k | case MachineOperand::MO_Register: |
2367 | 15.3k | return Op0.getReg() == Op1.getReg(); |
2368 | 0 | case MachineOperand::MO_Immediate: |
2369 | 0 | return Op0.getImm() == Op1.getImm(); |
2370 | 0 | default: |
2371 | 0 | llvm_unreachable("Didn't expect to be comparing these operand types"); |
2372 | 0 | } |
2373 | 0 | } |
2374 | | |
2375 | | bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, |
2376 | 69.1k | const MachineOperand &MO) const { |
2377 | 69.1k | const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; |
2378 | 69.1k | |
2379 | 69.1k | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); |
2380 | 69.1k | |
2381 | 69.1k | if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) |
2382 | 0 | return true; |
2383 | 69.1k | |
2384 | 69.1k | if (69.1k OpInfo.RegClass < 069.1k ) |
2385 | 0 | return false; |
2386 | 69.1k | |
2387 | 69.1k | if (69.1k MO.isImm() && 69.1k isInlineConstant(MO, OpInfo)68.8k ) |
2388 | 45.6k | return RI.opCanUseInlineConstant(OpInfo.OperandType); |
2389 | 23.5k | |
2390 | 23.5k | return RI.opCanUseLiteralConstant(OpInfo.OperandType); |
2391 | 23.5k | } |
2392 | | |
2393 | 625k | bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { |
2394 | 625k | int Op32 = AMDGPU::getVOPe32(Opcode); |
2395 | 625k | if (Op32 == -1) |
2396 | 538k | return false; |
2397 | 87.6k | |
2398 | 87.6k | return pseudoToMCOpcode(Op32) != -1; |
2399 | 87.6k | } |
2400 | | |
2401 | 0 | bool SIInstrInfo::hasModifiers(unsigned Opcode) const { |
2402 | 0 | // The src0_modifier operand is present on all instructions |
2403 | 0 | // that have modifiers. |
2404 | 0 |
|
2405 | 0 | return AMDGPU::getNamedOperandIdx(Opcode, |
2406 | 0 | AMDGPU::OpName::src0_modifiers) != -1; |
2407 | 0 | } |
2408 | | |
2409 | | bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, |
2410 | 168k | unsigned OpName) const { |
2411 | 168k | const MachineOperand *Mods = getNamedOperand(MI, OpName); |
2412 | 41.6k | return Mods && Mods->getImm(); |
2413 | 168k | } |
2414 | | |
2415 | 164 | bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { |
2416 | 164 | return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || |
2417 | 156 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || |
2418 | 155 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || |
2419 | 136 | hasModifiersSet(MI, AMDGPU::OpName::clamp) || |
2420 | 134 | hasModifiersSet(MI, AMDGPU::OpName::omod); |
2421 | 164 | } |
2422 | | |
2423 | | bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, |
2424 | | const MachineOperand &MO, |
2425 | 6.61M | const MCOperandInfo &OpInfo) const { |
2426 | 6.61M | // Literal constants use the constant bus. |
2427 | 6.61M | //if (isLiteralConstantLike(MO, OpInfo)) |
2428 | 6.61M | // return true; |
2429 | 6.61M | if (MO.isImm()) |
2430 | 1.71M | return !isInlineConstant(MO, OpInfo); |
2431 | 4.90M | |
2432 | 4.90M | if (4.90M !MO.isReg()4.90M ) |
2433 | 9.15k | return true; // Misc other operands like FrameIndex |
2434 | 4.89M | |
2435 | 4.89M | if (4.89M !MO.isUse()4.89M ) |
2436 | 116k | return false; |
2437 | 4.77M | |
2438 | 4.77M | if (4.77M TargetRegisterInfo::isVirtualRegister(MO.getReg())4.77M ) |
2439 | 2.42M | return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); |
2440 | 2.35M | |
2441 | 2.35M | // FLAT_SCR is just an SGPR pair. |
2442 | 2.35M | if (2.35M !MO.isImplicit() && 2.35M (MO.getReg() == AMDGPU::FLAT_SCR)2.27M ) |
2443 | 0 | return true; |
2444 | 2.35M | |
2445 | 2.35M | // EXEC register uses the constant bus. |
2446 | 2.35M | if (2.35M !MO.isImplicit() && 2.35M MO.getReg() == AMDGPU::EXEC2.27M ) |
2447 | 26 | return true; |
2448 | 2.35M | |
2449 | 2.35M | // SGPRs use the constant bus |
2450 | 2.35M | return (MO.getReg() == AMDGPU::VCC || 2.35M MO.getReg() == AMDGPU::M02.27M || |
2451 | 2.27M | (!MO.isImplicit() && |
2452 | 2.21M | (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || |
2453 | 2.27M | AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); |
2454 | 6.61M | } |
2455 | | |
2456 | 3.28M | static unsigned findImplicitSGPRRead(const MachineInstr &MI) { |
2457 | 4.12M | for (const MachineOperand &MO : MI.implicit_operands()) { |
2458 | 4.12M | // We only care about reads. |
2459 | 4.12M | if (MO.isDef()) |
2460 | 479k | continue; |
2461 | 3.64M | |
2462 | 3.64M | switch (MO.getReg()) { |
2463 | 189k | case AMDGPU::VCC: |
2464 | 189k | case AMDGPU::M0: |
2465 | 189k | case AMDGPU::FLAT_SCR: |
2466 | 189k | return MO.getReg(); |
2467 | 189k | |
2468 | 3.45M | default: |
2469 | 3.45M | break; |
2470 | 3.09M | } |
2471 | 3.09M | } |
2472 | 3.09M | |
2473 | 3.09M | return AMDGPU::NoRegister; |
2474 | 3.09M | } |
2475 | | |
2476 | 9.03M | static bool shouldReadExec(const MachineInstr &MI) { |
2477 | 9.03M | if (SIInstrInfo::isVALU(MI)9.03M ) { |
2478 | 3.23M | switch (MI.getOpcode()) { |
2479 | 23.7k | case AMDGPU::V_READLANE_B32: |
2480 | 23.7k | case AMDGPU::V_READLANE_B32_si: |
2481 | 23.7k | case AMDGPU::V_READLANE_B32_vi: |
2482 | 23.7k | case AMDGPU::V_WRITELANE_B32: |
2483 | 23.7k | case AMDGPU::V_WRITELANE_B32_si: |
2484 | 23.7k | case AMDGPU::V_WRITELANE_B32_vi: |
2485 | 23.7k | return false; |
2486 | 3.20M | } |
2487 | 3.20M | |
2488 | 3.20M | return true; |
2489 | 3.20M | } |
2490 | 5.79M | |
2491 | 5.79M | if (5.79M SIInstrInfo::isGenericOpcode(MI.getOpcode()) || |
2492 | 5.79M | SIInstrInfo::isSALU(MI) || |
2493 | 2.98M | SIInstrInfo::isSMRD(MI)) |
2494 | 4.00M | return false; |
2495 | 1.79M | |
2496 | 1.79M | return true; |
2497 | 1.79M | } |
2498 | | |
2499 | | static bool isSubRegOf(const SIRegisterInfo &TRI, |
2500 | | const MachineOperand &SuperVec, |
2501 | 2.76k | const MachineOperand &SubReg) { |
2502 | 2.76k | if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) |
2503 | 1.73k | return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); |
2504 | 1.02k | |
2505 | 1.02k | return SubReg.getSubReg() != AMDGPU::NoSubRegister && |
2506 | 1.02k | SubReg.getReg() == SuperVec.getReg(); |
2507 | 2.76k | } |
2508 | | |
2509 | | bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, |
2510 | 12.5M | StringRef &ErrInfo) const { |
2511 | 12.5M | uint16_t Opcode = MI.getOpcode(); |
2512 | 12.5M | if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) |
2513 | 3.52M | return true; |
2514 | 9.03M | |
2515 | 9.03M | const MachineFunction *MF = MI.getParent()->getParent(); |
2516 | 9.03M | const MachineRegisterInfo &MRI = MF->getRegInfo(); |
2517 | 9.03M | |
2518 | 9.03M | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); |
2519 | 9.03M | int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); |
2520 | 9.03M | int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); |
2521 | 9.03M | |
2522 | 9.03M | // Make sure the number of operands is correct. |
2523 | 9.03M | const MCInstrDesc &Desc = get(Opcode); |
2524 | 9.03M | if (!Desc.isVariadic() && |
2525 | 9.03M | Desc.getNumOperands() != MI.getNumExplicitOperands()9.01M ) { |
2526 | 0 | ErrInfo = "Instruction has wrong number of operands."; |
2527 | 0 | return false; |
2528 | 0 | } |
2529 | 9.03M | |
2530 | 9.03M | if (9.03M MI.isInlineAsm()9.03M ) { |
2531 | 0 | // Verify register classes for inlineasm constraints. |
2532 | 0 | for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); |
2533 | 0 | I != E0 ; ++I0 ) { |
2534 | 0 | const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); |
2535 | 0 | if (!RC) |
2536 | 0 | continue; |
2537 | 0 |
|
2538 | 0 | const MachineOperand &Op = MI.getOperand(I); |
2539 | 0 | if (!Op.isReg()) |
2540 | 0 | continue; |
2541 | 0 |
|
2542 | 0 | unsigned Reg = Op.getReg(); |
2543 | 0 | if (!TargetRegisterInfo::isVirtualRegister(Reg) && 0 !RC->contains(Reg)0 ) { |
2544 | 0 | ErrInfo = "inlineasm operand has incorrect register class."; |
2545 | 0 | return false; |
2546 | 0 | } |
2547 | 0 | } |
2548 | 0 |
|
2549 | 0 | return true; |
2550 | 9.03M | } |
2551 | 9.03M | |
2552 | 9.03M | // Make sure the register classes are correct. |
2553 | 41.0M | for (int i = 0, e = Desc.getNumOperands(); 9.03M i != e41.0M ; ++i31.9M ) { |
2554 | 31.9M | if (MI.getOperand(i).isFPImm()31.9M ) { |
2555 | 0 | ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " |
2556 | 0 | "all fp values to integers."; |
2557 | 0 | return false; |
2558 | 0 | } |
2559 | 31.9M | |
2560 | 31.9M | int RegClass = Desc.OpInfo[i].RegClass; |
2561 | 31.9M | |
2562 | 31.9M | switch (Desc.OpInfo[i].OperandType) { |
2563 | 12.7M | case MCOI::OPERAND_REGISTER: |
2564 | 12.7M | if (MI.getOperand(i).isImm()12.7M ) { |
2565 | 0 | ErrInfo = "Illegal immediate value for operand."; |
2566 | 0 | return false; |
2567 | 0 | } |
2568 | 12.7M | break; |
2569 | 3.94M | case AMDGPU::OPERAND_REG_IMM_INT32: |
2570 | 3.94M | case AMDGPU::OPERAND_REG_IMM_FP32: |
2571 | 3.94M | break; |
2572 | 4.21M | case AMDGPU::OPERAND_REG_INLINE_C_INT32: |
2573 | 4.21M | case AMDGPU::OPERAND_REG_INLINE_C_FP32: |
2574 | 4.21M | case AMDGPU::OPERAND_REG_INLINE_C_INT64: |
2575 | 4.21M | case AMDGPU::OPERAND_REG_INLINE_C_FP64: |
2576 | 4.21M | case AMDGPU::OPERAND_REG_INLINE_C_INT16: |
2577 | 4.21M | case AMDGPU::OPERAND_REG_INLINE_C_FP16: { |
2578 | 4.21M | const MachineOperand &MO = MI.getOperand(i); |
2579 | 4.21M | if (!MO.isReg() && 4.21M (!MO.isImm() || 1.74M !isInlineConstant(MI, i)1.74M )) { |
2580 | 0 | ErrInfo = "Illegal immediate value for operand."; |
2581 | 0 | return false; |
2582 | 0 | } |
2583 | 4.21M | break; |
2584 | 4.21M | } |
2585 | 9.65M | case MCOI::OPERAND_IMMEDIATE: |
2586 | 9.65M | case AMDGPU::OPERAND_KIMM32: |
2587 | 9.65M | // Check if this operand is an immediate. |
2588 | 9.65M | // FrameIndex operands will be replaced by immediates, so they are |
2589 | 9.65M | // allowed. |
2590 | 9.65M | if (!MI.getOperand(i).isImm() && 9.65M !MI.getOperand(i).isFI()5.55k ) { |
2591 | 0 | ErrInfo = "Expected immediate, but got non-immediate"; |
2592 | 0 | return false; |
2593 | 0 | } |
2594 | 9.65M | LLVM_FALLTHROUGH9.65M ; |
2595 | 11.0M | default: |
2596 | 11.0M | continue; |
2597 | 20.9M | } |
2598 | 20.9M | |
2599 | 20.9M | if (20.9M !MI.getOperand(i).isReg()20.9M ) |
2600 | 4.05M | continue; |
2601 | 16.8M | |
2602 | 16.8M | if (16.8M RegClass != -116.8M ) { |
2603 | 16.8M | unsigned Reg = MI.getOperand(i).getReg(); |
2604 | 16.8M | if (Reg == AMDGPU::NoRegister || |
2605 | 16.8M | TargetRegisterInfo::isVirtualRegister(Reg)) |
2606 | 7.92M | continue; |
2607 | 8.94M | |
2608 | 8.94M | const TargetRegisterClass *RC = RI.getRegClass(RegClass); |
2609 | 8.94M | if (!RC->contains(Reg)8.94M ) { |
2610 | 0 | ErrInfo = "Operand has incorrect register class."; |
2611 | 0 | return false; |
2612 | 0 | } |
2613 | 16.8M | } |
2614 | 31.9M | } |
2615 | 9.03M | |
2616 | 9.03M | // Verify SDWA |
2617 | 9.03M | if (9.03M isSDWA(MI)9.03M ) { |
2618 | 31.8k | if (!ST.hasSDWA()31.8k ) { |
2619 | 0 | ErrInfo = "SDWA is not supported on this target"; |
2620 | 0 | return false; |
2621 | 0 | } |
2622 | 31.8k | |
2623 | 31.8k | int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); |
2624 | 31.8k | |
2625 | 31.8k | const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; |
2626 | 31.8k | |
2627 | 127k | for (int OpIdx: OpIndicies) { |
2628 | 127k | if (OpIdx == -1) |
2629 | 35.0k | continue; |
2630 | 92.3k | const MachineOperand &MO = MI.getOperand(OpIdx); |
2631 | 92.3k | |
2632 | 92.3k | if (!ST.hasSDWAScalar()92.3k ) { |
2633 | 87.2k | // Only VGPRS on VI |
2634 | 87.2k | if (!MO.isReg() || 87.2k !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))87.2k ) { |
2635 | 0 | ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; |
2636 | 0 | return false; |
2637 | 0 | } |
2638 | 5.14k | } else { |
2639 | 5.14k | // No immediates on GFX9 |
2640 | 5.14k | if (!MO.isReg()5.14k ) { |
2641 | 0 | ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; |
2642 | 0 | return false; |
2643 | 0 | } |
2644 | 31.8k | } |
2645 | 127k | } |
2646 | 31.8k | |
2647 | 31.8k | if (31.8k !ST.hasSDWAOmod()31.8k ) { |
2648 | 30.0k | // No omod allowed on VI |
2649 | 30.0k | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); |
2650 | 30.0k | if (OMod != nullptr && |
2651 | 30.0k | (!OMod->isImm() || 6.19k OMod->getImm() != 06.19k )) { |
2652 | 0 | ErrInfo = "OMod not allowed in SDWA instructions on VI"; |
2653 | 0 | return false; |
2654 | 0 | } |
2655 | 31.8k | } |
2656 | 31.8k | |
2657 | 31.8k | uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); |
2658 | 31.8k | if (isVOPC(BasicOpcode)31.8k ) { |
2659 | 58 | if (!ST.hasSDWASdst() && 58 DstIdx != -113 ) { |
2660 | 0 | // Only vcc allowed as dst on VI for VOPC |
2661 | 0 | const MachineOperand &Dst = MI.getOperand(DstIdx); |
2662 | 0 | if (!Dst.isReg() || 0 Dst.getReg() != AMDGPU::VCC0 ) { |
2663 | 0 | ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; |
2664 | 0 | return false; |
2665 | 0 | } |
2666 | 58 | } else if (58 !ST.hasSDWAOutModsVOPC()58 ) { |
2667 | 45 | // No clamp allowed on GFX9 for VOPC |
2668 | 45 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); |
2669 | 45 | if (Clamp && 45 (!Clamp->isImm() || 45 Clamp->getImm() != 045 )) { |
2670 | 0 | ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; |
2671 | 0 | return false; |
2672 | 0 | } |
2673 | 45 | |
2674 | 45 | // No omod allowed on GFX9 for VOPC |
2675 | 45 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); |
2676 | 45 | if (OMod && 45 (!OMod->isImm() || 0 OMod->getImm() != 00 )) { |
2677 | 0 | ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; |
2678 | 0 | return false; |
2679 | 0 | } |
2680 | 9.03M | } |
2681 | 58 | } |
2682 | 31.8k | } |
2683 | 9.03M | |
2684 | 9.03M | // Verify VOP* |
2685 | 9.03M | if (9.03M isVOP1(MI) || 9.03M isVOP2(MI)8.19M || isVOP3(MI)7.05M || isVOPC(MI)5.86M || isSDWA(MI)5.84M ) { |
2686 | 3.21M | // Only look at the true operands. Only a real operand can use the constant |
2687 | 3.21M | // bus, and we don't want to check pseudo-operands like the source modifier |
2688 | 3.21M | // flags. |
2689 | 3.21M | const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; |
2690 | 3.21M | |
2691 | 3.21M | unsigned ConstantBusCount = 0; |
2692 | 3.21M | |
2693 | 3.21M | if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) |
2694 | 979 | ++ConstantBusCount; |
2695 | 3.21M | |
2696 | 3.21M | unsigned SGPRUsed = findImplicitSGPRRead(MI); |
2697 | 3.21M | if (SGPRUsed != AMDGPU::NoRegister) |
2698 | 184k | ++ConstantBusCount; |
2699 | 3.21M | |
2700 | 8.75M | for (int OpIdx : OpIndices) { |
2701 | 8.75M | if (OpIdx == -1) |
2702 | 2.64M | break; |
2703 | 6.10M | const MachineOperand &MO = MI.getOperand(OpIdx); |
2704 | 6.10M | if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])6.10M ) { |
2705 | 1.25M | if (MO.isReg()1.25M ) { |
2706 | 1.11M | if (MO.getReg() != SGPRUsed) |
2707 | 1.10M | ++ConstantBusCount; |
2708 | 1.11M | SGPRUsed = MO.getReg(); |
2709 | 1.25M | } else { |
2710 | 137k | ++ConstantBusCount; |
2711 | 137k | } |
2712 | 1.25M | } |
2713 | 8.75M | } |
2714 | 3.21M | if (ConstantBusCount > 13.21M ) { |
2715 | 0 | ErrInfo = "VOP* instruction uses the constant bus more than once"; |
2716 | 0 | return false; |
2717 | 0 | } |
2718 | 9.03M | } |
2719 | 9.03M | |
2720 | 9.03M | // Verify misc. restrictions on specific instructions. |
2721 | 9.03M | if (9.03M Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || |
2722 | 9.03M | Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F649.02M ) { |
2723 | 10.7k | const MachineOperand &Src0 = MI.getOperand(Src0Idx); |
2724 | 10.7k | const MachineOperand &Src1 = MI.getOperand(Src1Idx); |
2725 | 10.7k | const MachineOperand &Src2 = MI.getOperand(Src2Idx); |
2726 | 10.7k | if (Src0.isReg() && 10.7k Src1.isReg()10.4k && Src2.isReg()10.4k ) { |
2727 | 10.2k | if (!compareMachineOp(Src0, Src1) && |
2728 | 10.2k | !compareMachineOp(Src0, Src2)5.04k ) { |
2729 | 0 | ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; |
2730 | 0 | return false; |
2731 | 0 | } |
2732 | 9.03M | } |
2733 | 10.7k | } |
2734 | 9.03M | |
2735 | 9.03M | if (9.03M isSOPK(MI)9.03M ) { |
2736 | 10.2k | int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); |
2737 | 10.2k | if (sopkIsZext(MI)10.2k ) { |
2738 | 672 | if (!isUInt<16>(Imm)672 ) { |
2739 | 0 | ErrInfo = "invalid immediate for SOPK instruction"; |
2740 | 0 | return false; |
2741 | 0 | } |
2742 | 9.59k | } else { |
2743 | 9.59k | if (!isInt<16>(Imm)9.59k ) { |
2744 | 0 | ErrInfo = "invalid immediate for SOPK instruction"; |
2745 | 0 | return false; |
2746 | 0 | } |
2747 | 9.03M | } |
2748 | 10.2k | } |
2749 | 9.03M | |
2750 | 9.03M | if (9.03M Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || |
2751 | 9.02M | Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || |
2752 | 9.02M | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || |
2753 | 9.03M | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e649.02M ) { |
2754 | 2.76k | const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || |
2755 | 2.06k | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; |
2756 | 2.76k | |
2757 | 2.76k | const unsigned StaticNumOps = Desc.getNumOperands() + |
2758 | 2.76k | Desc.getNumImplicitUses(); |
2759 | 2.76k | const unsigned NumImplicitOps = IsDst ? 2705 : 12.06k ; |
2760 | 2.76k | |
2761 | 2.76k | // Allow additional implicit operands. This allows a fixup done by the post |
2762 | 2.76k | // RA scheduler where the main implicit operand is killed and implicit-defs |
2763 | 2.76k | // are added for sub-registers that remain live after this instruction. |
2764 | 2.76k | if (MI.getNumOperands() < StaticNumOps + NumImplicitOps2.76k ) { |
2765 | 0 | ErrInfo = "missing implicit register operands"; |
2766 | 0 | return false; |
2767 | 0 | } |
2768 | 2.76k | |
2769 | 2.76k | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); |
2770 | 2.76k | if (IsDst2.76k ) { |
2771 | 705 | if (!Dst->isUse()705 ) { |
2772 | 0 | ErrInfo = "v_movreld_b32 vdst should be a use operand"; |
2773 | 0 | return false; |
2774 | 0 | } |
2775 | 705 | |
2776 | 705 | unsigned UseOpIdx; |
2777 | 705 | if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || |
2778 | 705 | UseOpIdx != StaticNumOps + 1705 ) { |
2779 | 0 | ErrInfo = "movrel implicit operands should be tied"; |
2780 | 0 | return false; |
2781 | 0 | } |
2782 | 2.76k | } |
2783 | 2.76k | |
2784 | 2.76k | const MachineOperand &Src0 = MI.getOperand(Src0Idx); |
2785 | 2.76k | const MachineOperand &ImpUse |
2786 | 2.76k | = MI.getOperand(StaticNumOps + NumImplicitOps - 1); |
2787 | 2.76k | if (!ImpUse.isReg() || 2.76k !ImpUse.isUse()2.76k || |
2788 | 2.76k | !isSubRegOf(RI, ImpUse, IsDst ? 2.76k *Dst705 : Src02.06k )) { |
2789 | 0 | ErrInfo = "src0 should be subreg of implicit vector use"; |
2790 | 0 | return false; |
2791 | 0 | } |
2792 | 9.03M | } |
2793 | 9.03M | |
2794 | 9.03M | // Make sure we aren't losing exec uses in the td files. This mostly requires |
2795 | 9.03M | // being careful when using let Uses to try to add other use registers. |
2796 | 9.03M | if (9.03M shouldReadExec(MI)9.03M ) { |
2797 | 5.00M | if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)5.00M ) { |
2798 | 0 | ErrInfo = "VALU instruction does not implicitly read exec mask"; |
2799 | 0 | return false; |
2800 | 0 | } |
2801 | 9.03M | } |
2802 | 9.03M | |
2803 | 9.03M | if (9.03M isSMRD(MI)9.03M ) { |
2804 | 1.19M | if (MI.mayStore()1.19M ) { |
2805 | 834 | // The register offset form of scalar stores may only use m0 as the |
2806 | 834 | // soffset register. |
2807 | 834 | const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); |
2808 | 834 | if (Soff && 834 Soff->getReg() != AMDGPU::M0162 ) { |
2809 | 0 | ErrInfo = "scalar stores must use m0 as offset register"; |
2810 | 0 | return false; |
2811 | 0 | } |
2812 | 9.03M | } |
2813 | 1.19M | } |
2814 | 9.03M | |
2815 | 9.03M | if (9.03M isFLAT(MI) && 9.03M !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()352k ) { |
2816 | 277k | const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); |
2817 | 277k | if (Offset->getImm() != 0277k ) { |
2818 | 0 | ErrInfo = "subtarget does not support offsets in flat instructions"; |
2819 | 0 | return false; |
2820 | 0 | } |
2821 | 9.03M | } |
2822 | 9.03M | |
2823 | 9.03M | return true; |
2824 | 9.03M | } |
2825 | | |
2826 | 107k | unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { |
2827 | 107k | switch (MI.getOpcode()) { |
2828 | 1.87k | default: return AMDGPU::INSTRUCTION_LIST_END; |
2829 | 21.3k | case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; |
2830 | 47.5k | case AMDGPU::COPY: return AMDGPU::COPY; |
2831 | 329 | case AMDGPU::PHI: return AMDGPU::PHI; |
2832 | 22 | case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; |
2833 | 4 | case AMDGPU::WQM: return AMDGPU::WQM; |
2834 | 4 | case AMDGPU::WWM: return AMDGPU::WWM; |
2835 | 23 | case AMDGPU::S_MOV_B32: |
2836 | 23 | return MI.getOperand(1).isReg() ? |
2837 | 23 | AMDGPU::COPY23 : AMDGPU::V_MOV_B32_e320 ; |
2838 | 5.33k | case AMDGPU::S_ADD_I32: |
2839 | 5.33k | case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; |
2840 | 3.54k | case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; |
2841 | 2.17k | case AMDGPU::S_SUB_I32: |
2842 | 2.17k | case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; |
2843 | 1.24k | case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; |
2844 | 466 | case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; |
2845 | 3.26k | case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; |
2846 | 3.61k | case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; |
2847 | 213 | case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; |
2848 | 81 | case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; |
2849 | 50 | case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; |
2850 | 67 | case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; |
2851 | 16 | case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; |
2852 | 1.77k | case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; |
2853 | 213 | case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; |
2854 | 4.44k | case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; |
2855 | 1.65k | case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; |
2856 | 2.72k | case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; |
2857 | 138 | case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; |
2858 | 357 | case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; |
2859 | 720 | case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; |
2860 | 2.00k | case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; |
2861 | 1.46k | case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; |
2862 | 0 | case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; |
2863 | 12 | case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; |
2864 | 4 | case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; |
2865 | 16 | case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; |
2866 | 0 | case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; |
2867 | 0 | case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; |
2868 | 10 | case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; |
2869 | 4 | case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; |
2870 | 5 | case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; |
2871 | 0 | case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; |
2872 | 22 | case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; |
2873 | 29 | case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; |
2874 | 0 | case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; |
2875 | 3 | case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; |
2876 | 2 | case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; |
2877 | 0 | case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; |
2878 | 1 | case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; |
2879 | 1 | case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; |
2880 | 64 | case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; |
2881 | 14 | case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; |
2882 | 158 | case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; |
2883 | 2 | case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; |
2884 | 0 | case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; |
2885 | 77 | case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; |
2886 | 0 | } |
2887 | 0 | } |
2888 | | |
2889 | 0 | bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { |
2890 | 0 | return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; |
2891 | 0 | } |
2892 | | |
2893 | | const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, |
2894 | 1.73M | unsigned OpNo) const { |
2895 | 1.73M | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); |
2896 | 1.73M | const MCInstrDesc &Desc = get(MI.getOpcode()); |
2897 | 1.73M | if (MI.isVariadic() || 1.73M OpNo >= Desc.getNumOperands()1.61M || |
2898 | 1.73M | Desc.OpInfo[OpNo].RegClass == -11.33M ) { |
2899 | 763k | unsigned Reg = MI.getOperand(OpNo).getReg(); |
2900 | 763k | |
2901 | 763k | if (TargetRegisterInfo::isVirtualRegister(Reg)) |
2902 | 452k | return MRI.getRegClass(Reg); |
2903 | 311k | return RI.getPhysRegClass(Reg); |
2904 | 311k | } |
2905 | 969k | |
2906 | 969k | unsigned RCID = Desc.OpInfo[OpNo].RegClass; |
2907 | 969k | return RI.getRegClass(RCID); |
2908 | 969k | } |
2909 | | |
2910 | 137k | bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { |
2911 | 137k | switch (MI.getOpcode()) { |
2912 | 88.5k | case AMDGPU::COPY: |
2913 | 88.5k | case AMDGPU::REG_SEQUENCE: |
2914 | 88.5k | case AMDGPU::PHI: |
2915 | 88.5k | case AMDGPU::INSERT_SUBREG: |
2916 | 88.5k | return RI.hasVGPRs(getOpRegClass(MI, 0)); |
2917 | 48.7k | default: |
2918 | 48.7k | return RI.hasVGPRs(getOpRegClass(MI, OpNo)); |
2919 | 0 | } |
2920 | 0 | } |
2921 | | |
2922 | 20.1k | void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { |
2923 | 20.1k | MachineBasicBlock::iterator I = MI; |
2924 | 20.1k | MachineBasicBlock *MBB = MI.getParent(); |
2925 | 20.1k | MachineOperand &MO = MI.getOperand(OpIdx); |
2926 | 20.1k | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); |
2927 | 20.1k | unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; |
2928 | 20.1k | const TargetRegisterClass *RC = RI.getRegClass(RCID); |
2929 | 20.1k | unsigned Opcode = AMDGPU::V_MOV_B32_e32; |
2930 | 20.1k | if (MO.isReg()) |
2931 | 20.1k | Opcode = AMDGPU::COPY; |
2932 | 0 | else if (0 RI.isSGPRClass(RC)0 ) |
2933 | 0 | Opcode = AMDGPU::S_MOV_B32; |
2934 | 20.1k | |
2935 | 20.1k | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); |
2936 | 20.1k | if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) |
2937 | 2.18k | VRC = &AMDGPU::VReg_64RegClass; |
2938 | 20.1k | else |
2939 | 17.9k | VRC = &AMDGPU::VGPR_32RegClass; |
2940 | 20.1k | |
2941 | 20.1k | unsigned Reg = MRI.createVirtualRegister(VRC); |
2942 | 20.1k | DebugLoc DL = MBB->findDebugLoc(I); |
2943 | 20.1k | BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); |
2944 | 20.1k | MO.ChangeToRegister(Reg, false); |
2945 | 20.1k | } |
2946 | | |
2947 | | unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, |
2948 | | MachineRegisterInfo &MRI, |
2949 | | MachineOperand &SuperReg, |
2950 | | const TargetRegisterClass *SuperRC, |
2951 | | unsigned SubIdx, |
2952 | | const TargetRegisterClass *SubRC) |
2953 | 1.96k | const { |
2954 | 1.96k | MachineBasicBlock *MBB = MI->getParent(); |
2955 | 1.96k | DebugLoc DL = MI->getDebugLoc(); |
2956 | 1.96k | unsigned SubReg = MRI.createVirtualRegister(SubRC); |
2957 | 1.96k | |
2958 | 1.96k | if (SuperReg.getSubReg() == AMDGPU::NoSubRegister1.96k ) { |
2959 | 1.96k | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) |
2960 | 1.96k | .addReg(SuperReg.getReg(), 0, SubIdx); |
2961 | 1.96k | return SubReg; |
2962 | 1.96k | } |
2963 | 0 |
|
2964 | 0 | // Just in case the super register is itself a sub-register, copy it to a new |
2965 | 0 | // value so we don't need to worry about merging its subreg index with the |
2966 | 0 | // SubIdx passed to this function. The register coalescer should be able to |
2967 | 0 | // eliminate this extra copy. |
2968 | 0 | unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); |
2969 | 0 |
|
2970 | 0 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) |
2971 | 0 | .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); |
2972 | 0 |
|
2973 | 0 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) |
2974 | 0 | .addReg(NewSuperReg, 0, SubIdx); |
2975 | 0 |
|
2976 | 0 | return SubReg; |
2977 | 0 | } |
2978 | | |
2979 | | MachineOperand SIInstrInfo::buildExtractSubRegOrImm( |
2980 | | MachineBasicBlock::iterator MII, |
2981 | | MachineRegisterInfo &MRI, |
2982 | | MachineOperand &Op, |
2983 | | const TargetRegisterClass *SuperRC, |
2984 | | unsigned SubIdx, |
2985 | 1.92k | const TargetRegisterClass *SubRC) const { |
2986 | 1.92k | if (Op.isImm()1.92k ) { |
2987 | 0 | if (SubIdx == AMDGPU::sub0) |
2988 | 0 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); |
2989 | 0 | if (0 SubIdx == AMDGPU::sub10 ) |
2990 | 0 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); |
2991 | 0 |
|
2992 | 0 | llvm_unreachable0 ("Unhandled register index for immediate"); |
2993 | 0 | } |
2994 | 1.92k | |
2995 | 1.92k | unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, |
2996 | 1.92k | SubIdx, SubRC); |
2997 | 1.92k | return MachineOperand::CreateReg(SubReg, false); |
2998 | 1.92k | } |
2999 | | |
3000 | | // Change the order of operands from (0, 1, 2) to (0, 2, 1) |
3001 | 4.98k | void SIInstrInfo::swapOperands(MachineInstr &Inst) const { |
3002 | 4.98k | assert(Inst.getNumExplicitOperands() == 3); |
3003 | 4.98k | MachineOperand Op1 = Inst.getOperand(1); |
3004 | 4.98k | Inst.RemoveOperand(1); |
3005 | 4.98k | Inst.addOperand(Op1); |
3006 | 4.98k | } |
3007 | | |
3008 | | bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, |
3009 | | const MCOperandInfo &OpInfo, |
3010 | 322k | const MachineOperand &MO) const { |
3011 | 322k | if (!MO.isReg()) |
3012 | 464 | return false; |
3013 | 321k | |
3014 | 321k | unsigned Reg = MO.getReg(); |
3015 | 321k | const TargetRegisterClass *RC = |
3016 | 321k | TargetRegisterInfo::isVirtualRegister(Reg) ? |
3017 | 317k | MRI.getRegClass(Reg) : |
3018 | 4.37k | RI.getPhysRegClass(Reg); |
3019 | 322k | |
3020 | 322k | const SIRegisterInfo *TRI = |
3021 | 322k | static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); |
3022 | 322k | RC = TRI->getSubRegClass(RC, MO.getSubReg()); |
3023 | 322k | |
3024 | 322k | // In order to be legal, the common sub-class must be equal to the |
3025 | 322k | // class of the current operand. For example: |
3026 | 322k | // |
3027 | 322k | // v_mov_b32 s0 ; Operand defined as vsrc_b32 |
3028 | 322k | // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL |
3029 | 322k | // |
3030 | 322k | // s_sendmsg 0, s0 ; Operand defined as m0reg |
3031 | 322k | // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL |
3032 | 322k | |
3033 | 322k | return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; |
3034 | 322k | } |
3035 | | |
3036 | | bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, |
3037 | | const MCOperandInfo &OpInfo, |
3038 | 0 | const MachineOperand &MO) const { |
3039 | 0 | if (MO.isReg()) |
3040 | 0 | return isLegalRegOperand(MRI, OpInfo, MO); |
3041 | 0 |
|
3042 | 0 | // Handle non-register types that are treated like immediates. |
3043 | 0 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); |
3044 | 0 | return true; |
3045 | 0 | } |
3046 | | |
3047 | | bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, |
3048 | 393k | const MachineOperand *MO) const { |
3049 | 393k | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); |
3050 | 393k | const MCInstrDesc &InstDesc = MI.getDesc(); |
3051 | 393k | const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; |
3052 | 393k | const TargetRegisterClass *DefinedRC = |
3053 | 393k | OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass)393k : nullptr0 ; |
3054 | 393k | if (!MO) |
3055 | 0 | MO = &MI.getOperand(OpIdx); |
3056 | 393k | |
3057 | 393k | if (isVALU(MI) && 393k usesConstantBus(MRI, *MO, OpInfo)274k ) { |
3058 | 88.5k | |
3059 | 88.5k | RegSubRegPair SGPRUsed; |
3060 | 88.5k | if (MO->isReg()) |
3061 | 69.9k | SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); |
3062 | 88.5k | |
3063 | 459k | for (unsigned i = 0, e = MI.getNumOperands(); i != e459k ; ++i371k ) { |
3064 | 399k | if (i == OpIdx) |
3065 | 81.7k | continue; |
3066 | 317k | const MachineOperand &Op = MI.getOperand(i); |
3067 | 317k | if (Op.isReg()317k ) { |
3068 | 271k | if ((Op.getReg() != SGPRUsed.Reg || 271k Op.getSubReg() != SGPRUsed.SubReg37.6k ) && |
3069 | 271k | usesConstantBus(MRI, Op, InstDesc.OpInfo[i])234k ) { |
3070 | 28.3k | return false; |
3071 | 28.3k | } |
3072 | 46.0k | } else if (46.0k InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM3246.0k ) { |
3073 | 23 | return false; |
3074 | 23 | } |
3075 | 399k | } |
3076 | 88.5k | } |
3077 | 393k | |
3078 | 365k | if (365k MO->isReg()365k ) { |
3079 | 296k | assert(DefinedRC); |
3080 | 296k | return isLegalRegOperand(MRI, OpInfo, *MO); |
3081 | 296k | } |
3082 | 68.8k | |
3083 | 68.8k | // Handle non-register types that are treated like immediates. |
3084 | 365k | assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); |
3085 | 68.8k | |
3086 | 68.8k | if (!DefinedRC68.8k ) { |
3087 | 0 | // This operand expects an immediate. |
3088 | 0 | return true; |
3089 | 0 | } |
3090 | 68.8k | |
3091 | 68.8k | return isImmOperandLegal(MI, OpIdx, *MO); |
3092 | 68.8k | } |
3093 | | |
3094 | | void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, |
3095 | 17.2k | MachineInstr &MI) const { |
3096 | 17.2k | unsigned Opc = MI.getOpcode(); |
3097 | 17.2k | const MCInstrDesc &InstrDesc = get(Opc); |
3098 | 17.2k | |
3099 | 17.2k | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); |
3100 | 17.2k | MachineOperand &Src1 = MI.getOperand(Src1Idx); |
3101 | 17.2k | |
3102 | 17.2k | // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 |
3103 | 17.2k | // we need to only have one constant bus use. |
3104 | 17.2k | // |
3105 | 17.2k | // Note we do not need to worry about literal constants here. They are |
3106 | 17.2k | // disabled for the operand type for instructions because they will always |
3107 | 17.2k | // violate the one constant bus use rule. |
3108 | 17.2k | bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; |
3109 | 17.2k | if (HasImplicitSGPR17.2k ) { |
3110 | 4.78k | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); |
3111 | 4.78k | MachineOperand &Src0 = MI.getOperand(Src0Idx); |
3112 | 4.78k | |
3113 | 4.78k | if (Src0.isReg() && 4.78k RI.isSGPRReg(MRI, Src0.getReg())4.78k ) |
3114 | 2.68k | legalizeOpWithMove(MI, Src0Idx); |
3115 | 4.78k | } |
3116 | 17.2k | |
3117 | 17.2k | // VOP2 src0 instructions support all operand types, so we don't need to check |
3118 | 17.2k | // their legality. If src1 is already legal, we don't need to do anything. |
3119 | 17.2k | if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) |
3120 | 6.39k | return; |
3121 | 10.8k | |
3122 | 10.8k | // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for |
3123 | 10.8k | // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane |
3124 | 10.8k | // select is uniform. |
3125 | 10.8k | if (10.8k Opc == AMDGPU::V_READLANE_B32 && 10.8k Src1.isReg()1 && |
3126 | 10.8k | RI.isVGPR(MRI, Src1.getReg())1 ) { |
3127 | 1 | unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
3128 | 1 | const DebugLoc &DL = MI.getDebugLoc(); |
3129 | 1 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) |
3130 | 1 | .add(Src1); |
3131 | 1 | Src1.ChangeToRegister(Reg, false); |
3132 | 1 | return; |
3133 | 1 | } |
3134 | 10.8k | |
3135 | 10.8k | // We do not use commuteInstruction here because it is too aggressive and will |
3136 | 10.8k | // commute if it is possible. We only want to commute here if it improves |
3137 | 10.8k | // legality. This can be called a fairly large number of times so don't waste |
3138 | 10.8k | // compile time pointlessly swapping and checking legality again. |
3139 | 10.8k | if (10.8k HasImplicitSGPR || 10.8k !MI.isCommutable()8.72k ) { |
3140 | 2.10k | legalizeOpWithMove(MI, Src1Idx); |
3141 | 2.10k | return; |
3142 | 2.10k | } |
3143 | 8.72k | |
3144 | 8.72k | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); |
3145 | 8.72k | MachineOperand &Src0 = MI.getOperand(Src0Idx); |
3146 | 8.72k | |
3147 | 8.72k | // If src0 can be used as src1, commuting will make the operands legal. |
3148 | 8.72k | // Otherwise we have to give up and insert a move. |
3149 | 8.72k | // |
3150 | 8.72k | // TODO: Other immediate-like operand kinds could be commuted if there was a |
3151 | 8.72k | // MachineOperand::ChangeTo* for them. |
3152 | 8.72k | if ((!Src1.isImm() && 8.72k !Src1.isReg()8.25k ) || |
3153 | 8.72k | !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)8.72k ) { |
3154 | 0 | legalizeOpWithMove(MI, Src1Idx); |
3155 | 0 | return; |
3156 | 0 | } |
3157 | 8.72k | |
3158 | 8.72k | int CommutedOpc = commuteOpcode(MI); |
3159 | 8.72k | if (CommutedOpc == -18.72k ) { |
3160 | 0 | legalizeOpWithMove(MI, Src1Idx); |
3161 | 0 | return; |
3162 | 0 | } |
3163 | 8.72k | |
3164 | 8.72k | MI.setDesc(get(CommutedOpc)); |
3165 | 8.72k | |
3166 | 8.72k | unsigned Src0Reg = Src0.getReg(); |
3167 | 8.72k | unsigned Src0SubReg = Src0.getSubReg(); |
3168 | 8.72k | bool Src0Kill = Src0.isKill(); |
3169 | 8.72k | |
3170 | 8.72k | if (Src1.isImm()) |
3171 | 464 | Src0.ChangeToImmediate(Src1.getImm()); |
3172 | 8.25k | else if (8.25k Src1.isReg()8.25k ) { |
3173 | 8.25k | Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); |
3174 | 8.25k | Src0.setSubReg(Src1.getSubReg()); |
3175 | 8.25k | } else |
3176 | 0 | llvm_unreachable("Should only have register or immediate operands"); |
3177 | 8.72k | |
3178 | 8.72k | Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); |
3179 | 8.72k | Src1.setSubReg(Src0SubReg); |
3180 | 8.72k | } |
3181 | | |
3182 | | // Legalize VOP3 operands. Because all operand types are supported for any |
3183 | | // operand, and since literal constants are not allowed and should never be |
3184 | | // seen, we only need to worry about inserting copies if we use multiple SGPR |
3185 | | // operands. |
3186 | | void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, |
3187 | 47.7k | MachineInstr &MI) const { |
3188 | 47.7k | unsigned Opc = MI.getOpcode(); |
3189 | 47.7k | |
3190 | 47.7k | int VOP3Idx[3] = { |
3191 | 47.7k | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), |
3192 | 47.7k | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), |
3193 | 47.7k | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) |
3194 | 47.7k | }; |
3195 | 47.7k | |
3196 | 47.7k | // Find the one SGPR operand we are allowed to use. |
3197 | 47.7k | unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); |
3198 | 47.7k | |
3199 | 154k | for (unsigned i = 0; i < 3154k ; ++i106k ) { |
3200 | 139k | int Idx = VOP3Idx[i]; |
3201 | 139k | if (Idx == -1) |
3202 | 32.8k | break; |
3203 | 106k | MachineOperand &MO = MI.getOperand(Idx); |
3204 | 106k | |
3205 | 106k | // We should never see a VOP3 instruction with an illegal immediate operand. |
3206 | 106k | if (!MO.isReg()) |
3207 | 10.8k | continue; |
3208 | 95.4k | |
3209 | 95.4k | if (95.4k !RI.isSGPRClass(MRI.getRegClass(MO.getReg()))95.4k ) |
3210 | 47.4k | continue; // VGPRs are legal |
3211 | 47.9k | |
3212 | 47.9k | if (47.9k SGPRReg == AMDGPU::NoRegister || 47.9k SGPRReg == MO.getReg()22.5k ) { |
3213 | 32.6k | SGPRReg = MO.getReg(); |
3214 | 32.6k | // We can use one SGPR in each VOP3 instruction. |
3215 | 32.6k | continue; |
3216 | 32.6k | } |
3217 | 15.3k | |
3218 | 15.3k | // If we make it this far, then the operand is not legal and we must |
3219 | 15.3k | // legalize it. |
3220 | 15.3k | legalizeOpWithMove(MI, Idx); |
3221 | 15.3k | } |
3222 | 47.7k | } |
3223 | | |
3224 | | unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, |
3225 | 21 | MachineRegisterInfo &MRI) const { |
3226 | 21 | const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); |
3227 | 21 | const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); |
3228 | 21 | unsigned DstReg = MRI.createVirtualRegister(SRC); |
3229 | 21 | unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; |
3230 | 21 | |
3231 | 21 | SmallVector<unsigned, 8> SRegs; |
3232 | 87 | for (unsigned i = 0; i < SubRegs87 ; ++i66 ) { |
3233 | 66 | unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
3234 | 66 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), |
3235 | 66 | get(AMDGPU::V_READFIRSTLANE_B32), SGPR) |
3236 | 66 | .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); |
3237 | 66 | SRegs.push_back(SGPR); |
3238 | 66 | } |
3239 | 21 | |
3240 | 21 | MachineInstrBuilder MIB = |
3241 | 21 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), |
3242 | 21 | get(AMDGPU::REG_SEQUENCE), DstReg); |
3243 | 87 | for (unsigned i = 0; i < SubRegs87 ; ++i66 ) { |
3244 | 66 | MIB.addReg(SRegs[i]); |
3245 | 66 | MIB.addImm(RI.getSubRegFromChannel(i)); |
3246 | 66 | } |
3247 | 21 | return DstReg; |
3248 | 21 | } |
3249 | | |
3250 | | void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, |
3251 | 13 | MachineInstr &MI) const { |
3252 | 13 | |
3253 | 13 | // If the pointer is store in VGPRs, then we need to move them to |
3254 | 13 | // SGPRs using v_readfirstlane. This is safe because we only select |
3255 | 13 | // loads with uniform pointers to SMRD instruction so we know the |
3256 | 13 | // pointer value is uniform. |
3257 | 13 | MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); |
3258 | 13 | if (SBase && 13 !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))13 ) { |
3259 | 13 | unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); |
3260 | 13 | SBase->setReg(SGPR); |
3261 | 13 | } |
3262 | 13 | } |
3263 | | |
3264 | | void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, |
3265 | | MachineBasicBlock::iterator I, |
3266 | | const TargetRegisterClass *DstRC, |
3267 | | MachineOperand &Op, |
3268 | | MachineRegisterInfo &MRI, |
3269 | 27.4k | const DebugLoc &DL) const { |
3270 | 27.4k | unsigned OpReg = Op.getReg(); |
3271 | 27.4k | unsigned OpSubReg = Op.getSubReg(); |
3272 | 27.4k | |
3273 | 27.4k | const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( |
3274 | 27.4k | RI.getRegClassForReg(MRI, OpReg), OpSubReg); |
3275 | 27.4k | |
3276 | 27.4k | // Check if operand is already the correct register class. |
3277 | 27.4k | if (DstRC == OpRC) |
3278 | 253 | return; |
3279 | 27.1k | |
3280 | 27.1k | unsigned DstReg = MRI.createVirtualRegister(DstRC); |
3281 | 27.1k | MachineInstr *Copy = |
3282 | 27.1k | BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); |
3283 | 27.1k | |
3284 | 27.1k | Op.setReg(DstReg); |
3285 | 27.1k | Op.setSubReg(0); |
3286 | 27.1k | |
3287 | 27.1k | MachineInstr *Def = MRI.getVRegDef(OpReg); |
3288 | 27.1k | if (!Def) |
3289 | 0 | return; |
3290 | 27.1k | |
3291 | 27.1k | // Try to eliminate the copy if it is copying an immediate value. |
3292 | 27.1k | if (27.1k Def->isMoveImmediate()27.1k ) |
3293 | 5.47k | FoldImmediate(*Copy, *Def, OpReg, &MRI); |
3294 | 27.4k | } |
3295 | | |
3296 | 90.6k | void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { |
3297 | 90.6k | MachineFunction &MF = *MI.getParent()->getParent(); |
3298 | 90.6k | MachineRegisterInfo &MRI = MF.getRegInfo(); |
3299 | 90.6k | |
3300 | 90.6k | // Legalize VOP2 |
3301 | 90.6k | if (isVOP2(MI) || 90.6k isVOPC(MI)73.4k ) { |
3302 | 17.2k | legalizeOperandsVOP2(MRI, MI); |
3303 | 17.2k | return; |
3304 | 17.2k | } |
3305 | 73.4k | |
3306 | 73.4k | // Legalize VOP3 |
3307 | 73.4k | if (73.4k isVOP3(MI)73.4k ) { |
3308 | 19.4k | legalizeOperandsVOP3(MRI, MI); |
3309 | 19.4k | return; |
3310 | 19.4k | } |
3311 | 54.0k | |
3312 | 54.0k | // Legalize SMRD |
3313 | 54.0k | if (54.0k isSMRD(MI)54.0k ) { |
3314 | 13 | legalizeOperandsSMRD(MRI, MI); |
3315 | 13 | return; |
3316 | 13 | } |
3317 | 53.9k | |
3318 | 53.9k | // Legalize REG_SEQUENCE and PHI |
3319 | 53.9k | // The register class of the operands much be the same type as the register |
3320 | 53.9k | // class of the output. |
3321 | 53.9k | if (53.9k MI.getOpcode() == AMDGPU::PHI53.9k ) { |
3322 | 329 | const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; |
3323 | 993 | for (unsigned i = 1, e = MI.getNumOperands(); i != e993 ; i += 2664 ) { |
3324 | 664 | if (!MI.getOperand(i).isReg() || |
3325 | 664 | !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) |
3326 | 0 | continue; |
3327 | 664 | const TargetRegisterClass *OpRC = |
3328 | 664 | MRI.getRegClass(MI.getOperand(i).getReg()); |
3329 | 664 | if (RI.hasVGPRs(OpRC)664 ) { |
3330 | 233 | VRC = OpRC; |
3331 | 664 | } else { |
3332 | 431 | SRC = OpRC; |
3333 | 431 | } |
3334 | 664 | } |
3335 | 329 | |
3336 | 329 | // If any of the operands are VGPR registers, then they all most be |
3337 | 329 | // otherwise we will create illegal VGPR->SGPR copies when legalizing |
3338 | 329 | // them. |
3339 | 329 | if (VRC || 329 !RI.isSGPRClass(getOpRegClass(MI, 0))108 ) { |
3340 | 329 | if (!VRC329 ) { |
3341 | 108 | assert(SRC); |
3342 | 108 | VRC = RI.getEquivalentVGPRClass(SRC); |
3343 | 108 | } |
3344 | 329 | RC = VRC; |
3345 | 0 | } else { |
3346 | 0 | RC = SRC; |
3347 | 0 | } |
3348 | 329 | |
3349 | 329 | // Update all the operands so they have the same type. |
3350 | 993 | for (unsigned I = 1, E = MI.getNumOperands(); I != E993 ; I += 2664 ) { |
3351 | 664 | MachineOperand &Op = MI.getOperand(I); |
3352 | 664 | if (!Op.isReg() || 664 !TargetRegisterInfo::isVirtualRegister(Op.getReg())664 ) |
3353 | 0 | continue; |
3354 | 664 | |
3355 | 664 | // MI is a PHI instruction. |
3356 | 664 | MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); |
3357 | 664 | MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); |
3358 | 664 | |
3359 | 664 | // Avoid creating no-op copies with the same src and dst reg class. These |
3360 | 664 | // confuse some of the machine passes. |
3361 | 664 | legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); |
3362 | 664 | } |
3363 | 329 | } |
3364 | 53.9k | |
3365 | 53.9k | // REG_SEQUENCE doesn't really require operand legalization, but if one has a |
3366 | 53.9k | // VGPR dest type and SGPR sources, insert copies so all operands are |
3367 | 53.9k | // VGPRs. This seems to help operand folding / the register coalescer. |
3368 | 53.9k | if (MI.getOpcode() == AMDGPU::REG_SEQUENCE53.9k ) { |
3369 | 21.3k | MachineBasicBlock *MBB = MI.getParent(); |
3370 | 21.3k | const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); |
3371 | 21.3k | if (RI.hasVGPRs(DstRC)21.3k ) { |
3372 | 21.3k | // Update all the operands so they are VGPR register classes. These may |
3373 | 21.3k | // not be the same register class because REG_SEQUENCE supports mixing |
3374 | 21.3k | // subregister index types e.g. sub0_sub1 + sub2 + sub3 |
3375 | 73.8k | for (unsigned I = 1, E = MI.getNumOperands(); I != E73.8k ; I += 252.4k ) { |
3376 | 52.4k | MachineOperand &Op = MI.getOperand(I); |
3377 | 52.4k | if (!Op.isReg() || 52.4k !TargetRegisterInfo::isVirtualRegister(Op.getReg())52.4k ) |
3378 | 0 | continue; |
3379 | 52.4k | |
3380 | 52.4k | const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); |
3381 | 52.4k | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); |
3382 | 52.4k | if (VRC == OpRC) |
3383 | 25.7k | continue; |
3384 | 26.7k | |
3385 | 26.7k | legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); |
3386 | 26.7k | Op.setIsKill(); |
3387 | 26.7k | } |
3388 | 21.3k | } |
3389 | 21.3k | |
3390 | 21.3k | return; |
3391 | 21.3k | } |
3392 | 32.6k | |
3393 | 32.6k | // Legalize INSERT_SUBREG |
3394 | 32.6k | // src0 must have the same register class as dst |
3395 | 32.6k | if (32.6k MI.getOpcode() == AMDGPU::INSERT_SUBREG32.6k ) { |
3396 | 22 | unsigned Dst = MI.getOperand(0).getReg(); |
3397 | 22 | unsigned Src0 = MI.getOperand(1).getReg(); |
3398 | 22 | const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); |
3399 | 22 | const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); |
3400 | 22 | if (DstRC != Src0RC22 ) { |
3401 | 2 | MachineBasicBlock *MBB = MI.getParent(); |
3402 | 2 | MachineOperand &Op = MI.getOperand(1); |
3403 | 2 | legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); |
3404 | 2 | } |
3405 | 22 | return; |
3406 | 22 | } |
3407 | 32.6k | |
3408 | 32.6k | // Legalize MIMG and MUBUF/MTBUF for shaders. |
3409 | 32.6k | // |
3410 | 32.6k | // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via |
3411 | 32.6k | // scratch memory access. In both cases, the legalization never involves |
3412 | 32.6k | // conversion to the addr64 form. |
3413 | 32.6k | if (32.6k isMIMG(MI) || |
3414 | 32.6k | (AMDGPU::isShader(MF.getFunction()->getCallingConv()) && |
3415 | 32.6k | (isMUBUF(MI) || 208 isMTBUF(MI)204 ))) { |
3416 | 8 | MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); |
3417 | 8 | if (SRsrc && 8 !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))8 ) { |
3418 | 6 | unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); |
3419 | 6 | SRsrc->setReg(SGPR); |
3420 | 6 | } |
3421 | 8 | |
3422 | 8 | MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); |
3423 | 8 | if (SSamp && 8 !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))4 ) { |
3424 | 2 | unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); |
3425 | 2 | SSamp->setReg(SGPR); |
3426 | 2 | } |
3427 | 8 | return; |
3428 | 8 | } |
3429 | 32.6k | |
3430 | 32.6k | // Legalize MUBUF* instructions by converting to addr64 form. |
3431 | 32.6k | // FIXME: If we start using the non-addr64 instructions for compute, we |
3432 | 32.6k | // may need to legalize them as above. This especially applies to the |
3433 | 32.6k | // buffer_load_format_* variants and variants with idxen (or bothen). |
3434 | 32.6k | int SRsrcIdx = |
3435 | 32.6k | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); |
3436 | 32.6k | if (SRsrcIdx != -132.6k ) { |
3437 | 39 | // We have an MUBUF instruction |
3438 | 39 | MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); |
3439 | 39 | unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; |
3440 | 39 | if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), |
3441 | 39 | RI.getRegClass(SRsrcRC))) { |
3442 | 0 | // The operands are legal. |
3443 | 0 | // FIXME: We may need to legalize operands besided srsrc. |
3444 | 0 | return; |
3445 | 0 | } |
3446 | 39 | |
3447 | 39 | MachineBasicBlock &MBB = *MI.getParent(); |
3448 | 39 | |
3449 | 39 | // Extract the ptr from the resource descriptor. |
3450 | 39 | unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, |
3451 | 39 | &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); |
3452 | 39 | |
3453 | 39 | // Create an empty resource descriptor |
3454 | 39 | unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
3455 | 39 | unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
3456 | 39 | unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
3457 | 39 | unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); |
3458 | 39 | uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); |
3459 | 39 | |
3460 | 39 | // Zero64 = 0 |
3461 | 39 | BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) |
3462 | 39 | .addImm(0); |
3463 | 39 | |
3464 | 39 | // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} |
3465 | 39 | BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) |
3466 | 39 | .addImm(RsrcDataFormat & 0xFFFFFFFF); |
3467 | 39 | |
3468 | 39 | // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} |
3469 | 39 | BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) |
3470 | 39 | .addImm(RsrcDataFormat >> 32); |
3471 | 39 | |
3472 | 39 | // NewSRsrc = {Zero64, SRsrcFormat} |
3473 | 39 | BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) |
3474 | 39 | .addReg(Zero64) |
3475 | 39 | .addImm(AMDGPU::sub0_sub1) |
3476 | 39 | .addReg(SRsrcFormatLo) |
3477 | 39 | .addImm(AMDGPU::sub2) |
3478 | 39 | .addReg(SRsrcFormatHi) |
3479 | 39 | .addImm(AMDGPU::sub3); |
3480 | 39 | |
3481 | 39 | MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); |
3482 | 39 | unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); |
3483 | 39 | if (VAddr39 ) { |
3484 | 8 | // This is already an ADDR64 instruction so we need to add the pointer |
3485 | 8 | // extracted from the resource descriptor to the current value of VAddr. |
3486 | 8 | unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
3487 | 8 | unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
3488 | 8 | |
3489 | 8 | // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 |
3490 | 8 | DebugLoc DL = MI.getDebugLoc(); |
3491 | 8 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) |
3492 | 8 | .addReg(SRsrcPtr, 0, AMDGPU::sub0) |
3493 | 8 | .addReg(VAddr->getReg(), 0, AMDGPU::sub0); |
3494 | 8 | |
3495 | 8 | // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 |
3496 | 8 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) |
3497 | 8 | .addReg(SRsrcPtr, 0, AMDGPU::sub1) |
3498 | 8 | .addReg(VAddr->getReg(), 0, AMDGPU::sub1); |
3499 | 8 | |
3500 | 8 | // NewVaddr = {NewVaddrHi, NewVaddrLo} |
3501 | 8 | BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) |
3502 | 8 | .addReg(NewVAddrLo) |
3503 | 8 | .addImm(AMDGPU::sub0) |
3504 | 8 | .addReg(NewVAddrHi) |
3505 | 8 | .addImm(AMDGPU::sub1); |
3506 | 39 | } else { |
3507 | 31 | // This instructions is the _OFFSET variant, so we need to convert it to |
3508 | 31 | // ADDR64. |
3509 | 31 | assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() |
3510 | 31 | < SISubtarget::VOLCANIC_ISLANDS && |
3511 | 31 | "FIXME: Need to emit flat atomics here"); |
3512 | 31 | |
3513 | 31 | MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); |
3514 | 31 | MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); |
3515 | 31 | MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); |
3516 | 31 | unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); |
3517 | 31 | |
3518 | 31 | // Atomics rith return have have an additional tied operand and are |
3519 | 31 | // missing some of the special bits. |
3520 | 31 | MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); |
3521 | 31 | MachineInstr *Addr64; |
3522 | 31 | |
3523 | 31 | if (!VDataIn31 ) { |
3524 | 30 | // Regular buffer load / store. |
3525 | 30 | MachineInstrBuilder MIB = |
3526 | 30 | BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) |
3527 | 30 | .add(*VData) |
3528 | 30 | .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. |
3529 | 30 | // This will be replaced later |
3530 | 30 | // with the new value of vaddr. |
3531 | 30 | .add(*SRsrc) |
3532 | 30 | .add(*SOffset) |
3533 | 30 | .add(*Offset); |
3534 | 30 | |
3535 | 30 | // Atomics do not have this operand. |
3536 | 30 | if (const MachineOperand *GLC = |
3537 | 29 | getNamedOperand(MI, AMDGPU::OpName::glc)) { |
3538 | 29 | MIB.addImm(GLC->getImm()); |
3539 | 29 | } |
3540 | 30 | |
3541 | 30 | MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); |
3542 | 30 | |
3543 | 30 | if (const MachineOperand *TFE = |
3544 | 29 | getNamedOperand(MI, AMDGPU::OpName::tfe)) { |
3545 | 29 | MIB.addImm(TFE->getImm()); |
3546 | 29 | } |
3547 | 30 | |
3548 | 30 | MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); |
3549 | 30 | Addr64 = MIB; |
3550 | 31 | } else { |
3551 | 1 | // Atomics with return. |
3552 | 1 | Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) |
3553 | 1 | .add(*VData) |
3554 | 1 | .add(*VDataIn) |
3555 | 1 | .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. |
3556 | 1 | // This will be replaced later |
3557 | 1 | // with the new value of vaddr. |
3558 | 1 | .add(*SRsrc) |
3559 | 1 | .add(*SOffset) |
3560 | 1 | .add(*Offset) |
3561 | 1 | .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) |
3562 | 1 | .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); |
3563 | 1 | } |
3564 | 31 | |
3565 | 31 | MI.removeFromParent(); |
3566 | 31 | |
3567 | 31 | // NewVaddr = {NewVaddrHi, NewVaddrLo} |
3568 | 31 | BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), |
3569 | 31 | NewVAddr) |
3570 | 31 | .addReg(SRsrcPtr, 0, AMDGPU::sub0) |
3571 | 31 | .addImm(AMDGPU::sub0) |
3572 | 31 | .addReg(SRsrcPtr, 0, AMDGPU::sub1) |
3573 | 31 | .addImm(AMDGPU::sub1); |
3574 | 31 | |
3575 | 31 | VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); |
3576 | 31 | SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); |
3577 | 31 | } |
3578 | 39 | |
3579 | 39 | // Update the instruction to use NewVaddr |
3580 | 39 | VAddr->setReg(NewVAddr); |
3581 | 39 | // Update the instruction to use NewSRsrc |
3582 | 39 | SRsrc->setReg(NewSRsrc); |
3583 | 39 | } |
3584 | 90.6k | } |
3585 | | |
3586 | 31.6k | void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { |
3587 | 31.6k | SetVectorType Worklist; |
3588 | 31.6k | Worklist.insert(&TopInst); |
3589 | 31.6k | |
3590 | 138k | while (!Worklist.empty()138k ) { |
3591 | 107k | MachineInstr &Inst = *Worklist.pop_back_val(); |
3592 | 107k | MachineBasicBlock *MBB = Inst.getParent(); |
3593 | 107k | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); |
3594 | 107k | |
3595 | 107k | unsigned Opcode = Inst.getOpcode(); |
3596 | 107k | unsigned NewOpcode = getVALUOp(Inst); |
3597 | 107k | |
3598 | 107k | // Handle some special cases |
3599 | 107k | switch (Opcode) { |
3600 | 94.2k | default: |
3601 | 94.2k | break; |
3602 | 111 | case AMDGPU::S_AND_B64: |
3603 | 111 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); |
3604 | 111 | Inst.eraseFromParent(); |
3605 | 111 | continue; |
3606 | 107k | |
3607 | 216 | case AMDGPU::S_OR_B64: |
3608 | 216 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); |
3609 | 216 | Inst.eraseFromParent(); |
3610 | 216 | continue; |
3611 | 107k | |
3612 | 130 | case AMDGPU::S_XOR_B64: |
3613 | 130 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); |
3614 | 130 | Inst.eraseFromParent(); |
3615 | 130 | continue; |
3616 | 107k | |
3617 | 16 | case AMDGPU::S_NOT_B64: |
3618 | 16 | splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); |
3619 | 16 | Inst.eraseFromParent(); |
3620 | 16 | continue; |
3621 | 107k | |
3622 | 26 | case AMDGPU::S_BCNT1_I32_B64: |
3623 | 26 | splitScalar64BitBCNT(Worklist, Inst); |
3624 | 26 | Inst.eraseFromParent(); |
3625 | 26 | continue; |
3626 | 107k | |
3627 | 1.24k | case AMDGPU::S_BFE_I64: |
3628 | 1.24k | splitScalar64BitBFE(Worklist, Inst); |
3629 | 1.24k | Inst.eraseFromParent(); |
3630 | 1.24k | continue; |
3631 | 107k | |
3632 | 4.44k | case AMDGPU::S_LSHL_B32: |
3633 | 4.44k | if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS4.44k ) { |
3634 | 2.10k | NewOpcode = AMDGPU::V_LSHLREV_B32_e64; |
3635 | 2.10k | swapOperands(Inst); |
3636 | 2.10k | } |
3637 | 4.44k | break; |
3638 | 1.77k | case AMDGPU::S_ASHR_I32: |
3639 | 1.77k | if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS1.77k ) { |
3640 | 741 | NewOpcode = AMDGPU::V_ASHRREV_I32_e64; |
3641 | 741 | swapOperands(Inst); |
3642 | 741 | } |
3643 | 1.77k | break; |
3644 | 2.72k | case AMDGPU::S_LSHR_B32: |
3645 | 2.72k | if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS2.72k ) { |
3646 | 1.24k | NewOpcode = AMDGPU::V_LSHRREV_B32_e64; |
3647 | 1.24k | swapOperands(Inst); |
3648 | 1.24k | } |
3649 | 2.72k | break; |
3650 | 1.65k | case AMDGPU::S_LSHL_B64: |
3651 | 1.65k | if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS1.65k ) { |
3652 | 719 | NewOpcode = AMDGPU::V_LSHLREV_B64; |
3653 | 719 | swapOperands(Inst); |
3654 | 719 | } |
3655 | 1.65k | break; |
3656 | 213 | case AMDGPU::S_ASHR_I64: |
3657 | 213 | if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS213 ) { |
3658 | 95 | NewOpcode = AMDGPU::V_ASHRREV_I64; |
3659 | 95 | swapOperands(Inst); |
3660 | 95 | } |
3661 | 213 | break; |
3662 | 138 | case AMDGPU::S_LSHR_B64: |
3663 | 138 | if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS138 ) { |
3664 | 69 | NewOpcode = AMDGPU::V_LSHRREV_B64; |
3665 | 69 | swapOperands(Inst); |
3666 | 69 | } |
3667 | 138 | break; |
3668 | 107k | |
3669 | 16 | case AMDGPU::S_ABS_I32: |
3670 | 16 | lowerScalarAbs(Worklist, Inst); |
3671 | 16 | Inst.eraseFromParent(); |
3672 | 16 | continue; |
3673 | 107k | |
3674 | 77 | case AMDGPU::S_CBRANCH_SCC0: |
3675 | 77 | case AMDGPU::S_CBRANCH_SCC1: |
3676 | 77 | // Clear unused bits of vcc |
3677 | 77 | BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), |
3678 | 77 | AMDGPU::VCC) |
3679 | 77 | .addReg(AMDGPU::EXEC) |
3680 | 77 | .addReg(AMDGPU::VCC); |
3681 | 77 | break; |
3682 | 77 | |
3683 | 0 | case AMDGPU::S_BFE_U64: |
3684 | 0 | case AMDGPU::S_BFM_B64: |
3685 | 0 | llvm_unreachable("Moving this op to VALU not implemented"); |
3686 | 0 |
|
3687 | 52 | case AMDGPU::S_PACK_LL_B32_B16: |
3688 | 52 | case AMDGPU::S_PACK_LH_B32_B16: |
3689 | 52 | case AMDGPU::S_PACK_HH_B32_B16: |
3690 | 52 | movePackToVALU(Worklist, MRI, Inst); |
3691 | 52 | Inst.eraseFromParent(); |
3692 | 52 | continue; |
3693 | 52 | |
3694 | 12 | case AMDGPU::S_XNOR_B32: |
3695 | 12 | lowerScalarXnor(Worklist, Inst); |
3696 | 12 | Inst.eraseFromParent(); |
3697 | 12 | continue; |
3698 | 52 | |
3699 | 4 | case AMDGPU::S_XNOR_B64: |
3700 | 4 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32); |
3701 | 4 | Inst.eraseFromParent(); |
3702 | 4 | continue; |
3703 | 105k | } |
3704 | 105k | |
3705 | 105k | if (105k NewOpcode == AMDGPU::INSTRUCTION_LIST_END105k ) { |
3706 | 61 | // We cannot move this instruction to the VALU, so we should try to |
3707 | 61 | // legalize its operands instead. |
3708 | 61 | legalizeOperands(Inst); |
3709 | 61 | continue; |
3710 | 61 | } |
3711 | 105k | |
3712 | 105k | // Use the new VALU Opcode. |
3713 | 105k | const MCInstrDesc &NewDesc = get(NewOpcode); |
3714 | 105k | Inst.setDesc(NewDesc); |
3715 | 105k | |
3716 | 105k | // Remove any references to SCC. Vector instructions can't read from it, and |
3717 | 105k | // We're just about to add the implicit use / defs of VCC, and we don't want |
3718 | 105k | // both. |
3719 | 368k | for (unsigned i = Inst.getNumOperands() - 1; i > 0368k ; --i263k ) { |
3720 | 263k | MachineOperand &Op = Inst.getOperand(i); |
3721 | 263k | if (Op.isReg() && 263k Op.getReg() == AMDGPU::SCC206k ) { |
3722 | 39.0k | Inst.RemoveOperand(i); |
3723 | 39.0k | addSCCDefUsersToVALUWorklist(Inst, Worklist); |
3724 | 39.0k | } |
3725 | 263k | } |
3726 | 105k | |
3727 | 105k | if (Opcode == AMDGPU::S_SEXT_I32_I8 || 105k Opcode == AMDGPU::S_SEXT_I32_I16104k ) { |
3728 | 1.07k | // We are converting these to a BFE, so we need to add the missing |
3729 | 1.07k | // operands for the size and offset. |
3730 | 1.07k | unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8357 : 16720 ; |
3731 | 1.07k | Inst.addOperand(MachineOperand::CreateImm(0)); |
3732 | 1.07k | Inst.addOperand(MachineOperand::CreateImm(Size)); |
3733 | 1.07k | |
3734 | 105k | } else if (104k Opcode == AMDGPU::S_BCNT1_I32_B32104k ) { |
3735 | 64 | // The VALU version adds the second operand to the result, so insert an |
3736 | 64 | // extra 0 operand. |
3737 | 64 | Inst.addOperand(MachineOperand::CreateImm(0)); |
3738 | 64 | } |
3739 | 105k | |
3740 | 105k | Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); |
3741 | 105k | |
3742 | 105k | if (Opcode == AMDGPU::S_BFE_I32 || 105k Opcode == AMDGPU::S_BFE_U32103k ) { |
3743 | 3.47k | const MachineOperand &OffsetWidthOp = Inst.getOperand(2); |
3744 | 3.47k | // If we need to move this to VGPRs, we need to unpack the second operand |
3745 | 3.47k | // back into the 2 separate ones for bit offset and width. |
3746 | 3.47k | assert(OffsetWidthOp.isImm() && |
3747 | 3.47k | "Scalar BFE is only implemented for constant width and offset"); |
3748 | 3.47k | uint32_t Imm = OffsetWidthOp.getImm(); |
3749 | 3.47k | |
3750 | 3.47k | uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. |
3751 | 3.47k | uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. |
3752 | 3.47k | Inst.RemoveOperand(2); // Remove old immediate. |
3753 | 3.47k | Inst.addOperand(MachineOperand::CreateImm(Offset)); |
3754 | 3.47k | Inst.addOperand(MachineOperand::CreateImm(BitWidth)); |
3755 | 3.47k | } |
3756 | 105k | |
3757 | 105k | bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); |
3758 | 105k | unsigned NewDstReg = AMDGPU::NoRegister; |
3759 | 105k | if (HasDst105k ) { |
3760 | 105k | unsigned DstReg = Inst.getOperand(0).getReg(); |
3761 | 105k | if (TargetRegisterInfo::isPhysicalRegister(DstReg)) |
3762 | 46 | continue; |
3763 | 105k | |
3764 | 105k | // Update the destination register class. |
3765 | 105k | const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); |
3766 | 105k | if (!NewDstRC) |
3767 | 0 | continue; |
3768 | 105k | |
3769 | 105k | if (105k Inst.isCopy() && |
3770 | 47.4k | TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && |
3771 | 105k | NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())47.3k ) { |
3772 | 15.5k | // Instead of creating a copy where src and dst are the same register |
3773 | 15.5k | // class, we just replace all uses of dst with src. These kinds of |
3774 | 15.5k | // copies interfere with the heuristics MachineSink uses to decide |
3775 | 15.5k | // whether or not to split a critical edge. Since the pass assumes |
3776 | 15.5k | // that copies will end up as machine instructions and not be |
3777 | 15.5k | // eliminated. |
3778 | 15.5k | addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); |
3779 | 15.5k | MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); |
3780 | 15.5k | MRI.clearKillFlags(Inst.getOperand(1).getReg()); |
3781 | 15.5k | Inst.getOperand(0).setReg(DstReg); |
3782 | 15.5k | continue; |
3783 | 15.5k | } |
3784 | 89.4k | |
3785 | 89.4k | NewDstReg = MRI.createVirtualRegister(NewDstRC); |
3786 | 89.4k | MRI.replaceRegWith(DstReg, NewDstReg); |
3787 | 89.4k | } |
3788 | 105k | |
3789 | 105k | // Legalize the operands |
3790 | 89.6k | legalizeOperands(Inst); |
3791 | 89.6k | |
3792 | 89.6k | if (HasDst) |
3793 | 89.4k | addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); |
3794 | 107k | } |
3795 | 31.6k | } |
3796 | | |
3797 | | void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, |
3798 | 16 | MachineInstr &Inst) const { |
3799 | 16 | MachineBasicBlock &MBB = *Inst.getParent(); |
3800 | 16 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
3801 | 16 | MachineBasicBlock::iterator MII = Inst; |
3802 | 16 | DebugLoc DL = Inst.getDebugLoc(); |
3803 | 16 | |
3804 | 16 | MachineOperand &Dest = Inst.getOperand(0); |
3805 | 16 | MachineOperand &Src = Inst.getOperand(1); |
3806 | 16 | unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
3807 | 16 | unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
3808 | 16 | |
3809 | 16 | BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) |
3810 | 16 | .addImm(0) |
3811 | 16 | .addReg(Src.getReg()); |
3812 | 16 | |
3813 | 16 | BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) |
3814 | 16 | .addReg(Src.getReg()) |
3815 | 16 | .addReg(TmpReg); |
3816 | 16 | |
3817 | 16 | MRI.replaceRegWith(Dest.getReg(), ResultReg); |
3818 | 16 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); |
3819 | 16 | } |
3820 | | |
3821 | | void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, |
3822 | 12 | MachineInstr &Inst) const { |
3823 | 12 | MachineBasicBlock &MBB = *Inst.getParent(); |
3824 | 12 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
3825 | 12 | MachineBasicBlock::iterator MII = Inst; |
3826 | 12 | const DebugLoc &DL = Inst.getDebugLoc(); |
3827 | 12 | |
3828 | 12 | MachineOperand &Dest = Inst.getOperand(0); |
3829 | 12 | MachineOperand &Src0 = Inst.getOperand(1); |
3830 | 12 | MachineOperand &Src1 = Inst.getOperand(2); |
3831 | 12 | |
3832 | 12 | legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); |
3833 | 12 | legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); |
3834 | 12 | |
3835 | 12 | unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
3836 | 12 | BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) |
3837 | 12 | .add(Src0) |
3838 | 12 | .add(Src1); |
3839 | 12 | |
3840 | 12 | unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
3841 | 12 | BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not) |
3842 | 12 | .addReg(Xor); |
3843 | 12 | |
3844 | 12 | MRI.replaceRegWith(Dest.getReg(), Not); |
3845 | 12 | addUsersToMoveToVALUWorklist(Not, MRI, Worklist); |
3846 | 12 | } |
3847 | | |
3848 | | void SIInstrInfo::splitScalar64BitUnaryOp( |
3849 | | SetVectorType &Worklist, MachineInstr &Inst, |
3850 | 16 | unsigned Opcode) const { |
3851 | 16 | MachineBasicBlock &MBB = *Inst.getParent(); |
3852 | 16 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
3853 | 16 | |
3854 | 16 | MachineOperand &Dest = Inst.getOperand(0); |
3855 | 16 | MachineOperand &Src0 = Inst.getOperand(1); |
3856 | 16 | DebugLoc DL = Inst.getDebugLoc(); |
3857 | 16 | |
3858 | 16 | MachineBasicBlock::iterator MII = Inst; |
3859 | 16 | |
3860 | 16 | const MCInstrDesc &InstDesc = get(Opcode); |
3861 | 16 | const TargetRegisterClass *Src0RC = Src0.isReg() ? |
3862 | 16 | MRI.getRegClass(Src0.getReg()) : |
3863 | 0 | &AMDGPU::SGPR_32RegClass; |
3864 | 16 | |
3865 | 16 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); |
3866 | 16 | |
3867 | 16 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, |
3868 | 16 | AMDGPU::sub0, Src0SubRC); |
3869 | 16 | |
3870 | 16 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); |
3871 | 16 | const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); |
3872 | 16 | const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); |
3873 | 16 | |
3874 | 16 | unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); |
3875 | 16 | BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); |
3876 | 16 | |
3877 | 16 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, |
3878 | 16 | AMDGPU::sub1, Src0SubRC); |
3879 | 16 | |
3880 | 16 | unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); |
3881 | 16 | BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); |
3882 | 16 | |
3883 | 16 | unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); |
3884 | 16 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) |
3885 | 16 | .addReg(DestSub0) |
3886 | 16 | .addImm(AMDGPU::sub0) |
3887 | 16 | .addReg(DestSub1) |
3888 | 16 | .addImm(AMDGPU::sub1); |
3889 | 16 | |
3890 | 16 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); |
3891 | 16 | |
3892 | 16 | // We don't need to legalizeOperands here because for a single operand, src0 |
3893 | 16 | // will support any kind of input. |
3894 | 16 | |
3895 | 16 | // Move all users of this moved value. |
3896 | 16 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); |
3897 | 16 | } |
3898 | | |
3899 | | void SIInstrInfo::splitScalar64BitBinaryOp( |
3900 | | SetVectorType &Worklist, MachineInstr &Inst, |
3901 | 461 | unsigned Opcode) const { |
3902 | 461 | MachineBasicBlock &MBB = *Inst.getParent(); |
3903 | 461 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
3904 | 461 | |
3905 | 461 | MachineOperand &Dest = Inst.getOperand(0); |
3906 | 461 | MachineOperand &Src0 = Inst.getOperand(1); |
3907 | 461 | MachineOperand &Src1 = Inst.getOperand(2); |
3908 | 461 | DebugLoc DL = Inst.getDebugLoc(); |
3909 | 461 | |
3910 | 461 | MachineBasicBlock::iterator MII = Inst; |
3911 | 461 | |
3912 | 461 | const MCInstrDesc &InstDesc = get(Opcode); |
3913 | 461 | const TargetRegisterClass *Src0RC = Src0.isReg() ? |
3914 | 461 | MRI.getRegClass(Src0.getReg()) : |
3915 | 0 | &AMDGPU::SGPR_32RegClass; |
3916 | 461 | |
3917 | 461 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); |
3918 | 461 | const TargetRegisterClass *Src1RC = Src1.isReg() ? |
3919 | 461 | MRI.getRegClass(Src1.getReg()) : |
3920 | 0 | &AMDGPU::SGPR_32RegClass; |
3921 | 461 | |
3922 | 461 | const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); |
3923 | 461 | |
3924 | 461 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, |
3925 | 461 | AMDGPU::sub0, Src0SubRC); |
3926 | 461 | MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, |
3927 | 461 | AMDGPU::sub0, Src1SubRC); |
3928 | 461 | |
3929 | 461 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); |
3930 | 461 | const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); |
3931 | 461 | const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); |
3932 | 461 | |
3933 | 461 | unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); |
3934 | 461 | MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) |
3935 | 461 | .add(SrcReg0Sub0) |
3936 | 461 | .add(SrcReg1Sub0); |
3937 | 461 | |
3938 | 461 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, |
3939 | 461 | AMDGPU::sub1, Src0SubRC); |
3940 | 461 | MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, |
3941 | 461 | AMDGPU::sub1, Src1SubRC); |
3942 | 461 | |
3943 | 461 | unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); |
3944 | 461 | MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) |
3945 | 461 | .add(SrcReg0Sub1) |
3946 | 461 | .add(SrcReg1Sub1); |
3947 | 461 | |
3948 | 461 | unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); |
3949 | 461 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) |
3950 | 461 | .addReg(DestSub0) |
3951 | 461 | .addImm(AMDGPU::sub0) |
3952 | 461 | .addReg(DestSub1) |
3953 | 461 | .addImm(AMDGPU::sub1); |
3954 | 461 | |
3955 | 461 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); |
3956 | 461 | |
3957 | 461 | // Try to legalize the operands in case we need to swap the order to keep it |
3958 | 461 | // valid. |
3959 | 461 | legalizeOperands(LoHalf); |
3960 | 461 | legalizeOperands(HiHalf); |
3961 | 461 | |
3962 | 461 | // Move all users of this moved vlaue. |
3963 | 461 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); |
3964 | 461 | } |
3965 | | |
3966 | | void SIInstrInfo::splitScalar64BitBCNT( |
3967 | 26 | SetVectorType &Worklist, MachineInstr &Inst) const { |
3968 | 26 | MachineBasicBlock &MBB = *Inst.getParent(); |
3969 | 26 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
3970 | 26 | |
3971 | 26 | MachineBasicBlock::iterator MII = Inst; |
3972 | 26 | DebugLoc DL = Inst.getDebugLoc(); |
3973 | 26 | |
3974 | 26 | MachineOperand &Dest = Inst.getOperand(0); |
3975 | 26 | MachineOperand &Src = Inst.getOperand(1); |
3976 | 26 | |
3977 | 26 | const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); |
3978 | 26 | const TargetRegisterClass *SrcRC = Src.isReg() ? |
3979 | 26 | MRI.getRegClass(Src.getReg()) : |
3980 | 0 | &AMDGPU::SGPR_32RegClass; |
3981 | 26 | |
3982 | 26 | unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
3983 | 26 | unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
3984 | 26 | |
3985 | 26 | const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); |
3986 | 26 | |
3987 | 26 | MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, |
3988 | 26 | AMDGPU::sub0, SrcSubRC); |
3989 | 26 | MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, |
3990 | 26 | AMDGPU::sub1, SrcSubRC); |
3991 | 26 | |
3992 | 26 | BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); |
3993 | 26 | |
3994 | 26 | BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); |
3995 | 26 | |
3996 | 26 | MRI.replaceRegWith(Dest.getReg(), ResultReg); |
3997 | 26 | |
3998 | 26 | // We don't need to legalize operands here. src0 for etiher instruction can be |
3999 | 26 | // an SGPR, and the second input is unused or determined here. |
4000 | 26 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); |
4001 | 26 | } |
4002 | | |
4003 | | void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, |
4004 | 1.24k | MachineInstr &Inst) const { |
4005 | 1.24k | MachineBasicBlock &MBB = *Inst.getParent(); |
4006 | 1.24k | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
4007 | 1.24k | MachineBasicBlock::iterator MII = Inst; |
4008 | 1.24k | DebugLoc DL = Inst.getDebugLoc(); |
4009 | 1.24k | |
4010 | 1.24k | MachineOperand &Dest = Inst.getOperand(0); |
4011 | 1.24k | uint32_t Imm = Inst.getOperand(2).getImm(); |
4012 | 1.24k | uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. |
4013 | 1.24k | uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. |
4014 | 1.24k | |
4015 | 1.24k | (void) Offset; |
4016 | 1.24k | |
4017 | 1.24k | // Only sext_inreg cases handled. |
4018 | 1.24k | assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && |
4019 | 1.24k | Offset == 0 && "Not implemented"); |
4020 | 1.24k | |
4021 | 1.24k | if (BitWidth < 321.24k ) { |
4022 | 1.24k | unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4023 | 1.24k | unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4024 | 1.24k | unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); |
4025 | 1.24k | |
4026 | 1.24k | BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) |
4027 | 1.24k | .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) |
4028 | 1.24k | .addImm(0) |
4029 | 1.24k | .addImm(BitWidth); |
4030 | 1.24k | |
4031 | 1.24k | BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) |
4032 | 1.24k | .addImm(31) |
4033 | 1.24k | .addReg(MidRegLo); |
4034 | 1.24k | |
4035 | 1.24k | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) |
4036 | 1.24k | .addReg(MidRegLo) |
4037 | 1.24k | .addImm(AMDGPU::sub0) |
4038 | 1.24k | .addReg(MidRegHi) |
4039 | 1.24k | .addImm(AMDGPU::sub1); |
4040 | 1.24k | |
4041 | 1.24k | MRI.replaceRegWith(Dest.getReg(), ResultReg); |
4042 | 1.24k | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); |
4043 | 1.24k | return; |
4044 | 1.24k | } |
4045 | 6 | |
4046 | 6 | MachineOperand &Src = Inst.getOperand(1); |
4047 | 6 | unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4048 | 6 | unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); |
4049 | 6 | |
4050 | 6 | BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) |
4051 | 6 | .addImm(31) |
4052 | 6 | .addReg(Src.getReg(), 0, AMDGPU::sub0); |
4053 | 6 | |
4054 | 6 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) |
4055 | 6 | .addReg(Src.getReg(), 0, AMDGPU::sub0) |
4056 | 6 | .addImm(AMDGPU::sub0) |
4057 | 6 | .addReg(TmpReg) |
4058 | 6 | .addImm(AMDGPU::sub1); |
4059 | 6 | |
4060 | 6 | MRI.replaceRegWith(Dest.getReg(), ResultReg); |
4061 | 6 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); |
4062 | 6 | } |
4063 | | |
4064 | | void SIInstrInfo::addUsersToMoveToVALUWorklist( |
4065 | | unsigned DstReg, |
4066 | | MachineRegisterInfo &MRI, |
4067 | 106k | SetVectorType &Worklist) const { |
4068 | 106k | for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), |
4069 | 244k | E = MRI.use_end(); I != E244k ;) { |
4070 | 137k | MachineInstr &UseMI = *I->getParent(); |
4071 | 137k | if (!canReadVGPR(UseMI, I.getOperandNo())137k ) { |
4072 | 75.3k | Worklist.insert(&UseMI); |
4073 | 75.3k | |
4074 | 75.4k | do { |
4075 | 75.4k | ++I; |
4076 | 75.4k | } while (I != E && 75.4k I->getParent() == &UseMI22.5k ); |
4077 | 137k | } else { |
4078 | 62.0k | ++I; |
4079 | 62.0k | } |
4080 | 137k | } |
4081 | 106k | } |
4082 | | |
4083 | | void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, |
4084 | | MachineRegisterInfo &MRI, |
4085 | 52 | MachineInstr &Inst) const { |
4086 | 52 | unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4087 | 52 | MachineBasicBlock *MBB = Inst.getParent(); |
4088 | 52 | MachineOperand &Src0 = Inst.getOperand(1); |
4089 | 52 | MachineOperand &Src1 = Inst.getOperand(2); |
4090 | 52 | const DebugLoc &DL = Inst.getDebugLoc(); |
4091 | 52 | |
4092 | 52 | switch (Inst.getOpcode()) { |
4093 | 49 | case AMDGPU::S_PACK_LL_B32_B16: { |
4094 | 49 | unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4095 | 49 | unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4096 | 49 | |
4097 | 49 | // FIXME: Can do a lot better if we know the high bits of src0 or src1 are |
4098 | 49 | // 0. |
4099 | 49 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) |
4100 | 49 | .addImm(0xffff); |
4101 | 49 | |
4102 | 49 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) |
4103 | 49 | .addReg(ImmReg, RegState::Kill) |
4104 | 49 | .add(Src0); |
4105 | 49 | |
4106 | 49 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) |
4107 | 49 | .add(Src1) |
4108 | 49 | .addImm(16) |
4109 | 49 | .addReg(TmpReg, RegState::Kill); |
4110 | 49 | break; |
4111 | 52 | } |
4112 | 2 | case AMDGPU::S_PACK_LH_B32_B16: { |
4113 | 2 | unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4114 | 2 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) |
4115 | 2 | .addImm(0xffff); |
4116 | 2 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) |
4117 | 2 | .addReg(ImmReg, RegState::Kill) |
4118 | 2 | .add(Src0) |
4119 | 2 | .add(Src1); |
4120 | 2 | break; |
4121 | 52 | } |
4122 | 1 | case AMDGPU::S_PACK_HH_B32_B16: { |
4123 | 1 | unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4124 | 1 | unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4125 | 1 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) |
4126 | 1 | .addImm(16) |
4127 | 1 | .add(Src0); |
4128 | 1 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) |
4129 | 1 | .addImm(0xffff0000); |
4130 | 1 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) |
4131 | 1 | .add(Src1) |
4132 | 1 | .addReg(ImmReg, RegState::Kill) |
4133 | 1 | .addReg(TmpReg, RegState::Kill); |
4134 | 1 | break; |
4135 | 52 | } |
4136 | 0 | default: |
4137 | 0 | llvm_unreachable("unhandled s_pack_* instruction"); |
4138 | 52 | } |
4139 | 52 | |
4140 | 52 | MachineOperand &Dest = Inst.getOperand(0); |
4141 | 52 | MRI.replaceRegWith(Dest.getReg(), ResultReg); |
4142 | 52 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); |
4143 | 52 | } |
4144 | | |
4145 | | void SIInstrInfo::addSCCDefUsersToVALUWorklist( |
4146 | 39.0k | MachineInstr &SCCDefInst, SetVectorType &Worklist) const { |
4147 | 39.0k | // This assumes that all the users of SCC are in the same block |
4148 | 39.0k | // as the SCC def. |
4149 | 39.0k | for (MachineInstr &MI : |
4150 | 39.0k | make_range(MachineBasicBlock::iterator(SCCDefInst), |
4151 | 1.77M | SCCDefInst.getParent()->end())) { |
4152 | 1.77M | // Exit if we find another SCC def. |
4153 | 1.77M | if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) |
4154 | 29.9k | return; |
4155 | 1.74M | |
4156 | 1.74M | if (1.74M MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -11.74M ) |
4157 | 77 | Worklist.insert(&MI); |
4158 | 1.77M | } |
4159 | 39.0k | } |
4160 | | |
4161 | | const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( |
4162 | 105k | const MachineInstr &Inst) const { |
4163 | 105k | const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); |
4164 | 105k | |
4165 | 105k | switch (Inst.getOpcode()) { |
4166 | 105k | // For target instructions, getOpRegClass just returns the virtual register |
4167 | 105k | // class associated with the operand, so we need to find an equivalent VGPR |
4168 | 105k | // register class in order to move the instruction to the VALU. |
4169 | 69.1k | case AMDGPU::COPY: |
4170 | 69.1k | case AMDGPU::PHI: |
4171 | 69.1k | case AMDGPU::REG_SEQUENCE: |
4172 | 69.1k | case AMDGPU::INSERT_SUBREG: |
4173 | 69.1k | case AMDGPU::WQM: |
4174 | 69.1k | case AMDGPU::WWM: |
4175 | 69.1k | if (RI.hasVGPRs(NewDstRC)) |
4176 | 0 | return nullptr; |
4177 | 69.1k | |
4178 | 69.1k | NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); |
4179 | 69.1k | if (!NewDstRC) |
4180 | 0 | return nullptr; |
4181 | 69.1k | return NewDstRC; |
4182 | 35.8k | default: |
4183 | 35.8k | return NewDstRC; |
4184 | 0 | } |
4185 | 0 | } |
4186 | | |
4187 | | // Find the one SGPR operand we are allowed to use. |
4188 | | unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, |
4189 | 47.7k | int OpIndices[3]) const { |
4190 | 47.7k | const MCInstrDesc &Desc = MI.getDesc(); |
4191 | 47.7k | |
4192 | 47.7k | // Find the one SGPR operand we are allowed to use. |
4193 | 47.7k | // |
4194 | 47.7k | // First we need to consider the instruction's operand requirements before |
4195 | 47.7k | // legalizing. Some operands are required to be SGPRs, such as implicit uses |
4196 | 47.7k | // of VCC, but we are still bound by the constant bus requirement to only use |
4197 | 47.7k | // one. |
4198 | 47.7k | // |
4199 | 47.7k | // If the operand's class is an SGPR, we can never move it. |
4200 | 47.7k | |
4201 | 47.7k | unsigned SGPRReg = findImplicitSGPRRead(MI); |
4202 | 47.7k | if (SGPRReg != AMDGPU::NoRegister) |
4203 | 123 | return SGPRReg; |
4204 | 47.6k | |
4205 | 47.6k | unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; |
4206 | 47.6k | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); |
4207 | 47.6k | |
4208 | 147k | for (unsigned i = 0; i < 3147k ; ++i99.4k ) { |
4209 | 138k | int Idx = OpIndices[i]; |
4210 | 138k | if (Idx == -1) |
4211 | 32.8k | break; |
4212 | 105k | |
4213 | 105k | const MachineOperand &MO = MI.getOperand(Idx); |
4214 | 105k | if (!MO.isReg()) |
4215 | 10.8k | continue; |
4216 | 95.0k | |
4217 | 95.0k | // Is this operand statically required to be an SGPR based on the operand |
4218 | 95.0k | // constraints? |
4219 | 95.0k | const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); |
4220 | 95.0k | bool IsRequiredSGPR = RI.isSGPRClass(OpRC); |
4221 | 95.0k | if (IsRequiredSGPR) |
4222 | 6.45k | return MO.getReg(); |
4223 | 88.6k | |
4224 | 88.6k | // If this could be a VGPR or an SGPR, Check the dynamic register class. |
4225 | 88.6k | unsigned Reg = MO.getReg(); |
4226 | 88.6k | const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); |
4227 | 88.6k | if (RI.isSGPRClass(RegRC)) |
4228 | 41.4k | UsedSGPRs[i] = Reg; |
4229 | 138k | } |
4230 | 47.6k | |
4231 | 47.6k | // We don't have a required SGPR operand, so we have a bit more freedom in |
4232 | 47.6k | // selecting operands to move. |
4233 | 47.6k | |
4234 | 47.6k | // Try to select the most used SGPR. If an SGPR is equal to one of the |
4235 | 47.6k | // others, we choose that. |
4236 | 47.6k | // |
4237 | 47.6k | // e.g. |
4238 | 47.6k | // V_FMA_F32 v0, s0, s0, s0 -> No moves |
4239 | 47.6k | // V_FMA_F32 v0, s0, s1, s0 -> Move s1 |
4240 | 47.6k | |
4241 | 47.6k | // TODO: If some of the operands are 64-bit SGPRs and some 32, we should |
4242 | 47.6k | // prefer those. |
4243 | 47.6k | |
4244 | 41.1k | if (41.1k UsedSGPRs[0] != AMDGPU::NoRegister41.1k ) { |
4245 | 17.1k | if (UsedSGPRs[0] == UsedSGPRs[1] || 17.1k UsedSGPRs[0] == UsedSGPRs[2]16.8k ) |
4246 | 351 | SGPRReg = UsedSGPRs[0]; |
4247 | 17.1k | } |
4248 | 41.1k | |
4249 | 41.1k | if (SGPRReg == AMDGPU::NoRegister && 41.1k UsedSGPRs[1] != AMDGPU::NoRegister40.8k ) { |
4250 | 13.7k | if (UsedSGPRs[1] == UsedSGPRs[2]) |
4251 | 8 | SGPRReg = UsedSGPRs[1]; |
4252 | 13.7k | } |
4253 | 41.1k | |
4254 | 41.1k | return SGPRReg; |
4255 | 47.7k | } |
4256 | | |
4257 | | MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, |
4258 | 2.53M | unsigned OperandName) const { |
4259 | 2.53M | int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); |
4260 | 2.53M | if (Idx == -1) |
4261 | 607k | return nullptr; |
4262 | 1.92M | |
4263 | 1.92M | return &MI.getOperand(Idx); |
4264 | 1.92M | } |
4265 | | |
4266 | 22.5k | uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { |
4267 | 22.5k | uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; |
4268 | 22.5k | if (ST.isAmdHsaOS()22.5k ) { |
4269 | 436 | // Set ATC = 1. GFX9 doesn't have this bit. |
4270 | 436 | if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) |
4271 | 341 | RsrcDataFormat |= (1ULL << 56); |
4272 | 436 | |
4273 | 436 | // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. |
4274 | 436 | // BTW, it disables TC L2 and therefore decreases performance. |
4275 | 436 | if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS) |
4276 | 236 | RsrcDataFormat |= (2ULL << 59); |
4277 | 436 | } |
4278 | 22.5k | |
4279 | 22.5k | return RsrcDataFormat; |
4280 | 22.5k | } |
4281 | | |
4282 | 463 | uint64_t SIInstrInfo::getScratchRsrcWords23() const { |
4283 | 463 | uint64_t Rsrc23 = getDefaultRsrcDataFormat() | |
4284 | 463 | AMDGPU::RSRC_TID_ENABLE | |
4285 | 463 | 0xffffffff; // Size; |
4286 | 463 | |
4287 | 463 | // GFX9 doesn't have ELEMENT_SIZE. |
4288 | 463 | if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS463 ) { |
4289 | 397 | uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; |
4290 | 397 | Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; |
4291 | 397 | } |
4292 | 463 | |
4293 | 463 | // IndexStride = 64. |
4294 | 463 | Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; |
4295 | 463 | |
4296 | 463 | // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. |
4297 | 463 | // Clear them unless we want a huge stride. |
4298 | 463 | if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) |
4299 | 232 | Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; |
4300 | 463 | |
4301 | 463 | return Rsrc23; |
4302 | 463 | } |
4303 | | |
4304 | 60 | bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { |
4305 | 60 | unsigned Opc = MI.getOpcode(); |
4306 | 60 | |
4307 | 60 | return isSMRD(Opc); |
4308 | 60 | } |
4309 | | |
4310 | 14 | bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { |
4311 | 14 | unsigned Opc = MI.getOpcode(); |
4312 | 14 | |
4313 | 14 | return isMUBUF(Opc) || isMTBUF(Opc)14 || isMIMG(Opc)14 ; |
4314 | 14 | } |
4315 | | |
4316 | | unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, |
4317 | 2.78k | int &FrameIndex) const { |
4318 | 2.78k | const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); |
4319 | 2.78k | if (!Addr || 2.78k !Addr->isFI()2.71k ) |
4320 | 723 | return AMDGPU::NoRegister; |
4321 | 2.05k | |
4322 | 2.78k | assert(!MI.memoperands_empty() && |
4323 | 2.05k | (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); |
4324 | 2.05k | |
4325 | 2.05k | FrameIndex = Addr->getIndex(); |
4326 | 2.05k | return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); |
4327 | 2.05k | } |
4328 | | |
4329 | | unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, |
4330 | 50 | int &FrameIndex) const { |
4331 | 50 | const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); |
4332 | 50 | assert(Addr && Addr->isFI()); |
4333 | 50 | FrameIndex = Addr->getIndex(); |
4334 | 50 | return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); |
4335 | 50 | } |
4336 | | |
4337 | | unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, |
4338 | 13.1k | int &FrameIndex) const { |
4339 | 13.1k | if (!MI.mayLoad()) |
4340 | 10.9k | return AMDGPU::NoRegister; |
4341 | 2.20k | |
4342 | 2.20k | if (2.20k isMUBUF(MI) || 2.20k isVGPRSpill(MI)810 ) |
4343 | 1.40k | return isStackAccess(MI, FrameIndex); |
4344 | 799 | |
4345 | 799 | if (799 isSGPRSpill(MI)799 ) |
4346 | 50 | return isSGPRStackAccess(MI, FrameIndex); |
4347 | 749 | |
4348 | 749 | return AMDGPU::NoRegister; |
4349 | 749 | } |
4350 | | |
4351 | | unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, |
4352 | 7.99k | int &FrameIndex) const { |
4353 | 7.99k | if (!MI.mayStore()) |
4354 | 6.18k | return AMDGPU::NoRegister; |
4355 | 1.80k | |
4356 | 1.80k | if (1.80k isMUBUF(MI) || 1.80k isVGPRSpill(MI)428 ) |
4357 | 1.37k | return isStackAccess(MI, FrameIndex); |
4358 | 428 | |
4359 | 428 | if (428 isSGPRSpill(MI)428 ) |
4360 | 0 | return isSGPRStackAccess(MI, FrameIndex); |
4361 | 428 | |
4362 | 428 | return AMDGPU::NoRegister; |
4363 | 428 | } |
4364 | | |
4365 | 623k | unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { |
4366 | 623k | unsigned Opc = MI.getOpcode(); |
4367 | 623k | const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); |
4368 | 623k | unsigned DescSize = Desc.getSize(); |
4369 | 623k | |
4370 | 623k | // If we have a definitive size, we can use it. Otherwise we need to inspect |
4371 | 623k | // the operands to know the size. |
4372 | 623k | // |
4373 | 623k | // FIXME: Instructions that have a base 32-bit encoding report their size as |
4374 | 623k | // 4, even though they are really 8 bytes if they have a literal operand. |
4375 | 623k | if (DescSize != 0 && 623k DescSize != 4614k ) |
4376 | 173k | return DescSize; |
4377 | 449k | |
4378 | 449k | // 4-byte instructions may have a 32-bit literal encoded after them. Check |
4379 | 449k | // operands that coud ever be literals. |
4380 | 449k | if (449k isVALU(MI) || 449k isSALU(MI)263k ) { |
4381 | 411k | if (isFixedSize(MI)) |
4382 | 1.00k | return DescSize; |
4383 | 410k | |
4384 | 410k | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); |
4385 | 410k | if (Src0Idx == -1) |
4386 | 113k | return 4; // No operands. |
4387 | 296k | |
4388 | 296k | if (296k isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])296k ) |
4389 | 29.4k | return 8; |
4390 | 266k | |
4391 | 266k | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); |
4392 | 266k | if (Src1Idx == -1) |
4393 | 144k | return 4; |
4394 | 122k | |
4395 | 122k | if (122k isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])122k ) |
4396 | 9.29k | return 8; |
4397 | 113k | |
4398 | 113k | return 4; |
4399 | 113k | } |
4400 | 38.4k | |
4401 | 38.4k | if (38.4k DescSize == 438.4k ) |
4402 | 30.8k | return 4; |
4403 | 7.57k | |
4404 | 7.57k | switch (Opc) { |
4405 | 4.94k | case TargetOpcode::IMPLICIT_DEF: |
4406 | 4.94k | case TargetOpcode::KILL: |
4407 | 4.94k | case TargetOpcode::DBG_VALUE: |
4408 | 4.94k | case TargetOpcode::BUNDLE: |
4409 | 4.94k | case TargetOpcode::EH_LABEL: |
4410 | 4.94k | return 0; |
4411 | 2.63k | case TargetOpcode::INLINEASM: { |
4412 | 2.63k | const MachineFunction *MF = MI.getParent()->getParent(); |
4413 | 2.63k | const char *AsmStr = MI.getOperand(0).getSymbolName(); |
4414 | 2.63k | return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); |
4415 | 4.94k | } |
4416 | 0 | default: |
4417 | 0 | llvm_unreachable("unable to find instruction size"); |
4418 | 0 | } |
4419 | 0 | } |
4420 | | |
4421 | 90 | bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { |
4422 | 90 | if (!isFLAT(MI)) |
4423 | 79 | return false; |
4424 | 11 | |
4425 | 11 | if (11 MI.memoperands_empty()11 ) |
4426 | 6 | return true; |
4427 | 5 | |
4428 | 5 | for (const MachineMemOperand *MMO : MI.memoperands()) 5 { |
4429 | 5 | if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) |
4430 | 2 | return true; |
4431 | 3 | } |
4432 | 3 | return false; |
4433 | 3 | } |
4434 | | |
4435 | 0 | bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { |
4436 | 0 | return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; |
4437 | 0 | } |
4438 | | |
4439 | | void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, |
4440 | 0 | MachineBasicBlock *IfEnd) const { |
4441 | 0 | MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); |
4442 | 0 | assert(TI != IfEntry->end()); |
4443 | 0 |
|
4444 | 0 | MachineInstr *Branch = &(*TI); |
4445 | 0 | MachineFunction *MF = IfEntry->getParent(); |
4446 | 0 | MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); |
4447 | 0 |
|
4448 | 0 | if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO0 ) { |
4449 | 0 | unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
4450 | 0 | MachineInstr *SIIF = |
4451 | 0 | BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) |
4452 | 0 | .add(Branch->getOperand(0)) |
4453 | 0 | .add(Branch->getOperand(1)); |
4454 | 0 | MachineInstr *SIEND = |
4455 | 0 | BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) |
4456 | 0 | .addReg(DstReg); |
4457 | 0 |
|
4458 | 0 | IfEntry->erase(TI); |
4459 | 0 | IfEntry->insert(IfEntry->end(), SIIF); |
4460 | 0 | IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); |
4461 | 0 | } |
4462 | 0 | } |
4463 | | |
4464 | | void SIInstrInfo::convertNonUniformLoopRegion( |
4465 | 0 | MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { |
4466 | 0 | MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); |
4467 | 0 | // We expect 2 terminators, one conditional and one unconditional. |
4468 | 0 | assert(TI != LoopEnd->end()); |
4469 | 0 |
|
4470 | 0 | MachineInstr *Branch = &(*TI); |
4471 | 0 | MachineFunction *MF = LoopEnd->getParent(); |
4472 | 0 | MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); |
4473 | 0 |
|
4474 | 0 | if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO0 ) { |
4475 | 0 |
|
4476 | 0 | unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
4477 | 0 | unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
4478 | 0 | MachineInstrBuilder HeaderPHIBuilder = |
4479 | 0 | BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); |
4480 | 0 | for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), |
4481 | 0 | E = LoopEntry->pred_end(); |
4482 | 0 | PI != E0 ; ++PI0 ) { |
4483 | 0 | if (*PI == LoopEnd0 ) { |
4484 | 0 | HeaderPHIBuilder.addReg(BackEdgeReg); |
4485 | 0 | } else { |
4486 | 0 | MachineBasicBlock *PMBB = *PI; |
4487 | 0 | unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
4488 | 0 | materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), |
4489 | 0 | ZeroReg, 0); |
4490 | 0 | HeaderPHIBuilder.addReg(ZeroReg); |
4491 | 0 | } |
4492 | 0 | HeaderPHIBuilder.addMBB(*PI); |
4493 | 0 | } |
4494 | 0 | MachineInstr *HeaderPhi = HeaderPHIBuilder; |
4495 | 0 | MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), |
4496 | 0 | get(AMDGPU::SI_IF_BREAK), BackEdgeReg) |
4497 | 0 | .addReg(DstReg) |
4498 | 0 | .add(Branch->getOperand(0)); |
4499 | 0 | MachineInstr *SILOOP = |
4500 | 0 | BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) |
4501 | 0 | .addReg(BackEdgeReg) |
4502 | 0 | .addMBB(LoopEntry); |
4503 | 0 |
|
4504 | 0 | LoopEntry->insert(LoopEntry->begin(), HeaderPhi); |
4505 | 0 | LoopEnd->erase(TI); |
4506 | 0 | LoopEnd->insert(LoopEnd->end(), SIIFBREAK); |
4507 | 0 | LoopEnd->insert(LoopEnd->end(), SILOOP); |
4508 | 0 | } |
4509 | 0 | } |
4510 | | |
4511 | | ArrayRef<std::pair<int, const char *>> |
4512 | 5 | SIInstrInfo::getSerializableTargetIndices() const { |
4513 | 5 | static const std::pair<int, const char *> TargetIndices[] = { |
4514 | 5 | {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, |
4515 | 5 | {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, |
4516 | 5 | {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, |
4517 | 5 | {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, |
4518 | 5 | {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; |
4519 | 5 | return makeArrayRef(TargetIndices); |
4520 | 5 | } |
4521 | | |
4522 | | /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The |
4523 | | /// post-RA version of misched uses CreateTargetMIHazardRecognizer. |
4524 | | ScheduleHazardRecognizer * |
4525 | | SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, |
4526 | 11.4k | const ScheduleDAG *DAG) const { |
4527 | 11.4k | return new GCNHazardRecognizer(DAG->MF); |
4528 | 11.4k | } |
4529 | | |
4530 | | /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer |
4531 | | /// pass. |
4532 | | ScheduleHazardRecognizer * |
4533 | 15.1k | SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { |
4534 | 15.1k | return new GCNHazardRecognizer(MF); |
4535 | 15.1k | } |
4536 | | |
4537 | | std::pair<unsigned, unsigned> |
4538 | 3 | SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { |
4539 | 3 | return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); |
4540 | 3 | } |
4541 | | |
4542 | | ArrayRef<std::pair<unsigned, const char *>> |
4543 | 4 | SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { |
4544 | 4 | static const std::pair<unsigned, const char *> TargetFlags[] = { |
4545 | 4 | { MO_GOTPCREL, "amdgpu-gotprel" }, |
4546 | 4 | { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, |
4547 | 4 | { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, |
4548 | 4 | { MO_REL32_LO, "amdgpu-rel32-lo" }, |
4549 | 4 | { MO_REL32_HI, "amdgpu-rel32-hi" } |
4550 | 4 | }; |
4551 | 4 | |
4552 | 4 | return makeArrayRef(TargetFlags); |
4553 | 4 | } |
4554 | | |
4555 | 741 | bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { |
4556 | 703 | return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && |
4557 | 526 | MI.modifiesRegister(AMDGPU::EXEC, &RI); |
4558 | 741 | } |
4559 | | |
4560 | | MachineInstrBuilder |
4561 | | SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, |
4562 | | MachineBasicBlock::iterator I, |
4563 | | const DebugLoc &DL, |
4564 | 0 | unsigned DestReg) const { |
4565 | 0 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
4566 | 0 |
|
4567 | 0 | unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
4568 | 0 |
|
4569 | 0 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) |
4570 | 0 | .addReg(UnusedCarry, RegState::Define | RegState::Dead); |
4571 | 0 | } |