/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | /// \file |
9 | | /// This file implements the targeting of the RegisterBankInfo class for |
10 | | /// AMDGPU. |
11 | | /// \todo This should be generated by TableGen. |
12 | | //===----------------------------------------------------------------------===// |
13 | | |
14 | | #include "AMDGPURegisterBankInfo.h" |
15 | | #include "AMDGPUInstrInfo.h" |
16 | | #include "AMDGPUSubtarget.h" |
17 | | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
18 | | #include "SIMachineFunctionInfo.h" |
19 | | #include "SIRegisterInfo.h" |
20 | | #include "llvm/ADT/SmallSet.h" |
21 | | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" |
22 | | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
23 | | #include "llvm/CodeGen/GlobalISel/RegisterBank.h" |
24 | | #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" |
25 | | #include "llvm/CodeGen/TargetRegisterInfo.h" |
26 | | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
27 | | #include "llvm/IR/Constants.h" |
28 | | |
29 | | #define GET_TARGET_REGBANK_IMPL |
30 | | #include "AMDGPUGenRegisterBank.inc" |
31 | | |
32 | | // This file will be TableGen'ed at some point. |
33 | | #include "AMDGPUGenRegisterBankInfo.def" |
34 | | |
35 | | using namespace llvm; |
36 | | |
37 | | namespace { |
38 | | |
39 | | // Observer to apply a register bank to new registers created by LegalizerHelper. |
40 | | class ApplyRegBankMapping final : public GISelChangeObserver { |
41 | | private: |
42 | | MachineRegisterInfo &MRI; |
43 | | const RegisterBank *NewBank; |
44 | | SmallVector<MachineInstr *, 4> NewInsts; |
45 | | |
46 | | public: |
47 | | ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB) |
48 | 48 | : MRI(MRI_), NewBank(RB) {} |
49 | | |
50 | 48 | ~ApplyRegBankMapping() { |
51 | 48 | for (MachineInstr *MI : NewInsts) |
52 | 528 | applyBank(*MI); |
53 | 48 | } |
54 | | |
55 | | /// Set any registers that don't have a set register class or bank to SALU. |
56 | 528 | void applyBank(MachineInstr &MI) { |
57 | 1.39k | for (MachineOperand &Op : MI.operands()) { |
58 | 1.39k | if (!Op.isReg()) |
59 | 208 | continue; |
60 | 1.18k | |
61 | 1.18k | Register Reg = Op.getReg(); |
62 | 1.18k | if (MRI.getRegClassOrRegBank(Reg)) |
63 | 976 | continue; |
64 | 208 | |
65 | 208 | const RegisterBank *RB = NewBank; |
66 | 208 | // FIXME: This might not be enough to detect when SCC should be used. |
67 | 208 | if (MRI.getType(Reg) == LLT::scalar(1)) |
68 | 32 | RB = (NewBank == &AMDGPU::SGPRRegBank ? |
69 | 32 | &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank0 ); |
70 | 208 | |
71 | 208 | MRI.setRegBank(Reg, *RB); |
72 | 208 | } |
73 | 528 | } |
74 | | |
75 | 0 | void erasingInstr(MachineInstr &MI) override {} |
76 | | |
77 | 528 | void createdInstr(MachineInstr &MI) override { |
78 | 528 | // At this point, the instruction was just inserted and has no operands. |
79 | 528 | NewInsts.push_back(&MI); |
80 | 528 | } |
81 | | |
82 | 16 | void changingInstr(MachineInstr &MI) override {} |
83 | 16 | void changedInstr(MachineInstr &MI) override {} |
84 | | }; |
85 | | |
86 | | } |
87 | | AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) |
88 | | : AMDGPUGenRegisterBankInfo(), |
89 | 3.64k | TRI(static_cast<const SIRegisterInfo*>(&TRI)) { |
90 | 3.64k | |
91 | 3.64k | // HACK: Until this is fully tablegen'd. |
92 | 3.64k | static bool AlreadyInit = false; |
93 | 3.64k | if (AlreadyInit) |
94 | 114 | return; |
95 | 3.52k | |
96 | 3.52k | AlreadyInit = true; |
97 | 3.52k | |
98 | 3.52k | const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID); |
99 | 3.52k | (void)RBSGPR; |
100 | 3.52k | assert(&RBSGPR == &AMDGPU::SGPRRegBank); |
101 | 3.52k | |
102 | 3.52k | const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID); |
103 | 3.52k | (void)RBVGPR; |
104 | 3.52k | assert(&RBVGPR == &AMDGPU::VGPRRegBank); |
105 | 3.52k | |
106 | 3.52k | } |
107 | | |
108 | | unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, |
109 | | const RegisterBank &Src, |
110 | 4.82k | unsigned Size) const { |
111 | 4.82k | // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? |
112 | 4.82k | if (Dst.getID() == AMDGPU::SGPRRegBankID && |
113 | 4.82k | Src.getID() == AMDGPU::VGPRRegBankID2.24k ) { |
114 | 289 | return std::numeric_limits<unsigned>::max(); |
115 | 289 | } |
116 | 4.53k | |
117 | 4.53k | // Bool values are tricky, because the meaning is based on context. The SCC |
118 | 4.53k | // and VCC banks are for the natural scalar and vector conditions produced by |
119 | 4.53k | // a compare. |
120 | 4.53k | // |
121 | 4.53k | // Legalization doesn't know about the necessary context, so an s1 use may |
122 | 4.53k | // have been a truncate from an arbitrary value, in which case a copy (lowered |
123 | 4.53k | // as a compare with 0) needs to be inserted. |
124 | 4.53k | if (Size == 1 && |
125 | 4.53k | (344 Dst.getID() == AMDGPU::SCCRegBankID344 || |
126 | 344 | Dst.getID() == AMDGPU::SGPRRegBankID245 ) && |
127 | 4.53k | (156 Src.getID() == AMDGPU::SGPRRegBankID156 || |
128 | 156 | Src.getID() == AMDGPU::VGPRRegBankID112 || |
129 | 156 | Src.getID() == AMDGPU::VCCRegBankID84 )) |
130 | 117 | return std::numeric_limits<unsigned>::max(); |
131 | 4.41k | |
132 | 4.41k | if (Dst.getID() == AMDGPU::SCCRegBankID && |
133 | 4.41k | Src.getID() == AMDGPU::VCCRegBankID0 ) |
134 | 0 | return std::numeric_limits<unsigned>::max(); |
135 | 4.41k | |
136 | 4.41k | return RegisterBankInfo::copyCost(Dst, Src, Size); |
137 | 4.41k | } |
138 | | |
139 | | unsigned AMDGPURegisterBankInfo::getBreakDownCost( |
140 | | const ValueMapping &ValMapping, |
141 | 255 | const RegisterBank *CurBank) const { |
142 | 255 | // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to |
143 | 255 | // VGPR. |
144 | 255 | // FIXME: Is there a better way to do this? |
145 | 255 | if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 640 ) |
146 | 255 | return 10; // This is expensive. |
147 | 0 | |
148 | 0 | assert(ValMapping.NumBreakDowns == 2 && |
149 | 0 | ValMapping.BreakDown[0].Length == 32 && |
150 | 0 | ValMapping.BreakDown[0].StartIdx == 0 && |
151 | 0 | ValMapping.BreakDown[1].Length == 32 && |
152 | 0 | ValMapping.BreakDown[1].StartIdx == 32 && |
153 | 0 | ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); |
154 | 0 |
|
155 | 0 | // 32-bit extract of a 64-bit value is just access of a subregister, so free. |
156 | 0 | // TODO: Cost of 0 hits assert, though it's not clear it's what we really |
157 | 0 | // want. |
158 | 0 |
|
159 | 0 | // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR |
160 | 0 | // alignment restrictions, but this probably isn't important. |
161 | 0 | return 1; |
162 | 0 | } |
163 | | |
164 | | const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( |
165 | 8.49k | const TargetRegisterClass &RC) const { |
166 | 8.49k | |
167 | 8.49k | if (TRI->isSGPRClass(&RC)) |
168 | 4.64k | return getRegBank(AMDGPU::SGPRRegBankID); |
169 | 3.84k | |
170 | 3.84k | return getRegBank(AMDGPU::VGPRRegBankID); |
171 | 3.84k | } |
172 | | |
173 | | template <unsigned NumOps> |
174 | | RegisterBankInfo::InstructionMappings |
175 | | AMDGPURegisterBankInfo::addMappingFromTable( |
176 | | const MachineInstr &MI, const MachineRegisterInfo &MRI, |
177 | | const std::array<unsigned, NumOps> RegSrcOpIdx, |
178 | 57 | ArrayRef<OpRegBankEntry<NumOps>> Table) const { |
179 | 57 | |
180 | 57 | InstructionMappings AltMappings; |
181 | 57 | |
182 | 57 | SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); |
183 | 57 | |
184 | 57 | unsigned Sizes[NumOps]; |
185 | 225 | for (unsigned I = 0; I < NumOps; ++I168 ) { |
186 | 168 | Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); |
187 | 168 | Sizes[I] = getSizeInBits(Reg, MRI, *TRI); |
188 | 168 | } |
189 | 57 | |
190 | 110 | for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I53 ) { |
191 | 53 | unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); |
192 | 53 | Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); |
193 | 53 | } |
194 | 57 | |
195 | 57 | unsigned MappingID = 0; |
196 | 196 | for (const auto &Entry : Table) { |
197 | 788 | for (unsigned I = 0; I < NumOps; ++I592 ) { |
198 | 592 | int OpIdx = RegSrcOpIdx[I]; |
199 | 592 | Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); |
200 | 592 | } |
201 | 196 | |
202 | 196 | AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, |
203 | 196 | getOperandsMapping(Operands), |
204 | 196 | Operands.size())); |
205 | 196 | } |
206 | 57 | |
207 | 57 | return AltMappings; |
208 | 57 | } llvm::SmallVector<llvm::RegisterBankInfo::InstructionMapping const*, 4u> llvm::AMDGPURegisterBankInfo::addMappingFromTable<3u>(llvm::MachineInstr const&, llvm::MachineRegisterInfo const&, std::__1::array<unsigned int, 3u>, llvm::ArrayRef<llvm::AMDGPURegisterBankInfo::OpRegBankEntry<3u> >) const Line | Count | Source | 178 | 48 | ArrayRef<OpRegBankEntry<NumOps>> Table) const { | 179 | 48 | | 180 | 48 | InstructionMappings AltMappings; | 181 | 48 | | 182 | 48 | SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); | 183 | 48 | | 184 | 48 | unsigned Sizes[NumOps]; | 185 | 192 | for (unsigned I = 0; I < NumOps; ++I144 ) { | 186 | 144 | Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); | 187 | 144 | Sizes[I] = getSizeInBits(Reg, MRI, *TRI); | 188 | 144 | } | 189 | 48 | | 190 | 96 | for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I48 ) { | 191 | 48 | unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); | 192 | 48 | Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); | 193 | 48 | } | 194 | 48 | | 195 | 48 | unsigned MappingID = 0; | 196 | 168 | for (const auto &Entry : Table) { | 197 | 672 | for (unsigned I = 0; I < NumOps; ++I504 ) { | 198 | 504 | int OpIdx = RegSrcOpIdx[I]; | 199 | 504 | Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); | 200 | 504 | } | 201 | 168 | | 202 | 168 | AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, | 203 | 168 | getOperandsMapping(Operands), | 204 | 168 | Operands.size())); | 205 | 168 | } | 206 | 48 | | 207 | 48 | return AltMappings; | 208 | 48 | } |
llvm::SmallVector<llvm::RegisterBankInfo::InstructionMapping const*, 4u> llvm::AMDGPURegisterBankInfo::addMappingFromTable<4u>(llvm::MachineInstr const&, llvm::MachineRegisterInfo const&, std::__1::array<unsigned int, 4u>, llvm::ArrayRef<llvm::AMDGPURegisterBankInfo::OpRegBankEntry<4u> >) const Line | Count | Source | 178 | 5 | ArrayRef<OpRegBankEntry<NumOps>> Table) const { | 179 | 5 | | 180 | 5 | InstructionMappings AltMappings; | 181 | 5 | | 182 | 5 | SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); | 183 | 5 | | 184 | 5 | unsigned Sizes[NumOps]; | 185 | 25 | for (unsigned I = 0; I < NumOps; ++I20 ) { | 186 | 20 | Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); | 187 | 20 | Sizes[I] = getSizeInBits(Reg, MRI, *TRI); | 188 | 20 | } | 189 | 5 | | 190 | 10 | for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I5 ) { | 191 | 5 | unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); | 192 | 5 | Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); | 193 | 5 | } | 194 | 5 | | 195 | 5 | unsigned MappingID = 0; | 196 | 20 | for (const auto &Entry : Table) { | 197 | 100 | for (unsigned I = 0; I < NumOps; ++I80 ) { | 198 | 80 | int OpIdx = RegSrcOpIdx[I]; | 199 | 80 | Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); | 200 | 80 | } | 201 | 20 | | 202 | 20 | AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, | 203 | 20 | getOperandsMapping(Operands), | 204 | 20 | Operands.size())); | 205 | 20 | } | 206 | 5 | | 207 | 5 | return AltMappings; | 208 | 5 | } |
Unexecuted instantiation: llvm::SmallVector<llvm::RegisterBankInfo::InstructionMapping const*, 4u> llvm::AMDGPURegisterBankInfo::addMappingFromTable<2u>(llvm::MachineInstr const&, llvm::MachineRegisterInfo const&, std::__1::array<unsigned int, 2u>, llvm::ArrayRef<llvm::AMDGPURegisterBankInfo::OpRegBankEntry<2u> >) const llvm::SmallVector<llvm::RegisterBankInfo::InstructionMapping const*, 4u> llvm::AMDGPURegisterBankInfo::addMappingFromTable<1u>(llvm::MachineInstr const&, llvm::MachineRegisterInfo const&, std::__1::array<unsigned int, 1u>, llvm::ArrayRef<llvm::AMDGPURegisterBankInfo::OpRegBankEntry<1u> >) const Line | Count | Source | 178 | 4 | ArrayRef<OpRegBankEntry<NumOps>> Table) const { | 179 | 4 | | 180 | 4 | InstructionMappings AltMappings; | 181 | 4 | | 182 | 4 | SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); | 183 | 4 | | 184 | 4 | unsigned Sizes[NumOps]; | 185 | 8 | for (unsigned I = 0; I < NumOps; ++I4 ) { | 186 | 4 | Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); | 187 | 4 | Sizes[I] = getSizeInBits(Reg, MRI, *TRI); | 188 | 4 | } | 189 | 4 | | 190 | 4 | for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I0 ) { | 191 | 0 | unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); | 192 | 0 | Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); | 193 | 0 | } | 194 | 4 | | 195 | 4 | unsigned MappingID = 0; | 196 | 8 | for (const auto &Entry : Table) { | 197 | 16 | for (unsigned I = 0; I < NumOps; ++I8 ) { | 198 | 8 | int OpIdx = RegSrcOpIdx[I]; | 199 | 8 | Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); | 200 | 8 | } | 201 | 8 | | 202 | 8 | AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, | 203 | 8 | getOperandsMapping(Operands), | 204 | 8 | Operands.size())); | 205 | 8 | } | 206 | 4 | | 207 | 4 | return AltMappings; | 208 | 4 | } |
|
209 | | |
210 | | RegisterBankInfo::InstructionMappings |
211 | | AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( |
212 | 51 | const MachineInstr &MI, const MachineRegisterInfo &MRI) const { |
213 | 51 | switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { |
214 | 51 | case Intrinsic::amdgcn_readlane: { |
215 | 4 | static const OpRegBankEntry<3> Table[2] = { |
216 | 4 | // Perfectly legal. |
217 | 4 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, |
218 | 4 | |
219 | 4 | // Need a readfirstlane for the index. |
220 | 4 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } |
221 | 4 | }; |
222 | 4 | |
223 | 4 | const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; |
224 | 4 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
225 | 51 | } |
226 | 51 | case Intrinsic::amdgcn_writelane: { |
227 | 5 | static const OpRegBankEntry<4> Table[4] = { |
228 | 5 | // Perfectly legal. |
229 | 5 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, |
230 | 5 | |
231 | 5 | // Need readfirstlane of first op |
232 | 5 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, |
233 | 5 | |
234 | 5 | // Need readfirstlane of second op |
235 | 5 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, |
236 | 5 | |
237 | 5 | // Need readfirstlane of both ops |
238 | 5 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } |
239 | 5 | }; |
240 | 5 | |
241 | 5 | // rsrc, voffset, offset |
242 | 5 | const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; |
243 | 5 | return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
244 | 51 | } |
245 | 51 | default: |
246 | 42 | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
247 | 51 | } |
248 | 51 | } |
249 | | |
250 | | RegisterBankInfo::InstructionMappings |
251 | | AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( |
252 | 48 | const MachineInstr &MI, const MachineRegisterInfo &MRI) const { |
253 | 48 | |
254 | 48 | switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { |
255 | 48 | case Intrinsic::amdgcn_buffer_load: { |
256 | 8 | static const OpRegBankEntry<3> Table[4] = { |
257 | 8 | // Perfectly legal. |
258 | 8 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, |
259 | 8 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, |
260 | 8 | |
261 | 8 | // Waterfall loop needed for rsrc. In the worst case this will execute |
262 | 8 | // approximately an extra 10 * wavesize + 2 instructions. |
263 | 8 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, |
264 | 8 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 } |
265 | 8 | }; |
266 | 8 | |
267 | 8 | // rsrc, voffset, offset |
268 | 8 | const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } }; |
269 | 8 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
270 | 48 | } |
271 | 48 | case Intrinsic::amdgcn_s_buffer_load: { |
272 | 0 | static const OpRegBankEntry<2> Table[4] = { |
273 | 0 | // Perfectly legal. |
274 | 0 | { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, |
275 | 0 |
|
276 | 0 | // Only need 1 register in loop |
277 | 0 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, |
278 | 0 |
|
279 | 0 | // Have to waterfall the resource. |
280 | 0 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, |
281 | 0 |
|
282 | 0 | // Have to waterfall the resource, and the offset. |
283 | 0 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } |
284 | 0 | }; |
285 | 0 |
|
286 | 0 | // rsrc, offset |
287 | 0 | const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; |
288 | 0 | return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
289 | 48 | } |
290 | 48 | case Intrinsic::amdgcn_ds_ordered_add: |
291 | 8 | case Intrinsic::amdgcn_ds_ordered_swap: { |
292 | 8 | // VGPR = M0, VGPR |
293 | 8 | static const OpRegBankEntry<3> Table[2] = { |
294 | 8 | // Perfectly legal. |
295 | 8 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, |
296 | 8 | |
297 | 8 | // Need a readfirstlane for m0 |
298 | 8 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } |
299 | 8 | }; |
300 | 8 | |
301 | 8 | const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; |
302 | 8 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
303 | 8 | } |
304 | 8 | case Intrinsic::amdgcn_s_sendmsg: |
305 | 4 | case Intrinsic::amdgcn_s_sendmsghalt: { |
306 | 4 | static const OpRegBankEntry<1> Table[2] = { |
307 | 4 | // Perfectly legal. |
308 | 4 | { { AMDGPU::SGPRRegBankID }, 1 }, |
309 | 4 | |
310 | 4 | // Need readlane |
311 | 4 | { { AMDGPU::VGPRRegBankID }, 3 } |
312 | 4 | }; |
313 | 4 | |
314 | 4 | const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; |
315 | 4 | return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
316 | 4 | } |
317 | 28 | default: |
318 | 28 | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
319 | 48 | } |
320 | 48 | } |
321 | | |
322 | 188 | static bool isInstrUniform(const MachineInstr &MI) { |
323 | 188 | if (!MI.hasOneMemOperand()) |
324 | 0 | return false; |
325 | 188 | |
326 | 188 | const MachineMemOperand *MMO = *MI.memoperands_begin(); |
327 | 188 | return AMDGPUInstrInfo::isUniformMMO(MMO); |
328 | 188 | } |
329 | | |
330 | | RegisterBankInfo::InstructionMappings |
331 | | AMDGPURegisterBankInfo::getInstrAlternativeMappings( |
332 | 2.67k | const MachineInstr &MI) const { |
333 | 2.67k | |
334 | 2.67k | const MachineFunction &MF = *MI.getParent()->getParent(); |
335 | 2.67k | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
336 | 2.67k | |
337 | 2.67k | |
338 | 2.67k | InstructionMappings AltMappings; |
339 | 2.67k | switch (MI.getOpcode()) { |
340 | 2.67k | case TargetOpcode::G_AND: |
341 | 117 | case TargetOpcode::G_OR: |
342 | 117 | case TargetOpcode::G_XOR: { |
343 | 117 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
344 | 117 | |
345 | 117 | if (Size == 1) { |
346 | 30 | // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. |
347 | 30 | const InstructionMapping &SCCMapping = getInstructionMapping( |
348 | 30 | 1, 1, getOperandsMapping( |
349 | 30 | {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size), |
350 | 30 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
351 | 30 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), |
352 | 30 | 3); // Num Operands |
353 | 30 | AltMappings.push_back(&SCCMapping); |
354 | 30 | |
355 | 30 | const InstructionMapping &SGPRMapping = getInstructionMapping( |
356 | 30 | 1, 1, getOperandsMapping( |
357 | 30 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
358 | 30 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
359 | 30 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), |
360 | 30 | 3); // Num Operands |
361 | 30 | AltMappings.push_back(&SGPRMapping); |
362 | 30 | |
363 | 30 | const InstructionMapping &VCCMapping0 = getInstructionMapping( |
364 | 30 | 2, 10, getOperandsMapping( |
365 | 30 | {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), |
366 | 30 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), |
367 | 30 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), |
368 | 30 | 3); // Num Operands |
369 | 30 | AltMappings.push_back(&VCCMapping0); |
370 | 30 | return AltMappings; |
371 | 30 | } |
372 | 87 | |
373 | 87 | if (Size != 64) |
374 | 24 | break; |
375 | 63 | |
376 | 63 | const InstructionMapping &SSMapping = getInstructionMapping( |
377 | 63 | 1, 1, getOperandsMapping( |
378 | 63 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
379 | 63 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
380 | 63 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), |
381 | 63 | 3); // Num Operands |
382 | 63 | AltMappings.push_back(&SSMapping); |
383 | 63 | |
384 | 63 | const InstructionMapping &VVMapping = getInstructionMapping( |
385 | 63 | 2, 2, getOperandsMapping( |
386 | 63 | {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
387 | 63 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
388 | 63 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), |
389 | 63 | 3); // Num Operands |
390 | 63 | AltMappings.push_back(&VVMapping); |
391 | 63 | |
392 | 63 | const InstructionMapping &SVMapping = getInstructionMapping( |
393 | 63 | 3, 3, getOperandsMapping( |
394 | 63 | {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
395 | 63 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size), |
396 | 63 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), |
397 | 63 | 3); // Num Operands |
398 | 63 | AltMappings.push_back(&SVMapping); |
399 | 63 | |
400 | 63 | // SGPR in LHS is slightly preferrable, so make it VS more expensive than |
401 | 63 | // SV. |
402 | 63 | const InstructionMapping &VSMapping = getInstructionMapping( |
403 | 63 | 3, 4, getOperandsMapping( |
404 | 63 | {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
405 | 63 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
406 | 63 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}), |
407 | 63 | 3); // Num Operands |
408 | 63 | AltMappings.push_back(&VSMapping); |
409 | 63 | break; |
410 | 63 | } |
411 | 63 | case TargetOpcode::G_LOAD: { |
412 | 16 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
413 | 16 | LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); |
414 | 16 | // FIXME: Should we be hard coding the size for these mappings? |
415 | 16 | if (isInstrUniform(MI)) { |
416 | 8 | const InstructionMapping &SSMapping = getInstructionMapping( |
417 | 8 | 1, 1, getOperandsMapping( |
418 | 8 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
419 | 8 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), |
420 | 8 | 2); // Num Operands |
421 | 8 | AltMappings.push_back(&SSMapping); |
422 | 8 | } |
423 | 16 | |
424 | 16 | const InstructionMapping &VVMapping = getInstructionMapping( |
425 | 16 | 2, 1, getOperandsMapping( |
426 | 16 | {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), |
427 | 16 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), |
428 | 16 | 2); // Num Operands |
429 | 16 | AltMappings.push_back(&VVMapping); |
430 | 16 | |
431 | 16 | // It may be possible to have a vgpr = load sgpr mapping here, because |
432 | 16 | // the mubuf instructions support this kind of load, but probably for only |
433 | 16 | // gfx7 and older. However, the addressing mode matching in the instruction |
434 | 16 | // selector should be able to do a better job of detecting and selecting |
435 | 16 | // these kinds of loads from the vgpr = load vgpr mapping. |
436 | 16 | |
437 | 16 | return AltMappings; |
438 | 63 | |
439 | 63 | } |
440 | 225 | case TargetOpcode::G_ICMP: { |
441 | 225 | unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); |
442 | 225 | const InstructionMapping &SSMapping = getInstructionMapping(1, 1, |
443 | 225 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), |
444 | 225 | nullptr, // Predicate operand. |
445 | 225 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
446 | 225 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), |
447 | 225 | 4); // Num Operands |
448 | 225 | AltMappings.push_back(&SSMapping); |
449 | 225 | |
450 | 225 | const InstructionMapping &SVMapping = getInstructionMapping(2, 1, |
451 | 225 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), |
452 | 225 | nullptr, // Predicate operand. |
453 | 225 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
454 | 225 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), |
455 | 225 | 4); // Num Operands |
456 | 225 | AltMappings.push_back(&SVMapping); |
457 | 225 | |
458 | 225 | const InstructionMapping &VSMapping = getInstructionMapping(3, 1, |
459 | 225 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), |
460 | 225 | nullptr, // Predicate operand. |
461 | 225 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
462 | 225 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), |
463 | 225 | 4); // Num Operands |
464 | 225 | AltMappings.push_back(&VSMapping); |
465 | 225 | |
466 | 225 | const InstructionMapping &VVMapping = getInstructionMapping(4, 1, |
467 | 225 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), |
468 | 225 | nullptr, // Predicate operand. |
469 | 225 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
470 | 225 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), |
471 | 225 | 4); // Num Operands |
472 | 225 | AltMappings.push_back(&VVMapping); |
473 | 225 | |
474 | 225 | return AltMappings; |
475 | 63 | } |
476 | 80 | case TargetOpcode::G_SELECT: { |
477 | 80 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
478 | 80 | const InstructionMapping &SSMapping = getInstructionMapping(1, 1, |
479 | 80 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
480 | 80 | AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), |
481 | 80 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
482 | 80 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), |
483 | 80 | 4); // Num Operands |
484 | 80 | AltMappings.push_back(&SSMapping); |
485 | 80 | |
486 | 80 | const InstructionMapping &VVMapping = getInstructionMapping(2, 1, |
487 | 80 | getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
488 | 80 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), |
489 | 80 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
490 | 80 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), |
491 | 80 | 4); // Num Operands |
492 | 80 | AltMappings.push_back(&VVMapping); |
493 | 80 | |
494 | 80 | return AltMappings; |
495 | 63 | } |
496 | 63 | case TargetOpcode::G_SMIN: |
497 | 28 | case TargetOpcode::G_SMAX: |
498 | 28 | case TargetOpcode::G_UMIN: |
499 | 28 | case TargetOpcode::G_UMAX: { |
500 | 28 | static const OpRegBankEntry<3> Table[4] = { |
501 | 28 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, |
502 | 28 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, |
503 | 28 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, |
504 | 28 | |
505 | 28 | // Scalar requires cmp+select, and extends if 16-bit. |
506 | 28 | // FIXME: Should there be separate costs for 32 and 16-bit |
507 | 28 | { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 } |
508 | 28 | }; |
509 | 28 | |
510 | 28 | const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } }; |
511 | 28 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
512 | 28 | } |
513 | 28 | case TargetOpcode::G_UADDE: |
514 | 20 | case TargetOpcode::G_USUBE: |
515 | 20 | case TargetOpcode::G_SADDE: |
516 | 20 | case TargetOpcode::G_SSUBE: { |
517 | 20 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
518 | 20 | const InstructionMapping &SSMapping = getInstructionMapping(1, 1, |
519 | 20 | getOperandsMapping( |
520 | 20 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
521 | 20 | AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), |
522 | 20 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
523 | 20 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
524 | 20 | AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}), |
525 | 20 | 5); // Num Operands |
526 | 20 | AltMappings.push_back(&SSMapping); |
527 | 20 | |
528 | 20 | const InstructionMapping &VVMapping = getInstructionMapping(2, 1, |
529 | 20 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
530 | 20 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), |
531 | 20 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
532 | 20 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
533 | 20 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), |
534 | 20 | 5); // Num Operands |
535 | 20 | AltMappings.push_back(&VVMapping); |
536 | 20 | return AltMappings; |
537 | 20 | } |
538 | 55 | case AMDGPU::G_BRCOND: { |
539 | 55 | assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); |
540 | 55 | |
541 | 55 | const InstructionMapping &SMapping = getInstructionMapping( |
542 | 55 | 1, 1, getOperandsMapping( |
543 | 55 | {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}), |
544 | 55 | 2); // Num Operands |
545 | 55 | AltMappings.push_back(&SMapping); |
546 | 55 | |
547 | 55 | const InstructionMapping &VMapping = getInstructionMapping( |
548 | 55 | 1, 1, getOperandsMapping( |
549 | 55 | {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), |
550 | 55 | 2); // Num Operands |
551 | 55 | AltMappings.push_back(&VMapping); |
552 | 55 | return AltMappings; |
553 | 20 | } |
554 | 51 | case AMDGPU::G_INTRINSIC: |
555 | 51 | return getInstrAlternativeMappingsIntrinsic(MI, MRI); |
556 | 48 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: |
557 | 48 | return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); |
558 | 2.03k | default: |
559 | 2.03k | break; |
560 | 2.12k | } |
561 | 2.12k | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
562 | 2.12k | } |
563 | | |
564 | | void AMDGPURegisterBankInfo::split64BitValueForMapping( |
565 | | MachineIRBuilder &B, |
566 | | SmallVector<Register, 2> &Regs, |
567 | | LLT HalfTy, |
568 | 204 | Register Reg) const { |
569 | 204 | assert(HalfTy.getSizeInBits() == 32); |
570 | 204 | MachineRegisterInfo *MRI = B.getMRI(); |
571 | 204 | Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); |
572 | 204 | Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); |
573 | 204 | const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); |
574 | 204 | MRI->setRegBank(LoLHS, *Bank); |
575 | 204 | MRI->setRegBank(HiLHS, *Bank); |
576 | 204 | |
577 | 204 | Regs.push_back(LoLHS); |
578 | 204 | Regs.push_back(HiLHS); |
579 | 204 | |
580 | 204 | B.buildInstr(AMDGPU::G_UNMERGE_VALUES) |
581 | 204 | .addDef(LoLHS) |
582 | 204 | .addDef(HiLHS) |
583 | 204 | .addUse(Reg); |
584 | 204 | } |
585 | | |
586 | | /// Replace the current type each register in \p Regs has with \p NewTy |
587 | | static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, |
588 | 270 | LLT NewTy) { |
589 | 540 | for (Register Reg : Regs) { |
590 | 540 | assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); |
591 | 540 | MRI.setType(Reg, NewTy); |
592 | 540 | } |
593 | 270 | } |
594 | | |
595 | 190 | static LLT getHalfSizedType(LLT Ty) { |
596 | 190 | if (Ty.isVector()) { |
597 | 80 | assert(Ty.getNumElements() % 2 == 0); |
598 | 80 | return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); |
599 | 80 | } |
600 | 110 | |
601 | 110 | assert(Ty.getSizeInBits() % 2 == 0); |
602 | 110 | return LLT::scalar(Ty.getSizeInBits() / 2); |
603 | 110 | } |
604 | | |
605 | | /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If |
606 | | /// any of the required SGPR operands are VGPRs, perform a waterfall loop to |
607 | | /// execute the instruction for each unique combination of values in all lanes |
608 | | /// in the wave. The block will be split such that rest of the instructions are |
609 | | /// moved to a new block. |
610 | | /// |
611 | | /// Essentially performs this loop: |
612 | | // |
613 | | /// Save Execution Mask |
614 | | /// For (Lane : Wavefront) { |
615 | | /// Enable Lane, Disable all other lanes |
616 | | /// SGPR = read SGPR value for current lane from VGPR |
617 | | /// VGPRResult[Lane] = use_op SGPR |
618 | | /// } |
619 | | /// Restore Execution Mask |
620 | | /// |
621 | | /// There is additional complexity to try for compare values to identify the |
622 | | /// unique values used. |
623 | | void AMDGPURegisterBankInfo::executeInWaterfallLoop( |
624 | | MachineInstr &MI, MachineRegisterInfo &MRI, |
625 | 34 | ArrayRef<unsigned> OpIndices) const { |
626 | 34 | MachineFunction *MF = MI.getParent()->getParent(); |
627 | 34 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
628 | 34 | const SIInstrInfo *TII = ST.getInstrInfo(); |
629 | 34 | MachineBasicBlock::iterator I(MI); |
630 | 34 | |
631 | 34 | MachineBasicBlock &MBB = *MI.getParent(); |
632 | 34 | const DebugLoc &DL = MI.getDebugLoc(); |
633 | 34 | |
634 | 34 | // Use a set to avoid extra readfirstlanes in the case where multiple operands |
635 | 34 | // are the same register. |
636 | 34 | SmallSet<Register, 4> SGPROperandRegs; |
637 | 42 | for (unsigned Op : OpIndices) { |
638 | 42 | assert(MI.getOperand(Op).isUse()); |
639 | 42 | Register Reg = MI.getOperand(Op).getReg(); |
640 | 42 | const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); |
641 | 42 | if (OpBank->getID() == AMDGPU::VGPRRegBankID) |
642 | 22 | SGPROperandRegs.insert(Reg); |
643 | 42 | } |
644 | 34 | |
645 | 34 | // No operands need to be replaced, so no need to loop. |
646 | 34 | if (SGPROperandRegs.empty()) |
647 | 14 | return; |
648 | 20 | |
649 | 20 | MachineIRBuilder B(MI); |
650 | 20 | SmallVector<Register, 4> ResultRegs; |
651 | 20 | SmallVector<Register, 4> InitResultRegs; |
652 | 20 | SmallVector<Register, 4> PhiRegs; |
653 | 20 | for (MachineOperand &Def : MI.defs()) { |
654 | 20 | LLT ResTy = MRI.getType(Def.getReg()); |
655 | 20 | const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); |
656 | 20 | ResultRegs.push_back(Def.getReg()); |
657 | 20 | Register InitReg = B.buildUndef(ResTy).getReg(0); |
658 | 20 | Register PhiReg = MRI.createGenericVirtualRegister(ResTy); |
659 | 20 | InitResultRegs.push_back(InitReg); |
660 | 20 | PhiRegs.push_back(PhiReg); |
661 | 20 | MRI.setRegBank(PhiReg, *DefBank); |
662 | 20 | MRI.setRegBank(InitReg, *DefBank); |
663 | 20 | } |
664 | 20 | |
665 | 20 | Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
666 | 20 | Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
667 | 20 | |
668 | 20 | // Don't bother using generic instructions/registers for the exec mask. |
669 | 20 | B.buildInstr(TargetOpcode::IMPLICIT_DEF) |
670 | 20 | .addDef(InitSaveExecReg); |
671 | 20 | |
672 | 20 | Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
673 | 20 | Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
674 | 20 | |
675 | 20 | // To insert the loop we need to split the block. Move everything before this |
676 | 20 | // point to a new block, and insert a new empty block before this instruction. |
677 | 20 | MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); |
678 | 20 | MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); |
679 | 20 | MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); |
680 | 20 | MachineFunction::iterator MBBI(MBB); |
681 | 20 | ++MBBI; |
682 | 20 | MF->insert(MBBI, LoopBB); |
683 | 20 | MF->insert(MBBI, RestoreExecBB); |
684 | 20 | MF->insert(MBBI, RemainderBB); |
685 | 20 | |
686 | 20 | LoopBB->addSuccessor(RestoreExecBB); |
687 | 20 | LoopBB->addSuccessor(LoopBB); |
688 | 20 | |
689 | 20 | // Move the rest of the block into a new block. |
690 | 20 | RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); |
691 | 20 | RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); |
692 | 20 | |
693 | 20 | MBB.addSuccessor(LoopBB); |
694 | 20 | RestoreExecBB->addSuccessor(RemainderBB); |
695 | 20 | |
696 | 20 | B.setInsertPt(*LoopBB, LoopBB->end()); |
697 | 20 | |
698 | 20 | B.buildInstr(TargetOpcode::PHI) |
699 | 20 | .addDef(PhiExec) |
700 | 20 | .addReg(InitSaveExecReg) |
701 | 20 | .addMBB(&MBB) |
702 | 20 | .addReg(NewExec) |
703 | 20 | .addMBB(LoopBB); |
704 | 20 | |
705 | 20 | for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { |
706 | 20 | B.buildInstr(TargetOpcode::G_PHI) |
707 | 20 | .addDef(std::get<2>(Result)) |
708 | 20 | .addReg(std::get<0>(Result)) // Initial value / implicit_def |
709 | 20 | .addMBB(&MBB) |
710 | 20 | .addReg(std::get<1>(Result)) // Mid-loop value. |
711 | 20 | .addMBB(LoopBB); |
712 | 20 | } |
713 | 20 | |
714 | 20 | // Move the instruction into the loop. |
715 | 20 | LoopBB->splice(LoopBB->end(), &MBB, I); |
716 | 20 | I = std::prev(LoopBB->end()); |
717 | 20 | |
718 | 20 | B.setInstr(*I); |
719 | 20 | |
720 | 20 | Register CondReg; |
721 | 20 | |
722 | 84 | for (MachineOperand &Op : MI.uses()) { |
723 | 84 | if (!Op.isReg()) |
724 | 36 | continue; |
725 | 48 | |
726 | 48 | assert(!Op.isDef()); |
727 | 48 | if (SGPROperandRegs.count(Op.getReg())) { |
728 | 22 | LLT OpTy = MRI.getType(Op.getReg()); |
729 | 22 | unsigned OpSize = OpTy.getSizeInBits(); |
730 | 22 | |
731 | 22 | // Can only do a readlane of 32-bit pieces. |
732 | 22 | if (OpSize == 32) { |
733 | 10 | // Avoid extra copies in the simple case of one 32-bit register. |
734 | 10 | Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
735 | 10 | MRI.setType(CurrentLaneOpReg, OpTy); |
736 | 10 | |
737 | 10 | constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); |
738 | 10 | // Read the next variant <- also loop target. |
739 | 10 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg) |
740 | 10 | .addReg(Op.getReg()); |
741 | 10 | |
742 | 10 | Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
743 | 10 | bool First = CondReg == AMDGPU::NoRegister; |
744 | 10 | if (First) |
745 | 8 | CondReg = NewCondReg; |
746 | 10 | |
747 | 10 | // Compare the just read M0 value to all possible Idx values. |
748 | 10 | B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) |
749 | 10 | .addDef(NewCondReg) |
750 | 10 | .addReg(CurrentLaneOpReg) |
751 | 10 | .addReg(Op.getReg()); |
752 | 10 | Op.setReg(CurrentLaneOpReg); |
753 | 10 | |
754 | 10 | if (!First) { |
755 | 2 | Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
756 | 2 | |
757 | 2 | // If there are multiple operands to consider, and the conditions. |
758 | 2 | B.buildInstr(AMDGPU::S_AND_B64) |
759 | 2 | .addDef(AndReg) |
760 | 2 | .addReg(NewCondReg) |
761 | 2 | .addReg(CondReg); |
762 | 2 | CondReg = AndReg; |
763 | 2 | } |
764 | 12 | } else { |
765 | 12 | LLT S32 = LLT::scalar(32); |
766 | 12 | SmallVector<Register, 8> ReadlanePieces; |
767 | 12 | |
768 | 12 | // The compares can be done as 64-bit, but the extract needs to be done |
769 | 12 | // in 32-bit pieces. |
770 | 12 | |
771 | 12 | bool Is64 = OpSize % 64 == 0; |
772 | 12 | |
773 | 12 | LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32)0 ; |
774 | 12 | unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 |
775 | 12 | : AMDGPU::V_CMP_EQ_U32_e640 ; |
776 | 12 | |
777 | 12 | // The compares can be done as 64-bit, but the extract needs to be done |
778 | 12 | // in 32-bit pieces. |
779 | 12 | |
780 | 12 | // Insert the unmerge before the loop. |
781 | 12 | |
782 | 12 | B.setMBB(MBB); |
783 | 12 | auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); |
784 | 12 | B.setInstr(*I); |
785 | 12 | |
786 | 12 | unsigned NumPieces = Unmerge->getNumOperands() - 1; |
787 | 36 | for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx24 ) { |
788 | 24 | unsigned UnmergePiece = Unmerge.getReg(PieceIdx); |
789 | 24 | |
790 | 24 | Register CurrentLaneOpReg; |
791 | 24 | if (Is64) { |
792 | 24 | Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); |
793 | 24 | Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); |
794 | 24 | |
795 | 24 | MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); |
796 | 24 | MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); |
797 | 24 | MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); |
798 | 24 | |
799 | 24 | // Read the next variant <- also loop target. |
800 | 24 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), |
801 | 24 | CurrentLaneOpRegLo) |
802 | 24 | .addReg(UnmergePiece, 0, AMDGPU::sub0); |
803 | 24 | |
804 | 24 | // Read the next variant <- also loop target. |
805 | 24 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), |
806 | 24 | CurrentLaneOpRegHi) |
807 | 24 | .addReg(UnmergePiece, 0, AMDGPU::sub1); |
808 | 24 | |
809 | 24 | CurrentLaneOpReg = |
810 | 24 | B.buildMerge(LLT::scalar(64), |
811 | 24 | {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) |
812 | 24 | .getReg(0); |
813 | 24 | |
814 | 24 | MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); |
815 | 24 | |
816 | 24 | if (OpTy.getScalarSizeInBits() == 64) { |
817 | 0 | // If we need to produce a 64-bit element vector, so use the |
818 | 0 | // merged pieces |
819 | 0 | ReadlanePieces.push_back(CurrentLaneOpReg); |
820 | 24 | } else { |
821 | 24 | // 32-bit element type. |
822 | 24 | ReadlanePieces.push_back(CurrentLaneOpRegLo); |
823 | 24 | ReadlanePieces.push_back(CurrentLaneOpRegHi); |
824 | 24 | } |
825 | 24 | } else { |
826 | 0 | CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); |
827 | 0 | MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); |
828 | 0 | MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); |
829 | 0 |
|
830 | 0 | // Read the next variant <- also loop target. |
831 | 0 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), |
832 | 0 | CurrentLaneOpReg) |
833 | 0 | .addReg(UnmergePiece); |
834 | 0 | ReadlanePieces.push_back(CurrentLaneOpReg); |
835 | 0 | } |
836 | 24 | |
837 | 24 | Register NewCondReg |
838 | 24 | = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
839 | 24 | bool First = CondReg == AMDGPU::NoRegister; |
840 | 24 | if (First) |
841 | 12 | CondReg = NewCondReg; |
842 | 24 | |
843 | 24 | B.buildInstr(CmpOp) |
844 | 24 | .addDef(NewCondReg) |
845 | 24 | .addReg(CurrentLaneOpReg) |
846 | 24 | .addReg(UnmergePiece); |
847 | 24 | |
848 | 24 | if (!First) { |
849 | 12 | Register AndReg |
850 | 12 | = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
851 | 12 | |
852 | 12 | // If there are multiple operands to consider, and the conditions. |
853 | 12 | B.buildInstr(AMDGPU::S_AND_B64) |
854 | 12 | .addDef(AndReg) |
855 | 12 | .addReg(NewCondReg) |
856 | 12 | .addReg(CondReg); |
857 | 12 | CondReg = AndReg; |
858 | 12 | } |
859 | 24 | } |
860 | 12 | |
861 | 12 | // FIXME: Build merge seems to switch to CONCAT_VECTORS but not |
862 | 12 | // BUILD_VECTOR |
863 | 12 | if (OpTy.isVector()) { |
864 | 12 | auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); |
865 | 12 | Op.setReg(Merge.getReg(0)); |
866 | 12 | } else { |
867 | 0 | auto Merge = B.buildMerge(OpTy, ReadlanePieces); |
868 | 0 | Op.setReg(Merge.getReg(0)); |
869 | 0 | } |
870 | 12 | |
871 | 12 | MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); |
872 | 12 | } |
873 | 22 | } |
874 | 48 | } |
875 | 20 | |
876 | 20 | B.setInsertPt(*LoopBB, LoopBB->end()); |
877 | 20 | |
878 | 20 | // Update EXEC, save the original EXEC value to VCC. |
879 | 20 | B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64) |
880 | 20 | .addDef(NewExec) |
881 | 20 | .addReg(CondReg, RegState::Kill); |
882 | 20 | |
883 | 20 | MRI.setSimpleHint(NewExec, CondReg); |
884 | 20 | |
885 | 20 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. |
886 | 20 | B.buildInstr(AMDGPU::S_XOR_B64_term) |
887 | 20 | .addDef(AMDGPU::EXEC) |
888 | 20 | .addReg(AMDGPU::EXEC) |
889 | 20 | .addReg(NewExec); |
890 | 20 | |
891 | 20 | // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use |
892 | 20 | // s_cbranch_scc0? |
893 | 20 | |
894 | 20 | // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. |
895 | 20 | B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) |
896 | 20 | .addMBB(LoopBB); |
897 | 20 | |
898 | 20 | // Save the EXEC mask before the loop. |
899 | 20 | BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg) |
900 | 20 | .addReg(AMDGPU::EXEC); |
901 | 20 | |
902 | 20 | // Restore the EXEC mask after the loop. |
903 | 20 | B.setMBB(*RestoreExecBB); |
904 | 20 | B.buildInstr(AMDGPU::S_MOV_B64_term) |
905 | 20 | .addDef(AMDGPU::EXEC) |
906 | 20 | .addReg(SaveExecReg); |
907 | 20 | } |
908 | | |
909 | | // Legalize an operand that must be an SGPR by inserting a readfirstlane. |
910 | | void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( |
911 | 52 | MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { |
912 | 52 | Register Reg = MI.getOperand(OpIdx).getReg(); |
913 | 52 | const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); |
914 | 52 | if (Bank != &AMDGPU::VGPRRegBank) |
915 | 28 | return; |
916 | 24 | |
917 | 24 | MachineIRBuilder B(MI); |
918 | 24 | Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
919 | 24 | B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) |
920 | 24 | .addDef(SGPR) |
921 | 24 | .addReg(Reg); |
922 | 24 | |
923 | 24 | const TargetRegisterClass *Constrained = |
924 | 24 | constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); |
925 | 24 | (void)Constrained; |
926 | 24 | assert(Constrained && "Failed to constrain readfirstlane src reg"); |
927 | 24 | |
928 | 24 | MI.getOperand(OpIdx).setReg(SGPR); |
929 | 24 | } |
930 | | |
931 | | // When regbankselect repairs registers, it will insert a repair instruction |
932 | | // which defines the repaired register. Then it calls applyMapping and expects |
933 | | // that the targets will either delete or rewrite the originally wrote to the |
934 | | // repaired registers. Beccause of this, we end up in a situation where |
935 | | // we have 2 instructions defining the same registers. |
936 | | static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI, |
937 | | Register Reg, |
938 | 32 | const MachineInstr &MI) { |
939 | 32 | // Is there some way we can assert that there are exactly 2 def instructions? |
940 | 32 | for (MachineInstr &Other : MRI.def_instructions(Reg)) { |
941 | 32 | if (&Other != &MI) |
942 | 32 | return &Other; |
943 | 32 | } |
944 | 32 | |
945 | 32 | return nullptr0 ; |
946 | 32 | } |
947 | | |
948 | | bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, |
949 | | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, |
950 | 172 | MachineRegisterInfo &MRI) const { |
951 | 172 | Register DstReg = MI.getOperand(0).getReg(); |
952 | 172 | const LLT LoadTy = MRI.getType(DstReg); |
953 | 172 | unsigned LoadSize = LoadTy.getSizeInBits(); |
954 | 172 | const unsigned MaxNonSmrdLoadSize = 128; |
955 | 172 | // 128-bit loads are supported for all instruction types. |
956 | 172 | if (LoadSize <= MaxNonSmrdLoadSize) |
957 | 140 | return false; |
958 | 32 | |
959 | 32 | SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0)); |
960 | 32 | SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1)); |
961 | 32 | |
962 | 32 | // If the pointer is an SGPR, we have nothing to do. |
963 | 32 | if (SrcRegs.empty()) |
964 | 16 | return false; |
965 | 16 | |
966 | 16 | assert(LoadSize % MaxNonSmrdLoadSize == 0); |
967 | 16 | |
968 | 16 | // We want to get the repair instruction now, because it will help us |
969 | 16 | // determine which instruction the legalizer inserts that will also |
970 | 16 | // write to DstReg. |
971 | 16 | MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI); |
972 | 16 | |
973 | 16 | // RegBankSelect only emits scalar types, so we need to reset the pointer |
974 | 16 | // operand to a pointer type. |
975 | 16 | Register BasePtrReg = SrcRegs[0]; |
976 | 16 | LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); |
977 | 16 | MRI.setType(BasePtrReg, PtrTy); |
978 | 16 | |
979 | 16 | MachineIRBuilder B(MI); |
980 | 16 | |
981 | 16 | unsigned SplitElts = |
982 | 16 | MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits(); |
983 | 16 | const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType()); |
984 | 16 | ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank); |
985 | 16 | GISelObserverWrapper Observer(&O); |
986 | 16 | B.setChangeObserver(Observer); |
987 | 16 | LegalizerHelper Helper(B.getMF(), Observer, B); |
988 | 16 | if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) |
989 | 0 | return false; |
990 | 16 | |
991 | 16 | // At this point, the legalizer has split the original load into smaller |
992 | 16 | // loads. At the end of lowering, it inserts an instruction (LegalizedInst) |
993 | 16 | // that combines the outputs of the lower loads and writes it to DstReg. |
994 | 16 | // The register bank selector has also added the RepairInst which writes to |
995 | 16 | // DstReg as well. |
996 | 16 | |
997 | 16 | MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst); |
998 | 16 | |
999 | 16 | // Replace the output of the LegalizedInst with a temporary register, since |
1000 | 16 | // RepairInst already defines DstReg. |
1001 | 16 | Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg)); |
1002 | 16 | LegalizedInst->getOperand(0).setReg(TmpReg); |
1003 | 16 | B.setInsertPt(*RepairInst->getParent(), RepairInst); |
1004 | 16 | |
1005 | 160 | for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx144 ) { |
1006 | 144 | Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); |
1007 | 144 | B.buildConstant(IdxReg, DefIdx); |
1008 | 144 | MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID)); |
1009 | 144 | B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg); |
1010 | 144 | } |
1011 | 16 | |
1012 | 16 | MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); |
1013 | 16 | return true; |
1014 | 16 | } |
1015 | | |
1016 | | // For cases where only a single copy is inserted for matching register banks. |
1017 | | // Replace the register in the instruction operand |
1018 | | static void substituteSimpleCopyRegs( |
1019 | 34 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { |
1020 | 34 | SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); |
1021 | 34 | if (!SrcReg.empty()) { |
1022 | 14 | assert(SrcReg.size() == 1); |
1023 | 14 | OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); |
1024 | 14 | } |
1025 | 34 | } |
1026 | | |
1027 | | void AMDGPURegisterBankInfo::applyMappingImpl( |
1028 | 2.87k | const OperandsMapper &OpdMapper) const { |
1029 | 2.87k | MachineInstr &MI = OpdMapper.getMI(); |
1030 | 2.87k | unsigned Opc = MI.getOpcode(); |
1031 | 2.87k | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
1032 | 2.87k | switch (Opc) { |
1033 | 2.87k | case AMDGPU::G_SELECT: { |
1034 | 160 | Register DstReg = MI.getOperand(0).getReg(); |
1035 | 160 | LLT DstTy = MRI.getType(DstReg); |
1036 | 160 | if (DstTy.getSizeInBits() != 64) |
1037 | 96 | break; |
1038 | 64 | |
1039 | 64 | LLT HalfTy = getHalfSizedType(DstTy); |
1040 | 64 | |
1041 | 64 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); |
1042 | 64 | SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1)); |
1043 | 64 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); |
1044 | 64 | SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); |
1045 | 64 | |
1046 | 64 | // All inputs are SGPRs, nothing special to do. |
1047 | 64 | if (DefRegs.empty()) { |
1048 | 8 | assert(Src1Regs.empty() && Src2Regs.empty()); |
1049 | 8 | break; |
1050 | 8 | } |
1051 | 56 | |
1052 | 56 | MachineIRBuilder B(MI); |
1053 | 56 | if (Src0Regs.empty()) |
1054 | 56 | Src0Regs.push_back(MI.getOperand(1).getReg()); |
1055 | 0 | else { |
1056 | 0 | assert(Src0Regs.size() == 1); |
1057 | 0 | } |
1058 | 56 | |
1059 | 56 | if (Src1Regs.empty()) |
1060 | 0 | split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); |
1061 | 56 | else { |
1062 | 56 | setRegsToType(MRI, Src1Regs, HalfTy); |
1063 | 56 | } |
1064 | 56 | |
1065 | 56 | if (Src2Regs.empty()) |
1066 | 0 | split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); |
1067 | 56 | else |
1068 | 56 | setRegsToType(MRI, Src2Regs, HalfTy); |
1069 | 56 | |
1070 | 56 | setRegsToType(MRI, DefRegs, HalfTy); |
1071 | 56 | |
1072 | 56 | B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]); |
1073 | 56 | B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]); |
1074 | 56 | |
1075 | 56 | MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); |
1076 | 56 | MI.eraseFromParent(); |
1077 | 56 | return; |
1078 | 56 | } |
1079 | 234 | case AMDGPU::G_AND: |
1080 | 234 | case AMDGPU::G_OR: |
1081 | 234 | case AMDGPU::G_XOR: { |
1082 | 234 | // 64-bit and is only available on the SALU, so split into 2 32-bit ops if |
1083 | 234 | // there is a VGPR input. |
1084 | 234 | Register DstReg = MI.getOperand(0).getReg(); |
1085 | 234 | LLT DstTy = MRI.getType(DstReg); |
1086 | 234 | if (DstTy.getSizeInBits() != 64) |
1087 | 108 | break; |
1088 | 126 | |
1089 | 126 | LLT HalfTy = getHalfSizedType(DstTy); |
1090 | 126 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); |
1091 | 126 | SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); |
1092 | 126 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); |
1093 | 126 | |
1094 | 126 | // All inputs are SGPRs, nothing special to do. |
1095 | 126 | if (DefRegs.empty()) { |
1096 | 24 | assert(Src0Regs.empty() && Src1Regs.empty()); |
1097 | 24 | break; |
1098 | 24 | } |
1099 | 102 | |
1100 | 102 | assert(DefRegs.size() == 2); |
1101 | 102 | assert(Src0Regs.size() == Src1Regs.size() && |
1102 | 102 | (Src0Regs.empty() || Src0Regs.size() == 2)); |
1103 | 102 | |
1104 | 102 | // Depending on where the source registers came from, the generic code may |
1105 | 102 | // have decided to split the inputs already or not. If not, we still need to |
1106 | 102 | // extract the values. |
1107 | 102 | MachineIRBuilder B(MI); |
1108 | 102 | |
1109 | 102 | if (Src0Regs.empty()) |
1110 | 102 | split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); |
1111 | 0 | else |
1112 | 0 | setRegsToType(MRI, Src0Regs, HalfTy); |
1113 | 102 | |
1114 | 102 | if (Src1Regs.empty()) |
1115 | 102 | split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); |
1116 | 0 | else |
1117 | 0 | setRegsToType(MRI, Src1Regs, HalfTy); |
1118 | 102 | |
1119 | 102 | setRegsToType(MRI, DefRegs, HalfTy); |
1120 | 102 | |
1121 | 102 | B.buildInstr(Opc) |
1122 | 102 | .addDef(DefRegs[0]) |
1123 | 102 | .addUse(Src0Regs[0]) |
1124 | 102 | .addUse(Src1Regs[0]); |
1125 | 102 | |
1126 | 102 | B.buildInstr(Opc) |
1127 | 102 | .addDef(DefRegs[1]) |
1128 | 102 | .addUse(Src0Regs[1]) |
1129 | 102 | .addUse(Src1Regs[1]); |
1130 | 102 | |
1131 | 102 | MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); |
1132 | 102 | MI.eraseFromParent(); |
1133 | 102 | return; |
1134 | 102 | } |
1135 | 102 | case AMDGPU::G_ADD: |
1136 | 12 | case AMDGPU::G_SUB: |
1137 | 12 | case AMDGPU::G_MUL: { |
1138 | 12 | Register DstReg = MI.getOperand(0).getReg(); |
1139 | 12 | LLT DstTy = MRI.getType(DstReg); |
1140 | 12 | if (DstTy != LLT::scalar(16)) |
1141 | 12 | break; |
1142 | 0 | |
1143 | 0 | const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); |
1144 | 0 | if (DstBank == &AMDGPU::VGPRRegBank) |
1145 | 0 | break; |
1146 | 0 | |
1147 | 0 | // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. |
1148 | 0 | MachineFunction *MF = MI.getParent()->getParent(); |
1149 | 0 | MachineIRBuilder B(MI); |
1150 | 0 | ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); |
1151 | 0 | GISelObserverWrapper Observer(&ApplySALU); |
1152 | 0 | LegalizerHelper Helper(*MF, Observer, B); |
1153 | 0 |
|
1154 | 0 | if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != |
1155 | 0 | LegalizerHelper::Legalized) |
1156 | 0 | llvm_unreachable("widen scalar should have succeeded"); |
1157 | 0 | return; |
1158 | 0 | } |
1159 | 56 | case AMDGPU::G_SMIN: |
1160 | 56 | case AMDGPU::G_SMAX: |
1161 | 56 | case AMDGPU::G_UMIN: |
1162 | 56 | case AMDGPU::G_UMAX: { |
1163 | 56 | Register DstReg = MI.getOperand(0).getReg(); |
1164 | 56 | const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); |
1165 | 56 | if (DstBank == &AMDGPU::VGPRRegBank) |
1166 | 24 | break; |
1167 | 32 | |
1168 | 32 | MachineFunction *MF = MI.getParent()->getParent(); |
1169 | 32 | MachineIRBuilder B(MI); |
1170 | 32 | ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); |
1171 | 32 | GISelObserverWrapper Observer(&ApplySALU); |
1172 | 32 | LegalizerHelper Helper(*MF, Observer, B); |
1173 | 32 | |
1174 | 32 | // Turn scalar min/max into a compare and select. |
1175 | 32 | LLT Ty = MRI.getType(DstReg); |
1176 | 32 | LLT S32 = LLT::scalar(32); |
1177 | 32 | LLT S16 = LLT::scalar(16); |
1178 | 32 | |
1179 | 32 | if (Ty == S16) { |
1180 | 16 | // Need to widen to s32, and expand as cmp + select. |
1181 | 16 | if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) |
1182 | 16 | llvm_unreachable0 ("widenScalar should have succeeded"); |
1183 | 16 | |
1184 | 16 | // FIXME: This is relying on widenScalar leaving MI in place. |
1185 | 16 | if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized) |
1186 | 16 | llvm_unreachable0 ("lower should have succeeded"); |
1187 | 16 | } else { |
1188 | 16 | if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized) |
1189 | 16 | llvm_unreachable0 ("lower should have succeeded"); |
1190 | 16 | } |
1191 | 32 | |
1192 | 32 | return; |
1193 | 32 | } |
1194 | 64 | case AMDGPU::G_SEXT: |
1195 | 64 | case AMDGPU::G_ZEXT: { |
1196 | 64 | Register SrcReg = MI.getOperand(1).getReg(); |
1197 | 64 | LLT SrcTy = MRI.getType(SrcReg); |
1198 | 64 | bool Signed = Opc == AMDGPU::G_SEXT; |
1199 | 64 | |
1200 | 64 | MachineIRBuilder B(MI); |
1201 | 64 | const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); |
1202 | 64 | |
1203 | 64 | Register DstReg = MI.getOperand(0).getReg(); |
1204 | 64 | LLT DstTy = MRI.getType(DstReg); |
1205 | 64 | if (DstTy.isScalar() && |
1206 | 64 | SrcBank != &AMDGPU::SGPRRegBank && |
1207 | 64 | SrcBank != &AMDGPU::SCCRegBank44 && |
1208 | 64 | SrcBank != &AMDGPU::VCCRegBank32 && |
1209 | 64 | // FIXME: Should handle any type that round to s64 when irregular |
1210 | 64 | // breakdowns supported. |
1211 | 64 | DstTy.getSizeInBits() == 6420 && |
1212 | 64 | SrcTy.getSizeInBits() <= 3212 ) { |
1213 | 12 | const LLT S32 = LLT::scalar(32); |
1214 | 12 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); |
1215 | 12 | |
1216 | 12 | // Extend to 32-bit, and then extend the low half. |
1217 | 12 | if (Signed) { |
1218 | 6 | // TODO: Should really be buildSExtOrCopy |
1219 | 6 | B.buildSExtOrTrunc(DefRegs[0], SrcReg); |
1220 | 6 | |
1221 | 6 | // Replicate sign bit from 32-bit extended part. |
1222 | 6 | auto ShiftAmt = B.buildConstant(S32, 31); |
1223 | 6 | MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); |
1224 | 6 | B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt); |
1225 | 6 | } else { |
1226 | 6 | B.buildZExtOrTrunc(DefRegs[0], SrcReg); |
1227 | 6 | B.buildConstant(DefRegs[1], 0); |
1228 | 6 | } |
1229 | 12 | |
1230 | 12 | MRI.setRegBank(DstReg, *SrcBank); |
1231 | 12 | MI.eraseFromParent(); |
1232 | 12 | return; |
1233 | 12 | } |
1234 | 52 | |
1235 | 52 | if (SrcTy != LLT::scalar(1)) |
1236 | 8 | return; |
1237 | 44 | |
1238 | 44 | if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank32 ) { |
1239 | 24 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); |
1240 | 24 | |
1241 | 24 | const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ? |
1242 | 12 | &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank; |
1243 | 24 | |
1244 | 24 | unsigned DstSize = DstTy.getSizeInBits(); |
1245 | 24 | // 64-bit select is SGPR only |
1246 | 24 | const bool UseSel64 = DstSize > 32 && |
1247 | 24 | SrcBank->getID() == AMDGPU::SCCRegBankID8 ; |
1248 | 24 | |
1249 | 24 | // TODO: Should s16 select be legal? |
1250 | 24 | LLT SelType = UseSel64 ? LLT::scalar(64)4 : LLT::scalar(32)20 ; |
1251 | 24 | auto True = B.buildConstant(SelType, Signed ? -112 : 112 ); |
1252 | 24 | auto False = B.buildConstant(SelType, 0); |
1253 | 24 | |
1254 | 24 | MRI.setRegBank(True.getReg(0), *DstBank); |
1255 | 24 | MRI.setRegBank(False.getReg(0), *DstBank); |
1256 | 24 | MRI.setRegBank(DstReg, *DstBank); |
1257 | 24 | |
1258 | 24 | if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID8 ) { |
1259 | 4 | B.buildSelect(DefRegs[0], SrcReg, True, False); |
1260 | 4 | B.buildCopy(DefRegs[1], DefRegs[0]); |
1261 | 20 | } else if (DstSize < 32) { |
1262 | 8 | auto Sel = B.buildSelect(SelType, SrcReg, True, False); |
1263 | 8 | MRI.setRegBank(Sel.getReg(0), *DstBank); |
1264 | 8 | B.buildTrunc(DstReg, Sel); |
1265 | 12 | } else { |
1266 | 12 | B.buildSelect(DstReg, SrcReg, True, False); |
1267 | 12 | } |
1268 | 24 | |
1269 | 24 | MI.eraseFromParent(); |
1270 | 24 | return; |
1271 | 24 | } |
1272 | 20 | |
1273 | 20 | // Fixup the case with an s1 src that isn't a condition register. Use shifts |
1274 | 20 | // instead of introducing a compare to avoid an unnecessary condition |
1275 | 20 | // register (and since there's no scalar 16-bit compares). |
1276 | 20 | auto Ext = B.buildAnyExt(DstTy, SrcReg); |
1277 | 20 | auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1); |
1278 | 20 | auto Shl = B.buildShl(DstTy, Ext, ShiftAmt); |
1279 | 20 | |
1280 | 20 | if (MI.getOpcode() == AMDGPU::G_SEXT) |
1281 | 10 | B.buildAShr(DstReg, Shl, ShiftAmt); |
1282 | 10 | else |
1283 | 10 | B.buildLShr(DstReg, Shl, ShiftAmt); |
1284 | 20 | |
1285 | 20 | MRI.setRegBank(DstReg, *SrcBank); |
1286 | 20 | MRI.setRegBank(Ext.getReg(0), *SrcBank); |
1287 | 20 | MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); |
1288 | 20 | MRI.setRegBank(Shl.getReg(0), *SrcBank); |
1289 | 20 | MI.eraseFromParent(); |
1290 | 20 | return; |
1291 | 20 | } |
1292 | 20 | case AMDGPU::G_EXTRACT_VECTOR_ELT: |
1293 | 10 | applyDefaultMapping(OpdMapper); |
1294 | 10 | executeInWaterfallLoop(MI, MRI, { 2 }); |
1295 | 10 | return; |
1296 | 108 | case AMDGPU::G_INTRINSIC: { |
1297 | 108 | switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { |
1298 | 108 | case Intrinsic::amdgcn_s_buffer_load: { |
1299 | 8 | // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS |
1300 | 8 | executeInWaterfallLoop(MI, MRI, { 2, 3 }); |
1301 | 8 | return; |
1302 | 108 | } |
1303 | 108 | case Intrinsic::amdgcn_readlane: { |
1304 | 8 | substituteSimpleCopyRegs(OpdMapper, 2); |
1305 | 8 | |
1306 | 8 | assert(empty(OpdMapper.getVRegs(0))); |
1307 | 8 | assert(empty(OpdMapper.getVRegs(3))); |
1308 | 8 | |
1309 | 8 | // Make sure the index is an SGPR. It doesn't make sense to run this in a |
1310 | 8 | // waterfall loop, so assume it's a uniform value. |
1311 | 8 | constrainOpWithReadfirstlane(MI, MRI, 3); // Index |
1312 | 8 | return; |
1313 | 108 | } |
1314 | 108 | case Intrinsic::amdgcn_writelane: { |
1315 | 10 | assert(empty(OpdMapper.getVRegs(0))); |
1316 | 10 | assert(empty(OpdMapper.getVRegs(2))); |
1317 | 10 | assert(empty(OpdMapper.getVRegs(3))); |
1318 | 10 | |
1319 | 10 | substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val |
1320 | 10 | constrainOpWithReadfirstlane(MI, MRI, 2); // Source value |
1321 | 10 | constrainOpWithReadfirstlane(MI, MRI, 3); // Index |
1322 | 10 | return; |
1323 | 108 | } |
1324 | 108 | default: |
1325 | 82 | break; |
1326 | 82 | } |
1327 | 82 | break; |
1328 | 82 | } |
1329 | 98 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { |
1330 | 98 | switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { |
1331 | 98 | case Intrinsic::amdgcn_buffer_load: { |
1332 | 16 | executeInWaterfallLoop(MI, MRI, { 2 }); |
1333 | 16 | return; |
1334 | 98 | } |
1335 | 98 | case Intrinsic::amdgcn_ds_ordered_add: |
1336 | 16 | case Intrinsic::amdgcn_ds_ordered_swap: { |
1337 | 16 | // This is only allowed to execute with 1 lane, so readfirstlane is safe. |
1338 | 16 | assert(empty(OpdMapper.getVRegs(0))); |
1339 | 16 | substituteSimpleCopyRegs(OpdMapper, 3); |
1340 | 16 | constrainOpWithReadfirstlane(MI, MRI, 2); // M0 |
1341 | 16 | return; |
1342 | 16 | } |
1343 | 16 | case Intrinsic::amdgcn_s_sendmsg: |
1344 | 8 | case Intrinsic::amdgcn_s_sendmsghalt: { |
1345 | 8 | // FIXME: Should this use a waterfall loop? |
1346 | 8 | constrainOpWithReadfirstlane(MI, MRI, 2); // M0 |
1347 | 8 | return; |
1348 | 8 | } |
1349 | 58 | default: |
1350 | 58 | break; |
1351 | 58 | } |
1352 | 58 | break; |
1353 | 58 | } |
1354 | 172 | case AMDGPU::G_LOAD: { |
1355 | 172 | if (applyMappingWideLoad(MI, OpdMapper, MRI)) |
1356 | 16 | return; |
1357 | 156 | break; |
1358 | 156 | } |
1359 | 1.96k | default: |
1360 | 1.96k | break; |
1361 | 2.53k | } |
1362 | 2.53k | |
1363 | 2.53k | return applyDefaultMapping(OpdMapper); |
1364 | 2.53k | } |
1365 | | |
1366 | 564 | bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { |
1367 | 564 | const MachineFunction &MF = *MI.getParent()->getParent(); |
1368 | 564 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1369 | 1.80k | for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i1.24k ) { |
1370 | 1.58k | if (!MI.getOperand(i).isReg()) |
1371 | 4 | continue; |
1372 | 1.58k | Register Reg = MI.getOperand(i).getReg(); |
1373 | 1.58k | if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { |
1374 | 942 | if (Bank->getID() == AMDGPU::VGPRRegBankID) |
1375 | 340 | return false; |
1376 | 602 | |
1377 | 602 | assert(Bank->getID() == AMDGPU::SGPRRegBankID || |
1378 | 602 | Bank->getID() == AMDGPU::SCCRegBankID); |
1379 | 602 | } |
1380 | 1.58k | } |
1381 | 564 | return true224 ; |
1382 | 564 | } |
1383 | | |
1384 | | const RegisterBankInfo::InstructionMapping & |
1385 | 173 | AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { |
1386 | 173 | const MachineFunction &MF = *MI.getParent()->getParent(); |
1387 | 173 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1388 | 173 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
1389 | 173 | |
1390 | 732 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i559 ) { |
1391 | 559 | unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); |
1392 | 559 | unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID40 : AMDGPU::SGPRRegBankID519 ; |
1393 | 559 | OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); |
1394 | 559 | } |
1395 | 173 | return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), |
1396 | 173 | MI.getNumOperands()); |
1397 | 173 | } |
1398 | | |
1399 | | const RegisterBankInfo::InstructionMapping & |
1400 | 259 | AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { |
1401 | 259 | const MachineFunction &MF = *MI.getParent()->getParent(); |
1402 | 259 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1403 | 259 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
1404 | 259 | unsigned OpdIdx = 0; |
1405 | 259 | |
1406 | 259 | unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
1407 | 259 | OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); |
1408 | 259 | |
1409 | 259 | if (MI.getOperand(OpdIdx).isIntrinsicID()) |
1410 | 24 | OpdsMapping[OpdIdx++] = nullptr; |
1411 | 259 | |
1412 | 259 | Register Reg1 = MI.getOperand(OpdIdx).getReg(); |
1413 | 259 | unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI); |
1414 | 259 | |
1415 | 259 | unsigned DefaultBankID = Size1 == 1 ? |
1416 | 211 | AMDGPU::VCCRegBankID48 : AMDGPU::VGPRRegBankID; |
1417 | 259 | unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID); |
1418 | 259 | |
1419 | 259 | OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1); |
1420 | 259 | |
1421 | 566 | for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx307 ) { |
1422 | 307 | const MachineOperand &MO = MI.getOperand(OpdIdx); |
1423 | 307 | if (!MO.isReg()) |
1424 | 0 | continue; |
1425 | 307 | |
1426 | 307 | unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI); |
1427 | 307 | unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID32 : AMDGPU::VGPRRegBankID275 ; |
1428 | 307 | OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size); |
1429 | 307 | } |
1430 | 259 | |
1431 | 259 | return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), |
1432 | 259 | MI.getNumOperands()); |
1433 | 259 | } |
1434 | | |
1435 | | const RegisterBankInfo::InstructionMapping & |
1436 | 114 | AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { |
1437 | 114 | const MachineFunction &MF = *MI.getParent()->getParent(); |
1438 | 114 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1439 | 114 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
1440 | 114 | |
1441 | 622 | for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I508 ) { |
1442 | 508 | const MachineOperand &Op = MI.getOperand(I); |
1443 | 508 | if (!Op.isReg()) |
1444 | 168 | continue; |
1445 | 340 | |
1446 | 340 | unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); |
1447 | 340 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
1448 | 340 | } |
1449 | 114 | |
1450 | 114 | return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), |
1451 | 114 | MI.getNumOperands()); |
1452 | 114 | } |
1453 | | |
1454 | | const RegisterBankInfo::InstructionMapping & |
1455 | 172 | AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { |
1456 | 172 | |
1457 | 172 | const MachineFunction &MF = *MI.getParent()->getParent(); |
1458 | 172 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1459 | 172 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
1460 | 172 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
1461 | 172 | LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); |
1462 | 172 | unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); |
1463 | 172 | |
1464 | 172 | const ValueMapping *ValMapping; |
1465 | 172 | const ValueMapping *PtrMapping; |
1466 | 172 | |
1467 | 172 | if (isInstrUniform(MI)) { |
1468 | 155 | // We have a uniform instruction so we want to use an SMRD load |
1469 | 155 | ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
1470 | 155 | PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); |
1471 | 155 | } else { |
1472 | 17 | ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); |
1473 | 17 | // FIXME: What would happen if we used SGPRRegBankID here? |
1474 | 17 | PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); |
1475 | 17 | } |
1476 | 172 | |
1477 | 172 | OpdsMapping[0] = ValMapping; |
1478 | 172 | OpdsMapping[1] = PtrMapping; |
1479 | 172 | const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( |
1480 | 172 | 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); |
1481 | 172 | return Mapping; |
1482 | 172 | |
1483 | 172 | // FIXME: Do we want to add a mapping for FLAT load, or should we just |
1484 | 172 | // handle that during instruction selection? |
1485 | 172 | } |
1486 | | |
1487 | | unsigned |
1488 | | AMDGPURegisterBankInfo::getRegBankID(Register Reg, |
1489 | | const MachineRegisterInfo &MRI, |
1490 | | const TargetRegisterInfo &TRI, |
1491 | 2.64k | unsigned Default) const { |
1492 | 2.64k | |
1493 | 2.64k | const RegisterBank *Bank = getRegBank(Reg, MRI, TRI); |
1494 | 2.64k | return Bank ? Bank->getID()2.60k : Default48 ; |
1495 | 2.64k | } |
1496 | | |
1497 | | /// |
1498 | | /// This function must return a legal mapping, because |
1499 | | /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called |
1500 | | /// in RegBankSelect::Mode::Fast. Any mapping that would cause a |
1501 | | /// VGPR to SGPR generated is illegal. |
1502 | | /// |
1503 | | const RegisterBankInfo::InstructionMapping & |
1504 | 6.45k | AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { |
1505 | 6.45k | const MachineFunction &MF = *MI.getParent()->getParent(); |
1506 | 6.45k | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1507 | 6.45k | |
1508 | 6.45k | if (MI.isRegSequence()) { |
1509 | 16 | // If any input is a VGPR, the result must be a VGPR. The default handling |
1510 | 16 | // assumes any copy between banks is legal. |
1511 | 16 | unsigned BankID = AMDGPU::SGPRRegBankID; |
1512 | 16 | |
1513 | 28 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 212 ) { |
1514 | 24 | auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI); |
1515 | 24 | // It doesn't make sense to use vcc or scc banks here, so just ignore |
1516 | 24 | // them. |
1517 | 24 | if (OpBank != AMDGPU::SGPRRegBankID) { |
1518 | 12 | BankID = AMDGPU::VGPRRegBankID; |
1519 | 12 | break; |
1520 | 12 | } |
1521 | 24 | } |
1522 | 16 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
1523 | 16 | |
1524 | 16 | const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); |
1525 | 16 | return getInstructionMapping( |
1526 | 16 | 1, /*Cost*/ 1, |
1527 | 16 | /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); |
1528 | 16 | } |
1529 | 6.43k | |
1530 | 6.43k | // The default handling is broken and doesn't handle illegal SGPR->VGPR copies |
1531 | 6.43k | // properly. |
1532 | 6.43k | // |
1533 | 6.43k | // TODO: There are additional exec masking dependencies to analyze. |
1534 | 6.43k | if (MI.getOpcode() == TargetOpcode::G_PHI) { |
1535 | 96 | // TODO: Generate proper invalid bank enum. |
1536 | 96 | int ResultBank = -1; |
1537 | 96 | |
1538 | 204 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2108 ) { |
1539 | 168 | unsigned Reg = MI.getOperand(I).getReg(); |
1540 | 168 | const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); |
1541 | 168 | |
1542 | 168 | // FIXME: Assuming VGPR for any undetermined inputs. |
1543 | 168 | if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID162 ) { |
1544 | 52 | ResultBank = AMDGPU::VGPRRegBankID; |
1545 | 52 | break; |
1546 | 52 | } |
1547 | 116 | |
1548 | 116 | unsigned OpBank = Bank->getID(); |
1549 | 116 | // scc, scc -> sgpr |
1550 | 116 | if (OpBank == AMDGPU::SCCRegBankID) { |
1551 | 40 | // There's only one SCC register, so a phi requires copying to SGPR. |
1552 | 40 | OpBank = AMDGPU::SGPRRegBankID; |
1553 | 76 | } else if (OpBank == AMDGPU::VCCRegBankID) { |
1554 | 28 | // vcc, vcc -> vcc |
1555 | 28 | // vcc, sgpr -> vgpr |
1556 | 28 | if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID12 ) { |
1557 | 8 | ResultBank = AMDGPU::VGPRRegBankID; |
1558 | 8 | break; |
1559 | 8 | } |
1560 | 108 | } |
1561 | 108 | |
1562 | 108 | ResultBank = OpBank; |
1563 | 108 | } |
1564 | 96 | |
1565 | 96 | assert(ResultBank != -1); |
1566 | 96 | |
1567 | 96 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
1568 | 96 | |
1569 | 96 | const ValueMapping &ValMap = |
1570 | 96 | getValueMapping(0, Size, getRegBank(ResultBank)); |
1571 | 96 | return getInstructionMapping( |
1572 | 96 | 1, /*Cost*/ 1, |
1573 | 96 | /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); |
1574 | 96 | } |
1575 | 6.34k | |
1576 | 6.34k | const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); |
1577 | 6.34k | if (Mapping.isValid()) |
1578 | 3.57k | return Mapping; |
1579 | 2.76k | |
1580 | 2.76k | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
1581 | 2.76k | |
1582 | 2.76k | switch (MI.getOpcode()) { |
1583 | 2.76k | default: |
1584 | 2 | return getInvalidInstructionMapping(); |
1585 | 2.76k | |
1586 | 2.76k | case AMDGPU::G_AND: |
1587 | 234 | case AMDGPU::G_OR: |
1588 | 234 | case AMDGPU::G_XOR: { |
1589 | 234 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
1590 | 234 | if (Size == 1) { |
1591 | 60 | const RegisterBank *DstBank |
1592 | 60 | = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); |
1593 | 60 | |
1594 | 60 | unsigned TargetBankID = -1; |
1595 | 60 | unsigned BankLHS = -1; |
1596 | 60 | unsigned BankRHS = -1; |
1597 | 60 | if (DstBank) { |
1598 | 20 | TargetBankID = DstBank->getID(); |
1599 | 20 | if (DstBank == &AMDGPU::VCCRegBank) { |
1600 | 8 | TargetBankID = AMDGPU::VCCRegBankID; |
1601 | 8 | BankLHS = AMDGPU::VCCRegBankID; |
1602 | 8 | BankRHS = AMDGPU::VCCRegBankID; |
1603 | 12 | } else if (DstBank == &AMDGPU::SCCRegBank) { |
1604 | 8 | TargetBankID = AMDGPU::SCCRegBankID; |
1605 | 8 | BankLHS = AMDGPU::SGPRRegBankID; |
1606 | 8 | BankRHS = AMDGPU::SGPRRegBankID; |
1607 | 8 | } else { |
1608 | 4 | BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, |
1609 | 4 | AMDGPU::SGPRRegBankID); |
1610 | 4 | BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, |
1611 | 4 | AMDGPU::SGPRRegBankID); |
1612 | 4 | } |
1613 | 40 | } else { |
1614 | 40 | BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, |
1615 | 40 | AMDGPU::VCCRegBankID); |
1616 | 40 | BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, |
1617 | 40 | AMDGPU::VCCRegBankID); |
1618 | 40 | |
1619 | 40 | // Both inputs should be true booleans to produce a boolean result. |
1620 | 40 | if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID36 ) { |
1621 | 6 | TargetBankID = AMDGPU::VGPRRegBankID; |
1622 | 34 | } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID26 ) { |
1623 | 18 | TargetBankID = AMDGPU::VCCRegBankID; |
1624 | 18 | BankLHS = AMDGPU::VCCRegBankID; |
1625 | 18 | BankRHS = AMDGPU::VCCRegBankID; |
1626 | 18 | } else if (16 BankLHS == AMDGPU::SGPRRegBankID16 && BankRHS == AMDGPU::SGPRRegBankID10 ) { |
1627 | 6 | TargetBankID = AMDGPU::SGPRRegBankID; |
1628 | 10 | } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID4 ) { |
1629 | 10 | // The operation must be done on a 32-bit register, but it will set |
1630 | 10 | // scc. The result type could interchangably be SCC or SGPR, since |
1631 | 10 | // both values will be produced. |
1632 | 10 | TargetBankID = AMDGPU::SCCRegBankID; |
1633 | 10 | BankLHS = AMDGPU::SGPRRegBankID; |
1634 | 10 | BankRHS = AMDGPU::SGPRRegBankID; |
1635 | 10 | } |
1636 | 40 | } |
1637 | 60 | |
1638 | 60 | OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); |
1639 | 60 | OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); |
1640 | 60 | OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); |
1641 | 60 | break; |
1642 | 60 | } |
1643 | 174 | |
1644 | 174 | if (Size == 64) { |
1645 | 126 | |
1646 | 126 | if (isSALUMapping(MI)) { |
1647 | 24 | OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); |
1648 | 24 | OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; |
1649 | 102 | } else { |
1650 | 102 | OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); |
1651 | 102 | unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/); |
1652 | 102 | OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); |
1653 | 102 | |
1654 | 102 | unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/); |
1655 | 102 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); |
1656 | 102 | } |
1657 | 126 | |
1658 | 126 | break; |
1659 | 126 | } |
1660 | 48 | |
1661 | 48 | LLVM_FALLTHROUGH; |
1662 | 48 | } |
1663 | 48 | |
1664 | 320 | case AMDGPU::G_GEP: |
1665 | 320 | case AMDGPU::G_ADD: |
1666 | 320 | case AMDGPU::G_SUB: |
1667 | 320 | case AMDGPU::G_MUL: |
1668 | 320 | case AMDGPU::G_SHL: |
1669 | 320 | case AMDGPU::G_LSHR: |
1670 | 320 | case AMDGPU::G_ASHR: |
1671 | 320 | case AMDGPU::G_UADDO: |
1672 | 320 | case AMDGPU::G_SADDO: |
1673 | 320 | case AMDGPU::G_USUBO: |
1674 | 320 | case AMDGPU::G_SSUBO: |
1675 | 320 | case AMDGPU::G_UADDE: |
1676 | 320 | case AMDGPU::G_SADDE: |
1677 | 320 | case AMDGPU::G_USUBE: |
1678 | 320 | case AMDGPU::G_SSUBE: |
1679 | 320 | case AMDGPU::G_UMULH: |
1680 | 320 | case AMDGPU::G_SMULH: |
1681 | 320 | case AMDGPU::G_SMIN: |
1682 | 320 | case AMDGPU::G_SMAX: |
1683 | 320 | case AMDGPU::G_UMIN: |
1684 | 320 | case AMDGPU::G_UMAX: |
1685 | 320 | if (isSALUMapping(MI)) |
1686 | 173 | return getDefaultMappingSOP(MI); |
1687 | 147 | LLVM_FALLTHROUGH; |
1688 | 147 | |
1689 | 235 | case AMDGPU::G_FADD: |
1690 | 235 | case AMDGPU::G_FSUB: |
1691 | 235 | case AMDGPU::G_FPTOSI: |
1692 | 235 | case AMDGPU::G_FPTOUI: |
1693 | 235 | case AMDGPU::G_FMUL: |
1694 | 235 | case AMDGPU::G_FMA: |
1695 | 235 | case AMDGPU::G_FSQRT: |
1696 | 235 | case AMDGPU::G_SITOFP: |
1697 | 235 | case AMDGPU::G_UITOFP: |
1698 | 235 | case AMDGPU::G_FPTRUNC: |
1699 | 235 | case AMDGPU::G_FPEXT: |
1700 | 235 | case AMDGPU::G_FEXP2: |
1701 | 235 | case AMDGPU::G_FLOG2: |
1702 | 235 | case AMDGPU::G_FMINNUM: |
1703 | 235 | case AMDGPU::G_FMAXNUM: |
1704 | 235 | case AMDGPU::G_FMINNUM_IEEE: |
1705 | 235 | case AMDGPU::G_FMAXNUM_IEEE: |
1706 | 235 | case AMDGPU::G_FCANONICALIZE: |
1707 | 235 | case AMDGPU::G_INTRINSIC_TRUNC: |
1708 | 235 | case AMDGPU::G_INTRINSIC_ROUND: |
1709 | 235 | return getDefaultMappingVOP(MI); |
1710 | 235 | case AMDGPU::G_IMPLICIT_DEF: { |
1711 | 22 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
1712 | 22 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
1713 | 22 | break; |
1714 | 235 | } |
1715 | 298 | case AMDGPU::G_FCONSTANT: |
1716 | 298 | case AMDGPU::G_CONSTANT: |
1717 | 298 | case AMDGPU::G_FRAME_INDEX: |
1718 | 298 | case AMDGPU::G_BLOCK_ADDR: { |
1719 | 298 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
1720 | 298 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
1721 | 298 | break; |
1722 | 298 | } |
1723 | 298 | case AMDGPU::G_INSERT: { |
1724 | 10 | unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID4 : |
1725 | 10 | AMDGPU::VGPRRegBankID6 ; |
1726 | 10 | unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
1727 | 10 | unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); |
1728 | 10 | unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); |
1729 | 10 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); |
1730 | 10 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); |
1731 | 10 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); |
1732 | 10 | OpdsMapping[3] = nullptr; |
1733 | 10 | break; |
1734 | 298 | } |
1735 | 298 | case AMDGPU::G_EXTRACT: { |
1736 | 13 | unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); |
1737 | 13 | unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
1738 | 13 | unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); |
1739 | 13 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); |
1740 | 13 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); |
1741 | 13 | OpdsMapping[2] = nullptr; |
1742 | 13 | break; |
1743 | 298 | } |
1744 | 298 | case AMDGPU::G_MERGE_VALUES: |
1745 | 80 | case AMDGPU::G_BUILD_VECTOR: |
1746 | 80 | case AMDGPU::G_CONCAT_VECTORS: { |
1747 | 80 | unsigned Bank = isSALUMapping(MI) ? |
1748 | 62 | AMDGPU::SGPRRegBankID18 : AMDGPU::VGPRRegBankID; |
1749 | 80 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
1750 | 80 | unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); |
1751 | 80 | |
1752 | 80 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); |
1753 | 80 | // Op1 and Dst should use the same register bank. |
1754 | 240 | for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i160 ) |
1755 | 160 | OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); |
1756 | 80 | break; |
1757 | 80 | } |
1758 | 80 | case AMDGPU::G_BITCAST: |
1759 | 50 | case AMDGPU::G_INTTOPTR: |
1760 | 50 | case AMDGPU::G_PTRTOINT: |
1761 | 50 | case AMDGPU::G_CTLZ: |
1762 | 50 | case AMDGPU::G_CTLZ_ZERO_UNDEF: |
1763 | 50 | case AMDGPU::G_CTTZ: |
1764 | 50 | case AMDGPU::G_CTTZ_ZERO_UNDEF: |
1765 | 50 | case AMDGPU::G_CTPOP: |
1766 | 50 | case AMDGPU::G_BSWAP: |
1767 | 50 | case AMDGPU::G_FABS: |
1768 | 50 | case AMDGPU::G_FNEG: { |
1769 | 50 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
1770 | 50 | unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); |
1771 | 50 | OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); |
1772 | 50 | break; |
1773 | 50 | } |
1774 | 280 | case AMDGPU::G_TRUNC: { |
1775 | 280 | Register Dst = MI.getOperand(0).getReg(); |
1776 | 280 | Register Src = MI.getOperand(1).getReg(); |
1777 | 280 | unsigned Bank = getRegBankID(Src, MRI, *TRI); |
1778 | 280 | unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); |
1779 | 280 | unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); |
1780 | 280 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); |
1781 | 280 | OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); |
1782 | 280 | break; |
1783 | 50 | } |
1784 | 101 | case AMDGPU::G_ZEXT: |
1785 | 101 | case AMDGPU::G_SEXT: |
1786 | 101 | case AMDGPU::G_ANYEXT: { |
1787 | 101 | Register Dst = MI.getOperand(0).getReg(); |
1788 | 101 | Register Src = MI.getOperand(1).getReg(); |
1789 | 101 | unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); |
1790 | 101 | unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); |
1791 | 101 | |
1792 | 101 | unsigned DstBank; |
1793 | 101 | const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); |
1794 | 101 | assert(SrcBank); |
1795 | 101 | switch (SrcBank->getID()) { |
1796 | 101 | case AMDGPU::SCCRegBankID: |
1797 | 55 | case AMDGPU::SGPRRegBankID: |
1798 | 55 | DstBank = AMDGPU::SGPRRegBankID; |
1799 | 55 | break; |
1800 | 55 | default: |
1801 | 46 | DstBank = AMDGPU::VGPRRegBankID; |
1802 | 46 | break; |
1803 | 101 | } |
1804 | 101 | |
1805 | 101 | // TODO: Should anyext be split into 32-bit part as well? |
1806 | 101 | if (MI.getOpcode() == AMDGPU::G_ANYEXT) { |
1807 | 37 | OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); |
1808 | 37 | OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize); |
1809 | 64 | } else { |
1810 | 64 | // Scalar extend can use 64-bit BFE, but VGPRs require extending to |
1811 | 64 | // 32-bits, and then to 64. |
1812 | 64 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); |
1813 | 64 | OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), |
1814 | 64 | SrcSize); |
1815 | 64 | } |
1816 | 101 | break; |
1817 | 101 | } |
1818 | 101 | case AMDGPU::G_FCMP: { |
1819 | 6 | unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
1820 | 6 | unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); |
1821 | 6 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
1822 | 6 | OpdsMapping[1] = nullptr; // Predicate Operand. |
1823 | 6 | OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); |
1824 | 6 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
1825 | 6 | break; |
1826 | 101 | } |
1827 | 102 | case AMDGPU::G_STORE: { |
1828 | 102 | assert(MI.getOperand(0).isReg()); |
1829 | 102 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
1830 | 102 | // FIXME: We need to specify a different reg bank once scalar stores |
1831 | 102 | // are supported. |
1832 | 102 | const ValueMapping *ValMapping = |
1833 | 102 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
1834 | 102 | // FIXME: Depending on the type of store, the pointer could be in |
1835 | 102 | // the SGPR Reg bank. |
1836 | 102 | // FIXME: Pointer size should be based on the address space. |
1837 | 102 | const ValueMapping *PtrMapping = |
1838 | 102 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); |
1839 | 102 | |
1840 | 102 | OpdsMapping[0] = ValMapping; |
1841 | 102 | OpdsMapping[1] = PtrMapping; |
1842 | 102 | break; |
1843 | 101 | } |
1844 | 101 | |
1845 | 460 | case AMDGPU::G_ICMP: { |
1846 | 460 | auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); |
1847 | 460 | unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
1848 | 460 | unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); |
1849 | 460 | unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); |
1850 | 460 | |
1851 | 460 | bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID && |
1852 | 460 | Op3Bank == AMDGPU::SGPRRegBankID294 && |
1853 | 460 | (270 Size == 32270 || (14 Size == 6414 && |
1854 | 14 | (12 Pred == CmpInst::ICMP_EQ12 || Pred == CmpInst::ICMP_NE8 ) && |
1855 | 14 | MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64()8 )); |
1856 | 460 | |
1857 | 460 | unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID260 : AMDGPU::VCCRegBankID200 ; |
1858 | 460 | |
1859 | 460 | OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1); |
1860 | 460 | OpdsMapping[1] = nullptr; // Predicate Operand. |
1861 | 460 | OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); |
1862 | 460 | OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size); |
1863 | 460 | break; |
1864 | 101 | } |
1865 | 101 | case AMDGPU::G_EXTRACT_VECTOR_ELT: { |
1866 | 10 | unsigned OutputBankID = isSALUMapping(MI) ? |
1867 | 8 | AMDGPU::SGPRRegBankID2 : AMDGPU::VGPRRegBankID; |
1868 | 10 | unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); |
1869 | 10 | unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
1870 | 10 | unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); |
1871 | 10 | |
1872 | 10 | OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize); |
1873 | 10 | OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize); |
1874 | 10 | |
1875 | 10 | // The index can be either if the source vector is VGPR. |
1876 | 10 | OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); |
1877 | 10 | break; |
1878 | 101 | } |
1879 | 101 | case AMDGPU::G_INSERT_VECTOR_ELT: { |
1880 | 16 | unsigned OutputBankID = isSALUMapping(MI) ? |
1881 | 14 | AMDGPU::SGPRRegBankID2 : AMDGPU::VGPRRegBankID; |
1882 | 16 | |
1883 | 16 | unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
1884 | 16 | unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
1885 | 16 | unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); |
1886 | 16 | unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); |
1887 | 16 | unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); |
1888 | 16 | |
1889 | 16 | OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); |
1890 | 16 | OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); |
1891 | 16 | OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize); |
1892 | 16 | |
1893 | 16 | // The index can be either if the source vector is VGPR. |
1894 | 16 | OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); |
1895 | 16 | break; |
1896 | 101 | } |
1897 | 101 | case AMDGPU::G_UNMERGE_VALUES: { |
1898 | 2 | unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID1 : |
1899 | 2 | AMDGPU::VGPRRegBankID1 ; |
1900 | 2 | |
1901 | 2 | // Op1 and Dst should use the same register bank. |
1902 | 2 | // FIXME: Shouldn't this be the default? Why do we need to handle this? |
1903 | 8 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i6 ) { |
1904 | 6 | unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); |
1905 | 6 | OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); |
1906 | 6 | } |
1907 | 2 | break; |
1908 | 101 | } |
1909 | 108 | case AMDGPU::G_INTRINSIC: { |
1910 | 108 | switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { |
1911 | 108 | default: |
1912 | 0 | return getInvalidInstructionMapping(); |
1913 | 108 | case Intrinsic::amdgcn_div_fmas: |
1914 | 24 | case Intrinsic::amdgcn_trig_preop: |
1915 | 24 | case Intrinsic::amdgcn_sin: |
1916 | 24 | case Intrinsic::amdgcn_cos: |
1917 | 24 | case Intrinsic::amdgcn_log_clamp: |
1918 | 24 | case Intrinsic::amdgcn_rcp: |
1919 | 24 | case Intrinsic::amdgcn_rcp_legacy: |
1920 | 24 | case Intrinsic::amdgcn_rsq: |
1921 | 24 | case Intrinsic::amdgcn_rsq_legacy: |
1922 | 24 | case Intrinsic::amdgcn_rsq_clamp: |
1923 | 24 | case Intrinsic::amdgcn_ldexp: |
1924 | 24 | case Intrinsic::amdgcn_frexp_mant: |
1925 | 24 | case Intrinsic::amdgcn_frexp_exp: |
1926 | 24 | case Intrinsic::amdgcn_fract: |
1927 | 24 | case Intrinsic::amdgcn_cvt_pkrtz: |
1928 | 24 | case Intrinsic::amdgcn_cvt_pknorm_i16: |
1929 | 24 | case Intrinsic::amdgcn_cvt_pknorm_u16: |
1930 | 24 | case Intrinsic::amdgcn_cvt_pk_i16: |
1931 | 24 | case Intrinsic::amdgcn_cvt_pk_u16: |
1932 | 24 | case Intrinsic::amdgcn_fmed3: |
1933 | 24 | case Intrinsic::amdgcn_cubeid: |
1934 | 24 | case Intrinsic::amdgcn_cubema: |
1935 | 24 | case Intrinsic::amdgcn_cubesc: |
1936 | 24 | case Intrinsic::amdgcn_cubetc: |
1937 | 24 | case Intrinsic::amdgcn_sffbh: |
1938 | 24 | case Intrinsic::amdgcn_fmad_ftz: |
1939 | 24 | case Intrinsic::amdgcn_mbcnt_lo: |
1940 | 24 | case Intrinsic::amdgcn_mbcnt_hi: |
1941 | 24 | case Intrinsic::amdgcn_ubfe: |
1942 | 24 | case Intrinsic::amdgcn_sbfe: |
1943 | 24 | case Intrinsic::amdgcn_lerp: |
1944 | 24 | case Intrinsic::amdgcn_sad_u8: |
1945 | 24 | case Intrinsic::amdgcn_msad_u8: |
1946 | 24 | case Intrinsic::amdgcn_sad_hi_u8: |
1947 | 24 | case Intrinsic::amdgcn_sad_u16: |
1948 | 24 | case Intrinsic::amdgcn_qsad_pk_u16_u8: |
1949 | 24 | case Intrinsic::amdgcn_mqsad_pk_u16_u8: |
1950 | 24 | case Intrinsic::amdgcn_mqsad_u32_u8: |
1951 | 24 | case Intrinsic::amdgcn_cvt_pk_u8_f32: |
1952 | 24 | case Intrinsic::amdgcn_alignbit: |
1953 | 24 | case Intrinsic::amdgcn_alignbyte: |
1954 | 24 | case Intrinsic::amdgcn_fdot2: |
1955 | 24 | case Intrinsic::amdgcn_sdot2: |
1956 | 24 | case Intrinsic::amdgcn_udot2: |
1957 | 24 | case Intrinsic::amdgcn_sdot4: |
1958 | 24 | case Intrinsic::amdgcn_udot4: |
1959 | 24 | case Intrinsic::amdgcn_sdot8: |
1960 | 24 | case Intrinsic::amdgcn_udot8: |
1961 | 24 | case Intrinsic::amdgcn_fdiv_fast: |
1962 | 24 | case Intrinsic::amdgcn_wwm: |
1963 | 24 | case Intrinsic::amdgcn_wqm: |
1964 | 24 | return getDefaultMappingVOP(MI); |
1965 | 24 | case Intrinsic::amdgcn_ds_permute: |
1966 | 8 | case Intrinsic::amdgcn_ds_bpermute: |
1967 | 8 | case Intrinsic::amdgcn_update_dpp: |
1968 | 8 | return getDefaultMappingAllVGPR(MI); |
1969 | 8 | case Intrinsic::amdgcn_kernarg_segment_ptr: |
1970 | 6 | case Intrinsic::amdgcn_s_getpc: |
1971 | 6 | case Intrinsic::amdgcn_groupstaticsize: { |
1972 | 6 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
1973 | 6 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
1974 | 6 | break; |
1975 | 6 | } |
1976 | 6 | case Intrinsic::amdgcn_wqm_vote: { |
1977 | 6 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
1978 | 6 | OpdsMapping[0] = OpdsMapping[2] |
1979 | 6 | = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); |
1980 | 6 | break; |
1981 | 6 | } |
1982 | 8 | case Intrinsic::amdgcn_s_buffer_load: { |
1983 | 8 | // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS |
1984 | 8 | Register RSrc = MI.getOperand(2).getReg(); // SGPR |
1985 | 8 | Register Offset = MI.getOperand(3).getReg(); // SGPR/imm |
1986 | 8 | |
1987 | 8 | unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
1988 | 8 | unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); |
1989 | 8 | unsigned Size3 = MRI.getType(Offset).getSizeInBits(); |
1990 | 8 | |
1991 | 8 | unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); |
1992 | 8 | unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); |
1993 | 8 | |
1994 | 8 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0); |
1995 | 8 | OpdsMapping[1] = nullptr; // intrinsic id |
1996 | 8 | |
1997 | 8 | // Lie and claim everything is legal, even though some need to be |
1998 | 8 | // SGPRs. applyMapping will have to deal with it as a waterfall loop. |
1999 | 8 | OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc |
2000 | 8 | OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3); |
2001 | 8 | OpdsMapping[4] = nullptr; |
2002 | 8 | break; |
2003 | 6 | } |
2004 | 8 | case Intrinsic::amdgcn_div_scale: { |
2005 | 8 | unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
2006 | 8 | unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); |
2007 | 8 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); |
2008 | 8 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); |
2009 | 8 | |
2010 | 8 | unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); |
2011 | 8 | OpdsMapping[3] = AMDGPU::getValueMapping( |
2012 | 8 | getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize); |
2013 | 8 | OpdsMapping[4] = AMDGPU::getValueMapping( |
2014 | 8 | getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize); |
2015 | 8 | |
2016 | 8 | break; |
2017 | 6 | } |
2018 | 8 | case Intrinsic::amdgcn_class: { |
2019 | 8 | Register Src0Reg = MI.getOperand(2).getReg(); |
2020 | 8 | Register Src1Reg = MI.getOperand(3).getReg(); |
2021 | 8 | unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); |
2022 | 8 | unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); |
2023 | 8 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
2024 | 8 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); |
2025 | 8 | OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI), |
2026 | 8 | Src0Size); |
2027 | 8 | OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI), |
2028 | 8 | Src1Size); |
2029 | 8 | break; |
2030 | 6 | } |
2031 | 16 | case Intrinsic::amdgcn_icmp: |
2032 | 16 | case Intrinsic::amdgcn_fcmp: { |
2033 | 16 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
2034 | 16 | // This is not VCCRegBank because this is not used in boolean contexts. |
2035 | 16 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); |
2036 | 16 | unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
2037 | 16 | unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); |
2038 | 16 | unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); |
2039 | 16 | OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize); |
2040 | 16 | OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize); |
2041 | 16 | break; |
2042 | 16 | } |
2043 | 16 | case Intrinsic::amdgcn_readlane: { |
2044 | 8 | // This must be an SGPR, but accept a VGPR. |
2045 | 8 | unsigned IdxReg = MI.getOperand(3).getReg(); |
2046 | 8 | unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); |
2047 | 8 | unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); |
2048 | 8 | OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); |
2049 | 8 | LLVM_FALLTHROUGH; |
2050 | 8 | } |
2051 | 12 | case Intrinsic::amdgcn_readfirstlane: { |
2052 | 12 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
2053 | 12 | unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
2054 | 12 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); |
2055 | 12 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); |
2056 | 12 | break; |
2057 | 8 | } |
2058 | 10 | case Intrinsic::amdgcn_writelane: { |
2059 | 10 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
2060 | 10 | unsigned SrcReg = MI.getOperand(2).getReg(); |
2061 | 10 | unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); |
2062 | 10 | unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID); |
2063 | 10 | unsigned IdxReg = MI.getOperand(3).getReg(); |
2064 | 10 | unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); |
2065 | 10 | unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); |
2066 | 10 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
2067 | 10 | |
2068 | 10 | // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted |
2069 | 10 | // to legalize. |
2070 | 10 | OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); |
2071 | 10 | OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); |
2072 | 10 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); |
2073 | 10 | break; |
2074 | 8 | } |
2075 | 8 | case Intrinsic::amdgcn_if_break: { |
2076 | 2 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
2077 | 2 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
2078 | 2 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
2079 | 2 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
2080 | 2 | break; |
2081 | 76 | } |
2082 | 76 | } |
2083 | 76 | break; |
2084 | 76 | } |
2085 | 98 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { |
2086 | 98 | switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { |
2087 | 98 | default: |
2088 | 0 | return getInvalidInstructionMapping(); |
2089 | 98 | case Intrinsic::amdgcn_s_getreg: |
2090 | 8 | case Intrinsic::amdgcn_s_memtime: |
2091 | 8 | case Intrinsic::amdgcn_s_memrealtime: |
2092 | 8 | case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { |
2093 | 8 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
2094 | 8 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
2095 | 8 | break; |
2096 | 8 | } |
2097 | 40 | case Intrinsic::amdgcn_ds_append: |
2098 | 40 | case Intrinsic::amdgcn_ds_consume: |
2099 | 40 | case Intrinsic::amdgcn_ds_fadd: |
2100 | 40 | case Intrinsic::amdgcn_ds_fmin: |
2101 | 40 | case Intrinsic::amdgcn_ds_fmax: |
2102 | 40 | case Intrinsic::amdgcn_atomic_inc: |
2103 | 40 | case Intrinsic::amdgcn_atomic_dec: |
2104 | 40 | return getDefaultMappingAllVGPR(MI); |
2105 | 40 | case Intrinsic::amdgcn_ds_ordered_add: |
2106 | 16 | case Intrinsic::amdgcn_ds_ordered_swap: { |
2107 | 16 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
2108 | 16 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
2109 | 16 | unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, |
2110 | 16 | AMDGPU::SGPRRegBankID); |
2111 | 16 | OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); |
2112 | 16 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
2113 | 16 | break; |
2114 | 16 | } |
2115 | 16 | case Intrinsic::amdgcn_exp_compr: |
2116 | 4 | OpdsMapping[0] = nullptr; // IntrinsicID |
2117 | 4 | // FIXME: These are immediate values which can't be read from registers. |
2118 | 4 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
2119 | 4 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
2120 | 4 | // FIXME: Could we support packed types here? |
2121 | 4 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
2122 | 4 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
2123 | 4 | // FIXME: These are immediate values which can't be read from registers. |
2124 | 4 | OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
2125 | 4 | OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
2126 | 4 | break; |
2127 | 16 | case Intrinsic::amdgcn_exp: |
2128 | 4 | OpdsMapping[0] = nullptr; // IntrinsicID |
2129 | 4 | // FIXME: These are immediate values which can't be read from registers. |
2130 | 4 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
2131 | 4 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
2132 | 4 | // FIXME: Could we support packed types here? |
2133 | 4 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
2134 | 4 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
2135 | 4 | OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
2136 | 4 | OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
2137 | 4 | // FIXME: These are immediate values which can't be read from registers. |
2138 | 4 | OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
2139 | 4 | OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
2140 | 4 | break; |
2141 | 16 | case Intrinsic::amdgcn_buffer_load: { |
2142 | 16 | Register RSrc = MI.getOperand(2).getReg(); // SGPR |
2143 | 16 | Register VIndex = MI.getOperand(3).getReg(); // VGPR |
2144 | 16 | Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm |
2145 | 16 | |
2146 | 16 | unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
2147 | 16 | unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); |
2148 | 16 | unsigned Size3 = MRI.getType(VIndex).getSizeInBits(); |
2149 | 16 | unsigned Size4 = MRI.getType(Offset).getSizeInBits(); |
2150 | 16 | |
2151 | 16 | unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); |
2152 | 16 | unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); |
2153 | 16 | |
2154 | 16 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); |
2155 | 16 | OpdsMapping[1] = nullptr; // intrinsic id |
2156 | 16 | |
2157 | 16 | // Lie and claim everything is legal, even though some need to be |
2158 | 16 | // SGPRs. applyMapping will have to deal with it as a waterfall loop. |
2159 | 16 | OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc |
2160 | 16 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3); |
2161 | 16 | OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4); |
2162 | 16 | OpdsMapping[5] = nullptr; |
2163 | 16 | OpdsMapping[6] = nullptr; |
2164 | 16 | break; |
2165 | 16 | } |
2166 | 16 | case Intrinsic::amdgcn_s_sendmsg: |
2167 | 8 | case Intrinsic::amdgcn_s_sendmsghalt: { |
2168 | 8 | // This must be an SGPR, but accept a VGPR. |
2169 | 8 | unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, |
2170 | 8 | AMDGPU::SGPRRegBankID); |
2171 | 8 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); |
2172 | 8 | break; |
2173 | 8 | } |
2174 | 8 | case Intrinsic::amdgcn_end_cf: { |
2175 | 2 | unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); |
2176 | 2 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
2177 | 2 | break; |
2178 | 58 | } |
2179 | 58 | } |
2180 | 58 | break; |
2181 | 58 | } |
2182 | 160 | case AMDGPU::G_SELECT: { |
2183 | 160 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
2184 | 160 | unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, |
2185 | 160 | AMDGPU::SGPRRegBankID); |
2186 | 160 | unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI, |
2187 | 160 | AMDGPU::SGPRRegBankID); |
2188 | 160 | bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && |
2189 | 160 | Op3Bank == AMDGPU::SGPRRegBankID114 ; |
2190 | 160 | |
2191 | 160 | unsigned CondBankDefault = SGPRSrcs ? |
2192 | 96 | AMDGPU::SCCRegBankID64 : AMDGPU::VCCRegBankID; |
2193 | 160 | unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, |
2194 | 160 | CondBankDefault); |
2195 | 160 | if (CondBank == AMDGPU::SGPRRegBankID) |
2196 | 36 | CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID26 : AMDGPU::VCCRegBankID10 ; |
2197 | 124 | else if (CondBank == AMDGPU::VGPRRegBankID) |
2198 | 40 | CondBank = AMDGPU::VCCRegBankID; |
2199 | 160 | |
2200 | 160 | unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID64 ? |
2201 | 124 | AMDGPU::SGPRRegBankID36 : AMDGPU::VGPRRegBankID; |
2202 | 160 | |
2203 | 160 | assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID); |
2204 | 160 | |
2205 | 160 | if (Size == 64) { |
2206 | 64 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); |
2207 | 64 | OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); |
2208 | 64 | OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); |
2209 | 64 | OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); |
2210 | 96 | } else { |
2211 | 96 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); |
2212 | 96 | OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); |
2213 | 96 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); |
2214 | 96 | OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); |
2215 | 96 | } |
2216 | 160 | |
2217 | 160 | break; |
2218 | 58 | } |
2219 | 58 | |
2220 | 172 | case AMDGPU::G_LOAD: |
2221 | 172 | return getInstrMappingForLoad(MI); |
2222 | 58 | |
2223 | 66 | case AMDGPU::G_ATOMICRMW_XCHG: |
2224 | 66 | case AMDGPU::G_ATOMICRMW_ADD: |
2225 | 66 | case AMDGPU::G_ATOMICRMW_SUB: |
2226 | 66 | case AMDGPU::G_ATOMICRMW_AND: |
2227 | 66 | case AMDGPU::G_ATOMICRMW_OR: |
2228 | 66 | case AMDGPU::G_ATOMICRMW_XOR: |
2229 | 66 | case AMDGPU::G_ATOMICRMW_MAX: |
2230 | 66 | case AMDGPU::G_ATOMICRMW_MIN: |
2231 | 66 | case AMDGPU::G_ATOMICRMW_UMAX: |
2232 | 66 | case AMDGPU::G_ATOMICRMW_UMIN: |
2233 | 66 | case AMDGPU::G_ATOMIC_CMPXCHG: { |
2234 | 66 | return getDefaultMappingAllVGPR(MI); |
2235 | 66 | } |
2236 | 118 | case AMDGPU::G_BRCOND: { |
2237 | 118 | unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, |
2238 | 118 | AMDGPU::SGPRRegBankID); |
2239 | 118 | assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); |
2240 | 118 | if (Bank != AMDGPU::SCCRegBankID) |
2241 | 20 | Bank = AMDGPU::VCCRegBankID; |
2242 | 118 | |
2243 | 118 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); |
2244 | 118 | break; |
2245 | 2.04k | } |
2246 | 2.04k | } |
2247 | 2.04k | |
2248 | 2.04k | return getInstructionMapping(/*ID*/1, /*Cost*/1, |
2249 | 2.04k | getOperandsMapping(OpdsMapping), |
2250 | 2.04k | MI.getNumOperands()); |
2251 | 2.04k | } |
2252 | | |