/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- SILoadStoreOptimizer.cpp -------------------------------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This pass tries to fuse DS instructions with close by immediate offsets. |
11 | | // This will fuse operations such as |
12 | | // ds_read_b32 v0, v2 offset:16 |
13 | | // ds_read_b32 v1, v2 offset:32 |
14 | | // ==> |
15 | | // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 |
16 | | // |
17 | | // |
18 | | // Future improvements: |
19 | | // |
20 | | // - This currently relies on the scheduler to place loads and stores next to |
21 | | // each other, and then only merges adjacent pairs of instructions. It would |
22 | | // be good to be more flexible with interleaved instructions, and possibly run |
23 | | // before scheduling. It currently missing stores of constants because loading |
24 | | // the constant into the data register is placed between the stores, although |
25 | | // this is arguably a scheduling problem. |
26 | | // |
27 | | // - Live interval recomputing seems inefficient. This currently only matches |
28 | | // one pair, and recomputes live intervals and moves on to the next pair. It |
29 | | // would be better to compute a list of all merges that need to occur. |
30 | | // |
31 | | // - With a list of instructions to process, we can also merge more. If a |
32 | | // cluster of loads have offsets that are too large to fit in the 8-bit |
33 | | // offsets, but are close enough to fit in the 8 bits, we can add to the base |
34 | | // pointer and use the new reduced offsets. |
35 | | // |
36 | | //===----------------------------------------------------------------------===// |
37 | | |
38 | | #include "AMDGPU.h" |
39 | | #include "AMDGPUSubtarget.h" |
40 | | #include "SIInstrInfo.h" |
41 | | #include "SIRegisterInfo.h" |
42 | | #include "Utils/AMDGPUBaseInfo.h" |
43 | | #include "llvm/ADT/ArrayRef.h" |
44 | | #include "llvm/ADT/SmallVector.h" |
45 | | #include "llvm/ADT/StringRef.h" |
46 | | #include "llvm/Analysis/AliasAnalysis.h" |
47 | | #include "llvm/CodeGen/MachineBasicBlock.h" |
48 | | #include "llvm/CodeGen/MachineFunction.h" |
49 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
50 | | #include "llvm/CodeGen/MachineInstr.h" |
51 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
52 | | #include "llvm/CodeGen/MachineOperand.h" |
53 | | #include "llvm/CodeGen/MachineRegisterInfo.h" |
54 | | #include "llvm/IR/DebugLoc.h" |
55 | | #include "llvm/Pass.h" |
56 | | #include "llvm/Support/Debug.h" |
57 | | #include "llvm/Support/MathExtras.h" |
58 | | #include "llvm/Support/raw_ostream.h" |
59 | | #include <algorithm> |
60 | | #include <cassert> |
61 | | #include <cstdlib> |
62 | | #include <iterator> |
63 | | #include <utility> |
64 | | |
65 | | using namespace llvm; |
66 | | |
67 | | #define DEBUG_TYPE "si-load-store-opt" |
68 | | |
69 | | namespace { |
70 | | |
71 | | class SILoadStoreOptimizer : public MachineFunctionPass { |
72 | | using CombineInfo = struct { |
73 | | MachineBasicBlock::iterator I; |
74 | | MachineBasicBlock::iterator Paired; |
75 | | unsigned EltSize; |
76 | | unsigned Offset0; |
77 | | unsigned Offset1; |
78 | | unsigned BaseOff; |
79 | | bool UseST64; |
80 | | SmallVector<MachineInstr*, 8> InstsToMove; |
81 | | }; |
82 | | |
83 | | private: |
84 | | const SIInstrInfo *TII = nullptr; |
85 | | const SIRegisterInfo *TRI = nullptr; |
86 | | MachineRegisterInfo *MRI = nullptr; |
87 | | AliasAnalysis *AA = nullptr; |
88 | | |
89 | | static bool offsetsCanBeCombined(CombineInfo &CI); |
90 | | |
91 | | bool findMatchingDSInst(CombineInfo &CI); |
92 | | |
93 | | MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); |
94 | | |
95 | | MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); |
96 | | |
97 | | public: |
98 | | static char ID; |
99 | | |
100 | 1.43k | SILoadStoreOptimizer() : MachineFunctionPass(ID) { |
101 | 1.43k | initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); |
102 | 1.43k | } |
103 | | |
104 | | bool optimizeBlock(MachineBasicBlock &MBB); |
105 | | |
106 | | bool runOnMachineFunction(MachineFunction &MF) override; |
107 | | |
108 | 1.43k | StringRef getPassName() const override { return "SI Load / Store Optimizer"; } |
109 | | |
110 | 1.43k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
111 | 1.43k | AU.setPreservesCFG(); |
112 | 1.43k | AU.addRequired<AAResultsWrapperPass>(); |
113 | 1.43k | |
114 | 1.43k | MachineFunctionPass::getAnalysisUsage(AU); |
115 | 1.43k | } |
116 | | }; |
117 | | |
118 | | } // end anonymous namespace. |
119 | | |
120 | 90.0k | INITIALIZE_PASS_BEGIN90.0k (SILoadStoreOptimizer, DEBUG_TYPE,
|
121 | 90.0k | "SI Load / Store Optimizer", false, false) |
122 | 90.0k | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
123 | 90.0k | INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, |
124 | | "SI Load / Store Optimizer", false, false) |
125 | | |
126 | | char SILoadStoreOptimizer::ID = 0; |
127 | | |
128 | | char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; |
129 | | |
130 | 0 | FunctionPass *llvm::createSILoadStoreOptimizerPass() { |
131 | 0 | return new SILoadStoreOptimizer(); |
132 | 0 | } |
133 | | |
134 | | static void moveInstsAfter(MachineBasicBlock::iterator I, |
135 | 1.94k | ArrayRef<MachineInstr*> InstsToMove) { |
136 | 1.94k | MachineBasicBlock *MBB = I->getParent(); |
137 | 1.94k | ++I; |
138 | 687 | for (MachineInstr *MI : InstsToMove) { |
139 | 687 | MI->removeFromParent(); |
140 | 687 | MBB->insert(I, MI); |
141 | 687 | } |
142 | 1.94k | } |
143 | | |
144 | 2.99k | static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) { |
145 | 2.99k | // XXX: Should this be looking for implicit defs? |
146 | 2.99k | for (const MachineOperand &Def : MI.defs()) |
147 | 1.45k | Defs.insert(Def.getReg()); |
148 | 2.99k | } |
149 | | |
150 | | static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, |
151 | | MachineBasicBlock::iterator B, |
152 | | const SIInstrInfo *TII, |
153 | 332 | AliasAnalysis * AA) { |
154 | 332 | // RAW or WAR - cannot reorder |
155 | 332 | // WAW - cannot reorder |
156 | 332 | // RAR - safe to reorder |
157 | 256 | return !(A->mayStore() || B->mayStore()) || |
158 | 205 | TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); |
159 | 332 | } |
160 | | |
161 | | // Add MI and its defs to the lists if MI reads one of the defs that are |
162 | | // already in the list. Returns true in that case. |
163 | | static bool |
164 | | addToListsIfDependent(MachineInstr &MI, |
165 | | DenseSet<unsigned> &Defs, |
166 | 4.87k | SmallVectorImpl<MachineInstr*> &Insts) { |
167 | 23.5k | for (MachineOperand &Use : MI.operands()) { |
168 | 23.5k | // If one of the defs is read, then there is a use of Def between I and the |
169 | 23.5k | // instruction that I will potentially be merged with. We will need to move |
170 | 23.5k | // this instruction after the merged instructions. |
171 | 23.5k | |
172 | 23.5k | if (Use.isReg() && 23.5k Use.readsReg()15.4k && Defs.count(Use.getReg())12.1k ) { |
173 | 808 | Insts.push_back(&MI); |
174 | 808 | addDefsToList(MI, Defs); |
175 | 808 | return true; |
176 | 808 | } |
177 | 4.06k | } |
178 | 4.06k | |
179 | 4.06k | return false; |
180 | 4.06k | } |
181 | | |
182 | | static bool |
183 | | canMoveInstsAcrossMemOp(MachineInstr &MemOp, |
184 | | ArrayRef<MachineInstr*> InstsToMove, |
185 | | const SIInstrInfo *TII, |
186 | 1.97k | AliasAnalysis *AA) { |
187 | 1.97k | assert(MemOp.mayLoadOrStore()); |
188 | 1.97k | |
189 | 710 | for (MachineInstr *InstToMove : InstsToMove) { |
190 | 710 | if (!InstToMove->mayLoadOrStore()) |
191 | 689 | continue; |
192 | 21 | if (21 !memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)21 ) |
193 | 14 | return false; |
194 | 1.96k | } |
195 | 1.96k | return true; |
196 | 1.96k | } |
197 | | |
198 | 1.96k | bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { |
199 | 1.96k | // XXX - Would the same offset be OK? Is there any reason this would happen or |
200 | 1.96k | // be useful? |
201 | 1.96k | if (CI.Offset0 == CI.Offset1) |
202 | 9 | return false; |
203 | 1.95k | |
204 | 1.95k | // This won't be valid if the offset isn't aligned. |
205 | 1.95k | if (1.95k (CI.Offset0 % CI.EltSize != 0) || 1.95k (CI.Offset1 % CI.EltSize != 0)1.95k ) |
206 | 0 | return false; |
207 | 1.95k | |
208 | 1.95k | unsigned EltOffset0 = CI.Offset0 / CI.EltSize; |
209 | 1.95k | unsigned EltOffset1 = CI.Offset1 / CI.EltSize; |
210 | 1.95k | CI.UseST64 = false; |
211 | 1.95k | CI.BaseOff = 0; |
212 | 1.95k | |
213 | 1.95k | // If the offset in elements doesn't fit in 8-bits, we might be able to use |
214 | 1.95k | // the stride 64 versions. |
215 | 1.95k | if ((EltOffset0 % 64 == 0) && 1.95k (EltOffset1 % 64) == 0163 && |
216 | 1.95k | isUInt<8>(EltOffset0 / 64)30 && isUInt<8>(EltOffset1 / 64)30 ) { |
217 | 30 | CI.Offset0 = EltOffset0 / 64; |
218 | 30 | CI.Offset1 = EltOffset1 / 64; |
219 | 30 | CI.UseST64 = true; |
220 | 30 | return true; |
221 | 30 | } |
222 | 1.92k | |
223 | 1.92k | // Check if the new offsets fit in the reduced 8-bit range. |
224 | 1.92k | if (1.92k isUInt<8>(EltOffset0) && 1.92k isUInt<8>(EltOffset1)1.88k ) { |
225 | 1.86k | CI.Offset0 = EltOffset0; |
226 | 1.86k | CI.Offset1 = EltOffset1; |
227 | 1.86k | return true; |
228 | 1.86k | } |
229 | 57 | |
230 | 57 | // Try to shift base address to decrease offsets. |
231 | 57 | unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); |
232 | 57 | CI.BaseOff = std::min(CI.Offset0, CI.Offset1); |
233 | 57 | |
234 | 57 | if ((OffsetDiff % 64 == 0) && 57 isUInt<8>(OffsetDiff / 64)24 ) { |
235 | 24 | CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; |
236 | 24 | CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; |
237 | 24 | CI.UseST64 = true; |
238 | 24 | return true; |
239 | 24 | } |
240 | 33 | |
241 | 33 | if (33 isUInt<8>(OffsetDiff)33 ) { |
242 | 28 | CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize; |
243 | 28 | CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize; |
244 | 28 | return true; |
245 | 28 | } |
246 | 5 | |
247 | 5 | return false; |
248 | 5 | } |
249 | | |
250 | 2.90k | bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) { |
251 | 2.90k | MachineBasicBlock *MBB = CI.I->getParent(); |
252 | 2.90k | MachineBasicBlock::iterator E = MBB->end(); |
253 | 2.90k | MachineBasicBlock::iterator MBBI = CI.I; |
254 | 2.90k | |
255 | 2.90k | int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), |
256 | 2.90k | AMDGPU::OpName::addr); |
257 | 2.90k | const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx); |
258 | 2.90k | |
259 | 2.90k | // We only ever merge operations with the same base address register, so don't |
260 | 2.90k | // bother scanning forward if there are no other uses. |
261 | 2.90k | if (TargetRegisterInfo::isPhysicalRegister(AddrReg0.getReg()) || |
262 | 2.90k | MRI->hasOneNonDBGUse(AddrReg0.getReg())) |
263 | 788 | return false; |
264 | 2.11k | |
265 | 2.11k | ++MBBI; |
266 | 2.11k | |
267 | 2.11k | DenseSet<unsigned> DefsToMove; |
268 | 2.11k | addDefsToList(*CI.I, DefsToMove); |
269 | 2.11k | |
270 | 5.10k | for ( ; MBBI != E5.10k ; ++MBBI2.99k ) { |
271 | 4.96k | if (MBBI->getOpcode() != CI.I->getOpcode()4.96k ) { |
272 | 2.97k | // This is not a matching DS instruction, but we can keep looking as |
273 | 2.97k | // long as one of these conditions are met: |
274 | 2.97k | // 1. It is safe to move I down past MBBI. |
275 | 2.97k | // 2. It is safe to move MBBI down past the instruction that I will |
276 | 2.97k | // be merged into. |
277 | 2.97k | |
278 | 2.97k | if (MBBI->hasUnmodeledSideEffects()2.97k ) { |
279 | 11 | // We can't re-order this instruction with respect to other memory |
280 | 11 | // operations, so we fail both conditions mentioned above. |
281 | 11 | return false; |
282 | 11 | } |
283 | 2.96k | |
284 | 2.96k | if (2.96k MBBI->mayLoadOrStore() && |
285 | 2.96k | !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)268 ) { |
286 | 77 | // We fail condition #1, but we may still be able to satisfy condition |
287 | 77 | // #2. Add this instruction to the move list and then we will check |
288 | 77 | // if condition #2 holds once we have selected the matching instruction. |
289 | 77 | CI.InstsToMove.push_back(&*MBBI); |
290 | 77 | addDefsToList(*MBBI, DefsToMove); |
291 | 77 | continue; |
292 | 77 | } |
293 | 2.88k | |
294 | 2.88k | // When we match I with another DS instruction we will be moving I down |
295 | 2.88k | // to the location of the matched instruction any uses of I will need to |
296 | 2.88k | // be moved down as well. |
297 | 2.88k | addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove); |
298 | 2.88k | continue; |
299 | 2.88k | } |
300 | 1.98k | |
301 | 1.98k | // Don't merge volatiles. |
302 | 1.98k | if (1.98k MBBI->hasOrderedMemoryRef()1.98k ) |
303 | 2 | return false; |
304 | 1.98k | |
305 | 1.98k | // Handle a case like |
306 | 1.98k | // DS_WRITE_B32 addr, v, idx0 |
307 | 1.98k | // w = DS_READ_B32 addr, idx0 |
308 | 1.98k | // DS_WRITE_B32 addr, f(w), idx1 |
309 | 1.98k | // where the DS_READ_B32 ends up in InstsToMove and therefore prevents |
310 | 1.98k | // merging of the two writes. |
311 | 1.98k | if (1.98k addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove)1.98k ) |
312 | 2 | continue; |
313 | 1.98k | |
314 | 1.98k | const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); |
315 | 1.98k | |
316 | 1.98k | // Check same base pointer. Be careful of subregisters, which can occur with |
317 | 1.98k | // vectors of pointers. |
318 | 1.98k | if (AddrReg0.getReg() == AddrReg1.getReg() && |
319 | 1.98k | AddrReg0.getSubReg() == AddrReg1.getSubReg()1.96k ) { |
320 | 1.96k | int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), |
321 | 1.96k | AMDGPU::OpName::offset); |
322 | 1.96k | CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff; |
323 | 1.96k | CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; |
324 | 1.96k | CI.Paired = MBBI; |
325 | 1.96k | |
326 | 1.96k | // Check both offsets fit in the reduced range. |
327 | 1.96k | // We also need to go through the list of instructions that we plan to |
328 | 1.96k | // move and make sure they are all safe to move down past the merged |
329 | 1.96k | // instruction. |
330 | 1.96k | if (offsetsCanBeCombined(CI)) |
331 | 1.94k | if (1.94k canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)1.94k ) |
332 | 1.94k | return true; |
333 | 43 | } |
334 | 43 | |
335 | 43 | // We've found a load/store that we couldn't merge for some reason. |
336 | 43 | // We could potentially keep looking, but we'd need to make sure that |
337 | 43 | // it was safe to move I and also all the instruction in InstsToMove |
338 | 43 | // down past this instruction. |
339 | 43 | // check if we can move I across MBBI and if we can move all I's users |
340 | 43 | if (43 !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || |
341 | 31 | !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) |
342 | 20 | break; |
343 | 4.96k | } |
344 | 160 | return false; |
345 | 2.90k | } |
346 | | |
347 | | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( |
348 | 594 | CombineInfo &CI) { |
349 | 594 | MachineBasicBlock *MBB = CI.I->getParent(); |
350 | 594 | |
351 | 594 | // Be careful, since the addresses could be subregisters themselves in weird |
352 | 594 | // cases, like vectors of pointers. |
353 | 594 | const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); |
354 | 594 | |
355 | 594 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); |
356 | 594 | const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst); |
357 | 594 | |
358 | 594 | unsigned NewOffset0 = CI.Offset0; |
359 | 594 | unsigned NewOffset1 = CI.Offset1; |
360 | 162 | unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32 |
361 | 432 | : AMDGPU::DS_READ2_B64; |
362 | 594 | |
363 | 594 | if (CI.UseST64) |
364 | 30 | Opc = (CI.EltSize == 4) ? 30 AMDGPU::DS_READ2ST64_B3221 |
365 | 9 | : AMDGPU::DS_READ2ST64_B64; |
366 | 594 | |
367 | 594 | unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0162 : AMDGPU::sub0_sub1432 ; |
368 | 594 | unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1162 : AMDGPU::sub2_sub3432 ; |
369 | 594 | |
370 | 594 | if (NewOffset0 > NewOffset1594 ) { |
371 | 253 | // Canonicalize the merged instruction so the smaller offset comes first. |
372 | 253 | std::swap(NewOffset0, NewOffset1); |
373 | 253 | std::swap(SubRegIdx0, SubRegIdx1); |
374 | 253 | } |
375 | 594 | |
376 | 594 | assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && |
377 | 594 | (NewOffset0 != NewOffset1) && |
378 | 594 | "Computed offset doesn't fit"); |
379 | 594 | |
380 | 594 | const MCInstrDesc &Read2Desc = TII->get(Opc); |
381 | 594 | |
382 | 594 | const TargetRegisterClass *SuperRC |
383 | 594 | = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass162 : &AMDGPU::VReg_128RegClass432 ; |
384 | 594 | unsigned DestReg = MRI->createVirtualRegister(SuperRC); |
385 | 594 | |
386 | 594 | DebugLoc DL = CI.I->getDebugLoc(); |
387 | 594 | |
388 | 594 | unsigned BaseReg = AddrReg->getReg(); |
389 | 594 | unsigned BaseRegFlags = 0; |
390 | 594 | if (CI.BaseOff594 ) { |
391 | 26 | BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
392 | 26 | BaseRegFlags = RegState::Kill; |
393 | 26 | BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) |
394 | 26 | .addImm(CI.BaseOff) |
395 | 26 | .addReg(AddrReg->getReg()); |
396 | 26 | } |
397 | 594 | |
398 | 594 | MachineInstrBuilder Read2 = |
399 | 594 | BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) |
400 | 594 | .addReg(BaseReg, BaseRegFlags) // addr |
401 | 594 | .addImm(NewOffset0) // offset0 |
402 | 594 | .addImm(NewOffset1) // offset1 |
403 | 594 | .addImm(0) // gds |
404 | 594 | .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); |
405 | 594 | |
406 | 594 | (void)Read2; |
407 | 594 | |
408 | 594 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); |
409 | 594 | |
410 | 594 | // Copy to the old destination registers. |
411 | 594 | BuildMI(*MBB, CI.Paired, DL, CopyDesc) |
412 | 594 | .add(*Dest0) // Copy to same destination including flags and sub reg. |
413 | 594 | .addReg(DestReg, 0, SubRegIdx0); |
414 | 594 | MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) |
415 | 594 | .add(*Dest1) |
416 | 594 | .addReg(DestReg, RegState::Kill, SubRegIdx1); |
417 | 594 | |
418 | 594 | moveInstsAfter(Copy1, CI.InstsToMove); |
419 | 594 | |
420 | 594 | MachineBasicBlock::iterator Next = std::next(CI.I); |
421 | 594 | CI.I->eraseFromParent(); |
422 | 594 | CI.Paired->eraseFromParent(); |
423 | 594 | |
424 | 594 | DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); |
425 | 594 | return Next; |
426 | 594 | } |
427 | | |
428 | | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( |
429 | 1.34k | CombineInfo &CI) { |
430 | 1.34k | MachineBasicBlock *MBB = CI.I->getParent(); |
431 | 1.34k | |
432 | 1.34k | // Be sure to use .addOperand(), and not .addReg() with these. We want to be |
433 | 1.34k | // sure we preserve the subregister index and any register flags set on them. |
434 | 1.34k | const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); |
435 | 1.34k | const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); |
436 | 1.34k | const MachineOperand *Data1 |
437 | 1.34k | = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); |
438 | 1.34k | |
439 | 1.34k | unsigned NewOffset0 = CI.Offset0; |
440 | 1.34k | unsigned NewOffset1 = CI.Offset1; |
441 | 57 | unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32 |
442 | 1.28k | : AMDGPU::DS_WRITE2_B64; |
443 | 1.34k | |
444 | 1.34k | if (CI.UseST64) |
445 | 24 | Opc = (CI.EltSize == 4) ? 24 AMDGPU::DS_WRITE2ST64_B3217 |
446 | 7 | : AMDGPU::DS_WRITE2ST64_B64; |
447 | 1.34k | |
448 | 1.34k | if (NewOffset0 > NewOffset11.34k ) { |
449 | 1.21k | // Canonicalize the merged instruction so the smaller offset comes first. |
450 | 1.21k | std::swap(NewOffset0, NewOffset1); |
451 | 1.21k | std::swap(Data0, Data1); |
452 | 1.21k | } |
453 | 1.34k | |
454 | 1.34k | assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && |
455 | 1.34k | (NewOffset0 != NewOffset1) && |
456 | 1.34k | "Computed offset doesn't fit"); |
457 | 1.34k | |
458 | 1.34k | const MCInstrDesc &Write2Desc = TII->get(Opc); |
459 | 1.34k | DebugLoc DL = CI.I->getDebugLoc(); |
460 | 1.34k | |
461 | 1.34k | unsigned BaseReg = Addr->getReg(); |
462 | 1.34k | unsigned BaseRegFlags = 0; |
463 | 1.34k | if (CI.BaseOff1.34k ) { |
464 | 26 | BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
465 | 26 | BaseRegFlags = RegState::Kill; |
466 | 26 | BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) |
467 | 26 | .addImm(CI.BaseOff) |
468 | 26 | .addReg(Addr->getReg()); |
469 | 26 | } |
470 | 1.34k | |
471 | 1.34k | MachineInstrBuilder Write2 = |
472 | 1.34k | BuildMI(*MBB, CI.Paired, DL, Write2Desc) |
473 | 1.34k | .addReg(BaseReg, BaseRegFlags) // addr |
474 | 1.34k | .add(*Data0) // data0 |
475 | 1.34k | .add(*Data1) // data1 |
476 | 1.34k | .addImm(NewOffset0) // offset0 |
477 | 1.34k | .addImm(NewOffset1) // offset1 |
478 | 1.34k | .addImm(0) // gds |
479 | 1.34k | .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); |
480 | 1.34k | |
481 | 1.34k | moveInstsAfter(Write2, CI.InstsToMove); |
482 | 1.34k | |
483 | 1.34k | MachineBasicBlock::iterator Next = std::next(CI.I); |
484 | 1.34k | CI.I->eraseFromParent(); |
485 | 1.34k | CI.Paired->eraseFromParent(); |
486 | 1.34k | |
487 | 1.34k | DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); |
488 | 1.34k | return Next; |
489 | 1.34k | } |
490 | | |
491 | | // Scan through looking for adjacent LDS operations with constant offsets from |
492 | | // the same base register. We rely on the scheduler to do the hard work of |
493 | | // clustering nearby loads, and assume these are all adjacent. |
494 | 16.7k | bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { |
495 | 16.7k | bool Modified = false; |
496 | 16.7k | |
497 | 366k | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E366k ;) { |
498 | 350k | MachineInstr &MI = *I; |
499 | 350k | |
500 | 350k | // Don't combine if volatile. |
501 | 350k | if (MI.hasOrderedMemoryRef()350k ) { |
502 | 15.8k | ++I; |
503 | 15.8k | continue; |
504 | 15.8k | } |
505 | 334k | |
506 | 334k | CombineInfo CI; |
507 | 334k | CI.I = I; |
508 | 334k | unsigned Opc = MI.getOpcode(); |
509 | 334k | if (Opc == AMDGPU::DS_READ_B32 || 334k Opc == AMDGPU::DS_READ_B64333k ) { |
510 | 1.22k | CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8532 : 4695 ; |
511 | 1.22k | if (findMatchingDSInst(CI)1.22k ) { |
512 | 594 | Modified = true; |
513 | 594 | I = mergeRead2Pair(CI); |
514 | 1.22k | } else { |
515 | 633 | ++I; |
516 | 633 | } |
517 | 1.22k | |
518 | 1.22k | continue; |
519 | 333k | } else if (333k Opc == AMDGPU::DS_WRITE_B32 || 333k Opc == AMDGPU::DS_WRITE_B64332k ) { |
520 | 1.67k | CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 81.40k : 4266 ; |
521 | 1.67k | if (findMatchingDSInst(CI)1.67k ) { |
522 | 1.34k | Modified = true; |
523 | 1.34k | I = mergeWrite2Pair(CI); |
524 | 1.67k | } else { |
525 | 328 | ++I; |
526 | 328 | } |
527 | 333k | |
528 | 333k | continue; |
529 | 333k | } |
530 | 331k | |
531 | 331k | ++I; |
532 | 331k | } |
533 | 16.7k | |
534 | 16.7k | return Modified; |
535 | 16.7k | } |
536 | | |
537 | 14.8k | bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { |
538 | 14.8k | if (skipFunction(*MF.getFunction())) |
539 | 1 | return false; |
540 | 14.8k | |
541 | 14.8k | const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); |
542 | 14.8k | if (!STM.loadStoreOptEnabled()) |
543 | 1 | return false; |
544 | 14.8k | |
545 | 14.8k | TII = STM.getInstrInfo(); |
546 | 14.8k | TRI = &TII->getRegisterInfo(); |
547 | 14.8k | |
548 | 14.8k | MRI = &MF.getRegInfo(); |
549 | 14.8k | AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
550 | 14.8k | |
551 | 14.8k | assert(MRI->isSSA() && "Must be run on SSA"); |
552 | 14.8k | |
553 | 14.8k | DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); |
554 | 14.8k | |
555 | 14.8k | bool Modified = false; |
556 | 14.8k | |
557 | 14.8k | for (MachineBasicBlock &MBB : MF) |
558 | 16.7k | Modified |= optimizeBlock(MBB); |
559 | 14.8k | |
560 | 14.8k | return Modified; |
561 | 14.8k | } |