/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file contains a pass that performs load / store related peephole |
10 | | // optimizations. This pass should be run after register allocation. |
11 | | // |
12 | | //===----------------------------------------------------------------------===// |
13 | | |
14 | | #include "AArch64InstrInfo.h" |
15 | | #include "AArch64Subtarget.h" |
16 | | #include "MCTargetDesc/AArch64AddressingModes.h" |
17 | | #include "llvm/ADT/BitVector.h" |
18 | | #include "llvm/ADT/SmallVector.h" |
19 | | #include "llvm/ADT/Statistic.h" |
20 | | #include "llvm/ADT/StringRef.h" |
21 | | #include "llvm/ADT/iterator_range.h" |
22 | | #include "llvm/Analysis/AliasAnalysis.h" |
23 | | #include "llvm/CodeGen/MachineBasicBlock.h" |
24 | | #include "llvm/CodeGen/MachineFunction.h" |
25 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
26 | | #include "llvm/CodeGen/MachineInstr.h" |
27 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
28 | | #include "llvm/CodeGen/MachineOperand.h" |
29 | | #include "llvm/CodeGen/TargetRegisterInfo.h" |
30 | | #include "llvm/IR/DebugLoc.h" |
31 | | #include "llvm/MC/MCRegisterInfo.h" |
32 | | #include "llvm/Pass.h" |
33 | | #include "llvm/Support/CommandLine.h" |
34 | | #include "llvm/Support/Debug.h" |
35 | | #include "llvm/Support/ErrorHandling.h" |
36 | | #include "llvm/Support/raw_ostream.h" |
37 | | #include <cassert> |
38 | | #include <cstdint> |
39 | | #include <iterator> |
40 | | #include <limits> |
41 | | |
42 | | using namespace llvm; |
43 | | |
44 | | #define DEBUG_TYPE "aarch64-ldst-opt" |
45 | | |
46 | | STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); |
47 | | STATISTIC(NumPostFolded, "Number of post-index updates folded"); |
48 | | STATISTIC(NumPreFolded, "Number of pre-index updates folded"); |
49 | | STATISTIC(NumUnscaledPairCreated, |
50 | | "Number of load/store from unscaled generated"); |
51 | | STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); |
52 | | STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); |
53 | | |
54 | | // The LdStLimit limits how far we search for load/store pairs. |
55 | | static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", |
56 | | cl::init(20), cl::Hidden); |
57 | | |
58 | | // The UpdateLimit limits how far we search for update instructions when we form |
59 | | // pre-/post-index instructions. |
60 | | static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), |
61 | | cl::Hidden); |
62 | | |
63 | 513k | #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" |
64 | | |
65 | | namespace { |
66 | | |
67 | | using LdStPairFlags = struct LdStPairFlags { |
68 | | // If a matching instruction is found, MergeForward is set to true if the |
69 | | // merge is to remove the first instruction and replace the second with |
70 | | // a pair-wise insn, and false if the reverse is true. |
71 | | bool MergeForward = false; |
72 | | |
73 | | // SExtIdx gives the index of the result of the load pair that must be |
74 | | // extended. The value of SExtIdx assumes that the paired load produces the |
75 | | // value in this order: (I, returned iterator), i.e., -1 means no value has |
76 | | // to be extended, 0 means I, and 1 means the returned iterator. |
77 | | int SExtIdx = -1; |
78 | | |
79 | 2.75M | LdStPairFlags() = default; |
80 | | |
81 | 222k | void setMergeForward(bool V = true) { MergeForward = V; } |
82 | 222k | bool getMergeForward() const { return MergeForward; } |
83 | | |
84 | 11.1M | void setSExtIdx(int V) { SExtIdx = V; } |
85 | 214k | int getSExtIdx() const { return SExtIdx; } |
86 | | }; |
87 | | |
88 | | struct AArch64LoadStoreOpt : public MachineFunctionPass { |
89 | | static char ID; |
90 | | |
91 | 15.9k | AArch64LoadStoreOpt() : MachineFunctionPass(ID) { |
92 | 15.9k | initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); |
93 | 15.9k | } |
94 | | |
95 | | AliasAnalysis *AA; |
96 | | const AArch64InstrInfo *TII; |
97 | | const TargetRegisterInfo *TRI; |
98 | | const AArch64Subtarget *Subtarget; |
99 | | |
100 | | // Track which register units have been modified and used. |
101 | | LiveRegUnits ModifiedRegUnits, UsedRegUnits; |
102 | | |
103 | 15.8k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
104 | 15.8k | AU.addRequired<AAResultsWrapperPass>(); |
105 | 15.8k | MachineFunctionPass::getAnalysisUsage(AU); |
106 | 15.8k | } |
107 | | |
108 | | // Scan the instructions looking for a load/store that can be combined |
109 | | // with the current instruction into a load/store pair. |
110 | | // Return the matching instruction if one is found, else MBB->end(). |
111 | | MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, |
112 | | LdStPairFlags &Flags, |
113 | | unsigned Limit, |
114 | | bool FindNarrowMerge); |
115 | | |
116 | | // Scan the instructions looking for a store that writes to the address from |
117 | | // which the current load instruction reads. Return true if one is found. |
118 | | bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, |
119 | | MachineBasicBlock::iterator &StoreI); |
120 | | |
121 | | // Merge the two instructions indicated into a wider narrow store instruction. |
122 | | MachineBasicBlock::iterator |
123 | | mergeNarrowZeroStores(MachineBasicBlock::iterator I, |
124 | | MachineBasicBlock::iterator MergeMI, |
125 | | const LdStPairFlags &Flags); |
126 | | |
127 | | // Merge the two instructions indicated into a single pair-wise instruction. |
128 | | MachineBasicBlock::iterator |
129 | | mergePairedInsns(MachineBasicBlock::iterator I, |
130 | | MachineBasicBlock::iterator Paired, |
131 | | const LdStPairFlags &Flags); |
132 | | |
133 | | // Promote the load that reads directly from the address stored to. |
134 | | MachineBasicBlock::iterator |
135 | | promoteLoadFromStore(MachineBasicBlock::iterator LoadI, |
136 | | MachineBasicBlock::iterator StoreI); |
137 | | |
138 | | // Scan the instruction list to find a base register update that can |
139 | | // be combined with the current instruction (a load or store) using |
140 | | // pre or post indexed addressing with writeback. Scan forwards. |
141 | | MachineBasicBlock::iterator |
142 | | findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, |
143 | | int UnscaledOffset, unsigned Limit); |
144 | | |
145 | | // Scan the instruction list to find a base register update that can |
146 | | // be combined with the current instruction (a load or store) using |
147 | | // pre or post indexed addressing with writeback. Scan backwards. |
148 | | MachineBasicBlock::iterator |
149 | | findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); |
150 | | |
151 | | // Find an instruction that updates the base register of the ld/st |
152 | | // instruction. |
153 | | bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, |
154 | | unsigned BaseReg, int Offset); |
155 | | |
156 | | // Merge a pre- or post-index base register update into a ld/st instruction. |
157 | | MachineBasicBlock::iterator |
158 | | mergeUpdateInsn(MachineBasicBlock::iterator I, |
159 | | MachineBasicBlock::iterator Update, bool IsPreIdx); |
160 | | |
161 | | // Find and merge zero store instructions. |
162 | | bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); |
163 | | |
164 | | // Find and pair ldr/str instructions. |
165 | | bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); |
166 | | |
167 | | // Find and promote load instructions which read directly from store. |
168 | | bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); |
169 | | |
170 | | // Find and merge a base register updates before or after a ld/st instruction. |
171 | | bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI); |
172 | | |
173 | | bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); |
174 | | |
175 | | bool runOnMachineFunction(MachineFunction &Fn) override; |
176 | | |
177 | 15.8k | MachineFunctionProperties getRequiredProperties() const override { |
178 | 15.8k | return MachineFunctionProperties().set( |
179 | 15.8k | MachineFunctionProperties::Property::NoVRegs); |
180 | 15.8k | } |
181 | | |
182 | 513k | StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } |
183 | | }; |
184 | | |
185 | | char AArch64LoadStoreOpt::ID = 0; |
186 | | |
187 | | } // end anonymous namespace |
188 | | |
189 | | INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", |
190 | | AARCH64_LOAD_STORE_OPT_NAME, false, false) |
191 | | |
192 | 35.8M | static bool isNarrowStore(unsigned Opc) { |
193 | 35.8M | switch (Opc) { |
194 | 35.8M | default: |
195 | 35.1M | return false; |
196 | 35.8M | case AArch64::STRBBui: |
197 | 674k | case AArch64::STURBBi: |
198 | 674k | case AArch64::STRHHui: |
199 | 674k | case AArch64::STURHHi: |
200 | 674k | return true; |
201 | 35.8M | } |
202 | 35.8M | } |
203 | | |
204 | | // Scaling factor for unscaled load or store. |
205 | 18.9M | static int getMemScale(MachineInstr &MI) { |
206 | 18.9M | switch (MI.getOpcode()) { |
207 | 18.9M | default: |
208 | 0 | llvm_unreachable("Opcode has unknown scale!"); |
209 | 18.9M | case AArch64::LDRBBui: |
210 | 1.93M | case AArch64::LDURBBi: |
211 | 1.93M | case AArch64::LDRSBWui: |
212 | 1.93M | case AArch64::LDURSBWi: |
213 | 1.93M | case AArch64::STRBBui: |
214 | 1.93M | case AArch64::STURBBi: |
215 | 1.93M | return 1; |
216 | 1.93M | case AArch64::LDRHHui: |
217 | 548k | case AArch64::LDURHHi: |
218 | 548k | case AArch64::LDRSHWui: |
219 | 548k | case AArch64::LDURSHWi: |
220 | 548k | case AArch64::STRHHui: |
221 | 548k | case AArch64::STURHHi: |
222 | 548k | return 2; |
223 | 3.19M | case AArch64::LDRSui: |
224 | 3.19M | case AArch64::LDURSi: |
225 | 3.19M | case AArch64::LDRSWui: |
226 | 3.19M | case AArch64::LDURSWi: |
227 | 3.19M | case AArch64::LDRWui: |
228 | 3.19M | case AArch64::LDURWi: |
229 | 3.19M | case AArch64::STRSui: |
230 | 3.19M | case AArch64::STURSi: |
231 | 3.19M | case AArch64::STRWui: |
232 | 3.19M | case AArch64::STURWi: |
233 | 3.19M | case AArch64::LDPSi: |
234 | 3.19M | case AArch64::LDPSWi: |
235 | 3.19M | case AArch64::LDPWi: |
236 | 3.19M | case AArch64::STPSi: |
237 | 3.19M | case AArch64::STPWi: |
238 | 3.19M | return 4; |
239 | 12.0M | case AArch64::LDRDui: |
240 | 12.0M | case AArch64::LDURDi: |
241 | 12.0M | case AArch64::LDRXui: |
242 | 12.0M | case AArch64::LDURXi: |
243 | 12.0M | case AArch64::STRDui: |
244 | 12.0M | case AArch64::STURDi: |
245 | 12.0M | case AArch64::STRXui: |
246 | 12.0M | case AArch64::STURXi: |
247 | 12.0M | case AArch64::LDPDi: |
248 | 12.0M | case AArch64::LDPXi: |
249 | 12.0M | case AArch64::STPDi: |
250 | 12.0M | case AArch64::STPXi: |
251 | 12.0M | return 8; |
252 | 12.0M | case AArch64::LDRQui: |
253 | 1.16M | case AArch64::LDURQi: |
254 | 1.16M | case AArch64::STRQui: |
255 | 1.16M | case AArch64::STURQi: |
256 | 1.16M | case AArch64::LDPQi: |
257 | 1.16M | case AArch64::STPQi: |
258 | 1.16M | return 16; |
259 | 18.9M | } |
260 | 18.9M | } |
261 | | |
262 | | static unsigned getMatchingNonSExtOpcode(unsigned Opc, |
263 | 17.4M | bool *IsValidLdStrOpc = nullptr) { |
264 | 17.4M | if (IsValidLdStrOpc) |
265 | 17.4M | *IsValidLdStrOpc = true; |
266 | 17.4M | switch (Opc) { |
267 | 17.4M | default: |
268 | 6.53M | if (IsValidLdStrOpc) |
269 | 6.53M | *IsValidLdStrOpc = false; |
270 | 6.53M | return std::numeric_limits<unsigned>::max(); |
271 | 17.4M | case AArch64::STRDui: |
272 | 10.5M | case AArch64::STURDi: |
273 | 10.5M | case AArch64::STRQui: |
274 | 10.5M | case AArch64::STURQi: |
275 | 10.5M | case AArch64::STRBBui: |
276 | 10.5M | case AArch64::STURBBi: |
277 | 10.5M | case AArch64::STRHHui: |
278 | 10.5M | case AArch64::STURHHi: |
279 | 10.5M | case AArch64::STRWui: |
280 | 10.5M | case AArch64::STURWi: |
281 | 10.5M | case AArch64::STRXui: |
282 | 10.5M | case AArch64::STURXi: |
283 | 10.5M | case AArch64::LDRDui: |
284 | 10.5M | case AArch64::LDURDi: |
285 | 10.5M | case AArch64::LDRQui: |
286 | 10.5M | case AArch64::LDURQi: |
287 | 10.5M | case AArch64::LDRWui: |
288 | 10.5M | case AArch64::LDURWi: |
289 | 10.5M | case AArch64::LDRXui: |
290 | 10.5M | case AArch64::LDURXi: |
291 | 10.5M | case AArch64::STRSui: |
292 | 10.5M | case AArch64::STURSi: |
293 | 10.5M | case AArch64::LDRSui: |
294 | 10.5M | case AArch64::LDURSi: |
295 | 10.5M | return Opc; |
296 | 10.5M | case AArch64::LDRSWui: |
297 | 316k | return AArch64::LDRWui; |
298 | 10.5M | case AArch64::LDURSWi: |
299 | 6.62k | return AArch64::LDURWi; |
300 | 17.4M | } |
301 | 17.4M | } |
302 | | |
303 | 7.74k | static unsigned getMatchingWideOpcode(unsigned Opc) { |
304 | 7.74k | switch (Opc) { |
305 | 7.74k | default: |
306 | 0 | llvm_unreachable("Opcode has no wide equivalent!"); |
307 | 7.74k | case AArch64::STRBBui: |
308 | 0 | return AArch64::STRHHui; |
309 | 7.74k | case AArch64::STRHHui: |
310 | 1 | return AArch64::STRWui; |
311 | 7.74k | case AArch64::STURBBi: |
312 | 0 | return AArch64::STURHHi; |
313 | 7.74k | case AArch64::STURHHi: |
314 | 0 | return AArch64::STURWi; |
315 | 7.74k | case AArch64::STURWi: |
316 | 35 | return AArch64::STURXi; |
317 | 7.74k | case AArch64::STRWui: |
318 | 7.70k | return AArch64::STRXui; |
319 | 7.74k | } |
320 | 7.74k | } |
321 | | |
322 | 625k | static unsigned getMatchingPairOpcode(unsigned Opc) { |
323 | 625k | switch (Opc) { |
324 | 625k | default: |
325 | 0 | llvm_unreachable("Opcode has no pairwise equivalent!"); |
326 | 625k | case AArch64::STRSui: |
327 | 6.71k | case AArch64::STURSi: |
328 | 6.71k | return AArch64::STPSi; |
329 | 16.2k | case AArch64::STRDui: |
330 | 16.2k | case AArch64::STURDi: |
331 | 16.2k | return AArch64::STPDi; |
332 | 136k | case AArch64::STRQui: |
333 | 136k | case AArch64::STURQi: |
334 | 136k | return AArch64::STPQi; |
335 | 136k | case AArch64::STRWui: |
336 | 96.5k | case AArch64::STURWi: |
337 | 96.5k | return AArch64::STPWi; |
338 | 110k | case AArch64::STRXui: |
339 | 110k | case AArch64::STURXi: |
340 | 110k | return AArch64::STPXi; |
341 | 110k | case AArch64::LDRSui: |
342 | 9.88k | case AArch64::LDURSi: |
343 | 9.88k | return AArch64::LDPSi; |
344 | 18.3k | case AArch64::LDRDui: |
345 | 18.3k | case AArch64::LDURDi: |
346 | 18.3k | return AArch64::LDPDi; |
347 | 30.4k | case AArch64::LDRQui: |
348 | 30.4k | case AArch64::LDURQi: |
349 | 30.4k | return AArch64::LDPQi; |
350 | 61.8k | case AArch64::LDRWui: |
351 | 61.8k | case AArch64::LDURWi: |
352 | 61.8k | return AArch64::LDPWi; |
353 | 128k | case AArch64::LDRXui: |
354 | 128k | case AArch64::LDURXi: |
355 | 128k | return AArch64::LDPXi; |
356 | 128k | case AArch64::LDRSWui: |
357 | 10.6k | case AArch64::LDURSWi: |
358 | 10.6k | return AArch64::LDPSWi; |
359 | 625k | } |
360 | 625k | } |
361 | | |
362 | | static unsigned isMatchingStore(MachineInstr &LoadInst, |
363 | 586k | MachineInstr &StoreInst) { |
364 | 586k | unsigned LdOpc = LoadInst.getOpcode(); |
365 | 586k | unsigned StOpc = StoreInst.getOpcode(); |
366 | 586k | switch (LdOpc) { |
367 | 586k | default: |
368 | 0 | llvm_unreachable("Unsupported load instruction!"); |
369 | 586k | case AArch64::LDRBBui: |
370 | 38.8k | return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui29.0k || |
371 | 38.8k | StOpc == AArch64::STRWui27.0k || StOpc == AArch64::STRXui18.6k ; |
372 | 586k | case AArch64::LDURBBi: |
373 | 700 | return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi664 || |
374 | 700 | StOpc == AArch64::STURWi661 || StOpc == AArch64::STURXi630 ; |
375 | 586k | case AArch64::LDRHHui: |
376 | 46.5k | return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui5.64k || |
377 | 46.5k | StOpc == AArch64::STRXui3.55k ; |
378 | 586k | case AArch64::LDURHHi: |
379 | 1.86k | return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi1.80k || |
380 | 1.86k | StOpc == AArch64::STURXi1.75k ; |
381 | 586k | case AArch64::LDRWui: |
382 | 147k | return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui99.9k ; |
383 | 586k | case AArch64::LDURWi: |
384 | 5.37k | return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi4.57k ; |
385 | 586k | case AArch64::LDRXui: |
386 | 341k | return StOpc == AArch64::STRXui; |
387 | 586k | case AArch64::LDURXi: |
388 | 5.23k | return StOpc == AArch64::STURXi; |
389 | 586k | } |
390 | 586k | } |
391 | | |
392 | 218 | static unsigned getPreIndexedOpcode(unsigned Opc) { |
393 | 218 | // FIXME: We don't currently support creating pre-indexed loads/stores when |
394 | 218 | // the load or store is the unscaled version. If we decide to perform such an |
395 | 218 | // optimization in the future the cases for the unscaled loads/stores will |
396 | 218 | // need to be added here. |
397 | 218 | switch (Opc) { |
398 | 218 | default: |
399 | 0 | llvm_unreachable("Opcode has no pre-indexed equivalent!"); |
400 | 218 | case AArch64::STRSui: |
401 | 6 | return AArch64::STRSpre; |
402 | 218 | case AArch64::STRDui: |
403 | 8 | return AArch64::STRDpre; |
404 | 218 | case AArch64::STRQui: |
405 | 9 | return AArch64::STRQpre; |
406 | 218 | case AArch64::STRBBui: |
407 | 2 | return AArch64::STRBBpre; |
408 | 218 | case AArch64::STRHHui: |
409 | 2 | return AArch64::STRHHpre; |
410 | 218 | case AArch64::STRWui: |
411 | 9 | return AArch64::STRWpre; |
412 | 218 | case AArch64::STRXui: |
413 | 37 | return AArch64::STRXpre; |
414 | 218 | case AArch64::LDRSui: |
415 | 7 | return AArch64::LDRSpre; |
416 | 218 | case AArch64::LDRDui: |
417 | 6 | return AArch64::LDRDpre; |
418 | 218 | case AArch64::LDRQui: |
419 | 14 | return AArch64::LDRQpre; |
420 | 218 | case AArch64::LDRBBui: |
421 | 38 | return AArch64::LDRBBpre; |
422 | 218 | case AArch64::LDRHHui: |
423 | 6 | return AArch64::LDRHHpre; |
424 | 218 | case AArch64::LDRWui: |
425 | 7 | return AArch64::LDRWpre; |
426 | 218 | case AArch64::LDRXui: |
427 | 31 | return AArch64::LDRXpre; |
428 | 218 | case AArch64::LDRSWui: |
429 | 0 | return AArch64::LDRSWpre; |
430 | 218 | case AArch64::LDPSi: |
431 | 0 | return AArch64::LDPSpre; |
432 | 218 | case AArch64::LDPSWi: |
433 | 0 | return AArch64::LDPSWpre; |
434 | 218 | case AArch64::LDPDi: |
435 | 0 | return AArch64::LDPDpre; |
436 | 218 | case AArch64::LDPQi: |
437 | 2 | return AArch64::LDPQpre; |
438 | 218 | case AArch64::LDPWi: |
439 | 2 | return AArch64::LDPWpre; |
440 | 218 | case AArch64::LDPXi: |
441 | 0 | return AArch64::LDPXpre; |
442 | 218 | case AArch64::STPSi: |
443 | 1 | return AArch64::STPSpre; |
444 | 218 | case AArch64::STPDi: |
445 | 0 | return AArch64::STPDpre; |
446 | 218 | case AArch64::STPQi: |
447 | 1 | return AArch64::STPQpre; |
448 | 218 | case AArch64::STPWi: |
449 | 2 | return AArch64::STPWpre; |
450 | 218 | case AArch64::STPXi: |
451 | 28 | return AArch64::STPXpre; |
452 | 218 | } |
453 | 218 | } |
454 | | |
455 | 833 | static unsigned getPostIndexedOpcode(unsigned Opc) { |
456 | 833 | switch (Opc) { |
457 | 833 | default: |
458 | 0 | llvm_unreachable("Opcode has no post-indexed wise equivalent!"); |
459 | 833 | case AArch64::STRSui: |
460 | 18 | case AArch64::STURSi: |
461 | 18 | return AArch64::STRSpost; |
462 | 44 | case AArch64::STRDui: |
463 | 44 | case AArch64::STURDi: |
464 | 44 | return AArch64::STRDpost; |
465 | 44 | case AArch64::STRQui: |
466 | 44 | case AArch64::STURQi: |
467 | 44 | return AArch64::STRQpost; |
468 | 44 | case AArch64::STRBBui: |
469 | 10 | return AArch64::STRBBpost; |
470 | 44 | case AArch64::STRHHui: |
471 | 20 | return AArch64::STRHHpost; |
472 | 87 | case AArch64::STRWui: |
473 | 87 | case AArch64::STURWi: |
474 | 87 | return AArch64::STRWpost; |
475 | 87 | case AArch64::STRXui: |
476 | 61 | case AArch64::STURXi: |
477 | 61 | return AArch64::STRXpost; |
478 | 61 | case AArch64::LDRSui: |
479 | 28 | case AArch64::LDURSi: |
480 | 28 | return AArch64::LDRSpost; |
481 | 114 | case AArch64::LDRDui: |
482 | 114 | case AArch64::LDURDi: |
483 | 114 | return AArch64::LDRDpost; |
484 | 114 | case AArch64::LDRQui: |
485 | 87 | case AArch64::LDURQi: |
486 | 87 | return AArch64::LDRQpost; |
487 | 87 | case AArch64::LDRBBui: |
488 | 31 | return AArch64::LDRBBpost; |
489 | 87 | case AArch64::LDRHHui: |
490 | 42 | return AArch64::LDRHHpost; |
491 | 87 | case AArch64::LDRWui: |
492 | 30 | case AArch64::LDURWi: |
493 | 30 | return AArch64::LDRWpost; |
494 | 30 | case AArch64::LDRXui: |
495 | 20 | case AArch64::LDURXi: |
496 | 20 | return AArch64::LDRXpost; |
497 | 20 | case AArch64::LDRSWui: |
498 | 0 | return AArch64::LDRSWpost; |
499 | 20 | case AArch64::LDPSi: |
500 | 2 | return AArch64::LDPSpost; |
501 | 20 | case AArch64::LDPSWi: |
502 | 1 | return AArch64::LDPSWpost; |
503 | 47 | case AArch64::LDPDi: |
504 | 47 | return AArch64::LDPDpost; |
505 | 46 | case AArch64::LDPQi: |
506 | 46 | return AArch64::LDPQpost; |
507 | 20 | case AArch64::LDPWi: |
508 | 4 | return AArch64::LDPWpost; |
509 | 20 | case AArch64::LDPXi: |
510 | 5 | return AArch64::LDPXpost; |
511 | 20 | case AArch64::STPSi: |
512 | 2 | return AArch64::STPSpost; |
513 | 20 | case AArch64::STPDi: |
514 | 4 | return AArch64::STPDpost; |
515 | 51 | case AArch64::STPQi: |
516 | 51 | return AArch64::STPQpost; |
517 | 20 | case AArch64::STPWi: |
518 | 3 | return AArch64::STPWpost; |
519 | 32 | case AArch64::STPXi: |
520 | 32 | return AArch64::STPXpost; |
521 | 833 | } |
522 | 833 | } |
523 | | |
524 | 92.3M | static bool isPairedLdSt(const MachineInstr &MI) { |
525 | 92.3M | switch (MI.getOpcode()) { |
526 | 92.3M | default: |
527 | 66.3M | return false; |
528 | 92.3M | case AArch64::LDPSi: |
529 | 26.0M | case AArch64::LDPSWi: |
530 | 26.0M | case AArch64::LDPDi: |
531 | 26.0M | case AArch64::LDPQi: |
532 | 26.0M | case AArch64::LDPWi: |
533 | 26.0M | case AArch64::LDPXi: |
534 | 26.0M | case AArch64::STPSi: |
535 | 26.0M | case AArch64::STPDi: |
536 | 26.0M | case AArch64::STPQi: |
537 | 26.0M | case AArch64::STPWi: |
538 | 26.0M | case AArch64::STPXi: |
539 | 26.0M | return true; |
540 | 92.3M | } |
541 | 92.3M | } |
542 | | |
543 | | static const MachineOperand &getLdStRegOp(const MachineInstr &MI, |
544 | 15.6M | unsigned PairedRegOp = 0) { |
545 | 15.6M | assert(PairedRegOp < 2 && "Unexpected register operand idx."); |
546 | 15.6M | unsigned Idx = isPairedLdSt(MI) ? PairedRegOp4.95M : 010.6M ; |
547 | 15.6M | return MI.getOperand(Idx); |
548 | 15.6M | } |
549 | | |
550 | 25.4M | static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { |
551 | 25.4M | unsigned Idx = isPairedLdSt(MI) ? 26.95M : 118.5M ; |
552 | 25.4M | return MI.getOperand(Idx); |
553 | 25.4M | } |
554 | | |
555 | 43.1M | static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { |
556 | 43.1M | unsigned Idx = isPairedLdSt(MI) ? 311.5M : 231.5M ; |
557 | 43.1M | return MI.getOperand(Idx); |
558 | 43.1M | } |
559 | | |
560 | | static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, |
561 | | MachineInstr &StoreInst, |
562 | 64.8k | const AArch64InstrInfo *TII) { |
563 | 64.8k | assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); |
564 | 64.8k | int LoadSize = getMemScale(LoadInst); |
565 | 64.8k | int StoreSize = getMemScale(StoreInst); |
566 | 64.8k | int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst) |
567 | 64.8k | ? getLdStOffsetOp(StoreInst).getImm()1.07k |
568 | 64.8k | : getLdStOffsetOp(StoreInst).getImm() * StoreSize63.7k ; |
569 | 64.8k | int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst) |
570 | 64.8k | ? getLdStOffsetOp(LoadInst).getImm()1.07k |
571 | 64.8k | : getLdStOffsetOp(LoadInst).getImm() * LoadSize63.7k ; |
572 | 64.8k | return (UnscaledStOffset <= UnscaledLdOffset) && |
573 | 64.8k | (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize))44.3k ; |
574 | 64.8k | } |
575 | | |
576 | 32.4M | static bool isPromotableZeroStoreInst(MachineInstr &MI) { |
577 | 32.4M | unsigned Opc = MI.getOpcode(); |
578 | 32.4M | return (Opc == AArch64::STRWui || Opc == AArch64::STURWi31.7M || |
579 | 32.4M | isNarrowStore(Opc)31.6M ) && |
580 | 32.4M | getLdStRegOp(MI).getReg() == AArch64::WZR1.26M ; |
581 | 32.4M | } |
582 | | |
583 | 29.6M | static bool isPromotableLoadFromStore(MachineInstr &MI) { |
584 | 29.6M | switch (MI.getOpcode()) { |
585 | 29.6M | default: |
586 | 27.0M | return false; |
587 | 29.6M | // Scaled instructions. |
588 | 29.6M | case AArch64::LDRBBui: |
589 | 2.61M | case AArch64::LDRHHui: |
590 | 2.61M | case AArch64::LDRWui: |
591 | 2.61M | case AArch64::LDRXui: |
592 | 2.61M | // Unscaled instructions. |
593 | 2.61M | case AArch64::LDURBBi: |
594 | 2.61M | case AArch64::LDURHHi: |
595 | 2.61M | case AArch64::LDURWi: |
596 | 2.61M | case AArch64::LDURXi: |
597 | 2.61M | return true; |
598 | 29.6M | } |
599 | 29.6M | } |
600 | | |
601 | 29.4M | static bool isMergeableLdStUpdate(MachineInstr &MI) { |
602 | 29.4M | unsigned Opc = MI.getOpcode(); |
603 | 29.4M | switch (Opc) { |
604 | 29.4M | default: |
605 | 22.8M | return false; |
606 | 29.4M | // Scaled instructions. |
607 | 29.4M | case AArch64::STRSui: |
608 | 6.64M | case AArch64::STRDui: |
609 | 6.64M | case AArch64::STRQui: |
610 | 6.64M | case AArch64::STRXui: |
611 | 6.64M | case AArch64::STRWui: |
612 | 6.64M | case AArch64::STRHHui: |
613 | 6.64M | case AArch64::STRBBui: |
614 | 6.64M | case AArch64::LDRSui: |
615 | 6.64M | case AArch64::LDRDui: |
616 | 6.64M | case AArch64::LDRQui: |
617 | 6.64M | case AArch64::LDRXui: |
618 | 6.64M | case AArch64::LDRWui: |
619 | 6.64M | case AArch64::LDRHHui: |
620 | 6.64M | case AArch64::LDRBBui: |
621 | 6.64M | // Unscaled instructions. |
622 | 6.64M | case AArch64::STURSi: |
623 | 6.64M | case AArch64::STURDi: |
624 | 6.64M | case AArch64::STURQi: |
625 | 6.64M | case AArch64::STURWi: |
626 | 6.64M | case AArch64::STURXi: |
627 | 6.64M | case AArch64::LDURSi: |
628 | 6.64M | case AArch64::LDURDi: |
629 | 6.64M | case AArch64::LDURQi: |
630 | 6.64M | case AArch64::LDURWi: |
631 | 6.64M | case AArch64::LDURXi: |
632 | 6.64M | // Paired instructions. |
633 | 6.64M | case AArch64::LDPSi: |
634 | 6.64M | case AArch64::LDPSWi: |
635 | 6.64M | case AArch64::LDPDi: |
636 | 6.64M | case AArch64::LDPQi: |
637 | 6.64M | case AArch64::LDPWi: |
638 | 6.64M | case AArch64::LDPXi: |
639 | 6.64M | case AArch64::STPSi: |
640 | 6.64M | case AArch64::STPDi: |
641 | 6.64M | case AArch64::STPQi: |
642 | 6.64M | case AArch64::STPWi: |
643 | 6.64M | case AArch64::STPXi: |
644 | 6.64M | // Make sure this is a reg+imm (as opposed to an address reloc). |
645 | 6.64M | if (!getLdStOffsetOp(MI).isImm()) |
646 | 418k | return false; |
647 | 6.22M | |
648 | 6.22M | return true; |
649 | 29.4M | } |
650 | 29.4M | } |
651 | | |
652 | | MachineBasicBlock::iterator |
653 | | AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, |
654 | | MachineBasicBlock::iterator MergeMI, |
655 | 7.74k | const LdStPairFlags &Flags) { |
656 | 7.74k | assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && |
657 | 7.74k | "Expected promotable zero stores."); |
658 | 7.74k | |
659 | 7.74k | MachineBasicBlock::iterator NextI = I; |
660 | 7.74k | ++NextI; |
661 | 7.74k | // If NextI is the second of the two instructions to be merged, we need |
662 | 7.74k | // to skip one further. Either way we merge will invalidate the iterator, |
663 | 7.74k | // and we don't need to scan the new instruction, as it's a pairwise |
664 | 7.74k | // instruction, which we're not considering for further action anyway. |
665 | 7.74k | if (NextI == MergeMI) |
666 | 7.62k | ++NextI; |
667 | 7.74k | |
668 | 7.74k | unsigned Opc = I->getOpcode(); |
669 | 7.74k | bool IsScaled = !TII->isUnscaledLdSt(Opc); |
670 | 7.74k | int OffsetStride = IsScaled ? 17.70k : getMemScale(*I)35 ; |
671 | 7.74k | |
672 | 7.74k | bool MergeForward = Flags.getMergeForward(); |
673 | 7.74k | // Insert our new paired instruction after whichever of the paired |
674 | 7.74k | // instructions MergeForward indicates. |
675 | 7.74k | MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI0 : I; |
676 | 7.74k | // Also based on MergeForward is from where we copy the base register operand |
677 | 7.74k | // so we get the flags compatible with the input code. |
678 | 7.74k | const MachineOperand &BaseRegOp = |
679 | 7.74k | MergeForward ? getLdStBaseOp(*MergeMI)0 : getLdStBaseOp(*I); |
680 | 7.74k | |
681 | 7.74k | // Which register is Rt and which is Rt2 depends on the offset order. |
682 | 7.74k | MachineInstr *RtMI; |
683 | 7.74k | if (getLdStOffsetOp(*I).getImm() == |
684 | 7.74k | getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) |
685 | 984 | RtMI = &*MergeMI; |
686 | 6.75k | else |
687 | 6.75k | RtMI = &*I; |
688 | 7.74k | |
689 | 7.74k | int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); |
690 | 7.74k | // Change the scaled offset from small to large type. |
691 | 7.74k | if (IsScaled) { |
692 | 7.70k | assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); |
693 | 7.70k | OffsetImm /= 2; |
694 | 7.70k | } |
695 | 7.74k | |
696 | 7.74k | // Construct the new instruction. |
697 | 7.74k | DebugLoc DL = I->getDebugLoc(); |
698 | 7.74k | MachineBasicBlock *MBB = I->getParent(); |
699 | 7.74k | MachineInstrBuilder MIB; |
700 | 7.74k | MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) |
701 | 7.74k | .addReg(isNarrowStore(Opc) ? AArch64::WZR1 : AArch64::XZR7.73k ) |
702 | 7.74k | .add(BaseRegOp) |
703 | 7.74k | .addImm(OffsetImm) |
704 | 7.74k | .cloneMergedMemRefs({&*I, &*MergeMI}) |
705 | 7.74k | .setMIFlags(I->mergeFlagsWith(*MergeMI)); |
706 | 7.74k | (void)MIB; |
707 | 7.74k | |
708 | 7.74k | LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n "); |
709 | 7.74k | LLVM_DEBUG(I->print(dbgs())); |
710 | 7.74k | LLVM_DEBUG(dbgs() << " "); |
711 | 7.74k | LLVM_DEBUG(MergeMI->print(dbgs())); |
712 | 7.74k | LLVM_DEBUG(dbgs() << " with instruction:\n "); |
713 | 7.74k | LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); |
714 | 7.74k | LLVM_DEBUG(dbgs() << "\n"); |
715 | 7.74k | |
716 | 7.74k | // Erase the old instructions. |
717 | 7.74k | I->eraseFromParent(); |
718 | 7.74k | MergeMI->eraseFromParent(); |
719 | 7.74k | return NextI; |
720 | 7.74k | } |
721 | | |
722 | | MachineBasicBlock::iterator |
723 | | AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, |
724 | | MachineBasicBlock::iterator Paired, |
725 | 214k | const LdStPairFlags &Flags) { |
726 | 214k | MachineBasicBlock::iterator NextI = I; |
727 | 214k | ++NextI; |
728 | 214k | // If NextI is the second of the two instructions to be merged, we need |
729 | 214k | // to skip one further. Either way we merge will invalidate the iterator, |
730 | 214k | // and we don't need to scan the new instruction, as it's a pairwise |
731 | 214k | // instruction, which we're not considering for further action anyway. |
732 | 214k | if (NextI == Paired) |
733 | 193k | ++NextI; |
734 | 214k | |
735 | 214k | int SExtIdx = Flags.getSExtIdx(); |
736 | 214k | unsigned Opc = |
737 | 214k | SExtIdx == -1 ? I->getOpcode()208k : getMatchingNonSExtOpcode(I->getOpcode())5.51k ; |
738 | 214k | bool IsUnscaled = TII->isUnscaledLdSt(Opc); |
739 | 214k | int OffsetStride = IsUnscaled ? getMemScale(*I)19.4k : 1194k ; |
740 | 214k | |
741 | 214k | bool MergeForward = Flags.getMergeForward(); |
742 | 214k | // Insert our new paired instruction after whichever of the paired |
743 | 214k | // instructions MergeForward indicates. |
744 | 214k | MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired7.41k : I206k ; |
745 | 214k | // Also based on MergeForward is from where we copy the base register operand |
746 | 214k | // so we get the flags compatible with the input code. |
747 | 214k | const MachineOperand &BaseRegOp = |
748 | 214k | MergeForward ? getLdStBaseOp(*Paired)7.41k : getLdStBaseOp(*I)206k ; |
749 | 214k | |
750 | 214k | int Offset = getLdStOffsetOp(*I).getImm(); |
751 | 214k | int PairedOffset = getLdStOffsetOp(*Paired).getImm(); |
752 | 214k | bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode()); |
753 | 214k | if (IsUnscaled != PairedIsUnscaled) { |
754 | 14.3k | // We're trying to pair instructions that differ in how they are scaled. If |
755 | 14.3k | // I is scaled then scale the offset of Paired accordingly. Otherwise, do |
756 | 14.3k | // the opposite (i.e., make Paired's offset unscaled). |
757 | 14.3k | int MemSize = getMemScale(*Paired); |
758 | 14.3k | if (PairedIsUnscaled) { |
759 | 229 | // If the unscaled offset isn't a multiple of the MemSize, we can't |
760 | 229 | // pair the operations together. |
761 | 229 | assert(!(PairedOffset % getMemScale(*Paired)) && |
762 | 229 | "Offset should be a multiple of the stride!"); |
763 | 229 | PairedOffset /= MemSize; |
764 | 14.1k | } else { |
765 | 14.1k | PairedOffset *= MemSize; |
766 | 14.1k | } |
767 | 14.3k | } |
768 | 214k | |
769 | 214k | // Which register is Rt and which is Rt2 depends on the offset order. |
770 | 214k | MachineInstr *RtMI, *Rt2MI; |
771 | 214k | if (Offset == PairedOffset + OffsetStride) { |
772 | 29.0k | RtMI = &*Paired; |
773 | 29.0k | Rt2MI = &*I; |
774 | 29.0k | // Here we swapped the assumption made for SExtIdx. |
775 | 29.0k | // I.e., we turn ldp I, Paired into ldp Paired, I. |
776 | 29.0k | // Update the index accordingly. |
777 | 29.0k | if (SExtIdx != -1) |
778 | 28 | SExtIdx = (SExtIdx + 1) % 2; |
779 | 185k | } else { |
780 | 185k | RtMI = &*I; |
781 | 185k | Rt2MI = &*Paired; |
782 | 185k | } |
783 | 214k | int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); |
784 | 214k | // Scale the immediate offset, if necessary. |
785 | 214k | if (TII->isUnscaledLdSt(RtMI->getOpcode())) { |
786 | 19.6k | assert(!(OffsetImm % getMemScale(*RtMI)) && |
787 | 19.6k | "Unscaled offset cannot be scaled."); |
788 | 19.6k | OffsetImm /= getMemScale(*RtMI); |
789 | 19.6k | } |
790 | 214k | |
791 | 214k | // Construct the new instruction. |
792 | 214k | MachineInstrBuilder MIB; |
793 | 214k | DebugLoc DL = I->getDebugLoc(); |
794 | 214k | MachineBasicBlock *MBB = I->getParent(); |
795 | 214k | MachineOperand RegOp0 = getLdStRegOp(*RtMI); |
796 | 214k | MachineOperand RegOp1 = getLdStRegOp(*Rt2MI); |
797 | 214k | // Kill flags may become invalid when moving stores for pairing. |
798 | 214k | if (RegOp0.isUse()) { |
799 | 115k | if (!MergeForward) { |
800 | 108k | // Clear kill flags on store if moving upwards. Example: |
801 | 108k | // STRWui %w0, ... |
802 | 108k | // USE %w1 |
803 | 108k | // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards |
804 | 108k | RegOp0.setIsKill(false); |
805 | 108k | RegOp1.setIsKill(false); |
806 | 108k | } else { |
807 | 6.90k | // Clear kill flags of the first stores register. Example: |
808 | 6.90k | // STRWui %w1, ... |
809 | 6.90k | // USE kill %w1 ; need to clear kill flag when moving STRWui downwards |
810 | 6.90k | // STRW %w0 |
811 | 6.90k | unsigned Reg = getLdStRegOp(*I).getReg(); |
812 | 6.90k | for (MachineInstr &MI : make_range(std::next(I), Paired)) |
813 | 15.7k | MI.clearRegisterKills(Reg, TRI); |
814 | 6.90k | } |
815 | 115k | } |
816 | 214k | MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc))) |
817 | 214k | .add(RegOp0) |
818 | 214k | .add(RegOp1) |
819 | 214k | .add(BaseRegOp) |
820 | 214k | .addImm(OffsetImm) |
821 | 214k | .cloneMergedMemRefs({&*I, &*Paired}) |
822 | 214k | .setMIFlags(I->mergeFlagsWith(*Paired)); |
823 | 214k | |
824 | 214k | (void)MIB; |
825 | 214k | |
826 | 214k | LLVM_DEBUG( |
827 | 214k | dbgs() << "Creating pair load/store. Replacing instructions:\n "); |
828 | 214k | LLVM_DEBUG(I->print(dbgs())); |
829 | 214k | LLVM_DEBUG(dbgs() << " "); |
830 | 214k | LLVM_DEBUG(Paired->print(dbgs())); |
831 | 214k | LLVM_DEBUG(dbgs() << " with instruction:\n "); |
832 | 214k | if (SExtIdx != -1) { |
833 | 5.51k | // Generate the sign extension for the proper result of the ldp. |
834 | 5.51k | // I.e., with X1, that would be: |
835 | 5.51k | // %w1 = KILL %w1, implicit-def %x1 |
836 | 5.51k | // %x1 = SBFMXri killed %x1, 0, 31 |
837 | 5.51k | MachineOperand &DstMO = MIB->getOperand(SExtIdx); |
838 | 5.51k | // Right now, DstMO has the extended register, since it comes from an |
839 | 5.51k | // extended opcode. |
840 | 5.51k | unsigned DstRegX = DstMO.getReg(); |
841 | 5.51k | // Get the W variant of that register. |
842 | 5.51k | unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); |
843 | 5.51k | // Update the result of LDP to use the W instead of the X variant. |
844 | 5.51k | DstMO.setReg(DstRegW); |
845 | 5.51k | LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); |
846 | 5.51k | LLVM_DEBUG(dbgs() << "\n"); |
847 | 5.51k | // Make the machine verifier happy by providing a definition for |
848 | 5.51k | // the X register. |
849 | 5.51k | // Insert this definition right after the generated LDP, i.e., before |
850 | 5.51k | // InsertionPoint. |
851 | 5.51k | MachineInstrBuilder MIBKill = |
852 | 5.51k | BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) |
853 | 5.51k | .addReg(DstRegW) |
854 | 5.51k | .addReg(DstRegX, RegState::Define); |
855 | 5.51k | MIBKill->getOperand(2).setImplicit(); |
856 | 5.51k | // Create the sign extension. |
857 | 5.51k | MachineInstrBuilder MIBSXTW = |
858 | 5.51k | BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) |
859 | 5.51k | .addReg(DstRegX) |
860 | 5.51k | .addImm(0) |
861 | 5.51k | .addImm(31); |
862 | 5.51k | (void)MIBSXTW; |
863 | 5.51k | LLVM_DEBUG(dbgs() << " Extend operand:\n "); |
864 | 5.51k | LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); |
865 | 208k | } else { |
866 | 208k | LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); |
867 | 208k | } |
868 | 214k | LLVM_DEBUG(dbgs() << "\n"); |
869 | 214k | |
870 | 214k | // Erase the old instructions. |
871 | 214k | I->eraseFromParent(); |
872 | 214k | Paired->eraseFromParent(); |
873 | 214k | |
874 | 214k | return NextI; |
875 | 214k | } |
876 | | |
877 | | MachineBasicBlock::iterator |
878 | | AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, |
879 | 226 | MachineBasicBlock::iterator StoreI) { |
880 | 226 | MachineBasicBlock::iterator NextI = LoadI; |
881 | 226 | ++NextI; |
882 | 226 | |
883 | 226 | int LoadSize = getMemScale(*LoadI); |
884 | 226 | int StoreSize = getMemScale(*StoreI); |
885 | 226 | unsigned LdRt = getLdStRegOp(*LoadI).getReg(); |
886 | 226 | const MachineOperand &StMO = getLdStRegOp(*StoreI); |
887 | 226 | unsigned StRt = getLdStRegOp(*StoreI).getReg(); |
888 | 226 | bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); |
889 | 226 | |
890 | 226 | assert((IsStoreXReg || |
891 | 226 | TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && |
892 | 226 | "Unexpected RegClass"); |
893 | 226 | |
894 | 226 | MachineInstr *BitExtMI; |
895 | 226 | if (LoadSize == StoreSize && (166 LoadSize == 4166 || LoadSize == 895 )) { |
896 | 150 | // Remove the load, if the destination register of the loads is the same |
897 | 150 | // register for stored value. |
898 | 150 | if (StRt == LdRt && LoadSize == 888 ) { |
899 | 41 | for (MachineInstr &MI : make_range(StoreI->getIterator(), |
900 | 44 | LoadI->getIterator())) { |
901 | 44 | if (MI.killsRegister(StRt, TRI)) { |
902 | 41 | MI.clearRegisterKills(StRt, TRI); |
903 | 41 | break; |
904 | 41 | } |
905 | 44 | } |
906 | 41 | LLVM_DEBUG(dbgs() << "Remove load instruction:\n "); |
907 | 41 | LLVM_DEBUG(LoadI->print(dbgs())); |
908 | 41 | LLVM_DEBUG(dbgs() << "\n"); |
909 | 41 | LoadI->eraseFromParent(); |
910 | 41 | return NextI; |
911 | 41 | } |
912 | 109 | // Replace the load with a mov if the load and store are in the same size. |
913 | 109 | BitExtMI = |
914 | 109 | BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), |
915 | 109 | TII->get(IsStoreXReg ? AArch64::ORRXrs38 : AArch64::ORRWrs71 ), LdRt) |
916 | 109 | .addReg(IsStoreXReg ? AArch64::XZR38 : AArch64::WZR71 ) |
917 | 109 | .add(StMO) |
918 | 109 | .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) |
919 | 109 | .setMIFlags(LoadI->getFlags()); |
920 | 109 | } else { |
921 | 76 | // FIXME: Currently we disable this transformation in big-endian targets as |
922 | 76 | // performance and correctness are verified only in little-endian. |
923 | 76 | if (!Subtarget->isLittleEndian()) |
924 | 3 | return NextI; |
925 | 73 | bool IsUnscaled = TII->isUnscaledLdSt(*LoadI); |
926 | 73 | assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) && |
927 | 73 | "Unsupported ld/st match"); |
928 | 73 | assert(LoadSize <= StoreSize && "Invalid load size"); |
929 | 73 | int UnscaledLdOffset = IsUnscaled |
930 | 73 | ? getLdStOffsetOp(*LoadI).getImm()17 |
931 | 73 | : getLdStOffsetOp(*LoadI).getImm() * LoadSize56 ; |
932 | 73 | int UnscaledStOffset = IsUnscaled |
933 | 73 | ? getLdStOffsetOp(*StoreI).getImm()17 |
934 | 73 | : getLdStOffsetOp(*StoreI).getImm() * StoreSize56 ; |
935 | 73 | int Width = LoadSize * 8; |
936 | 73 | unsigned DestReg = IsStoreXReg |
937 | 73 | ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, |
938 | 32 | &AArch64::GPR64RegClass) |
939 | 73 | : LdRt41 ; |
940 | 73 | |
941 | 73 | assert((UnscaledLdOffset >= UnscaledStOffset && |
942 | 73 | (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && |
943 | 73 | "Invalid offset"); |
944 | 73 | |
945 | 73 | int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); |
946 | 73 | int Imms = Immr + Width - 1; |
947 | 73 | if (UnscaledLdOffset == UnscaledStOffset) { |
948 | 34 | uint32_t AndMaskEncoded = ((IsStoreXReg ? 19 : 025 ) << 12) // N |
949 | 34 | | ((Immr) << 6) // immr |
950 | 34 | | ((Imms) << 0) // imms |
951 | 34 | ; |
952 | 34 | |
953 | 34 | BitExtMI = |
954 | 34 | BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), |
955 | 34 | TII->get(IsStoreXReg ? AArch64::ANDXri9 : AArch64::ANDWri25 ), |
956 | 34 | DestReg) |
957 | 34 | .add(StMO) |
958 | 34 | .addImm(AndMaskEncoded) |
959 | 34 | .setMIFlags(LoadI->getFlags()); |
960 | 39 | } else { |
961 | 39 | BitExtMI = |
962 | 39 | BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), |
963 | 39 | TII->get(IsStoreXReg ? AArch64::UBFMXri23 : AArch64::UBFMWri16 ), |
964 | 39 | DestReg) |
965 | 39 | .add(StMO) |
966 | 39 | .addImm(Immr) |
967 | 39 | .addImm(Imms) |
968 | 39 | .setMIFlags(LoadI->getFlags()); |
969 | 39 | } |
970 | 73 | } |
971 | 226 | |
972 | 226 | // Clear kill flags between store and load. |
973 | 226 | for (MachineInstr &MI : make_range(StoreI->getIterator(), |
974 | 182 | BitExtMI->getIterator())) |
975 | 263 | if (MI.killsRegister(StRt, TRI)) { |
976 | 163 | MI.clearRegisterKills(StRt, TRI); |
977 | 163 | break; |
978 | 163 | } |
979 | 182 | |
980 | 182 | LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n "); |
981 | 182 | LLVM_DEBUG(StoreI->print(dbgs())); |
982 | 182 | LLVM_DEBUG(dbgs() << " "); |
983 | 182 | LLVM_DEBUG(LoadI->print(dbgs())); |
984 | 182 | LLVM_DEBUG(dbgs() << " with instructions:\n "); |
985 | 182 | LLVM_DEBUG(StoreI->print(dbgs())); |
986 | 182 | LLVM_DEBUG(dbgs() << " "); |
987 | 182 | LLVM_DEBUG((BitExtMI)->print(dbgs())); |
988 | 182 | LLVM_DEBUG(dbgs() << "\n"); |
989 | 182 | |
990 | 182 | // Erase the old instructions. |
991 | 182 | LoadI->eraseFromParent(); |
992 | 182 | return NextI; |
993 | 226 | } |
994 | | |
995 | 3.45M | static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { |
996 | 3.45M | // Convert the byte-offset used by unscaled into an "element" offset used |
997 | 3.45M | // by the scaled pair load/store instructions. |
998 | 3.45M | if (IsUnscaled) { |
999 | 199k | // If the byte-offset isn't a multiple of the stride, there's no point |
1000 | 199k | // trying to match it. |
1001 | 199k | if (Offset % OffsetStride) |
1002 | 89.6k | return false; |
1003 | 110k | Offset /= OffsetStride; |
1004 | 110k | } |
1005 | 3.45M | return 3.36M Offset <= 633.36M && Offset >= -643.02M ; |
1006 | 3.45M | } |
1007 | | |
1008 | | // Do alignment, specialized to power of 2 and for signed ints, |
1009 | | // avoiding having to do a C-style cast from uint_64t to int when |
1010 | | // using alignTo from include/llvm/Support/MathExtras.h. |
1011 | | // FIXME: Move this function to include/MathExtras.h? |
1012 | 34.8k | static int alignTo(int Num, int PowOf2) { |
1013 | 34.8k | return (Num + PowOf2 - 1) & ~(PowOf2 - 1); |
1014 | 34.8k | } |
1015 | | |
1016 | | static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, |
1017 | 626k | AliasAnalysis *AA) { |
1018 | 626k | // One of the instructions must modify memory. |
1019 | 626k | if (!MIa.mayStore() && !MIb.mayStore()601k ) |
1020 | 6.82k | return false; |
1021 | 619k | |
1022 | 619k | // Both instructions must be memory operations. |
1023 | 619k | if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore()0 ) |
1024 | 0 | return false; |
1025 | 619k | |
1026 | 619k | return MIa.mayAlias(AA, MIb, /*UseTBAA*/false); |
1027 | 619k | } |
1028 | | |
1029 | | static bool mayAlias(MachineInstr &MIa, |
1030 | | SmallVectorImpl<MachineInstr *> &MemInsns, |
1031 | 235k | AliasAnalysis *AA) { |
1032 | 235k | for (MachineInstr *MIb : MemInsns) |
1033 | 39.5k | if (mayAlias(MIa, *MIb, AA)) |
1034 | 12.9k | return true; |
1035 | 235k | |
1036 | 235k | return false222k ; |
1037 | 235k | } |
1038 | | |
1039 | | bool AArch64LoadStoreOpt::findMatchingStore( |
1040 | | MachineBasicBlock::iterator I, unsigned Limit, |
1041 | 2.27M | MachineBasicBlock::iterator &StoreI) { |
1042 | 2.27M | MachineBasicBlock::iterator B = I->getParent()->begin(); |
1043 | 2.27M | MachineBasicBlock::iterator MBBI = I; |
1044 | 2.27M | MachineInstr &LoadMI = *I; |
1045 | 2.27M | unsigned BaseReg = getLdStBaseOp(LoadMI).getReg(); |
1046 | 2.27M | |
1047 | 2.27M | // If the load is the first instruction in the block, there's obviously |
1048 | 2.27M | // not any matching store. |
1049 | 2.27M | if (MBBI == B) |
1050 | 821k | return false; |
1051 | 1.45M | |
1052 | 1.45M | // Track which register units have been modified and used between the first |
1053 | 1.45M | // insn and the second insn. |
1054 | 1.45M | ModifiedRegUnits.clear(); |
1055 | 1.45M | UsedRegUnits.clear(); |
1056 | 1.45M | |
1057 | 1.45M | unsigned Count = 0; |
1058 | 4.24M | do { |
1059 | 4.24M | --MBBI; |
1060 | 4.24M | MachineInstr &MI = *MBBI; |
1061 | 4.24M | |
1062 | 4.24M | // Don't count transient instructions towards the search limit since there |
1063 | 4.24M | // may be different numbers of them if e.g. debug information is present. |
1064 | 4.24M | if (!MI.isTransient()) |
1065 | 4.06M | ++Count; |
1066 | 4.24M | |
1067 | 4.24M | // If the load instruction reads directly from the address to which the |
1068 | 4.24M | // store instruction writes and the stored value is not modified, we can |
1069 | 4.24M | // promote the load. Since we do not handle stores with pre-/post-index, |
1070 | 4.24M | // it's unnecessary to check if BaseReg is modified by the store itself. |
1071 | 4.24M | if (MI.mayStore() && isMatchingStore(LoadMI, MI)586k && |
1072 | 4.24M | BaseReg == getLdStBaseOp(MI).getReg()297k && |
1073 | 4.24M | isLdOffsetInRangeOfSt(LoadMI, MI, TII)64.8k && |
1074 | 4.24M | ModifiedRegUnits.available(getLdStRegOp(MI).getReg())370 ) { |
1075 | 226 | StoreI = MBBI; |
1076 | 226 | return true; |
1077 | 226 | } |
1078 | 4.24M | |
1079 | 4.24M | if (MI.isCall()) |
1080 | 332k | return false; |
1081 | 3.90M | |
1082 | 3.90M | // Update modified / uses register units. |
1083 | 3.90M | LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); |
1084 | 3.90M | |
1085 | 3.90M | // Otherwise, if the base register is modified, we have no match, so |
1086 | 3.90M | // return early. |
1087 | 3.90M | if (!ModifiedRegUnits.available(BaseReg)) |
1088 | 385k | return false; |
1089 | 3.52M | |
1090 | 3.52M | // If we encounter a store aliased with the load, return early. |
1091 | 3.52M | if (MI.mayStore() && mayAlias(LoadMI, MI, AA)586k ) |
1092 | 213k | return false; |
1093 | 3.30M | } while (MBBI != B && Count < Limit2.82M ); |
1094 | 1.45M | return false517k ; |
1095 | 1.45M | } |
1096 | | |
1097 | | // Returns true if FirstMI and MI are candidates for merging or pairing. |
1098 | | // Otherwise, returns false. |
1099 | | static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, |
1100 | | LdStPairFlags &Flags, |
1101 | 11.1M | const AArch64InstrInfo *TII) { |
1102 | 11.1M | // If this is volatile or if pairing is suppressed, not a candidate. |
1103 | 11.1M | if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)10.1M ) |
1104 | 962k | return false; |
1105 | 10.1M | |
1106 | 10.1M | // We should have already checked FirstMI for pair suppression and volatility. |
1107 | 10.1M | assert(!FirstMI.hasOrderedMemoryRef() && |
1108 | 10.1M | !TII->isLdStPairSuppressed(FirstMI) && |
1109 | 10.1M | "FirstMI shouldn't get here if either of these checks are true."); |
1110 | 10.1M | |
1111 | 10.1M | unsigned OpcA = FirstMI.getOpcode(); |
1112 | 10.1M | unsigned OpcB = MI.getOpcode(); |
1113 | 10.1M | |
1114 | 10.1M | // Opcodes match: nothing more to check. |
1115 | 10.1M | if (OpcA == OpcB) |
1116 | 1.46M | return true; |
1117 | 8.70M | |
1118 | 8.70M | // Try to match a sign-extended load/store with a zero-extended load/store. |
1119 | 8.70M | bool IsValidLdStrOpc, PairIsValidLdStrOpc; |
1120 | 8.70M | unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); |
1121 | 8.70M | assert(IsValidLdStrOpc && |
1122 | 8.70M | "Given Opc should be a Load or Store with an immediate"); |
1123 | 8.70M | // OpcA will be the first instruction in the pair. |
1124 | 8.70M | if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { |
1125 | 60.7k | Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 150.1k : 010.6k ); |
1126 | 60.7k | return true; |
1127 | 60.7k | } |
1128 | 8.64M | |
1129 | 8.64M | // If the second instruction isn't even a mergable/pairable load/store, bail |
1130 | 8.64M | // out. |
1131 | 8.64M | if (!PairIsValidLdStrOpc) |
1132 | 6.53M | return false; |
1133 | 2.10M | |
1134 | 2.10M | // FIXME: We don't support merging narrow stores with mixed scaled/unscaled |
1135 | 2.10M | // offsets. |
1136 | 2.10M | if (isNarrowStore(OpcA) || isNarrowStore(OpcB)2.10M ) |
1137 | 173k | return false; |
1138 | 1.93M | |
1139 | 1.93M | // Try to match an unscaled load/store with a scaled load/store. |
1140 | 1.93M | return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) && |
1141 | 1.93M | getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB)205k ; |
1142 | 1.93M | |
1143 | 1.93M | // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? |
1144 | 1.93M | } |
1145 | | |
1146 | | /// Scan the instructions looking for a load/store that can be combined with the |
1147 | | /// current instruction into a wider equivalent or a load/store pair. |
1148 | | MachineBasicBlock::iterator |
1149 | | AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, |
1150 | | LdStPairFlags &Flags, unsigned Limit, |
1151 | 2.75M | bool FindNarrowMerge) { |
1152 | 2.75M | MachineBasicBlock::iterator E = I->getParent()->end(); |
1153 | 2.75M | MachineBasicBlock::iterator MBBI = I; |
1154 | 2.75M | MachineInstr &FirstMI = *I; |
1155 | 2.75M | ++MBBI; |
1156 | 2.75M | |
1157 | 2.75M | bool MayLoad = FirstMI.mayLoad(); |
1158 | 2.75M | bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); |
1159 | 2.75M | unsigned Reg = getLdStRegOp(FirstMI).getReg(); |
1160 | 2.75M | unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); |
1161 | 2.75M | int Offset = getLdStOffsetOp(FirstMI).getImm(); |
1162 | 2.75M | int OffsetStride = IsUnscaled ? getMemScale(FirstMI)88.2k : 12.66M ; |
1163 | 2.75M | bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); |
1164 | 2.75M | |
1165 | 2.75M | // Track which register units have been modified and used between the first |
1166 | 2.75M | // insn (inclusive) and the second insn. |
1167 | 2.75M | ModifiedRegUnits.clear(); |
1168 | 2.75M | UsedRegUnits.clear(); |
1169 | 2.75M | |
1170 | 2.75M | // Remember any instructions that read/write memory between FirstMI and MI. |
1171 | 2.75M | SmallVector<MachineInstr *, 4> MemInsns; |
1172 | 2.75M | |
1173 | 12.7M | for (unsigned Count = 0; MBBI != E && Count < Limit11.2M ; ++MBBI9.99M ) { |
1174 | 11.1M | MachineInstr &MI = *MBBI; |
1175 | 11.1M | |
1176 | 11.1M | // Don't count transient instructions towards the search limit since there |
1177 | 11.1M | // may be different numbers of them if e.g. debug information is present. |
1178 | 11.1M | if (!MI.isTransient()) |
1179 | 11.0M | ++Count; |
1180 | 11.1M | |
1181 | 11.1M | Flags.setSExtIdx(-1); |
1182 | 11.1M | if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && |
1183 | 11.1M | getLdStOffsetOp(MI).isImm()1.58M ) { |
1184 | 1.58M | assert(MI.mayLoadOrStore() && "Expected memory operation."); |
1185 | 1.58M | // If we've found another instruction with the same opcode, check to see |
1186 | 1.58M | // if the base and offset are compatible with our starting instruction. |
1187 | 1.58M | // These instructions all have scaled immediate operands, so we just |
1188 | 1.58M | // check for +1/-1. Make sure to check the new instruction offset is |
1189 | 1.58M | // actually an immediate and not a symbolic reference destined for |
1190 | 1.58M | // a relocation. |
1191 | 1.58M | unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); |
1192 | 1.58M | int MIOffset = getLdStOffsetOp(MI).getImm(); |
1193 | 1.58M | bool MIIsUnscaled = TII->isUnscaledLdSt(MI); |
1194 | 1.58M | if (IsUnscaled != MIIsUnscaled) { |
1195 | 59.8k | // We're trying to pair instructions that differ in how they are scaled. |
1196 | 59.8k | // If FirstMI is scaled then scale the offset of MI accordingly. |
1197 | 59.8k | // Otherwise, do the opposite (i.e., make MI's offset unscaled). |
1198 | 59.8k | int MemSize = getMemScale(MI); |
1199 | 59.8k | if (MIIsUnscaled) { |
1200 | 25.2k | // If the unscaled offset isn't a multiple of the MemSize, we can't |
1201 | 25.2k | // pair the operations together: bail and keep looking. |
1202 | 25.2k | if (MIOffset % MemSize) { |
1203 | 9.04k | LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, |
1204 | 9.04k | UsedRegUnits, TRI); |
1205 | 9.04k | MemInsns.push_back(&MI); |
1206 | 9.04k | continue; |
1207 | 9.04k | } |
1208 | 16.2k | MIOffset /= MemSize; |
1209 | 34.5k | } else { |
1210 | 34.5k | MIOffset *= MemSize; |
1211 | 34.5k | } |
1212 | 59.8k | } |
1213 | 1.58M | |
1214 | 1.58M | if (1.57M BaseReg == MIBaseReg1.57M && (987k (Offset == MIOffset + OffsetStride)987k || |
1215 | 987k | (Offset + OffsetStride == MIOffset)912k )) { |
1216 | 338k | int MinOffset = Offset < MIOffset ? Offset263k : MIOffset74.8k ; |
1217 | 338k | if (FindNarrowMerge) { |
1218 | 10.9k | // If the alignment requirements of the scaled wide load/store |
1219 | 10.9k | // instruction can't express the offset of the scaled narrow input, |
1220 | 10.9k | // bail and keep looking. For promotable zero stores, allow only when |
1221 | 10.9k | // the stored value is the same (i.e., WZR). |
1222 | 10.9k | if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset10.9k ) || |
1223 | 10.9k | (8.59k IsPromotableZeroStore8.59k && Reg != getLdStRegOp(MI).getReg()8.59k )) { |
1224 | 3.11k | LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, |
1225 | 3.11k | UsedRegUnits, TRI); |
1226 | 3.11k | MemInsns.push_back(&MI); |
1227 | 3.11k | continue; |
1228 | 3.11k | } |
1229 | 327k | } else { |
1230 | 327k | // Pairwise instructions have a 7-bit signed offset field. Single |
1231 | 327k | // insns have a 12-bit unsigned offset field. If the resultant |
1232 | 327k | // immediate offset of merging these instructions is out of range for |
1233 | 327k | // a pairwise instruction, bail and keep looking. |
1234 | 327k | if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) { |
1235 | 1.58k | LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, |
1236 | 1.58k | UsedRegUnits, TRI); |
1237 | 1.58k | MemInsns.push_back(&MI); |
1238 | 1.58k | continue; |
1239 | 1.58k | } |
1240 | 325k | // If the alignment requirements of the paired (scaled) instruction |
1241 | 325k | // can't express the offset of the unscaled input, bail and keep |
1242 | 325k | // looking. |
1243 | 325k | if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)23.9k ) { |
1244 | 0 | LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, |
1245 | 0 | UsedRegUnits, TRI); |
1246 | 0 | MemInsns.push_back(&MI); |
1247 | 0 | continue; |
1248 | 0 | } |
1249 | 333k | } |
1250 | 333k | // If the destination register of the loads is the same register, bail |
1251 | 333k | // and keep looking. A load-pair instruction with both destination |
1252 | 333k | // registers the same is UNPREDICTABLE and will result in an exception. |
1253 | 333k | if (MayLoad && Reg == getLdStRegOp(MI).getReg()147k ) { |
1254 | 37.0k | LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, |
1255 | 37.0k | TRI); |
1256 | 37.0k | MemInsns.push_back(&MI); |
1257 | 37.0k | continue; |
1258 | 37.0k | } |
1259 | 296k | |
1260 | 296k | // If the Rt of the second instruction was not modified or used between |
1261 | 296k | // the two instructions and none of the instructions between the second |
1262 | 296k | // and first alias with the second, we can combine the second into the |
1263 | 296k | // first. |
1264 | 296k | if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) && |
1265 | 296k | !(225k MI.mayLoad()225k && |
1266 | 225k | !UsedRegUnits.available(getLdStRegOp(MI).getReg())107k ) && |
1267 | 296k | !mayAlias(MI, MemInsns, AA)222k ) { |
1268 | 214k | Flags.setMergeForward(false); |
1269 | 214k | return MBBI; |
1270 | 214k | } |
1271 | 81.5k | |
1272 | 81.5k | // Likewise, if the Rt of the first instruction is not modified or used |
1273 | 81.5k | // between the two instructions and none of the instructions between the |
1274 | 81.5k | // first and the second alias with the first, we can combine the first |
1275 | 81.5k | // into the second. |
1276 | 81.5k | if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg()) && |
1277 | 81.5k | !(21.2k MayLoad21.2k && |
1278 | 21.2k | !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())8.75k ) && |
1279 | 81.5k | !mayAlias(FirstMI, MemInsns, AA)13.0k ) { |
1280 | 7.41k | Flags.setMergeForward(true); |
1281 | 7.41k | return MBBI; |
1282 | 7.41k | } |
1283 | 10.8M | // Unable to combine these instructions due to interference in between. |
1284 | 10.8M | // Keep looking. |
1285 | 10.8M | } |
1286 | 1.57M | } |
1287 | 10.8M | |
1288 | 10.8M | // If the instruction wasn't a matching load or store. Stop searching if we |
1289 | 10.8M | // encounter a call instruction that might modify memory. |
1290 | 10.8M | if (MI.isCall()) |
1291 | 733k | return E; |
1292 | 10.1M | |
1293 | 10.1M | // Update modified / uses register units. |
1294 | 10.1M | LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); |
1295 | 10.1M | |
1296 | 10.1M | // Otherwise, if the base register is modified, we have no match, so |
1297 | 10.1M | // return early. |
1298 | 10.1M | if (!ModifiedRegUnits.available(BaseReg)) |
1299 | 189k | return E; |
1300 | 9.93M | |
1301 | 9.93M | // Update list of instructions that read/write memory. |
1302 | 9.93M | if (MI.mayLoadOrStore()) |
1303 | 3.87M | MemInsns.push_back(&MI); |
1304 | 9.93M | } |
1305 | 2.75M | return E1.61M ; |
1306 | 2.75M | } |
1307 | | |
1308 | | MachineBasicBlock::iterator |
1309 | | AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, |
1310 | | MachineBasicBlock::iterator Update, |
1311 | 1.05k | bool IsPreIdx) { |
1312 | 1.05k | assert((Update->getOpcode() == AArch64::ADDXri || |
1313 | 1.05k | Update->getOpcode() == AArch64::SUBXri) && |
1314 | 1.05k | "Unexpected base register update instruction to merge!"); |
1315 | 1.05k | MachineBasicBlock::iterator NextI = I; |
1316 | 1.05k | // Return the instruction following the merged instruction, which is |
1317 | 1.05k | // the instruction following our unmerged load. Unless that's the add/sub |
1318 | 1.05k | // instruction we're merging, in which case it's the one after that. |
1319 | 1.05k | if (++NextI == Update) |
1320 | 304 | ++NextI; |
1321 | 1.05k | |
1322 | 1.05k | int Value = Update->getOperand(2).getImm(); |
1323 | 1.05k | assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && |
1324 | 1.05k | "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); |
1325 | 1.05k | if (Update->getOpcode() == AArch64::SUBXri) |
1326 | 103 | Value = -Value; |
1327 | 1.05k | |
1328 | 1.05k | unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())218 |
1329 | 1.05k | : getPostIndexedOpcode(I->getOpcode())833 ; |
1330 | 1.05k | MachineInstrBuilder MIB; |
1331 | 1.05k | if (!isPairedLdSt(*I)) { |
1332 | 818 | // Non-paired instruction. |
1333 | 818 | MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) |
1334 | 818 | .add(getLdStRegOp(*Update)) |
1335 | 818 | .add(getLdStRegOp(*I)) |
1336 | 818 | .add(getLdStBaseOp(*I)) |
1337 | 818 | .addImm(Value) |
1338 | 818 | .setMemRefs(I->memoperands()) |
1339 | 818 | .setMIFlags(I->mergeFlagsWith(*Update)); |
1340 | 818 | } else { |
1341 | 233 | // Paired instruction. |
1342 | 233 | int Scale = getMemScale(*I); |
1343 | 233 | MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) |
1344 | 233 | .add(getLdStRegOp(*Update)) |
1345 | 233 | .add(getLdStRegOp(*I, 0)) |
1346 | 233 | .add(getLdStRegOp(*I, 1)) |
1347 | 233 | .add(getLdStBaseOp(*I)) |
1348 | 233 | .addImm(Value / Scale) |
1349 | 233 | .setMemRefs(I->memoperands()) |
1350 | 233 | .setMIFlags(I->mergeFlagsWith(*Update)); |
1351 | 233 | } |
1352 | 1.05k | (void)MIB; |
1353 | 1.05k | |
1354 | 1.05k | if (IsPreIdx) { |
1355 | 218 | ++NumPreFolded; |
1356 | 218 | LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store."); |
1357 | 833 | } else { |
1358 | 833 | ++NumPostFolded; |
1359 | 833 | LLVM_DEBUG(dbgs() << "Creating post-indexed load/store."); |
1360 | 833 | } |
1361 | 1.05k | LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); |
1362 | 1.05k | LLVM_DEBUG(I->print(dbgs())); |
1363 | 1.05k | LLVM_DEBUG(dbgs() << " "); |
1364 | 1.05k | LLVM_DEBUG(Update->print(dbgs())); |
1365 | 1.05k | LLVM_DEBUG(dbgs() << " with instruction:\n "); |
1366 | 1.05k | LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); |
1367 | 1.05k | LLVM_DEBUG(dbgs() << "\n"); |
1368 | 1.05k | |
1369 | 1.05k | // Erase the old instructions for the block. |
1370 | 1.05k | I->eraseFromParent(); |
1371 | 1.05k | Update->eraseFromParent(); |
1372 | 1.05k | |
1373 | 1.05k | return NextI; |
1374 | 1.05k | } |
1375 | | |
1376 | | bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, |
1377 | | MachineInstr &MI, |
1378 | 18.9M | unsigned BaseReg, int Offset) { |
1379 | 18.9M | switch (MI.getOpcode()) { |
1380 | 18.9M | default: |
1381 | 18.0M | break; |
1382 | 18.9M | case AArch64::SUBXri: |
1383 | 913k | case AArch64::ADDXri: |
1384 | 913k | // Make sure it's a vanilla immediate operand, not a relocation or |
1385 | 913k | // anything else we can't handle. |
1386 | 913k | if (!MI.getOperand(2).isImm()) |
1387 | 241k | break; |
1388 | 672k | // Watch out for 1 << 12 shifted value. |
1389 | 672k | if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm())) |
1390 | 12.0k | break; |
1391 | 660k | |
1392 | 660k | // The update instruction source and destination register must be the |
1393 | 660k | // same as the load/store base register. |
1394 | 660k | if (MI.getOperand(0).getReg() != BaseReg || |
1395 | 660k | MI.getOperand(1).getReg() != BaseReg105k ) |
1396 | 562k | break; |
1397 | 98.4k | |
1398 | 98.4k | bool IsPairedInsn = isPairedLdSt(MemMI); |
1399 | 98.4k | int UpdateOffset = MI.getOperand(2).getImm(); |
1400 | 98.4k | if (MI.getOpcode() == AArch64::SUBXri) |
1401 | 270 | UpdateOffset = -UpdateOffset; |
1402 | 98.4k | |
1403 | 98.4k | // For non-paired load/store instructions, the immediate must fit in a |
1404 | 98.4k | // signed 9-bit integer. |
1405 | 98.4k | if (!IsPairedInsn && (3.50k UpdateOffset > 2553.50k || UpdateOffset < -2562.37k )) |
1406 | 1.16k | break; |
1407 | 97.2k | |
1408 | 97.2k | // For paired load/store instructions, the immediate must be a multiple of |
1409 | 97.2k | // the scaling factor. The scaled offset must also fit into a signed 7-bit |
1410 | 97.2k | // integer. |
1411 | 97.2k | if (IsPairedInsn) { |
1412 | 94.9k | int Scale = getMemScale(MemMI); |
1413 | 94.9k | if (UpdateOffset % Scale != 0) |
1414 | 10 | break; |
1415 | 94.9k | |
1416 | 94.9k | int ScaledOffset = UpdateOffset / Scale; |
1417 | 94.9k | if (ScaledOffset > 63 || ScaledOffset < -6494.0k ) |
1418 | 978 | break; |
1419 | 96.2k | } |
1420 | 96.2k | |
1421 | 96.2k | // If we have a non-zero Offset, we check that it matches the amount |
1422 | 96.2k | // we're adding to the register. |
1423 | 96.2k | if (!Offset || Offset == UpdateOffset95.2k ) |
1424 | 1.05k | return true; |
1425 | 95.2k | break; |
1426 | 18.9M | } |
1427 | 18.9M | return false; |
1428 | 18.9M | } |
1429 | | |
1430 | | MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( |
1431 | 12.2M | MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { |
1432 | 12.2M | MachineBasicBlock::iterator E = I->getParent()->end(); |
1433 | 12.2M | MachineInstr &MemMI = *I; |
1434 | 12.2M | MachineBasicBlock::iterator MBBI = I; |
1435 | 12.2M | |
1436 | 12.2M | unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); |
1437 | 12.2M | int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); |
1438 | 12.2M | |
1439 | 12.2M | // Scan forward looking for post-index opportunities. Updating instructions |
1440 | 12.2M | // can't be formed if the memory instruction doesn't have the offset we're |
1441 | 12.2M | // looking for. |
1442 | 12.2M | if (MIUnscaledOffset != UnscaledOffset) |
1443 | 5.11M | return E; |
1444 | 7.16M | |
1445 | 7.16M | // If the base register overlaps a destination register, we can't |
1446 | 7.16M | // merge the update. |
1447 | 7.16M | bool IsPairedInsn = isPairedLdSt(MemMI); |
1448 | 16.3M | for (unsigned i = 0, e = IsPairedInsn ? 22.40M : 14.75M ; i != e; ++i9.16M ) { |
1449 | 9.56M | unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); |
1450 | 9.56M | if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)9.31M ) |
1451 | 401k | return E; |
1452 | 9.56M | } |
1453 | 7.16M | |
1454 | 7.16M | // Track which register units have been modified and used between the first |
1455 | 7.16M | // insn (inclusive) and the second insn. |
1456 | 7.16M | ModifiedRegUnits.clear(); |
1457 | 6.76M | UsedRegUnits.clear(); |
1458 | 6.76M | ++MBBI; |
1459 | 18.4M | for (unsigned Count = 0; MBBI != E && Count < Limit16.2M ; ++MBBI11.6M ) { |
1460 | 16.2M | MachineInstr &MI = *MBBI; |
1461 | 16.2M | |
1462 | 16.2M | // Don't count transient instructions towards the search limit since there |
1463 | 16.2M | // may be different numbers of them if e.g. debug information is present. |
1464 | 16.2M | if (!MI.isTransient()) |
1465 | 16.1M | ++Count; |
1466 | 16.2M | |
1467 | 16.2M | // If we found a match, return it. |
1468 | 16.2M | if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) |
1469 | 870 | return MBBI; |
1470 | 16.2M | |
1471 | 16.2M | // Update the status of what the instruction clobbered and used. |
1472 | 16.2M | LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); |
1473 | 16.2M | |
1474 | 16.2M | // Otherwise, if the base register is used or modified, we have no match, so |
1475 | 16.2M | // return early. |
1476 | 16.2M | if (!ModifiedRegUnits.available(BaseReg) || |
1477 | 16.2M | !UsedRegUnits.available(BaseReg)15.0M ) |
1478 | 4.53M | return E; |
1479 | 16.2M | } |
1480 | 6.76M | return E2.22M ; |
1481 | 6.76M | } |
1482 | | |
1483 | | MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( |
1484 | 6.05M | MachineBasicBlock::iterator I, unsigned Limit) { |
1485 | 6.05M | MachineBasicBlock::iterator B = I->getParent()->begin(); |
1486 | 6.05M | MachineBasicBlock::iterator E = I->getParent()->end(); |
1487 | 6.05M | MachineInstr &MemMI = *I; |
1488 | 6.05M | MachineBasicBlock::iterator MBBI = I; |
1489 | 6.05M | |
1490 | 6.05M | unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); |
1491 | 6.05M | int Offset = getLdStOffsetOp(MemMI).getImm(); |
1492 | 6.05M | |
1493 | 6.05M | // If the load/store is the first instruction in the block, there's obviously |
1494 | 6.05M | // not any matching update. Ditto if the memory offset isn't zero. |
1495 | 6.05M | if (MBBI == B || Offset != 05.02M ) |
1496 | 5.16M | return E; |
1497 | 884k | // If the base register overlaps a destination register, we can't |
1498 | 884k | // merge the update. |
1499 | 884k | bool IsPairedInsn = isPairedLdSt(MemMI); |
1500 | 1.70M | for (unsigned i = 0, e = IsPairedInsn ? 271.3k : 1812k ; i != e; ++i825k ) { |
1501 | 953k | unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); |
1502 | 953k | if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)884k ) |
1503 | 128k | return E; |
1504 | 953k | } |
1505 | 884k | |
1506 | 884k | // Track which register units have been modified and used between the first |
1507 | 884k | // insn (inclusive) and the second insn. |
1508 | 884k | ModifiedRegUnits.clear(); |
1509 | 755k | UsedRegUnits.clear(); |
1510 | 755k | unsigned Count = 0; |
1511 | 2.74M | do { |
1512 | 2.74M | --MBBI; |
1513 | 2.74M | MachineInstr &MI = *MBBI; |
1514 | 2.74M | |
1515 | 2.74M | // Don't count transient instructions towards the search limit since there |
1516 | 2.74M | // may be different numbers of them if e.g. debug information is present. |
1517 | 2.74M | if (!MI.isTransient()) |
1518 | 2.71M | ++Count; |
1519 | 2.74M | |
1520 | 2.74M | // If we found a match, return it. |
1521 | 2.74M | if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) |
1522 | 181 | return MBBI; |
1523 | 2.74M | |
1524 | 2.74M | // Update the status of what the instruction clobbered and used. |
1525 | 2.74M | LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); |
1526 | 2.74M | |
1527 | 2.74M | // Otherwise, if the base register is used or modified, we have no match, so |
1528 | 2.74M | // return early. |
1529 | 2.74M | if (!ModifiedRegUnits.available(BaseReg) || |
1530 | 2.74M | !UsedRegUnits.available(BaseReg)2.43M ) |
1531 | 547k | return E; |
1532 | 2.19M | } while (MBBI != B && Count < Limit1.99M ); |
1533 | 755k | return E208k ; |
1534 | 755k | } |
1535 | | |
1536 | | bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( |
1537 | 2.61M | MachineBasicBlock::iterator &MBBI) { |
1538 | 2.61M | MachineInstr &MI = *MBBI; |
1539 | 2.61M | // If this is a volatile load, don't mess with it. |
1540 | 2.61M | if (MI.hasOrderedMemoryRef()) |
1541 | 327k | return false; |
1542 | 2.28M | |
1543 | 2.28M | // Make sure this is a reg+imm. |
1544 | 2.28M | // FIXME: It is possible to extend it to handle reg+reg cases. |
1545 | 2.28M | if (!getLdStOffsetOp(MI).isImm()) |
1546 | 15.2k | return false; |
1547 | 2.27M | |
1548 | 2.27M | // Look backward up to LdStLimit instructions. |
1549 | 2.27M | MachineBasicBlock::iterator StoreI; |
1550 | 2.27M | if (findMatchingStore(MBBI, LdStLimit, StoreI)) { |
1551 | 226 | ++NumLoadsFromStoresPromoted; |
1552 | 226 | // Promote the load. Keeping the iterator straight is a |
1553 | 226 | // pain, so we let the merge routine tell us what the next instruction |
1554 | 226 | // is after it's done mucking about. |
1555 | 226 | MBBI = promoteLoadFromStore(MBBI, StoreI); |
1556 | 226 | return true; |
1557 | 226 | } |
1558 | 2.27M | return false; |
1559 | 2.27M | } |
1560 | | |
1561 | | // Merge adjacent zero stores into a wider store. |
1562 | | bool AArch64LoadStoreOpt::tryToMergeZeroStInst( |
1563 | 57.4k | MachineBasicBlock::iterator &MBBI) { |
1564 | 57.4k | assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store."); |
1565 | 57.4k | MachineInstr &MI = *MBBI; |
1566 | 57.4k | MachineBasicBlock::iterator E = MI.getParent()->end(); |
1567 | 57.4k | |
1568 | 57.4k | if (!TII->isCandidateToMergeOrPair(MI)) |
1569 | 940 | return false; |
1570 | 56.5k | |
1571 | 56.5k | // Look ahead up to LdStLimit instructions for a mergable instruction. |
1572 | 56.5k | LdStPairFlags Flags; |
1573 | 56.5k | MachineBasicBlock::iterator MergeMI = |
1574 | 56.5k | findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); |
1575 | 56.5k | if (MergeMI != E) { |
1576 | 7.74k | ++NumZeroStoresPromoted; |
1577 | 7.74k | |
1578 | 7.74k | // Keeping the iterator straight is a pain, so we let the merge routine tell |
1579 | 7.74k | // us what the next instruction is after it's done mucking about. |
1580 | 7.74k | MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags); |
1581 | 7.74k | return true; |
1582 | 7.74k | } |
1583 | 48.8k | return false; |
1584 | 48.8k | } |
1585 | | |
1586 | | // Find loads and stores that can be merged into a single load or store pair |
1587 | | // instruction. |
1588 | 3.75M | bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { |
1589 | 3.75M | MachineInstr &MI = *MBBI; |
1590 | 3.75M | MachineBasicBlock::iterator E = MI.getParent()->end(); |
1591 | 3.75M | |
1592 | 3.75M | if (!TII->isCandidateToMergeOrPair(MI)) |
1593 | 629k | return false; |
1594 | 3.12M | |
1595 | 3.12M | // Early exit if the offset is not possible to match. (6 bits of positive |
1596 | 3.12M | // range, plus allow an extra one in case we find a later insn that matches |
1597 | 3.12M | // with Offset-1) |
1598 | 3.12M | bool IsUnscaled = TII->isUnscaledLdSt(MI); |
1599 | 3.12M | int Offset = getLdStOffsetOp(MI).getImm(); |
1600 | 3.12M | int OffsetStride = IsUnscaled ? getMemScale(MI)176k : 12.95M ; |
1601 | 3.12M | // Allow one more for offset. |
1602 | 3.12M | if (Offset > 0) |
1603 | 2.24M | Offset -= OffsetStride; |
1604 | 3.12M | if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) |
1605 | 431k | return false; |
1606 | 2.69M | |
1607 | 2.69M | // Look ahead up to LdStLimit instructions for a pairable instruction. |
1608 | 2.69M | LdStPairFlags Flags; |
1609 | 2.69M | MachineBasicBlock::iterator Paired = |
1610 | 2.69M | findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false); |
1611 | 2.69M | if (Paired != E) { |
1612 | 214k | ++NumPairCreated; |
1613 | 214k | if (TII->isUnscaledLdSt(MI)) |
1614 | 19.4k | ++NumUnscaledPairCreated; |
1615 | 214k | // Keeping the iterator straight is a pain, so we let the merge routine tell |
1616 | 214k | // us what the next instruction is after it's done mucking about. |
1617 | 214k | MBBI = mergePairedInsns(MBBI, Paired, Flags); |
1618 | 214k | return true; |
1619 | 214k | } |
1620 | 2.48M | return false; |
1621 | 2.48M | } |
1622 | | |
1623 | | bool AArch64LoadStoreOpt::tryToMergeLdStUpdate |
1624 | 6.22M | (MachineBasicBlock::iterator &MBBI) { |
1625 | 6.22M | MachineInstr &MI = *MBBI; |
1626 | 6.22M | MachineBasicBlock::iterator E = MI.getParent()->end(); |
1627 | 6.22M | MachineBasicBlock::iterator Update; |
1628 | 6.22M | |
1629 | 6.22M | // Look forward to try to form a post-index instruction. For example, |
1630 | 6.22M | // ldr x0, [x20] |
1631 | 6.22M | // add x20, x20, #32 |
1632 | 6.22M | // merged into: |
1633 | 6.22M | // ldr x0, [x20], #32 |
1634 | 6.22M | Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); |
1635 | 6.22M | if (Update != E) { |
1636 | 833 | // Merge the update into the ld/st. |
1637 | 833 | MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); |
1638 | 833 | return true; |
1639 | 833 | } |
1640 | 6.22M | |
1641 | 6.22M | // Don't know how to handle unscaled pre/post-index versions below, so bail. |
1642 | 6.22M | if (TII->isUnscaledLdSt(MI.getOpcode())) |
1643 | 170k | return false; |
1644 | 6.05M | |
1645 | 6.05M | // Look back to try to find a pre-index instruction. For example, |
1646 | 6.05M | // add x0, x0, #8 |
1647 | 6.05M | // ldr x1, [x0] |
1648 | 6.05M | // merged into: |
1649 | 6.05M | // ldr x1, [x0, #8]! |
1650 | 6.05M | Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); |
1651 | 6.05M | if (Update != E) { |
1652 | 181 | // Merge the update into the ld/st. |
1653 | 181 | MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); |
1654 | 181 | return true; |
1655 | 181 | } |
1656 | 6.05M | |
1657 | 6.05M | // The immediate in the load/store is scaled by the size of the memory |
1658 | 6.05M | // operation. The immediate in the add we're looking for, |
1659 | 6.05M | // however, is not, so adjust here. |
1660 | 6.05M | int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); |
1661 | 6.05M | |
1662 | 6.05M | // Look forward to try to find a post-index instruction. For example, |
1663 | 6.05M | // ldr x1, [x0, #64] |
1664 | 6.05M | // add x0, x0, #64 |
1665 | 6.05M | // merged into: |
1666 | 6.05M | // ldr x1, [x0, #64]! |
1667 | 6.05M | Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); |
1668 | 6.05M | if (Update != E) { |
1669 | 37 | // Merge the update into the ld/st. |
1670 | 37 | MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); |
1671 | 37 | return true; |
1672 | 37 | } |
1673 | 6.05M | |
1674 | 6.05M | return false; |
1675 | 6.05M | } |
1676 | | |
1677 | | bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, |
1678 | 4.14M | bool EnableNarrowZeroStOpt) { |
1679 | 4.14M | bool Modified = false; |
1680 | 4.14M | // Four tranformations to do here: |
1681 | 4.14M | // 1) Find loads that directly read from stores and promote them by |
1682 | 4.14M | // replacing with mov instructions. If the store is wider than the load, |
1683 | 4.14M | // the load will be replaced with a bitfield extract. |
1684 | 4.14M | // e.g., |
1685 | 4.14M | // str w1, [x0, #4] |
1686 | 4.14M | // ldrh w2, [x0, #6] |
1687 | 4.14M | // ; becomes |
1688 | 4.14M | // str w1, [x0, #4] |
1689 | 4.14M | // lsr w2, w1, #16 |
1690 | 4.14M | for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); |
1691 | 33.8M | MBBI != E;) { |
1692 | 29.6M | if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI)2.61M ) |
1693 | 226 | Modified = true; |
1694 | 29.6M | else |
1695 | 29.6M | ++MBBI; |
1696 | 29.6M | } |
1697 | 4.14M | // 2) Merge adjacent zero stores into a wider store. |
1698 | 4.14M | // e.g., |
1699 | 4.14M | // strh wzr, [x0] |
1700 | 4.14M | // strh wzr, [x0, #2] |
1701 | 4.14M | // ; becomes |
1702 | 4.14M | // str wzr, [x0] |
1703 | 4.14M | // e.g., |
1704 | 4.14M | // str wzr, [x0] |
1705 | 4.14M | // str wzr, [x0, #4] |
1706 | 4.14M | // ; becomes |
1707 | 4.14M | // str xzr, [x0] |
1708 | 4.14M | if (EnableNarrowZeroStOpt) |
1709 | 4.14M | for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); |
1710 | 33.7M | MBBI != E;) { |
1711 | 29.6M | if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI)57.4k ) |
1712 | 7.74k | Modified = true; |
1713 | 29.6M | else |
1714 | 29.6M | ++MBBI; |
1715 | 29.6M | } |
1716 | 4.14M | // 3) Find loads and stores that can be merged into a single load or store |
1717 | 4.14M | // pair instruction. |
1718 | 4.14M | // e.g., |
1719 | 4.14M | // ldr x0, [x2] |
1720 | 4.14M | // ldr x1, [x2, #8] |
1721 | 4.14M | // ; becomes |
1722 | 4.14M | // ldp x0, x1, [x2] |
1723 | 4.14M | for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); |
1724 | 33.5M | MBBI != E;) { |
1725 | 29.4M | if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI)3.75M ) |
1726 | 214k | Modified = true; |
1727 | 29.2M | else |
1728 | 29.2M | ++MBBI; |
1729 | 29.4M | } |
1730 | 4.14M | // 4) Find base register updates that can be merged into the load or store |
1731 | 4.14M | // as a base-reg writeback. |
1732 | 4.14M | // e.g., |
1733 | 4.14M | // ldr x0, [x2] |
1734 | 4.14M | // add x2, x2, #4 |
1735 | 4.14M | // ; becomes |
1736 | 4.14M | // ldr x0, [x2], #4 |
1737 | 4.14M | for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); |
1738 | 33.5M | MBBI != E;) { |
1739 | 29.4M | if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI)6.22M ) |
1740 | 1.05k | Modified = true; |
1741 | 29.4M | else |
1742 | 29.4M | ++MBBI; |
1743 | 29.4M | } |
1744 | 4.14M | |
1745 | 4.14M | return Modified; |
1746 | 4.14M | } |
1747 | | |
1748 | 497k | bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { |
1749 | 497k | if (skipFunction(Fn.getFunction())) |
1750 | 17 | return false; |
1751 | 497k | |
1752 | 497k | Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); |
1753 | 497k | TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); |
1754 | 497k | TRI = Subtarget->getRegisterInfo(); |
1755 | 497k | AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
1756 | 497k | |
1757 | 497k | // Resize the modified and used register unit trackers. We do this once |
1758 | 497k | // per function and then clear the register units each time we optimize a load |
1759 | 497k | // or store. |
1760 | 497k | ModifiedRegUnits.init(*TRI); |
1761 | 497k | UsedRegUnits.init(*TRI); |
1762 | 497k | |
1763 | 497k | bool Modified = false; |
1764 | 497k | bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign(); |
1765 | 497k | for (auto &MBB : Fn) |
1766 | 4.14M | Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt); |
1767 | 497k | |
1768 | 497k | return Modified; |
1769 | 497k | } |
1770 | | |
1771 | | // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and |
1772 | | // stores near one another? Note: The pre-RA instruction scheduler already has |
1773 | | // hooks to try and schedule pairable loads/stores together to improve pairing |
1774 | | // opportunities. Thus, pre-RA pairing pass may not be worth the effort. |
1775 | | |
1776 | | // FIXME: When pairing store instructions it's very possible for this pass to |
1777 | | // hoist a store with a KILL marker above another use (without a KILL marker). |
1778 | | // The resulting IR is invalid, but nothing uses the KILL markers after this |
1779 | | // pass, so it's never caused a problem in practice. |
1780 | | |
1781 | | /// createAArch64LoadStoreOptimizationPass - returns an instance of the |
1782 | | /// load / store optimization pass. |
1783 | 15.9k | FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { |
1784 | 15.9k | return new AArch64LoadStoreOpt(); |
1785 | 15.9k | } |