/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This file contains a pass that performs load / store related peephole |
11 | | // optimizations. This pass should be run after register allocation. |
12 | | // |
13 | | //===----------------------------------------------------------------------===// |
14 | | |
15 | | #include "AArch64InstrInfo.h" |
16 | | #include "AArch64Subtarget.h" |
17 | | #include "MCTargetDesc/AArch64AddressingModes.h" |
18 | | #include "llvm/ADT/BitVector.h" |
19 | | #include "llvm/ADT/SmallVector.h" |
20 | | #include "llvm/ADT/Statistic.h" |
21 | | #include "llvm/ADT/StringRef.h" |
22 | | #include "llvm/ADT/iterator_range.h" |
23 | | #include "llvm/Analysis/AliasAnalysis.h" |
24 | | #include "llvm/CodeGen/MachineBasicBlock.h" |
25 | | #include "llvm/CodeGen/MachineFunction.h" |
26 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
27 | | #include "llvm/CodeGen/MachineInstr.h" |
28 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
29 | | #include "llvm/CodeGen/MachineOperand.h" |
30 | | #include "llvm/IR/DebugLoc.h" |
31 | | #include "llvm/MC/MCRegisterInfo.h" |
32 | | #include "llvm/Pass.h" |
33 | | #include "llvm/Support/CommandLine.h" |
34 | | #include "llvm/Support/Debug.h" |
35 | | #include "llvm/Support/ErrorHandling.h" |
36 | | #include "llvm/Support/raw_ostream.h" |
37 | | #include "llvm/Target/TargetRegisterInfo.h" |
38 | | #include <cassert> |
39 | | #include <cstdint> |
40 | | #include <iterator> |
41 | | #include <limits> |
42 | | |
43 | | using namespace llvm; |
44 | | |
45 | | #define DEBUG_TYPE "aarch64-ldst-opt" |
46 | | |
47 | | STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); |
48 | | STATISTIC(NumPostFolded, "Number of post-index updates folded"); |
49 | | STATISTIC(NumPreFolded, "Number of pre-index updates folded"); |
50 | | STATISTIC(NumUnscaledPairCreated, |
51 | | "Number of load/store from unscaled generated"); |
52 | | STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); |
53 | | STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); |
54 | | |
55 | | // The LdStLimit limits how far we search for load/store pairs. |
56 | | static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", |
57 | | cl::init(20), cl::Hidden); |
58 | | |
59 | | // The UpdateLimit limits how far we search for update instructions when we form |
60 | | // pre-/post-index instructions. |
61 | | static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), |
62 | | cl::Hidden); |
63 | | |
64 | 13.7k | #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" |
65 | | |
66 | | namespace { |
67 | | |
68 | | using LdStPairFlags = struct LdStPairFlags { |
69 | | // If a matching instruction is found, MergeForward is set to true if the |
70 | | // merge is to remove the first instruction and replace the second with |
71 | | // a pair-wise insn, and false if the reverse is true. |
72 | | bool MergeForward = false; |
73 | | |
74 | | // SExtIdx gives the index of the result of the load pair that must be |
75 | | // extended. The value of SExtIdx assumes that the paired load produces the |
76 | | // value in this order: (I, returned iterator), i.e., -1 means no value has |
77 | | // to be extended, 0 means I, and 1 means the returned iterator. |
78 | | int SExtIdx = -1; |
79 | | |
80 | 2.39M | LdStPairFlags() = default; |
81 | | |
82 | 361k | void setMergeForward(bool V = true) { MergeForward = V; } |
83 | 361k | bool getMergeForward() const { return MergeForward; } |
84 | | |
85 | 8.32M | void setSExtIdx(int V) { SExtIdx = V; } |
86 | 354k | int getSExtIdx() const { return SExtIdx; } |
87 | | }; |
88 | | |
89 | | struct AArch64LoadStoreOpt : public MachineFunctionPass { |
90 | | static char ID; |
91 | | |
92 | 13.7k | AArch64LoadStoreOpt() : MachineFunctionPass(ID) { |
93 | 13.7k | initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); |
94 | 13.7k | } |
95 | | |
96 | | AliasAnalysis *AA; |
97 | | const AArch64InstrInfo *TII; |
98 | | const TargetRegisterInfo *TRI; |
99 | | const AArch64Subtarget *Subtarget; |
100 | | |
101 | | // Track which registers have been modified and used. |
102 | | BitVector ModifiedRegs, UsedRegs; |
103 | | |
104 | 13.7k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
105 | 13.7k | AU.addRequired<AAResultsWrapperPass>(); |
106 | 13.7k | MachineFunctionPass::getAnalysisUsage(AU); |
107 | 13.7k | } |
108 | | |
109 | | // Scan the instructions looking for a load/store that can be combined |
110 | | // with the current instruction into a load/store pair. |
111 | | // Return the matching instruction if one is found, else MBB->end(). |
112 | | MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, |
113 | | LdStPairFlags &Flags, |
114 | | unsigned Limit, |
115 | | bool FindNarrowMerge); |
116 | | |
117 | | // Scan the instructions looking for a store that writes to the address from |
118 | | // which the current load instruction reads. Return true if one is found. |
119 | | bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, |
120 | | MachineBasicBlock::iterator &StoreI); |
121 | | |
122 | | // Merge the two instructions indicated into a wider narrow store instruction. |
123 | | MachineBasicBlock::iterator |
124 | | mergeNarrowZeroStores(MachineBasicBlock::iterator I, |
125 | | MachineBasicBlock::iterator MergeMI, |
126 | | const LdStPairFlags &Flags); |
127 | | |
128 | | // Merge the two instructions indicated into a single pair-wise instruction. |
129 | | MachineBasicBlock::iterator |
130 | | mergePairedInsns(MachineBasicBlock::iterator I, |
131 | | MachineBasicBlock::iterator Paired, |
132 | | const LdStPairFlags &Flags); |
133 | | |
134 | | // Promote the load that reads directly from the address stored to. |
135 | | MachineBasicBlock::iterator |
136 | | promoteLoadFromStore(MachineBasicBlock::iterator LoadI, |
137 | | MachineBasicBlock::iterator StoreI); |
138 | | |
139 | | // Scan the instruction list to find a base register update that can |
140 | | // be combined with the current instruction (a load or store) using |
141 | | // pre or post indexed addressing with writeback. Scan forwards. |
142 | | MachineBasicBlock::iterator |
143 | | findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, |
144 | | int UnscaledOffset, unsigned Limit); |
145 | | |
146 | | // Scan the instruction list to find a base register update that can |
147 | | // be combined with the current instruction (a load or store) using |
148 | | // pre or post indexed addressing with writeback. Scan backwards. |
149 | | MachineBasicBlock::iterator |
150 | | findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); |
151 | | |
152 | | // Find an instruction that updates the base register of the ld/st |
153 | | // instruction. |
154 | | bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, |
155 | | unsigned BaseReg, int Offset); |
156 | | |
157 | | // Merge a pre- or post-index base register update into a ld/st instruction. |
158 | | MachineBasicBlock::iterator |
159 | | mergeUpdateInsn(MachineBasicBlock::iterator I, |
160 | | MachineBasicBlock::iterator Update, bool IsPreIdx); |
161 | | |
162 | | // Find and merge zero store instructions. |
163 | | bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); |
164 | | |
165 | | // Find and pair ldr/str instructions. |
166 | | bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); |
167 | | |
168 | | // Find and promote load instructions which read directly from store. |
169 | | bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); |
170 | | |
171 | | bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); |
172 | | |
173 | | bool runOnMachineFunction(MachineFunction &Fn) override; |
174 | | |
175 | 13.7k | MachineFunctionProperties getRequiredProperties() const override { |
176 | 13.7k | return MachineFunctionProperties().set( |
177 | 13.7k | MachineFunctionProperties::Property::NoVRegs); |
178 | 13.7k | } |
179 | | |
180 | 13.7k | StringRef getPassName() const override { return 13.7k AARCH64_LOAD_STORE_OPT_NAME13.7k ; } |
181 | | }; |
182 | | |
183 | | char AArch64LoadStoreOpt::ID = 0; |
184 | | |
185 | | } // end anonymous namespace |
186 | | |
187 | | INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", |
188 | | AARCH64_LOAD_STORE_OPT_NAME, false, false) |
189 | | |
190 | 28.6M | static bool isNarrowStore(unsigned Opc) { |
191 | 28.6M | switch (Opc) { |
192 | 28.0M | default: |
193 | 28.0M | return false; |
194 | 591k | case AArch64::STRBBui: |
195 | 591k | case AArch64::STURBBi: |
196 | 591k | case AArch64::STRHHui: |
197 | 591k | case AArch64::STURHHi: |
198 | 591k | return true; |
199 | 0 | } |
200 | 0 | } |
201 | | |
202 | | // Scaling factor for unscaled load or store. |
203 | 14.5M | static int getMemScale(MachineInstr &MI) { |
204 | 14.5M | switch (MI.getOpcode()) { |
205 | 0 | default: |
206 | 0 | llvm_unreachable("Opcode has unknown scale!"); |
207 | 1.55M | case AArch64::LDRBBui: |
208 | 1.55M | case AArch64::LDURBBi: |
209 | 1.55M | case AArch64::LDRSBWui: |
210 | 1.55M | case AArch64::LDURSBWi: |
211 | 1.55M | case AArch64::STRBBui: |
212 | 1.55M | case AArch64::STURBBi: |
213 | 1.55M | return 1; |
214 | 259k | case AArch64::LDRHHui: |
215 | 259k | case AArch64::LDURHHi: |
216 | 259k | case AArch64::LDRSHWui: |
217 | 259k | case AArch64::LDURSHWi: |
218 | 259k | case AArch64::STRHHui: |
219 | 259k | case AArch64::STURHHi: |
220 | 259k | return 2; |
221 | 2.56M | case AArch64::LDRSui: |
222 | 2.56M | case AArch64::LDURSi: |
223 | 2.56M | case AArch64::LDRSWui: |
224 | 2.56M | case AArch64::LDURSWi: |
225 | 2.56M | case AArch64::LDRWui: |
226 | 2.56M | case AArch64::LDURWi: |
227 | 2.56M | case AArch64::STRSui: |
228 | 2.56M | case AArch64::STURSi: |
229 | 2.56M | case AArch64::STRWui: |
230 | 2.56M | case AArch64::STURWi: |
231 | 2.56M | case AArch64::LDPSi: |
232 | 2.56M | case AArch64::LDPSWi: |
233 | 2.56M | case AArch64::LDPWi: |
234 | 2.56M | case AArch64::STPSi: |
235 | 2.56M | case AArch64::STPWi: |
236 | 2.56M | return 4; |
237 | 9.99M | case AArch64::LDRDui: |
238 | 9.99M | case AArch64::LDURDi: |
239 | 9.99M | case AArch64::LDRXui: |
240 | 9.99M | case AArch64::LDURXi: |
241 | 9.99M | case AArch64::STRDui: |
242 | 9.99M | case AArch64::STURDi: |
243 | 9.99M | case AArch64::STRXui: |
244 | 9.99M | case AArch64::STURXi: |
245 | 9.99M | case AArch64::LDPDi: |
246 | 9.99M | case AArch64::LDPXi: |
247 | 9.99M | case AArch64::STPDi: |
248 | 9.99M | case AArch64::STPXi: |
249 | 9.99M | return 8; |
250 | 202k | case AArch64::LDRQui: |
251 | 202k | case AArch64::LDURQi: |
252 | 202k | case AArch64::STRQui: |
253 | 202k | case AArch64::STURQi: |
254 | 202k | case AArch64::LDPQi: |
255 | 202k | case AArch64::STPQi: |
256 | 202k | return 16; |
257 | 0 | } |
258 | 0 | } |
259 | | |
260 | | static unsigned getMatchingNonSExtOpcode(unsigned Opc, |
261 | 12.2M | bool *IsValidLdStrOpc = nullptr) { |
262 | 12.2M | if (IsValidLdStrOpc) |
263 | 12.2M | *IsValidLdStrOpc = true; |
264 | 12.2M | switch (Opc) { |
265 | 4.35M | default: |
266 | 4.35M | if (IsValidLdStrOpc) |
267 | 4.35M | *IsValidLdStrOpc = false; |
268 | 4.35M | return std::numeric_limits<unsigned>::max(); |
269 | 7.64M | case AArch64::STRDui: |
270 | 7.64M | case AArch64::STURDi: |
271 | 7.64M | case AArch64::STRQui: |
272 | 7.64M | case AArch64::STURQi: |
273 | 7.64M | case AArch64::STRBBui: |
274 | 7.64M | case AArch64::STURBBi: |
275 | 7.64M | case AArch64::STRHHui: |
276 | 7.64M | case AArch64::STURHHi: |
277 | 7.64M | case AArch64::STRWui: |
278 | 7.64M | case AArch64::STURWi: |
279 | 7.64M | case AArch64::STRXui: |
280 | 7.64M | case AArch64::STURXi: |
281 | 7.64M | case AArch64::LDRDui: |
282 | 7.64M | case AArch64::LDURDi: |
283 | 7.64M | case AArch64::LDRQui: |
284 | 7.64M | case AArch64::LDURQi: |
285 | 7.64M | case AArch64::LDRWui: |
286 | 7.64M | case AArch64::LDURWi: |
287 | 7.64M | case AArch64::LDRXui: |
288 | 7.64M | case AArch64::LDURXi: |
289 | 7.64M | case AArch64::STRSui: |
290 | 7.64M | case AArch64::STURSi: |
291 | 7.64M | case AArch64::LDRSui: |
292 | 7.64M | case AArch64::LDURSi: |
293 | 7.64M | return Opc; |
294 | 288k | case AArch64::LDRSWui: |
295 | 288k | return AArch64::LDRWui; |
296 | 5.79k | case AArch64::LDURSWi: |
297 | 5.79k | return AArch64::LDURWi; |
298 | 0 | } |
299 | 0 | } |
300 | | |
301 | 6.37k | static unsigned getMatchingWideOpcode(unsigned Opc) { |
302 | 6.37k | switch (Opc) { |
303 | 0 | default: |
304 | 0 | llvm_unreachable("Opcode has no wide equivalent!"); |
305 | 2 | case AArch64::STRBBui: |
306 | 2 | return AArch64::STRHHui; |
307 | 3 | case AArch64::STRHHui: |
308 | 3 | return AArch64::STRWui; |
309 | 0 | case AArch64::STURBBi: |
310 | 0 | return AArch64::STURHHi; |
311 | 3 | case AArch64::STURHHi: |
312 | 3 | return AArch64::STURWi; |
313 | 87 | case AArch64::STURWi: |
314 | 87 | return AArch64::STURXi; |
315 | 6.27k | case AArch64::STRWui: |
316 | 6.27k | return AArch64::STRXui; |
317 | 0 | } |
318 | 0 | } |
319 | | |
320 | 737k | static unsigned getMatchingPairOpcode(unsigned Opc) { |
321 | 737k | switch (Opc) { |
322 | 0 | default: |
323 | 0 | llvm_unreachable("Opcode has no pairwise equivalent!"); |
324 | 1.80k | case AArch64::STRSui: |
325 | 1.80k | case AArch64::STURSi: |
326 | 1.80k | return AArch64::STPSi; |
327 | 110k | case AArch64::STRDui: |
328 | 110k | case AArch64::STURDi: |
329 | 110k | return AArch64::STPDi; |
330 | 9.85k | case AArch64::STRQui: |
331 | 9.85k | case AArch64::STURQi: |
332 | 9.85k | return AArch64::STPQi; |
333 | 162k | case AArch64::STRWui: |
334 | 162k | case AArch64::STURWi: |
335 | 162k | return AArch64::STPWi; |
336 | 182k | case AArch64::STRXui: |
337 | 182k | case AArch64::STURXi: |
338 | 182k | return AArch64::STPXi; |
339 | 4.22k | case AArch64::LDRSui: |
340 | 4.22k | case AArch64::LDURSi: |
341 | 4.22k | return AArch64::LDPSi; |
342 | 19.0k | case AArch64::LDRDui: |
343 | 19.0k | case AArch64::LDURDi: |
344 | 19.0k | return AArch64::LDPDi; |
345 | 16.5k | case AArch64::LDRQui: |
346 | 16.5k | case AArch64::LDURQi: |
347 | 16.5k | return AArch64::LDPQi; |
348 | 88.8k | case AArch64::LDRWui: |
349 | 88.8k | case AArch64::LDURWi: |
350 | 88.8k | return AArch64::LDPWi; |
351 | 127k | case AArch64::LDRXui: |
352 | 127k | case AArch64::LDURXi: |
353 | 127k | return AArch64::LDPXi; |
354 | 14.3k | case AArch64::LDRSWui: |
355 | 14.3k | case AArch64::LDURSWi: |
356 | 14.3k | return AArch64::LDPSWi; |
357 | 0 | } |
358 | 0 | } |
359 | | |
360 | | static unsigned isMatchingStore(MachineInstr &LoadInst, |
361 | 468k | MachineInstr &StoreInst) { |
362 | 468k | unsigned LdOpc = LoadInst.getOpcode(); |
363 | 468k | unsigned StOpc = StoreInst.getOpcode(); |
364 | 468k | switch (LdOpc) { |
365 | 0 | default: |
366 | 0 | llvm_unreachable("Unsupported load instruction!"); |
367 | 27.9k | case AArch64::LDRBBui: |
368 | 20.0k | return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || |
369 | 27.9k | StOpc == AArch64::STRWui18.0k || StOpc == AArch64::STRXui9.93k ; |
370 | 897 | case AArch64::LDURBBi: |
371 | 690 | return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || |
372 | 897 | StOpc == AArch64::STURWi680 || StOpc == AArch64::STURXi638 ; |
373 | 11.1k | case AArch64::LDRHHui: |
374 | 7.30k | return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || |
375 | 5.17k | StOpc == AArch64::STRXui; |
376 | 2.87k | case AArch64::LDURHHi: |
377 | 2.84k | return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || |
378 | 2.77k | StOpc == AArch64::STURXi; |
379 | 116k | case AArch64::LDRWui: |
380 | 69.6k | return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; |
381 | 4.54k | case AArch64::LDURWi: |
382 | 3.18k | return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; |
383 | 300k | case AArch64::LDRXui: |
384 | 300k | return StOpc == AArch64::STRXui; |
385 | 4.48k | case AArch64::LDURXi: |
386 | 4.48k | return StOpc == AArch64::STURXi; |
387 | 0 | } |
388 | 0 | } |
389 | | |
390 | 348 | static unsigned getPreIndexedOpcode(unsigned Opc) { |
391 | 348 | // FIXME: We don't currently support creating pre-indexed loads/stores when |
392 | 348 | // the load or store is the unscaled version. If we decide to perform such an |
393 | 348 | // optimization in the future the cases for the unscaled loads/stores will |
394 | 348 | // need to be added here. |
395 | 348 | switch (Opc) { |
396 | 0 | default: |
397 | 0 | llvm_unreachable("Opcode has no pre-indexed equivalent!"); |
398 | 6 | case AArch64::STRSui: |
399 | 6 | return AArch64::STRSpre; |
400 | 8 | case AArch64::STRDui: |
401 | 8 | return AArch64::STRDpre; |
402 | 28 | case AArch64::STRQui: |
403 | 28 | return AArch64::STRQpre; |
404 | 6 | case AArch64::STRBBui: |
405 | 6 | return AArch64::STRBBpre; |
406 | 4 | case AArch64::STRHHui: |
407 | 4 | return AArch64::STRHHpre; |
408 | 14 | case AArch64::STRWui: |
409 | 14 | return AArch64::STRWpre; |
410 | 53 | case AArch64::STRXui: |
411 | 53 | return AArch64::STRXpre; |
412 | 14 | case AArch64::LDRSui: |
413 | 14 | return AArch64::LDRSpre; |
414 | 6 | case AArch64::LDRDui: |
415 | 6 | return AArch64::LDRDpre; |
416 | 6 | case AArch64::LDRQui: |
417 | 6 | return AArch64::LDRQpre; |
418 | 28 | case AArch64::LDRBBui: |
419 | 28 | return AArch64::LDRBBpre; |
420 | 2 | case AArch64::LDRHHui: |
421 | 2 | return AArch64::LDRHHpre; |
422 | 6 | case AArch64::LDRWui: |
423 | 6 | return AArch64::LDRWpre; |
424 | 121 | case AArch64::LDRXui: |
425 | 121 | return AArch64::LDRXpre; |
426 | 0 | case AArch64::LDRSWui: |
427 | 0 | return AArch64::LDRSWpre; |
428 | 0 | case AArch64::LDPSi: |
429 | 0 | return AArch64::LDPSpre; |
430 | 0 | case AArch64::LDPSWi: |
431 | 0 | return AArch64::LDPSWpre; |
432 | 0 | case AArch64::LDPDi: |
433 | 0 | return AArch64::LDPDpre; |
434 | 0 | case AArch64::LDPQi: |
435 | 0 | return AArch64::LDPQpre; |
436 | 2 | case AArch64::LDPWi: |
437 | 2 | return AArch64::LDPWpre; |
438 | 0 | case AArch64::LDPXi: |
439 | 0 | return AArch64::LDPXpre; |
440 | 0 | case AArch64::STPSi: |
441 | 0 | return AArch64::STPSpre; |
442 | 0 | case AArch64::STPDi: |
443 | 0 | return AArch64::STPDpre; |
444 | 1 | case AArch64::STPQi: |
445 | 1 | return AArch64::STPQpre; |
446 | 4 | case AArch64::STPWi: |
447 | 4 | return AArch64::STPWpre; |
448 | 39 | case AArch64::STPXi: |
449 | 39 | return AArch64::STPXpre; |
450 | 0 | } |
451 | 0 | } |
452 | | |
453 | 3.13k | static unsigned getPostIndexedOpcode(unsigned Opc) { |
454 | 3.13k | switch (Opc) { |
455 | 0 | default: |
456 | 0 | llvm_unreachable("Opcode has no post-indexed wise equivalent!"); |
457 | 49 | case AArch64::STRSui: |
458 | 49 | case AArch64::STURSi: |
459 | 49 | return AArch64::STRSpost; |
460 | 247 | case AArch64::STRDui: |
461 | 247 | case AArch64::STURDi: |
462 | 247 | return AArch64::STRDpost; |
463 | 33 | case AArch64::STRQui: |
464 | 33 | case AArch64::STURQi: |
465 | 33 | return AArch64::STRQpost; |
466 | 54 | case AArch64::STRBBui: |
467 | 54 | return AArch64::STRBBpost; |
468 | 78 | case AArch64::STRHHui: |
469 | 78 | return AArch64::STRHHpost; |
470 | 243 | case AArch64::STRWui: |
471 | 243 | case AArch64::STURWi: |
472 | 243 | return AArch64::STRWpost; |
473 | 204 | case AArch64::STRXui: |
474 | 204 | case AArch64::STURXi: |
475 | 204 | return AArch64::STRXpost; |
476 | 67 | case AArch64::LDRSui: |
477 | 67 | case AArch64::LDURSi: |
478 | 67 | return AArch64::LDRSpost; |
479 | 326 | case AArch64::LDRDui: |
480 | 326 | case AArch64::LDURDi: |
481 | 326 | return AArch64::LDRDpost; |
482 | 117 | case AArch64::LDRQui: |
483 | 117 | case AArch64::LDURQi: |
484 | 117 | return AArch64::LDRQpost; |
485 | 350 | case AArch64::LDRBBui: |
486 | 350 | return AArch64::LDRBBpost; |
487 | 370 | case AArch64::LDRHHui: |
488 | 370 | return AArch64::LDRHHpost; |
489 | 162 | case AArch64::LDRWui: |
490 | 162 | case AArch64::LDURWi: |
491 | 162 | return AArch64::LDRWpost; |
492 | 47 | case AArch64::LDRXui: |
493 | 47 | case AArch64::LDURXi: |
494 | 47 | return AArch64::LDRXpost; |
495 | 0 | case AArch64::LDRSWui: |
496 | 0 | return AArch64::LDRSWpost; |
497 | 14 | case AArch64::LDPSi: |
498 | 14 | return AArch64::LDPSpost; |
499 | 1 | case AArch64::LDPSWi: |
500 | 1 | return AArch64::LDPSWpost; |
501 | 4 | case AArch64::LDPDi: |
502 | 4 | return AArch64::LDPDpost; |
503 | 102 | case AArch64::LDPQi: |
504 | 102 | return AArch64::LDPQpost; |
505 | 72 | case AArch64::LDPWi: |
506 | 72 | return AArch64::LDPWpost; |
507 | 6 | case AArch64::LDPXi: |
508 | 6 | return AArch64::LDPXpost; |
509 | 6 | case AArch64::STPSi: |
510 | 6 | return AArch64::STPSpost; |
511 | 476 | case AArch64::STPDi: |
512 | 476 | return AArch64::STPDpost; |
513 | 18 | case AArch64::STPQi: |
514 | 18 | return AArch64::STPQpost; |
515 | 18 | case AArch64::STPWi: |
516 | 18 | return AArch64::STPWpost; |
517 | 71 | case AArch64::STPXi: |
518 | 71 | return AArch64::STPXpost; |
519 | 0 | } |
520 | 0 | } |
521 | | |
522 | 73.9M | static bool isPairedLdSt(const MachineInstr &MI) { |
523 | 73.9M | switch (MI.getOpcode()) { |
524 | 53.8M | default: |
525 | 53.8M | return false; |
526 | 20.1M | case AArch64::LDPSi: |
527 | 20.1M | case AArch64::LDPSWi: |
528 | 20.1M | case AArch64::LDPDi: |
529 | 20.1M | case AArch64::LDPQi: |
530 | 20.1M | case AArch64::LDPWi: |
531 | 20.1M | case AArch64::LDPXi: |
532 | 20.1M | case AArch64::STPSi: |
533 | 20.1M | case AArch64::STPDi: |
534 | 20.1M | case AArch64::STPQi: |
535 | 20.1M | case AArch64::STPWi: |
536 | 20.1M | case AArch64::STPXi: |
537 | 20.1M | return true; |
538 | 0 | } |
539 | 0 | } |
540 | | |
541 | | static const MachineOperand &getLdStRegOp(const MachineInstr &MI, |
542 | 12.9M | unsigned PairedRegOp = 0) { |
543 | 12.9M | assert(PairedRegOp < 2 && "Unexpected register operand idx."); |
544 | 12.9M | unsigned Idx = isPairedLdSt(MI) ? PairedRegOp3.83M : 09.14M ; |
545 | 12.9M | return MI.getOperand(Idx); |
546 | 12.9M | } |
547 | | |
548 | 20.2M | static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { |
549 | 20.2M | unsigned Idx = isPairedLdSt(MI) ? 25.36M : 114.8M ; |
550 | 20.2M | return MI.getOperand(Idx); |
551 | 20.2M | } |
552 | | |
553 | 34.6M | static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { |
554 | 34.6M | unsigned Idx = isPairedLdSt(MI) ? 38.94M : 225.6M ; |
555 | 34.6M | return MI.getOperand(Idx); |
556 | 34.6M | } |
557 | | |
558 | | static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, |
559 | | MachineInstr &StoreInst, |
560 | 66.5k | const AArch64InstrInfo *TII) { |
561 | 66.5k | assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); |
562 | 66.5k | int LoadSize = getMemScale(LoadInst); |
563 | 66.5k | int StoreSize = getMemScale(StoreInst); |
564 | 66.5k | int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst) |
565 | 2.32k | ? getLdStOffsetOp(StoreInst).getImm() |
566 | 64.2k | : getLdStOffsetOp(StoreInst).getImm() * StoreSize; |
567 | 66.5k | int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst) |
568 | 2.32k | ? getLdStOffsetOp(LoadInst).getImm() |
569 | 64.2k | : getLdStOffsetOp(LoadInst).getImm() * LoadSize; |
570 | 66.5k | return (UnscaledStOffset <= UnscaledLdOffset) && |
571 | 41.0k | (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); |
572 | 66.5k | } |
573 | | |
574 | 25.9M | static bool isPromotableZeroStoreInst(MachineInstr &MI) { |
575 | 25.9M | unsigned Opc = MI.getOpcode(); |
576 | 25.3M | return (Opc == AArch64::STRWui || Opc == AArch64::STURWi || |
577 | 25.3M | isNarrowStore(Opc)) && |
578 | 1.09M | getLdStRegOp(MI).getReg() == AArch64::WZR; |
579 | 25.9M | } |
580 | | |
581 | | MachineBasicBlock::iterator |
582 | | AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, |
583 | | MachineBasicBlock::iterator MergeMI, |
584 | 6.37k | const LdStPairFlags &Flags) { |
585 | 6.37k | assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && |
586 | 6.37k | "Expected promotable zero stores."); |
587 | 6.37k | |
588 | 6.37k | MachineBasicBlock::iterator NextI = I; |
589 | 6.37k | ++NextI; |
590 | 6.37k | // If NextI is the second of the two instructions to be merged, we need |
591 | 6.37k | // to skip one further. Either way we merge will invalidate the iterator, |
592 | 6.37k | // and we don't need to scan the new instruction, as it's a pairwise |
593 | 6.37k | // instruction, which we're not considering for further action anyway. |
594 | 6.37k | if (NextI == MergeMI) |
595 | 6.29k | ++NextI; |
596 | 6.37k | |
597 | 6.37k | unsigned Opc = I->getOpcode(); |
598 | 6.37k | bool IsScaled = !TII->isUnscaledLdSt(Opc); |
599 | 6.37k | int OffsetStride = IsScaled ? 16.28k : getMemScale(*I)90 ; |
600 | 6.37k | |
601 | 6.37k | bool MergeForward = Flags.getMergeForward(); |
602 | 6.37k | // Insert our new paired instruction after whichever of the paired |
603 | 6.37k | // instructions MergeForward indicates. |
604 | 6.37k | MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI0 : I6.37k ; |
605 | 6.37k | // Also based on MergeForward is from where we copy the base register operand |
606 | 6.37k | // so we get the flags compatible with the input code. |
607 | 6.37k | const MachineOperand &BaseRegOp = |
608 | 6.37k | MergeForward ? getLdStBaseOp(*MergeMI)0 : getLdStBaseOp(*I)6.37k ; |
609 | 6.37k | |
610 | 6.37k | // Which register is Rt and which is Rt2 depends on the offset order. |
611 | 6.37k | MachineInstr *RtMI; |
612 | 6.37k | if (getLdStOffsetOp(*I).getImm() == |
613 | 6.37k | getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) |
614 | 2.95k | RtMI = &*MergeMI; |
615 | 6.37k | else |
616 | 3.41k | RtMI = &*I; |
617 | 6.37k | |
618 | 6.37k | int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); |
619 | 6.37k | // Change the scaled offset from small to large type. |
620 | 6.37k | if (IsScaled6.37k ) { |
621 | 6.28k | assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); |
622 | 6.28k | OffsetImm /= 2; |
623 | 6.28k | } |
624 | 6.37k | |
625 | 6.37k | // Construct the new instruction. |
626 | 6.37k | DebugLoc DL = I->getDebugLoc(); |
627 | 6.37k | MachineBasicBlock *MBB = I->getParent(); |
628 | 6.37k | MachineInstrBuilder MIB; |
629 | 6.37k | MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) |
630 | 6.37k | .addReg(isNarrowStore(Opc) ? AArch64::WZR8 : AArch64::XZR6.36k ) |
631 | 6.37k | .add(BaseRegOp) |
632 | 6.37k | .addImm(OffsetImm) |
633 | 6.37k | .setMemRefs(I->mergeMemRefsWith(*MergeMI)); |
634 | 6.37k | (void)MIB; |
635 | 6.37k | |
636 | 6.37k | DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n "); |
637 | 6.37k | DEBUG(I->print(dbgs())); |
638 | 6.37k | DEBUG(dbgs() << " "); |
639 | 6.37k | DEBUG(MergeMI->print(dbgs())); |
640 | 6.37k | DEBUG(dbgs() << " with instruction:\n "); |
641 | 6.37k | DEBUG(((MachineInstr *)MIB)->print(dbgs())); |
642 | 6.37k | DEBUG(dbgs() << "\n"); |
643 | 6.37k | |
644 | 6.37k | // Erase the old instructions. |
645 | 6.37k | I->eraseFromParent(); |
646 | 6.37k | MergeMI->eraseFromParent(); |
647 | 6.37k | return NextI; |
648 | 6.37k | } |
649 | | |
650 | | MachineBasicBlock::iterator |
651 | | AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, |
652 | | MachineBasicBlock::iterator Paired, |
653 | 354k | const LdStPairFlags &Flags) { |
654 | 354k | MachineBasicBlock::iterator NextI = I; |
655 | 354k | ++NextI; |
656 | 354k | // If NextI is the second of the two instructions to be merged, we need |
657 | 354k | // to skip one further. Either way we merge will invalidate the iterator, |
658 | 354k | // and we don't need to scan the new instruction, as it's a pairwise |
659 | 354k | // instruction, which we're not considering for further action anyway. |
660 | 354k | if (NextI == Paired) |
661 | 307k | ++NextI; |
662 | 354k | |
663 | 354k | int SExtIdx = Flags.getSExtIdx(); |
664 | 354k | unsigned Opc = |
665 | 354k | SExtIdx == -1 ? I->getOpcode()342k : getMatchingNonSExtOpcode(I->getOpcode())11.7k ; |
666 | 354k | bool IsUnscaled = TII->isUnscaledLdSt(Opc); |
667 | 354k | int OffsetStride = IsUnscaled ? getMemScale(*I)34.2k : 1320k ; |
668 | 354k | |
669 | 354k | bool MergeForward = Flags.getMergeForward(); |
670 | 354k | // Insert our new paired instruction after whichever of the paired |
671 | 354k | // instructions MergeForward indicates. |
672 | 354k | MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired15.7k : I338k ; |
673 | 354k | // Also based on MergeForward is from where we copy the base register operand |
674 | 354k | // so we get the flags compatible with the input code. |
675 | 354k | const MachineOperand &BaseRegOp = |
676 | 354k | MergeForward ? getLdStBaseOp(*Paired)15.7k : getLdStBaseOp(*I)338k ; |
677 | 354k | |
678 | 354k | int Offset = getLdStOffsetOp(*I).getImm(); |
679 | 354k | int PairedOffset = getLdStOffsetOp(*Paired).getImm(); |
680 | 354k | bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode()); |
681 | 354k | if (IsUnscaled != PairedIsUnscaled354k ) { |
682 | 19.0k | // We're trying to pair instructions that differ in how they are scaled. If |
683 | 19.0k | // I is scaled then scale the offset of Paired accordingly. Otherwise, do |
684 | 19.0k | // the opposite (i.e., make Paired's offset unscaled). |
685 | 19.0k | int MemSize = getMemScale(*Paired); |
686 | 19.0k | if (PairedIsUnscaled19.0k ) { |
687 | 919 | // If the unscaled offset isn't a multiple of the MemSize, we can't |
688 | 919 | // pair the operations together. |
689 | 919 | assert(!(PairedOffset % getMemScale(*Paired)) && |
690 | 919 | "Offset should be a multiple of the stride!"); |
691 | 919 | PairedOffset /= MemSize; |
692 | 19.0k | } else { |
693 | 18.1k | PairedOffset *= MemSize; |
694 | 18.1k | } |
695 | 19.0k | } |
696 | 354k | |
697 | 354k | // Which register is Rt and which is Rt2 depends on the offset order. |
698 | 354k | MachineInstr *RtMI, *Rt2MI; |
699 | 354k | if (Offset == PairedOffset + OffsetStride354k ) { |
700 | 49.0k | RtMI = &*Paired; |
701 | 49.0k | Rt2MI = &*I; |
702 | 49.0k | // Here we swapped the assumption made for SExtIdx. |
703 | 49.0k | // I.e., we turn ldp I, Paired into ldp Paired, I. |
704 | 49.0k | // Update the index accordingly. |
705 | 49.0k | if (SExtIdx != -1) |
706 | 32 | SExtIdx = (SExtIdx + 1) % 2; |
707 | 354k | } else { |
708 | 305k | RtMI = &*I; |
709 | 305k | Rt2MI = &*Paired; |
710 | 305k | } |
711 | 354k | int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); |
712 | 354k | // Scale the immediate offset, if necessary. |
713 | 354k | if (TII->isUnscaledLdSt(RtMI->getOpcode())354k ) { |
714 | 35.1k | assert(!(OffsetImm % getMemScale(*RtMI)) && |
715 | 35.1k | "Unscaled offset cannot be scaled."); |
716 | 35.1k | OffsetImm /= getMemScale(*RtMI); |
717 | 35.1k | } |
718 | 354k | |
719 | 354k | // Construct the new instruction. |
720 | 354k | MachineInstrBuilder MIB; |
721 | 354k | DebugLoc DL = I->getDebugLoc(); |
722 | 354k | MachineBasicBlock *MBB = I->getParent(); |
723 | 354k | MachineOperand RegOp0 = getLdStRegOp(*RtMI); |
724 | 354k | MachineOperand RegOp1 = getLdStRegOp(*Rt2MI); |
725 | 354k | // Kill flags may become invalid when moving stores for pairing. |
726 | 354k | if (RegOp0.isUse()354k ) { |
727 | 193k | if (!MergeForward193k ) { |
728 | 178k | // Clear kill flags on store if moving upwards. Example: |
729 | 178k | // STRWui %w0, ... |
730 | 178k | // USE %w1 |
731 | 178k | // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards |
732 | 178k | RegOp0.setIsKill(false); |
733 | 178k | RegOp1.setIsKill(false); |
734 | 193k | } else { |
735 | 15.0k | // Clear kill flags of the first stores register. Example: |
736 | 15.0k | // STRWui %w1, ... |
737 | 15.0k | // USE kill %w1 ; need to clear kill flag when moving STRWui downwards |
738 | 15.0k | // STRW %w0 |
739 | 15.0k | unsigned Reg = getLdStRegOp(*I).getReg(); |
740 | 15.0k | for (MachineInstr &MI : make_range(std::next(I), Paired)) |
741 | 43.9k | MI.clearRegisterKills(Reg, TRI); |
742 | 15.0k | } |
743 | 193k | } |
744 | 354k | MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc))) |
745 | 354k | .add(RegOp0) |
746 | 354k | .add(RegOp1) |
747 | 354k | .add(BaseRegOp) |
748 | 354k | .addImm(OffsetImm) |
749 | 354k | .setMemRefs(I->mergeMemRefsWith(*Paired)); |
750 | 354k | |
751 | 354k | (void)MIB; |
752 | 354k | |
753 | 354k | DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); |
754 | 354k | DEBUG(I->print(dbgs())); |
755 | 354k | DEBUG(dbgs() << " "); |
756 | 354k | DEBUG(Paired->print(dbgs())); |
757 | 354k | DEBUG(dbgs() << " with instruction:\n "); |
758 | 354k | if (SExtIdx != -1354k ) { |
759 | 11.7k | // Generate the sign extension for the proper result of the ldp. |
760 | 11.7k | // I.e., with X1, that would be: |
761 | 11.7k | // %W1<def> = KILL %W1, %X1<imp-def> |
762 | 11.7k | // %X1<def> = SBFMXri %X1<kill>, 0, 31 |
763 | 11.7k | MachineOperand &DstMO = MIB->getOperand(SExtIdx); |
764 | 11.7k | // Right now, DstMO has the extended register, since it comes from an |
765 | 11.7k | // extended opcode. |
766 | 11.7k | unsigned DstRegX = DstMO.getReg(); |
767 | 11.7k | // Get the W variant of that register. |
768 | 11.7k | unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); |
769 | 11.7k | // Update the result of LDP to use the W instead of the X variant. |
770 | 11.7k | DstMO.setReg(DstRegW); |
771 | 11.7k | DEBUG(((MachineInstr *)MIB)->print(dbgs())); |
772 | 11.7k | DEBUG(dbgs() << "\n"); |
773 | 11.7k | // Make the machine verifier happy by providing a definition for |
774 | 11.7k | // the X register. |
775 | 11.7k | // Insert this definition right after the generated LDP, i.e., before |
776 | 11.7k | // InsertionPoint. |
777 | 11.7k | MachineInstrBuilder MIBKill = |
778 | 11.7k | BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) |
779 | 11.7k | .addReg(DstRegW) |
780 | 11.7k | .addReg(DstRegX, RegState::Define); |
781 | 11.7k | MIBKill->getOperand(2).setImplicit(); |
782 | 11.7k | // Create the sign extension. |
783 | 11.7k | MachineInstrBuilder MIBSXTW = |
784 | 11.7k | BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) |
785 | 11.7k | .addReg(DstRegX) |
786 | 11.7k | .addImm(0) |
787 | 11.7k | .addImm(31); |
788 | 11.7k | (void)MIBSXTW; |
789 | 11.7k | DEBUG(dbgs() << " Extend operand:\n "); |
790 | 11.7k | DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); |
791 | 354k | } else { |
792 | 342k | DEBUG(((MachineInstr *)MIB)->print(dbgs())); |
793 | 342k | } |
794 | 354k | DEBUG(dbgs() << "\n"); |
795 | 354k | |
796 | 354k | // Erase the old instructions. |
797 | 354k | I->eraseFromParent(); |
798 | 354k | Paired->eraseFromParent(); |
799 | 354k | |
800 | 354k | return NextI; |
801 | 354k | } |
802 | | |
803 | | MachineBasicBlock::iterator |
804 | | AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, |
805 | 141 | MachineBasicBlock::iterator StoreI) { |
806 | 141 | MachineBasicBlock::iterator NextI = LoadI; |
807 | 141 | ++NextI; |
808 | 141 | |
809 | 141 | int LoadSize = getMemScale(*LoadI); |
810 | 141 | int StoreSize = getMemScale(*StoreI); |
811 | 141 | unsigned LdRt = getLdStRegOp(*LoadI).getReg(); |
812 | 141 | const MachineOperand &StMO = getLdStRegOp(*StoreI); |
813 | 141 | unsigned StRt = getLdStRegOp(*StoreI).getReg(); |
814 | 141 | bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); |
815 | 141 | |
816 | 141 | assert((IsStoreXReg || |
817 | 141 | TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && |
818 | 141 | "Unexpected RegClass"); |
819 | 141 | |
820 | 141 | MachineInstr *BitExtMI; |
821 | 141 | if (LoadSize == StoreSize && 141 (LoadSize == 4 || 79 LoadSize == 839 )) { |
822 | 74 | // Remove the load, if the destination register of the loads is the same |
823 | 74 | // register for stored value. |
824 | 74 | if (StRt == LdRt && 74 LoadSize == 840 ) { |
825 | 13 | for (MachineInstr &MI : make_range(StoreI->getIterator(), |
826 | 14 | LoadI->getIterator())) { |
827 | 14 | if (MI.killsRegister(StRt, TRI)14 ) { |
828 | 13 | MI.clearRegisterKills(StRt, TRI); |
829 | 13 | break; |
830 | 13 | } |
831 | 13 | } |
832 | 13 | DEBUG(dbgs() << "Remove load instruction:\n "); |
833 | 13 | DEBUG(LoadI->print(dbgs())); |
834 | 13 | DEBUG(dbgs() << "\n"); |
835 | 13 | LoadI->eraseFromParent(); |
836 | 13 | return NextI; |
837 | 13 | } |
838 | 61 | // Replace the load with a mov if the load and store are in the same size. |
839 | 61 | BitExtMI = |
840 | 61 | BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), |
841 | 61 | TII->get(IsStoreXReg ? AArch64::ORRXrs21 : AArch64::ORRWrs40 ), LdRt) |
842 | 61 | .addReg(IsStoreXReg ? AArch64::XZR21 : AArch64::WZR40 ) |
843 | 74 | .add(StMO) |
844 | 74 | .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); |
845 | 141 | } else { |
846 | 67 | // FIXME: Currently we disable this transformation in big-endian targets as |
847 | 67 | // performance and correctness are verified only in little-endian. |
848 | 67 | if (!Subtarget->isLittleEndian()) |
849 | 0 | return NextI; |
850 | 67 | bool IsUnscaled = TII->isUnscaledLdSt(*LoadI); |
851 | 67 | assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) && |
852 | 67 | "Unsupported ld/st match"); |
853 | 67 | assert(LoadSize <= StoreSize && "Invalid load size"); |
854 | 67 | int UnscaledLdOffset = IsUnscaled |
855 | 24 | ? getLdStOffsetOp(*LoadI).getImm() |
856 | 43 | : getLdStOffsetOp(*LoadI).getImm() * LoadSize; |
857 | 67 | int UnscaledStOffset = IsUnscaled |
858 | 24 | ? getLdStOffsetOp(*StoreI).getImm() |
859 | 43 | : getLdStOffsetOp(*StoreI).getImm() * StoreSize; |
860 | 67 | int Width = LoadSize * 8; |
861 | 67 | int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); |
862 | 67 | int Imms = Immr + Width - 1; |
863 | 67 | unsigned DestReg = IsStoreXReg |
864 | 35 | ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, |
865 | 35 | &AArch64::GPR64RegClass) |
866 | 32 | : LdRt; |
867 | 67 | |
868 | 67 | assert((UnscaledLdOffset >= UnscaledStOffset && |
869 | 67 | (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && |
870 | 67 | "Invalid offset"); |
871 | 67 | |
872 | 67 | Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); |
873 | 67 | Imms = Immr + Width - 1; |
874 | 67 | if (UnscaledLdOffset == UnscaledStOffset67 ) { |
875 | 25 | uint32_t AndMaskEncoded = ((IsStoreXReg ? 111 : 014 ) << 12) // N |
876 | 25 | | ((Immr) << 6) // immr |
877 | 25 | | ((Imms) << 0) // imms |
878 | 25 | ; |
879 | 25 | |
880 | 25 | BitExtMI = |
881 | 25 | BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), |
882 | 25 | TII->get(IsStoreXReg ? AArch64::ANDXri11 : AArch64::ANDWri14 ), |
883 | 25 | DestReg) |
884 | 25 | .add(StMO) |
885 | 25 | .addImm(AndMaskEncoded); |
886 | 67 | } else { |
887 | 42 | BitExtMI = |
888 | 42 | BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), |
889 | 42 | TII->get(IsStoreXReg ? AArch64::UBFMXri24 : AArch64::UBFMWri18 ), |
890 | 42 | DestReg) |
891 | 42 | .add(StMO) |
892 | 42 | .addImm(Immr) |
893 | 42 | .addImm(Imms); |
894 | 42 | } |
895 | 67 | } |
896 | 141 | |
897 | 141 | // Clear kill flags between store and load. |
898 | 128 | for (MachineInstr &MI : make_range(StoreI->getIterator(), |
899 | 128 | BitExtMI->getIterator())) |
900 | 146 | if (146 MI.killsRegister(StRt, TRI)146 ) { |
901 | 120 | MI.clearRegisterKills(StRt, TRI); |
902 | 120 | break; |
903 | 120 | } |
904 | 128 | |
905 | 128 | DEBUG(dbgs() << "Promoting load by replacing :\n "); |
906 | 128 | DEBUG(StoreI->print(dbgs())); |
907 | 128 | DEBUG(dbgs() << " "); |
908 | 128 | DEBUG(LoadI->print(dbgs())); |
909 | 128 | DEBUG(dbgs() << " with instructions:\n "); |
910 | 128 | DEBUG(StoreI->print(dbgs())); |
911 | 128 | DEBUG(dbgs() << " "); |
912 | 128 | DEBUG((BitExtMI)->print(dbgs())); |
913 | 128 | DEBUG(dbgs() << "\n"); |
914 | 141 | |
915 | 141 | // Erase the old instructions. |
916 | 141 | LoadI->eraseFromParent(); |
917 | 141 | return NextI; |
918 | 141 | } |
919 | | |
920 | | /// trackRegDefsUses - Remember what registers the specified instruction uses |
921 | | /// and modifies. |
922 | | static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs, |
923 | | BitVector &UsedRegs, |
924 | 24.5M | const TargetRegisterInfo *TRI) { |
925 | 90.6M | for (const MachineOperand &MO : MI.operands()) { |
926 | 90.6M | if (MO.isRegMask()) |
927 | 1.20M | ModifiedRegs.setBitsNotInMask(MO.getRegMask()); |
928 | 90.6M | |
929 | 90.6M | if (!MO.isReg()) |
930 | 29.0M | continue; |
931 | 61.6M | unsigned Reg = MO.getReg(); |
932 | 61.6M | if (!Reg) |
933 | 2 | continue; |
934 | 61.6M | if (61.6M MO.isDef()61.6M ) { |
935 | 23.5M | // WZR/XZR are not modified even when used as a destination register. |
936 | 23.5M | if (Reg != AArch64::WZR && 23.5M Reg != AArch64::XZR22.7M ) |
937 | 164M | for (MCRegAliasIterator AI(Reg, TRI, true); 22.2M AI.isValid()164M ; ++AI142M ) |
938 | 142M | ModifiedRegs.set(*AI); |
939 | 61.6M | } else { |
940 | 38.0M | assert(MO.isUse() && "Reg operand not a def and not a use?!?"); |
941 | 280M | for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid()280M ; ++AI242M ) |
942 | 242M | UsedRegs.set(*AI); |
943 | 38.0M | } |
944 | 90.6M | } |
945 | 24.5M | } |
946 | | |
947 | 2.96M | static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { |
948 | 2.96M | // Convert the byte-offset used by unscaled into an "element" offset used |
949 | 2.96M | // by the scaled pair load/store instructions. |
950 | 2.96M | if (IsUnscaled2.96M ) { |
951 | 232k | // If the byte-offset isn't a multiple of the stride, there's no point |
952 | 232k | // trying to match it. |
953 | 232k | if (Offset % OffsetStride) |
954 | 88.7k | return false; |
955 | 143k | Offset /= OffsetStride; |
956 | 143k | } |
957 | 2.88M | return Offset <= 63 && 2.88M Offset >= -642.69M ; |
958 | 2.96M | } |
959 | | |
960 | | // Do alignment, specialized to power of 2 and for signed ints, |
961 | | // avoiding having to do a C-style cast from uint_64t to int when |
962 | | // using alignTo from include/llvm/Support/MathExtras.h. |
963 | | // FIXME: Move this function to include/MathExtras.h? |
964 | 51.7k | static int alignTo(int Num, int PowOf2) { |
965 | 51.7k | return (Num + PowOf2 - 1) & ~(PowOf2 - 1); |
966 | 51.7k | } |
967 | | |
968 | | static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, |
969 | 544k | AliasAnalysis *AA) { |
970 | 544k | // One of the instructions must modify memory. |
971 | 544k | if (!MIa.mayStore() && 544k !MIb.mayStore()492k ) |
972 | 16.3k | return false; |
973 | 527k | |
974 | 527k | // Both instructions must be memory operations. |
975 | 527k | if (527k !MIa.mayLoadOrStore() && 527k !MIb.mayLoadOrStore()0 ) |
976 | 0 | return false; |
977 | 527k | |
978 | 527k | return MIa.mayAlias(AA, MIb, /*UseTBAA*/false); |
979 | 527k | } |
980 | | |
981 | | static bool mayAlias(MachineInstr &MIa, |
982 | | SmallVectorImpl<MachineInstr *> &MemInsns, |
983 | 383k | AliasAnalysis *AA) { |
984 | 383k | for (MachineInstr *MIb : MemInsns) |
985 | 75.5k | if (75.5k mayAlias(MIa, *MIb, AA)75.5k ) |
986 | 22.6k | return true; |
987 | 361k | |
988 | 361k | return false; |
989 | 361k | } |
990 | | |
991 | | bool AArch64LoadStoreOpt::findMatchingStore( |
992 | | MachineBasicBlock::iterator I, unsigned Limit, |
993 | 1.88M | MachineBasicBlock::iterator &StoreI) { |
994 | 1.88M | MachineBasicBlock::iterator B = I->getParent()->begin(); |
995 | 1.88M | MachineBasicBlock::iterator MBBI = I; |
996 | 1.88M | MachineInstr &LoadMI = *I; |
997 | 1.88M | unsigned BaseReg = getLdStBaseOp(LoadMI).getReg(); |
998 | 1.88M | |
999 | 1.88M | // If the load is the first instruction in the block, there's obviously |
1000 | 1.88M | // not any matching store. |
1001 | 1.88M | if (MBBI == B) |
1002 | 744k | return false; |
1003 | 1.13M | |
1004 | 1.13M | // Track which registers have been modified and used between the first insn |
1005 | 1.13M | // and the second insn. |
1006 | 1.13M | ModifiedRegs.reset(); |
1007 | 1.13M | UsedRegs.reset(); |
1008 | 1.13M | |
1009 | 1.13M | unsigned Count = 0; |
1010 | 3.27M | do { |
1011 | 3.27M | --MBBI; |
1012 | 3.27M | MachineInstr &MI = *MBBI; |
1013 | 3.27M | |
1014 | 3.27M | // Don't count transient instructions towards the search limit since there |
1015 | 3.27M | // may be different numbers of them if e.g. debug information is present. |
1016 | 3.27M | if (!MI.isTransient()) |
1017 | 3.25M | ++Count; |
1018 | 3.27M | |
1019 | 3.27M | // If the load instruction reads directly from the address to which the |
1020 | 3.27M | // store instruction writes and the stored value is not modified, we can |
1021 | 3.27M | // promote the load. Since we do not handle stores with pre-/post-index, |
1022 | 3.27M | // it's unnecessary to check if BaseReg is modified by the store itself. |
1023 | 3.27M | if (MI.mayStore() && 3.27M isMatchingStore(LoadMI, MI)468k && |
1024 | 231k | BaseReg == getLdStBaseOp(MI).getReg() && |
1025 | 66.5k | isLdOffsetInRangeOfSt(LoadMI, MI, TII) && |
1026 | 3.27M | !ModifiedRegs[getLdStRegOp(MI).getReg()]309 ) { |
1027 | 141 | StoreI = MBBI; |
1028 | 141 | return true; |
1029 | 141 | } |
1030 | 3.27M | |
1031 | 3.27M | if (3.27M MI.isCall()3.27M ) |
1032 | 224k | return false; |
1033 | 3.05M | |
1034 | 3.05M | // Update modified / uses register lists. |
1035 | 3.05M | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); |
1036 | 3.05M | |
1037 | 3.05M | // Otherwise, if the base register is modified, we have no match, so |
1038 | 3.05M | // return early. |
1039 | 3.05M | if (ModifiedRegs[BaseReg]) |
1040 | 325k | return false; |
1041 | 2.72M | |
1042 | 2.72M | // If we encounter a store aliased with the load, return early. |
1043 | 2.72M | if (2.72M MI.mayStore() && 2.72M mayAlias(LoadMI, MI, AA)468k ) |
1044 | 174k | return false; |
1045 | 2.55M | } while (MBBI != B && 2.55M Count < Limit2.16M ); |
1046 | 411k | return false; |
1047 | 1.88M | } |
1048 | | |
1049 | | // Returns true if FirstMI and MI are candidates for merging or pairing. |
1050 | | // Otherwise, returns false. |
1051 | | static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, |
1052 | | LdStPairFlags &Flags, |
1053 | 8.21M | const AArch64InstrInfo *TII) { |
1054 | 8.21M | // If this is volatile or if pairing is suppressed, not a candidate. |
1055 | 8.21M | if (MI.hasOrderedMemoryRef() || 8.21M TII->isLdStPairSuppressed(MI)7.45M ) |
1056 | 797k | return false; |
1057 | 7.41M | |
1058 | 7.41M | // We should have already checked FirstMI for pair suppression and volatility. |
1059 | 8.21M | assert(!FirstMI.hasOrderedMemoryRef() && |
1060 | 7.41M | !TII->isLdStPairSuppressed(FirstMI) && |
1061 | 7.41M | "FirstMI shouldn't get here if either of these checks are true."); |
1062 | 7.41M | |
1063 | 7.41M | unsigned OpcA = FirstMI.getOpcode(); |
1064 | 7.41M | unsigned OpcB = MI.getOpcode(); |
1065 | 7.41M | |
1066 | 7.41M | // Opcodes match: nothing more to check. |
1067 | 7.41M | if (OpcA == OpcB) |
1068 | 1.27M | return true; |
1069 | 6.13M | |
1070 | 6.13M | // Try to match a sign-extended load/store with a zero-extended load/store. |
1071 | 6.13M | bool IsValidLdStrOpc, PairIsValidLdStrOpc; |
1072 | 6.13M | unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); |
1073 | 6.13M | assert(IsValidLdStrOpc && |
1074 | 6.13M | "Given Opc should be a Load or Store with an immediate"); |
1075 | 6.13M | // OpcA will be the first instruction in the pair. |
1076 | 6.13M | if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)6.13M ) { |
1077 | 117k | Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 196.7k : 020.2k ); |
1078 | 117k | return true; |
1079 | 117k | } |
1080 | 6.02M | |
1081 | 6.02M | // If the second instruction isn't even a mergable/pairable load/store, bail |
1082 | 6.02M | // out. |
1083 | 6.02M | if (6.02M !PairIsValidLdStrOpc6.02M ) |
1084 | 4.35M | return false; |
1085 | 1.67M | |
1086 | 1.67M | // FIXME: We don't support merging narrow stores with mixed scaled/unscaled |
1087 | 1.67M | // offsets. |
1088 | 1.67M | if (1.67M isNarrowStore(OpcA) || 1.67M isNarrowStore(OpcB)1.64M ) |
1089 | 150k | return false; |
1090 | 1.51M | |
1091 | 1.51M | // Try to match an unscaled load/store with a scaled load/store. |
1092 | 1.51M | return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) && |
1093 | 191k | getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); |
1094 | 8.21M | |
1095 | 8.21M | // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? |
1096 | 8.21M | } |
1097 | | |
1098 | | /// Scan the instructions looking for a load/store that can be combined with the |
1099 | | /// current instruction into a wider equivalent or a load/store pair. |
1100 | | MachineBasicBlock::iterator |
1101 | | AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, |
1102 | | LdStPairFlags &Flags, unsigned Limit, |
1103 | 2.39M | bool FindNarrowMerge) { |
1104 | 2.39M | MachineBasicBlock::iterator E = I->getParent()->end(); |
1105 | 2.39M | MachineBasicBlock::iterator MBBI = I; |
1106 | 2.39M | MachineInstr &FirstMI = *I; |
1107 | 2.39M | ++MBBI; |
1108 | 2.39M | |
1109 | 2.39M | bool MayLoad = FirstMI.mayLoad(); |
1110 | 2.39M | bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); |
1111 | 2.39M | unsigned Reg = getLdStRegOp(FirstMI).getReg(); |
1112 | 2.39M | unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); |
1113 | 2.39M | int Offset = getLdStOffsetOp(FirstMI).getImm(); |
1114 | 2.39M | int OffsetStride = IsUnscaled ? getMemScale(FirstMI)106k : 12.28M ; |
1115 | 2.39M | bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); |
1116 | 2.39M | |
1117 | 2.39M | // Track which registers have been modified and used between the first insn |
1118 | 2.39M | // (inclusive) and the second insn. |
1119 | 2.39M | ModifiedRegs.reset(); |
1120 | 2.39M | UsedRegs.reset(); |
1121 | 2.39M | |
1122 | 2.39M | // Remember any instructions that read/write memory between FirstMI and MI. |
1123 | 2.39M | SmallVector<MachineInstr *, 4> MemInsns; |
1124 | 2.39M | |
1125 | 9.52M | for (unsigned Count = 0; MBBI != E && 9.52M Count < Limit8.27M ; ++MBBI7.13M ) { |
1126 | 8.21M | MachineInstr &MI = *MBBI; |
1127 | 8.21M | |
1128 | 8.21M | // Don't count transient instructions towards the search limit since there |
1129 | 8.21M | // may be different numbers of them if e.g. debug information is present. |
1130 | 8.21M | if (!MI.isTransient()) |
1131 | 8.19M | ++Count; |
1132 | 8.21M | |
1133 | 8.21M | Flags.setSExtIdx(-1); |
1134 | 8.21M | if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && |
1135 | 8.21M | getLdStOffsetOp(MI).isImm()1.46M ) { |
1136 | 1.46M | assert(MI.mayLoadOrStore() && "Expected memory operation."); |
1137 | 1.46M | // If we've found another instruction with the same opcode, check to see |
1138 | 1.46M | // if the base and offset are compatible with our starting instruction. |
1139 | 1.46M | // These instructions all have scaled immediate operands, so we just |
1140 | 1.46M | // check for +1/-1. Make sure to check the new instruction offset is |
1141 | 1.46M | // actually an immediate and not a symbolic reference destined for |
1142 | 1.46M | // a relocation. |
1143 | 1.46M | unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); |
1144 | 1.46M | int MIOffset = getLdStOffsetOp(MI).getImm(); |
1145 | 1.46M | bool MIIsUnscaled = TII->isUnscaledLdSt(MI); |
1146 | 1.46M | if (IsUnscaled != MIIsUnscaled1.46M ) { |
1147 | 75.1k | // We're trying to pair instructions that differ in how they are scaled. |
1148 | 75.1k | // If FirstMI is scaled then scale the offset of MI accordingly. |
1149 | 75.1k | // Otherwise, do the opposite (i.e., make MI's offset unscaled). |
1150 | 75.1k | int MemSize = getMemScale(MI); |
1151 | 75.1k | if (MIIsUnscaled75.1k ) { |
1152 | 40.3k | // If the unscaled offset isn't a multiple of the MemSize, we can't |
1153 | 40.3k | // pair the operations together: bail and keep looking. |
1154 | 40.3k | if (MIOffset % MemSize40.3k ) { |
1155 | 27.1k | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); |
1156 | 27.1k | MemInsns.push_back(&MI); |
1157 | 27.1k | continue; |
1158 | 27.1k | } |
1159 | 13.1k | MIOffset /= MemSize; |
1160 | 75.1k | } else { |
1161 | 34.8k | MIOffset *= MemSize; |
1162 | 34.8k | } |
1163 | 75.1k | } |
1164 | 1.46M | |
1165 | 1.43M | if (1.43M BaseReg == MIBaseReg && 1.43M ((Offset == MIOffset + OffsetStride) || |
1166 | 1.43M | (Offset + OffsetStride == MIOffset)849k )) { |
1167 | 440k | int MinOffset = Offset < MIOffset ? Offset351k : MIOffset88.6k ; |
1168 | 440k | if (FindNarrowMerge440k ) { |
1169 | 10.3k | // If the alignment requirements of the scaled wide load/store |
1170 | 10.3k | // instruction can't express the offset of the scaled narrow input, |
1171 | 10.3k | // bail and keep looking. For promotable zero stores, allow only when |
1172 | 10.3k | // the stored value is the same (i.e., WZR). |
1173 | 10.3k | if ((!IsUnscaled && 10.3k alignTo(MinOffset, 2) != MinOffset10.1k ) || |
1174 | 10.3k | (IsPromotableZeroStore && 7.01k Reg != getLdStRegOp(MI).getReg()7.01k )) { |
1175 | 3.85k | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); |
1176 | 3.85k | MemInsns.push_back(&MI); |
1177 | 3.85k | continue; |
1178 | 3.85k | } |
1179 | 429k | } else { |
1180 | 429k | // Pairwise instructions have a 7-bit signed offset field. Single |
1181 | 429k | // insns have a 12-bit unsigned offset field. If the resultant |
1182 | 429k | // immediate offset of merging these instructions is out of range for |
1183 | 429k | // a pairwise instruction, bail and keep looking. |
1184 | 429k | if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)429k ) { |
1185 | 800 | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); |
1186 | 800 | MemInsns.push_back(&MI); |
1187 | 800 | continue; |
1188 | 800 | } |
1189 | 428k | // If the alignment requirements of the paired (scaled) instruction |
1190 | 428k | // can't express the offset of the unscaled input, bail and keep |
1191 | 428k | // looking. |
1192 | 428k | if (428k IsUnscaled && 428k (alignTo(MinOffset, OffsetStride) != MinOffset)41.5k ) { |
1193 | 0 | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); |
1194 | 0 | MemInsns.push_back(&MI); |
1195 | 0 | continue; |
1196 | 0 | } |
1197 | 435k | } |
1198 | 435k | // If the destination register of the loads is the same register, bail |
1199 | 435k | // and keep looking. A load-pair instruction with both destination |
1200 | 435k | // registers the same is UNPREDICTABLE and will result in an exception. |
1201 | 435k | if (435k MayLoad && 435k Reg == getLdStRegOp(MI).getReg()190k ) { |
1202 | 20.3k | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); |
1203 | 20.3k | MemInsns.push_back(&MI); |
1204 | 20.3k | continue; |
1205 | 20.3k | } |
1206 | 415k | |
1207 | 415k | // If the Rt of the second instruction was not modified or used between |
1208 | 415k | // the two instructions and none of the instructions between the second |
1209 | 415k | // and first alias with the second, we can combine the second into the |
1210 | 415k | // first. |
1211 | 415k | if (415k !ModifiedRegs[getLdStRegOp(MI).getReg()] && |
1212 | 359k | !(MI.mayLoad() && 359k UsedRegs[getLdStRegOp(MI).getReg()]167k ) && |
1213 | 415k | !mayAlias(MI, MemInsns, AA)356k ) { |
1214 | 345k | Flags.setMergeForward(false); |
1215 | 345k | return MBBI; |
1216 | 345k | } |
1217 | 69.7k | |
1218 | 69.7k | // Likewise, if the Rt of the first instruction is not modified or used |
1219 | 69.7k | // between the two instructions and none of the instructions between the |
1220 | 69.7k | // first and the second alias with the first, we can combine the first |
1221 | 69.7k | // into the second. |
1222 | 69.7k | if (69.7k !ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && |
1223 | 33.6k | !(MayLoad && 33.6k UsedRegs[getLdStRegOp(FirstMI).getReg()]7.09k ) && |
1224 | 69.7k | !mayAlias(FirstMI, MemInsns, AA)27.2k ) { |
1225 | 15.7k | Flags.setMergeForward(true); |
1226 | 15.7k | return MBBI; |
1227 | 15.7k | } |
1228 | 7.79M | // Unable to combine these instructions due to interference in between. |
1229 | 7.79M | // Keep looking. |
1230 | 7.79M | } |
1231 | 1.46M | } |
1232 | 7.79M | |
1233 | 7.79M | // If the instruction wasn't a matching load or store. Stop searching if we |
1234 | 7.79M | // encounter a call instruction that might modify memory. |
1235 | 7.79M | if (7.79M MI.isCall()7.79M ) |
1236 | 550k | return E; |
1237 | 7.24M | |
1238 | 7.24M | // Update modified / uses register lists. |
1239 | 7.24M | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); |
1240 | 7.24M | |
1241 | 7.24M | // Otherwise, if the base register is modified, we have no match, so |
1242 | 7.24M | // return early. |
1243 | 7.24M | if (ModifiedRegs[BaseReg]) |
1244 | 164k | return E; |
1245 | 7.08M | |
1246 | 7.08M | // Update list of instructions that read/write memory. |
1247 | 7.08M | if (7.08M MI.mayLoadOrStore()7.08M ) |
1248 | 3.00M | MemInsns.push_back(&MI); |
1249 | 8.21M | } |
1250 | 1.31M | return E; |
1251 | 2.39M | } |
1252 | | |
1253 | | MachineBasicBlock::iterator |
1254 | | AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, |
1255 | | MachineBasicBlock::iterator Update, |
1256 | 3.48k | bool IsPreIdx) { |
1257 | 3.48k | assert((Update->getOpcode() == AArch64::ADDXri || |
1258 | 3.48k | Update->getOpcode() == AArch64::SUBXri) && |
1259 | 3.48k | "Unexpected base register update instruction to merge!"); |
1260 | 3.48k | MachineBasicBlock::iterator NextI = I; |
1261 | 3.48k | // Return the instruction following the merged instruction, which is |
1262 | 3.48k | // the instruction following our unmerged load. Unless that's the add/sub |
1263 | 3.48k | // instruction we're merging, in which case it's the one after that. |
1264 | 3.48k | if (++NextI == Update) |
1265 | 970 | ++NextI; |
1266 | 3.48k | |
1267 | 3.48k | int Value = Update->getOperand(2).getImm(); |
1268 | 3.48k | assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && |
1269 | 3.48k | "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); |
1270 | 3.48k | if (Update->getOpcode() == AArch64::SUBXri) |
1271 | 327 | Value = -Value; |
1272 | 3.48k | |
1273 | 348 | unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) |
1274 | 3.13k | : getPostIndexedOpcode(I->getOpcode()); |
1275 | 3.48k | MachineInstrBuilder MIB; |
1276 | 3.48k | if (!isPairedLdSt(*I)3.48k ) { |
1277 | 2.64k | // Non-paired instruction. |
1278 | 2.64k | MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) |
1279 | 2.64k | .add(getLdStRegOp(*Update)) |
1280 | 2.64k | .add(getLdStRegOp(*I)) |
1281 | 2.64k | .add(getLdStBaseOp(*I)) |
1282 | 2.64k | .addImm(Value) |
1283 | 2.64k | .setMemRefs(I->memoperands_begin(), I->memoperands_end()); |
1284 | 3.48k | } else { |
1285 | 834 | // Paired instruction. |
1286 | 834 | int Scale = getMemScale(*I); |
1287 | 834 | MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) |
1288 | 834 | .add(getLdStRegOp(*Update)) |
1289 | 834 | .add(getLdStRegOp(*I, 0)) |
1290 | 834 | .add(getLdStRegOp(*I, 1)) |
1291 | 834 | .add(getLdStBaseOp(*I)) |
1292 | 834 | .addImm(Value / Scale) |
1293 | 834 | .setMemRefs(I->memoperands_begin(), I->memoperands_end()); |
1294 | 834 | } |
1295 | 3.48k | (void)MIB; |
1296 | 3.48k | |
1297 | 3.48k | if (IsPreIdx) |
1298 | 3.48k | DEBUG(dbgs() << "Creating pre-indexed load/store."); |
1299 | 3.48k | else |
1300 | 3.48k | DEBUG(dbgs() << "Creating post-indexed load/store."); |
1301 | 3.48k | DEBUG(dbgs() << " Replacing instructions:\n "); |
1302 | 3.48k | DEBUG(I->print(dbgs())); |
1303 | 3.48k | DEBUG(dbgs() << " "); |
1304 | 3.48k | DEBUG(Update->print(dbgs())); |
1305 | 3.48k | DEBUG(dbgs() << " with instruction:\n "); |
1306 | 3.48k | DEBUG(((MachineInstr *)MIB)->print(dbgs())); |
1307 | 3.48k | DEBUG(dbgs() << "\n"); |
1308 | 3.48k | |
1309 | 3.48k | // Erase the old instructions for the block. |
1310 | 3.48k | I->eraseFromParent(); |
1311 | 3.48k | Update->eraseFromParent(); |
1312 | 3.48k | |
1313 | 3.48k | return NextI; |
1314 | 3.48k | } |
1315 | | |
1316 | | bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, |
1317 | | MachineInstr &MI, |
1318 | 14.1M | unsigned BaseReg, int Offset) { |
1319 | 14.1M | switch (MI.getOpcode()) { |
1320 | 13.2M | default: |
1321 | 13.2M | break; |
1322 | 909k | case AArch64::SUBXri: |
1323 | 909k | case AArch64::ADDXri: |
1324 | 909k | // Make sure it's a vanilla immediate operand, not a relocation or |
1325 | 909k | // anything else we can't handle. |
1326 | 909k | if (!MI.getOperand(2).isImm()) |
1327 | 329k | break; |
1328 | 580k | // Watch out for 1 << 12 shifted value. |
1329 | 580k | if (580k AArch64_AM::getShiftValue(MI.getOperand(3).getImm())580k ) |
1330 | 1.58k | break; |
1331 | 578k | |
1332 | 578k | // The update instruction source and destination register must be the |
1333 | 578k | // same as the load/store base register. |
1334 | 578k | if (578k MI.getOperand(0).getReg() != BaseReg || |
1335 | 90.0k | MI.getOperand(1).getReg() != BaseReg) |
1336 | 500k | break; |
1337 | 78.3k | |
1338 | 78.3k | bool IsPairedInsn = isPairedLdSt(MemMI); |
1339 | 78.3k | int UpdateOffset = MI.getOperand(2).getImm(); |
1340 | 78.3k | if (MI.getOpcode() == AArch64::SUBXri) |
1341 | 3.93k | UpdateOffset = -UpdateOffset; |
1342 | 78.3k | |
1343 | 78.3k | // For non-paired load/store instructions, the immediate must fit in a |
1344 | 78.3k | // signed 9-bit integer. |
1345 | 78.3k | if (!IsPairedInsn && 78.3k (UpdateOffset > 255 || 9.29k UpdateOffset < -2567.60k )) |
1346 | 1.76k | break; |
1347 | 76.5k | |
1348 | 76.5k | // For paired load/store instructions, the immediate must be a multiple of |
1349 | 76.5k | // the scaling factor. The scaled offset must also fit into a signed 7-bit |
1350 | 76.5k | // integer. |
1351 | 76.5k | if (76.5k IsPairedInsn76.5k ) { |
1352 | 69.0k | int Scale = getMemScale(MemMI); |
1353 | 69.0k | if (UpdateOffset % Scale != 0) |
1354 | 8 | break; |
1355 | 69.0k | |
1356 | 69.0k | int ScaledOffset = UpdateOffset / Scale; |
1357 | 69.0k | if (ScaledOffset > 63 || 69.0k ScaledOffset < -6468.9k ) |
1358 | 73 | break; |
1359 | 76.4k | } |
1360 | 76.4k | |
1361 | 76.4k | // If we have a non-zero Offset, we check that it matches the amount |
1362 | 76.4k | // we're adding to the register. |
1363 | 76.4k | if (76.4k !Offset || 76.4k Offset == UpdateOffset73.0k ) |
1364 | 3.48k | return true; |
1365 | 72.9k | break; |
1366 | 14.1M | } |
1367 | 14.1M | return false; |
1368 | 14.1M | } |
1369 | | |
1370 | | MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( |
1371 | 9.33M | MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { |
1372 | 9.33M | MachineBasicBlock::iterator E = I->getParent()->end(); |
1373 | 9.33M | MachineInstr &MemMI = *I; |
1374 | 9.33M | MachineBasicBlock::iterator MBBI = I; |
1375 | 9.33M | |
1376 | 9.33M | unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); |
1377 | 9.33M | int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); |
1378 | 9.33M | |
1379 | 9.33M | // Scan forward looking for post-index opportunities. Updating instructions |
1380 | 9.33M | // can't be formed if the memory instruction doesn't have the offset we're |
1381 | 9.33M | // looking for. |
1382 | 9.33M | if (MIUnscaledOffset != UnscaledOffset) |
1383 | 3.95M | return E; |
1384 | 5.37M | |
1385 | 5.37M | // If the base register overlaps a destination register, we can't |
1386 | 5.37M | // merge the update. |
1387 | 5.37M | bool IsPairedInsn = isPairedLdSt(MemMI); |
1388 | 12.3M | for (unsigned i = 0, e = IsPairedInsn ? 21.86M : 13.51M ; i != e12.3M ; ++i6.99M ) { |
1389 | 7.22M | unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); |
1390 | 7.22M | if (DestReg == BaseReg || 7.22M TRI->isSubRegister(BaseReg, DestReg)7.05M ) |
1391 | 234k | return E; |
1392 | 7.22M | } |
1393 | 5.37M | |
1394 | 5.37M | // Track which registers have been modified and used between the first insn |
1395 | 5.37M | // (inclusive) and the second insn. |
1396 | 5.13M | ModifiedRegs.reset(); |
1397 | 5.13M | UsedRegs.reset(); |
1398 | 5.13M | ++MBBI; |
1399 | 14.4M | for (unsigned Count = 0; MBBI != E && 14.4M Count < Limit12.5M ; ++MBBI9.30M ) { |
1400 | 12.5M | MachineInstr &MI = *MBBI; |
1401 | 12.5M | |
1402 | 12.5M | // Don't count transient instructions towards the search limit since there |
1403 | 12.5M | // may be different numbers of them if e.g. debug information is present. |
1404 | 12.5M | if (!MI.isTransient()) |
1405 | 12.5M | ++Count; |
1406 | 12.5M | |
1407 | 12.5M | // If we found a match, return it. |
1408 | 12.5M | if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) |
1409 | 3.19k | return MBBI; |
1410 | 12.5M | |
1411 | 12.5M | // Update the status of what the instruction clobbered and used. |
1412 | 12.5M | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); |
1413 | 12.5M | |
1414 | 12.5M | // Otherwise, if the base register is used or modified, we have no match, so |
1415 | 12.5M | // return early. |
1416 | 12.5M | if (ModifiedRegs[BaseReg] || 12.5M UsedRegs[BaseReg]11.8M ) |
1417 | 3.27M | return E; |
1418 | 12.5M | } |
1419 | 1.85M | return E; |
1420 | 9.33M | } |
1421 | | |
1422 | | MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( |
1423 | 4.57M | MachineBasicBlock::iterator I, unsigned Limit) { |
1424 | 4.57M | MachineBasicBlock::iterator B = I->getParent()->begin(); |
1425 | 4.57M | MachineBasicBlock::iterator E = I->getParent()->end(); |
1426 | 4.57M | MachineInstr &MemMI = *I; |
1427 | 4.57M | MachineBasicBlock::iterator MBBI = I; |
1428 | 4.57M | |
1429 | 4.57M | unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); |
1430 | 4.57M | int Offset = getLdStOffsetOp(MemMI).getImm(); |
1431 | 4.57M | |
1432 | 4.57M | // If the load/store is the first instruction in the block, there's obviously |
1433 | 4.57M | // not any matching update. Ditto if the memory offset isn't zero. |
1434 | 4.57M | if (MBBI == B || 4.57M Offset != 03.68M ) |
1435 | 3.96M | return E; |
1436 | 612k | // If the base register overlaps a destination register, we can't |
1437 | 612k | // merge the update. |
1438 | 612k | bool IsPairedInsn = isPairedLdSt(MemMI); |
1439 | 1.21M | for (unsigned i = 0, e = IsPairedInsn ? 258.0k : 1554k ; i != e1.21M ; ++i605k ) { |
1440 | 668k | unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); |
1441 | 668k | if (DestReg == BaseReg || 668k TRI->isSubRegister(BaseReg, DestReg)623k ) |
1442 | 62.8k | return E; |
1443 | 668k | } |
1444 | 612k | |
1445 | 612k | // Track which registers have been modified and used between the first insn |
1446 | 612k | // (inclusive) and the second insn. |
1447 | 549k | ModifiedRegs.reset(); |
1448 | 549k | UsedRegs.reset(); |
1449 | 549k | unsigned Count = 0; |
1450 | 1.59M | do { |
1451 | 1.59M | --MBBI; |
1452 | 1.59M | MachineInstr &MI = *MBBI; |
1453 | 1.59M | |
1454 | 1.59M | // Don't count transient instructions towards the search limit since there |
1455 | 1.59M | // may be different numbers of them if e.g. debug information is present. |
1456 | 1.59M | if (!MI.isTransient()) |
1457 | 1.58M | ++Count; |
1458 | 1.59M | |
1459 | 1.59M | // If we found a match, return it. |
1460 | 1.59M | if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) |
1461 | 293 | return MBBI; |
1462 | 1.59M | |
1463 | 1.59M | // Update the status of what the instruction clobbered and used. |
1464 | 1.59M | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); |
1465 | 1.59M | |
1466 | 1.59M | // Otherwise, if the base register is used or modified, we have no match, so |
1467 | 1.59M | // return early. |
1468 | 1.59M | if (ModifiedRegs[BaseReg] || 1.59M UsedRegs[BaseReg]1.34M ) |
1469 | 408k | return E; |
1470 | 1.18M | } while (MBBI != B && 1.18M Count < Limit1.04M ); |
1471 | 141k | return E; |
1472 | 4.57M | } |
1473 | | |
1474 | | bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( |
1475 | 2.20M | MachineBasicBlock::iterator &MBBI) { |
1476 | 2.20M | MachineInstr &MI = *MBBI; |
1477 | 2.20M | // If this is a volatile load, don't mess with it. |
1478 | 2.20M | if (MI.hasOrderedMemoryRef()) |
1479 | 305k | return false; |
1480 | 1.89M | |
1481 | 1.89M | // Make sure this is a reg+imm. |
1482 | 1.89M | // FIXME: It is possible to extend it to handle reg+reg cases. |
1483 | 1.89M | if (1.89M !getLdStOffsetOp(MI).isImm()1.89M ) |
1484 | 17.6k | return false; |
1485 | 1.88M | |
1486 | 1.88M | // Look backward up to LdStLimit instructions. |
1487 | 1.88M | MachineBasicBlock::iterator StoreI; |
1488 | 1.88M | if (findMatchingStore(MBBI, LdStLimit, StoreI)1.88M ) { |
1489 | 141 | ++NumLoadsFromStoresPromoted; |
1490 | 141 | // Promote the load. Keeping the iterator straight is a |
1491 | 141 | // pain, so we let the merge routine tell us what the next instruction |
1492 | 141 | // is after it's done mucking about. |
1493 | 141 | MBBI = promoteLoadFromStore(MBBI, StoreI); |
1494 | 141 | return true; |
1495 | 141 | } |
1496 | 1.88M | return false; |
1497 | 1.88M | } |
1498 | | |
1499 | | // Merge adjacent zero stores into a wider store. |
1500 | | bool AArch64LoadStoreOpt::tryToMergeZeroStInst( |
1501 | 127k | MachineBasicBlock::iterator &MBBI) { |
1502 | 127k | assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store."); |
1503 | 127k | MachineInstr &MI = *MBBI; |
1504 | 127k | MachineBasicBlock::iterator E = MI.getParent()->end(); |
1505 | 127k | |
1506 | 127k | if (!TII->isCandidateToMergeOrPair(MI)) |
1507 | 503 | return false; |
1508 | 126k | |
1509 | 126k | // Look ahead up to LdStLimit instructions for a mergable instruction. |
1510 | 126k | LdStPairFlags Flags; |
1511 | 126k | MachineBasicBlock::iterator MergeMI = |
1512 | 126k | findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); |
1513 | 126k | if (MergeMI != E126k ) { |
1514 | 6.37k | ++NumZeroStoresPromoted; |
1515 | 6.37k | |
1516 | 6.37k | // Keeping the iterator straight is a pain, so we let the merge routine tell |
1517 | 6.37k | // us what the next instruction is after it's done mucking about. |
1518 | 6.37k | MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags); |
1519 | 6.37k | return true; |
1520 | 6.37k | } |
1521 | 120k | return false; |
1522 | 120k | } |
1523 | | |
1524 | | // Find loads and stores that can be merged into a single load or store pair |
1525 | | // instruction. |
1526 | 3.15M | bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { |
1527 | 3.15M | MachineInstr &MI = *MBBI; |
1528 | 3.15M | MachineBasicBlock::iterator E = MI.getParent()->end(); |
1529 | 3.15M | |
1530 | 3.15M | if (!TII->isCandidateToMergeOrPair(MI)) |
1531 | 617k | return false; |
1532 | 2.53M | |
1533 | 2.53M | // Early exit if the offset is not possible to match. (6 bits of positive |
1534 | 2.53M | // range, plus allow an extra one in case we find a later insn that matches |
1535 | 2.53M | // with Offset-1) |
1536 | 2.53M | bool IsUnscaled = TII->isUnscaledLdSt(MI); |
1537 | 2.53M | int Offset = getLdStOffsetOp(MI).getImm(); |
1538 | 2.53M | int OffsetStride = IsUnscaled ? getMemScale(MI)190k : 12.34M ; |
1539 | 2.53M | // Allow one more for offset. |
1540 | 2.53M | if (Offset > 0) |
1541 | 1.79M | Offset -= OffsetStride; |
1542 | 2.53M | if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) |
1543 | 272k | return false; |
1544 | 2.26M | |
1545 | 2.26M | // Look ahead up to LdStLimit instructions for a pairable instruction. |
1546 | 2.26M | LdStPairFlags Flags; |
1547 | 2.26M | MachineBasicBlock::iterator Paired = |
1548 | 2.26M | findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false); |
1549 | 2.26M | if (Paired != E2.26M ) { |
1550 | 354k | ++NumPairCreated; |
1551 | 354k | if (TII->isUnscaledLdSt(MI)) |
1552 | 34.2k | ++NumUnscaledPairCreated; |
1553 | 354k | // Keeping the iterator straight is a pain, so we let the merge routine tell |
1554 | 354k | // us what the next instruction is after it's done mucking about. |
1555 | 354k | MBBI = mergePairedInsns(MBBI, Paired, Flags); |
1556 | 354k | return true; |
1557 | 354k | } |
1558 | 1.91M | return false; |
1559 | 1.91M | } |
1560 | | |
1561 | | bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, |
1562 | 3.72M | bool EnableNarrowZeroStOpt) { |
1563 | 3.72M | bool Modified = false; |
1564 | 3.72M | // Four tranformations to do here: |
1565 | 3.72M | // 1) Find loads that directly read from stores and promote them by |
1566 | 3.72M | // replacing with mov instructions. If the store is wider than the load, |
1567 | 3.72M | // the load will be replaced with a bitfield extract. |
1568 | 3.72M | // e.g., |
1569 | 3.72M | // str w1, [x0, #4] |
1570 | 3.72M | // ldrh w2, [x0, #6] |
1571 | 3.72M | // ; becomes |
1572 | 3.72M | // str w1, [x0, #4] |
1573 | 3.72M | // lsr w2, w1, #16 |
1574 | 3.72M | for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); |
1575 | 27.3M | MBBI != E27.3M ;) { |
1576 | 23.5M | MachineInstr &MI = *MBBI; |
1577 | 23.5M | switch (MI.getOpcode()) { |
1578 | 21.3M | default: |
1579 | 21.3M | // Just move on to the next instruction. |
1580 | 21.3M | ++MBBI; |
1581 | 21.3M | break; |
1582 | 23.5M | // Scaled instructions. |
1583 | 2.20M | case AArch64::LDRBBui: |
1584 | 2.20M | case AArch64::LDRHHui: |
1585 | 2.20M | case AArch64::LDRWui: |
1586 | 2.20M | case AArch64::LDRXui: |
1587 | 2.20M | // Unscaled instructions. |
1588 | 2.20M | case AArch64::LDURBBi: |
1589 | 2.20M | case AArch64::LDURHHi: |
1590 | 2.20M | case AArch64::LDURWi: |
1591 | 2.20M | case AArch64::LDURXi: |
1592 | 2.20M | if (tryToPromoteLoadFromStore(MBBI)2.20M ) { |
1593 | 141 | Modified = true; |
1594 | 141 | break; |
1595 | 141 | } |
1596 | 2.20M | ++MBBI; |
1597 | 2.20M | break; |
1598 | 23.5M | } |
1599 | 23.5M | } |
1600 | 3.72M | // 2) Merge adjacent zero stores into a wider store. |
1601 | 3.72M | // e.g., |
1602 | 3.72M | // strh wzr, [x0] |
1603 | 3.72M | // strh wzr, [x0, #2] |
1604 | 3.72M | // ; becomes |
1605 | 3.72M | // str wzr, [x0] |
1606 | 3.72M | // e.g., |
1607 | 3.72M | // str wzr, [x0] |
1608 | 3.72M | // str wzr, [x0, #4] |
1609 | 3.72M | // ; becomes |
1610 | 3.72M | // str xzr, [x0] |
1611 | 3.72M | for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); |
1612 | 27.2M | EnableNarrowZeroStOpt && 27.2M MBBI != E27.2M ;) { |
1613 | 23.5M | if (isPromotableZeroStoreInst(*MBBI)23.5M ) { |
1614 | 127k | if (tryToMergeZeroStInst(MBBI)127k ) { |
1615 | 6.37k | Modified = true; |
1616 | 6.37k | } else |
1617 | 120k | ++MBBI; |
1618 | 127k | } else |
1619 | 23.4M | ++MBBI; |
1620 | 23.5M | } |
1621 | 3.72M | |
1622 | 3.72M | // 3) Find loads and stores that can be merged into a single load or store |
1623 | 3.72M | // pair instruction. |
1624 | 3.72M | // e.g., |
1625 | 3.72M | // ldr x0, [x2] |
1626 | 3.72M | // ldr x1, [x2, #8] |
1627 | 3.72M | // ; becomes |
1628 | 3.72M | // ldp x0, x1, [x2] |
1629 | 3.72M | for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); |
1630 | 26.9M | MBBI != E26.9M ;) { |
1631 | 23.2M | if (TII->isPairableLdStInst(*MBBI) && 23.2M tryToPairLdStInst(MBBI)3.15M ) |
1632 | 354k | Modified = true; |
1633 | 23.2M | else |
1634 | 22.8M | ++MBBI; |
1635 | 23.2M | } |
1636 | 3.72M | // 4) Find base register updates that can be merged into the load or store |
1637 | 3.72M | // as a base-reg writeback. |
1638 | 3.72M | // e.g., |
1639 | 3.72M | // ldr x0, [x2] |
1640 | 3.72M | // add x2, x2, #4 |
1641 | 3.72M | // ; becomes |
1642 | 3.72M | // ldr x0, [x2], #4 |
1643 | 3.72M | for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); |
1644 | 26.9M | MBBI != E26.9M ;) { |
1645 | 23.2M | MachineInstr &MI = *MBBI; |
1646 | 23.2M | // Do update merging. It's simpler to keep this separate from the above |
1647 | 23.2M | // switchs, though not strictly necessary. |
1648 | 23.2M | unsigned Opc = MI.getOpcode(); |
1649 | 23.2M | switch (Opc) { |
1650 | 18.0M | default: |
1651 | 18.0M | // Just move on to the next instruction. |
1652 | 18.0M | ++MBBI; |
1653 | 18.0M | break; |
1654 | 23.2M | // Scaled instructions. |
1655 | 5.16M | case AArch64::STRSui: |
1656 | 5.16M | case AArch64::STRDui: |
1657 | 5.16M | case AArch64::STRQui: |
1658 | 5.16M | case AArch64::STRXui: |
1659 | 5.16M | case AArch64::STRWui: |
1660 | 5.16M | case AArch64::STRHHui: |
1661 | 5.16M | case AArch64::STRBBui: |
1662 | 5.16M | case AArch64::LDRSui: |
1663 | 5.16M | case AArch64::LDRDui: |
1664 | 5.16M | case AArch64::LDRQui: |
1665 | 5.16M | case AArch64::LDRXui: |
1666 | 5.16M | case AArch64::LDRWui: |
1667 | 5.16M | case AArch64::LDRHHui: |
1668 | 5.16M | case AArch64::LDRBBui: |
1669 | 5.16M | // Unscaled instructions. |
1670 | 5.16M | case AArch64::STURSi: |
1671 | 5.16M | case AArch64::STURDi: |
1672 | 5.16M | case AArch64::STURQi: |
1673 | 5.16M | case AArch64::STURWi: |
1674 | 5.16M | case AArch64::STURXi: |
1675 | 5.16M | case AArch64::LDURSi: |
1676 | 5.16M | case AArch64::LDURDi: |
1677 | 5.16M | case AArch64::LDURQi: |
1678 | 5.16M | case AArch64::LDURWi: |
1679 | 5.16M | case AArch64::LDURXi: |
1680 | 5.16M | // Paired instructions. |
1681 | 5.16M | case AArch64::LDPSi: |
1682 | 5.16M | case AArch64::LDPSWi: |
1683 | 5.16M | case AArch64::LDPDi: |
1684 | 5.16M | case AArch64::LDPQi: |
1685 | 5.16M | case AArch64::LDPWi: |
1686 | 5.16M | case AArch64::LDPXi: |
1687 | 5.16M | case AArch64::STPSi: |
1688 | 5.16M | case AArch64::STPDi: |
1689 | 5.16M | case AArch64::STPQi: |
1690 | 5.16M | case AArch64::STPWi: |
1691 | 5.16M | case AArch64::STPXi: { |
1692 | 5.16M | // Make sure this is a reg+imm (as opposed to an address reloc). |
1693 | 5.16M | if (!getLdStOffsetOp(MI).isImm()5.16M ) { |
1694 | 415k | ++MBBI; |
1695 | 415k | break; |
1696 | 415k | } |
1697 | 4.75M | // Look forward to try to form a post-index instruction. For example, |
1698 | 4.75M | // ldr x0, [x20] |
1699 | 4.75M | // add x20, x20, #32 |
1700 | 4.75M | // merged into: |
1701 | 4.75M | // ldr x0, [x20], #32 |
1702 | 4.75M | MachineBasicBlock::iterator Update = |
1703 | 4.75M | findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); |
1704 | 4.75M | if (Update != E4.75M ) { |
1705 | 3.13k | // Merge the update into the ld/st. |
1706 | 3.13k | MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); |
1707 | 3.13k | Modified = true; |
1708 | 3.13k | ++NumPostFolded; |
1709 | 3.13k | break; |
1710 | 3.13k | } |
1711 | 4.75M | |
1712 | 4.75M | // Don't know how to handle unscaled pre/post-index versions below, so |
1713 | 4.75M | // move to the next instruction. |
1714 | 4.75M | if (4.75M TII->isUnscaledLdSt(Opc)4.75M ) { |
1715 | 171k | ++MBBI; |
1716 | 171k | break; |
1717 | 171k | } |
1718 | 4.57M | |
1719 | 4.57M | // Look back to try to find a pre-index instruction. For example, |
1720 | 4.57M | // add x0, x0, #8 |
1721 | 4.57M | // ldr x1, [x0] |
1722 | 4.57M | // merged into: |
1723 | 4.57M | // ldr x1, [x0, #8]! |
1724 | 4.57M | Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); |
1725 | 4.57M | if (Update != E4.57M ) { |
1726 | 293 | // Merge the update into the ld/st. |
1727 | 293 | MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); |
1728 | 293 | Modified = true; |
1729 | 293 | ++NumPreFolded; |
1730 | 293 | break; |
1731 | 293 | } |
1732 | 4.57M | // The immediate in the load/store is scaled by the size of the memory |
1733 | 4.57M | // operation. The immediate in the add we're looking for, |
1734 | 4.57M | // however, is not, so adjust here. |
1735 | 4.57M | int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); |
1736 | 4.57M | |
1737 | 4.57M | // Look forward to try to find a post-index instruction. For example, |
1738 | 4.57M | // ldr x1, [x0, #64] |
1739 | 4.57M | // add x0, x0, #64 |
1740 | 4.57M | // merged into: |
1741 | 4.57M | // ldr x1, [x0, #64]! |
1742 | 4.57M | Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); |
1743 | 4.57M | if (Update != E4.57M ) { |
1744 | 55 | // Merge the update into the ld/st. |
1745 | 55 | MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); |
1746 | 55 | Modified = true; |
1747 | 55 | ++NumPreFolded; |
1748 | 55 | break; |
1749 | 55 | } |
1750 | 4.57M | |
1751 | 4.57M | // Nothing found. Just move to the next instruction. |
1752 | 4.57M | ++MBBI; |
1753 | 4.57M | break; |
1754 | 4.57M | } |
1755 | 23.2M | } |
1756 | 23.2M | } |
1757 | 3.72M | |
1758 | 3.72M | return Modified; |
1759 | 3.72M | } |
1760 | | |
1761 | 455k | bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { |
1762 | 455k | if (skipFunction(*Fn.getFunction())) |
1763 | 1 | return false; |
1764 | 455k | |
1765 | 455k | Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); |
1766 | 455k | TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); |
1767 | 455k | TRI = Subtarget->getRegisterInfo(); |
1768 | 455k | AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
1769 | 455k | |
1770 | 455k | // Resize the modified and used register bitfield trackers. We do this once |
1771 | 455k | // per function and then clear the bitfield each time we optimize a load or |
1772 | 455k | // store. |
1773 | 455k | ModifiedRegs.resize(TRI->getNumRegs()); |
1774 | 455k | UsedRegs.resize(TRI->getNumRegs()); |
1775 | 455k | |
1776 | 455k | bool Modified = false; |
1777 | 455k | bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign(); |
1778 | 455k | for (auto &MBB : Fn) |
1779 | 3.72M | Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt); |
1780 | 455k | |
1781 | 455k | return Modified; |
1782 | 455k | } |
1783 | | |
1784 | | // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and |
1785 | | // stores near one another? Note: The pre-RA instruction scheduler already has |
1786 | | // hooks to try and schedule pairable loads/stores together to improve pairing |
1787 | | // opportunities. Thus, pre-RA pairing pass may not be worth the effort. |
1788 | | |
1789 | | // FIXME: When pairing store instructions it's very possible for this pass to |
1790 | | // hoist a store with a KILL marker above another use (without a KILL marker). |
1791 | | // The resulting IR is invalid, but nothing uses the KILL markers after this |
1792 | | // pass, so it's never caused a problem in practice. |
1793 | | |
1794 | | /// createAArch64LoadStoreOptimizationPass - returns an instance of the |
1795 | | /// load / store optimization pass. |
1796 | 13.7k | FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { |
1797 | 13.7k | return new AArch64LoadStoreOpt(); |
1798 | 13.7k | } |