Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This file contains a pass that performs load / store related peephole
11
// optimizations. This pass should be run after register allocation.
12
//
13
//===----------------------------------------------------------------------===//
14
15
#include "AArch64InstrInfo.h"
16
#include "AArch64Subtarget.h"
17
#include "MCTargetDesc/AArch64AddressingModes.h"
18
#include "llvm/ADT/BitVector.h"
19
#include "llvm/ADT/SmallVector.h"
20
#include "llvm/ADT/Statistic.h"
21
#include "llvm/ADT/StringRef.h"
22
#include "llvm/ADT/iterator_range.h"
23
#include "llvm/Analysis/AliasAnalysis.h"
24
#include "llvm/CodeGen/MachineBasicBlock.h"
25
#include "llvm/CodeGen/MachineFunction.h"
26
#include "llvm/CodeGen/MachineFunctionPass.h"
27
#include "llvm/CodeGen/MachineInstr.h"
28
#include "llvm/CodeGen/MachineInstrBuilder.h"
29
#include "llvm/CodeGen/MachineOperand.h"
30
#include "llvm/IR/DebugLoc.h"
31
#include "llvm/MC/MCRegisterInfo.h"
32
#include "llvm/Pass.h"
33
#include "llvm/Support/CommandLine.h"
34
#include "llvm/Support/Debug.h"
35
#include "llvm/Support/ErrorHandling.h"
36
#include "llvm/Support/raw_ostream.h"
37
#include "llvm/Target/TargetRegisterInfo.h"
38
#include <cassert>
39
#include <cstdint>
40
#include <iterator>
41
#include <limits>
42
43
using namespace llvm;
44
45
#define DEBUG_TYPE "aarch64-ldst-opt"
46
47
STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
48
STATISTIC(NumPostFolded, "Number of post-index updates folded");
49
STATISTIC(NumPreFolded, "Number of pre-index updates folded");
50
STATISTIC(NumUnscaledPairCreated,
51
          "Number of load/store from unscaled generated");
52
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
53
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
54
55
// The LdStLimit limits how far we search for load/store pairs.
56
static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
57
                                   cl::init(20), cl::Hidden);
58
59
// The UpdateLimit limits how far we search for update instructions when we form
60
// pre-/post-index instructions.
61
static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
62
                                     cl::Hidden);
63
64
13.7k
#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
65
66
namespace {
67
68
using LdStPairFlags = struct LdStPairFlags {
69
  // If a matching instruction is found, MergeForward is set to true if the
70
  // merge is to remove the first instruction and replace the second with
71
  // a pair-wise insn, and false if the reverse is true.
72
  bool MergeForward = false;
73
74
  // SExtIdx gives the index of the result of the load pair that must be
75
  // extended. The value of SExtIdx assumes that the paired load produces the
76
  // value in this order: (I, returned iterator), i.e., -1 means no value has
77
  // to be extended, 0 means I, and 1 means the returned iterator.
78
  int SExtIdx = -1;
79
80
2.39M
  LdStPairFlags() = default;
81
82
361k
  void setMergeForward(bool V = true) { MergeForward = V; }
83
361k
  bool getMergeForward() const { return MergeForward; }
84
85
8.32M
  void setSExtIdx(int V) { SExtIdx = V; }
86
354k
  int getSExtIdx() const { return SExtIdx; }
87
};
88
89
struct AArch64LoadStoreOpt : public MachineFunctionPass {
90
  static char ID;
91
92
13.7k
  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
93
13.7k
    initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
94
13.7k
  }
95
96
  AliasAnalysis *AA;
97
  const AArch64InstrInfo *TII;
98
  const TargetRegisterInfo *TRI;
99
  const AArch64Subtarget *Subtarget;
100
101
  // Track which registers have been modified and used.
102
  BitVector ModifiedRegs, UsedRegs;
103
104
13.7k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
105
13.7k
    AU.addRequired<AAResultsWrapperPass>();
106
13.7k
    MachineFunctionPass::getAnalysisUsage(AU);
107
13.7k
  }
108
109
  // Scan the instructions looking for a load/store that can be combined
110
  // with the current instruction into a load/store pair.
111
  // Return the matching instruction if one is found, else MBB->end().
112
  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
113
                                               LdStPairFlags &Flags,
114
                                               unsigned Limit,
115
                                               bool FindNarrowMerge);
116
117
  // Scan the instructions looking for a store that writes to the address from
118
  // which the current load instruction reads. Return true if one is found.
119
  bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
120
                         MachineBasicBlock::iterator &StoreI);
121
122
  // Merge the two instructions indicated into a wider narrow store instruction.
123
  MachineBasicBlock::iterator
124
  mergeNarrowZeroStores(MachineBasicBlock::iterator I,
125
                        MachineBasicBlock::iterator MergeMI,
126
                        const LdStPairFlags &Flags);
127
128
  // Merge the two instructions indicated into a single pair-wise instruction.
129
  MachineBasicBlock::iterator
130
  mergePairedInsns(MachineBasicBlock::iterator I,
131
                   MachineBasicBlock::iterator Paired,
132
                   const LdStPairFlags &Flags);
133
134
  // Promote the load that reads directly from the address stored to.
135
  MachineBasicBlock::iterator
136
  promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
137
                       MachineBasicBlock::iterator StoreI);
138
139
  // Scan the instruction list to find a base register update that can
140
  // be combined with the current instruction (a load or store) using
141
  // pre or post indexed addressing with writeback. Scan forwards.
142
  MachineBasicBlock::iterator
143
  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
144
                                int UnscaledOffset, unsigned Limit);
145
146
  // Scan the instruction list to find a base register update that can
147
  // be combined with the current instruction (a load or store) using
148
  // pre or post indexed addressing with writeback. Scan backwards.
149
  MachineBasicBlock::iterator
150
  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
151
152
  // Find an instruction that updates the base register of the ld/st
153
  // instruction.
154
  bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
155
                            unsigned BaseReg, int Offset);
156
157
  // Merge a pre- or post-index base register update into a ld/st instruction.
158
  MachineBasicBlock::iterator
159
  mergeUpdateInsn(MachineBasicBlock::iterator I,
160
                  MachineBasicBlock::iterator Update, bool IsPreIdx);
161
162
  // Find and merge zero store instructions.
163
  bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
164
165
  // Find and pair ldr/str instructions.
166
  bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
167
168
  // Find and promote load instructions which read directly from store.
169
  bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
170
171
  bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
172
173
  bool runOnMachineFunction(MachineFunction &Fn) override;
174
175
13.7k
  MachineFunctionProperties getRequiredProperties() const override {
176
13.7k
    return MachineFunctionProperties().set(
177
13.7k
        MachineFunctionProperties::Property::NoVRegs);
178
13.7k
  }
179
180
13.7k
  StringRef getPassName() const override 
{ return 13.7k
AARCH64_LOAD_STORE_OPT_NAME13.7k
; }
181
};
182
183
char AArch64LoadStoreOpt::ID = 0;
184
185
} // end anonymous namespace
186
187
INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
188
                AARCH64_LOAD_STORE_OPT_NAME, false, false)
189
190
28.6M
static bool isNarrowStore(unsigned Opc) {
191
28.6M
  switch (Opc) {
192
28.0M
  default:
193
28.0M
    return false;
194
591k
  case AArch64::STRBBui:
195
591k
  case AArch64::STURBBi:
196
591k
  case AArch64::STRHHui:
197
591k
  case AArch64::STURHHi:
198
591k
    return true;
199
0
  }
200
0
}
201
202
// Scaling factor for unscaled load or store.
203
14.5M
static int getMemScale(MachineInstr &MI) {
204
14.5M
  switch (MI.getOpcode()) {
205
0
  default:
206
0
    llvm_unreachable("Opcode has unknown scale!");
207
1.55M
  case AArch64::LDRBBui:
208
1.55M
  case AArch64::LDURBBi:
209
1.55M
  case AArch64::LDRSBWui:
210
1.55M
  case AArch64::LDURSBWi:
211
1.55M
  case AArch64::STRBBui:
212
1.55M
  case AArch64::STURBBi:
213
1.55M
    return 1;
214
259k
  case AArch64::LDRHHui:
215
259k
  case AArch64::LDURHHi:
216
259k
  case AArch64::LDRSHWui:
217
259k
  case AArch64::LDURSHWi:
218
259k
  case AArch64::STRHHui:
219
259k
  case AArch64::STURHHi:
220
259k
    return 2;
221
2.56M
  case AArch64::LDRSui:
222
2.56M
  case AArch64::LDURSi:
223
2.56M
  case AArch64::LDRSWui:
224
2.56M
  case AArch64::LDURSWi:
225
2.56M
  case AArch64::LDRWui:
226
2.56M
  case AArch64::LDURWi:
227
2.56M
  case AArch64::STRSui:
228
2.56M
  case AArch64::STURSi:
229
2.56M
  case AArch64::STRWui:
230
2.56M
  case AArch64::STURWi:
231
2.56M
  case AArch64::LDPSi:
232
2.56M
  case AArch64::LDPSWi:
233
2.56M
  case AArch64::LDPWi:
234
2.56M
  case AArch64::STPSi:
235
2.56M
  case AArch64::STPWi:
236
2.56M
    return 4;
237
9.99M
  case AArch64::LDRDui:
238
9.99M
  case AArch64::LDURDi:
239
9.99M
  case AArch64::LDRXui:
240
9.99M
  case AArch64::LDURXi:
241
9.99M
  case AArch64::STRDui:
242
9.99M
  case AArch64::STURDi:
243
9.99M
  case AArch64::STRXui:
244
9.99M
  case AArch64::STURXi:
245
9.99M
  case AArch64::LDPDi:
246
9.99M
  case AArch64::LDPXi:
247
9.99M
  case AArch64::STPDi:
248
9.99M
  case AArch64::STPXi:
249
9.99M
    return 8;
250
202k
  case AArch64::LDRQui:
251
202k
  case AArch64::LDURQi:
252
202k
  case AArch64::STRQui:
253
202k
  case AArch64::STURQi:
254
202k
  case AArch64::LDPQi:
255
202k
  case AArch64::STPQi:
256
202k
    return 16;
257
0
  }
258
0
}
259
260
static unsigned getMatchingNonSExtOpcode(unsigned Opc,
261
12.2M
                                         bool *IsValidLdStrOpc = nullptr) {
262
12.2M
  if (IsValidLdStrOpc)
263
12.2M
    *IsValidLdStrOpc = true;
264
12.2M
  switch (Opc) {
265
4.35M
  default:
266
4.35M
    if (IsValidLdStrOpc)
267
4.35M
      *IsValidLdStrOpc = false;
268
4.35M
    return std::numeric_limits<unsigned>::max();
269
7.64M
  case AArch64::STRDui:
270
7.64M
  case AArch64::STURDi:
271
7.64M
  case AArch64::STRQui:
272
7.64M
  case AArch64::STURQi:
273
7.64M
  case AArch64::STRBBui:
274
7.64M
  case AArch64::STURBBi:
275
7.64M
  case AArch64::STRHHui:
276
7.64M
  case AArch64::STURHHi:
277
7.64M
  case AArch64::STRWui:
278
7.64M
  case AArch64::STURWi:
279
7.64M
  case AArch64::STRXui:
280
7.64M
  case AArch64::STURXi:
281
7.64M
  case AArch64::LDRDui:
282
7.64M
  case AArch64::LDURDi:
283
7.64M
  case AArch64::LDRQui:
284
7.64M
  case AArch64::LDURQi:
285
7.64M
  case AArch64::LDRWui:
286
7.64M
  case AArch64::LDURWi:
287
7.64M
  case AArch64::LDRXui:
288
7.64M
  case AArch64::LDURXi:
289
7.64M
  case AArch64::STRSui:
290
7.64M
  case AArch64::STURSi:
291
7.64M
  case AArch64::LDRSui:
292
7.64M
  case AArch64::LDURSi:
293
7.64M
    return Opc;
294
288k
  case AArch64::LDRSWui:
295
288k
    return AArch64::LDRWui;
296
5.79k
  case AArch64::LDURSWi:
297
5.79k
    return AArch64::LDURWi;
298
0
  }
299
0
}
300
301
6.37k
static unsigned getMatchingWideOpcode(unsigned Opc) {
302
6.37k
  switch (Opc) {
303
0
  default:
304
0
    llvm_unreachable("Opcode has no wide equivalent!");
305
2
  case AArch64::STRBBui:
306
2
    return AArch64::STRHHui;
307
3
  case AArch64::STRHHui:
308
3
    return AArch64::STRWui;
309
0
  case AArch64::STURBBi:
310
0
    return AArch64::STURHHi;
311
3
  case AArch64::STURHHi:
312
3
    return AArch64::STURWi;
313
87
  case AArch64::STURWi:
314
87
    return AArch64::STURXi;
315
6.27k
  case AArch64::STRWui:
316
6.27k
    return AArch64::STRXui;
317
0
  }
318
0
}
319
320
737k
static unsigned getMatchingPairOpcode(unsigned Opc) {
321
737k
  switch (Opc) {
322
0
  default:
323
0
    llvm_unreachable("Opcode has no pairwise equivalent!");
324
1.80k
  case AArch64::STRSui:
325
1.80k
  case AArch64::STURSi:
326
1.80k
    return AArch64::STPSi;
327
110k
  case AArch64::STRDui:
328
110k
  case AArch64::STURDi:
329
110k
    return AArch64::STPDi;
330
9.85k
  case AArch64::STRQui:
331
9.85k
  case AArch64::STURQi:
332
9.85k
    return AArch64::STPQi;
333
162k
  case AArch64::STRWui:
334
162k
  case AArch64::STURWi:
335
162k
    return AArch64::STPWi;
336
182k
  case AArch64::STRXui:
337
182k
  case AArch64::STURXi:
338
182k
    return AArch64::STPXi;
339
4.22k
  case AArch64::LDRSui:
340
4.22k
  case AArch64::LDURSi:
341
4.22k
    return AArch64::LDPSi;
342
19.0k
  case AArch64::LDRDui:
343
19.0k
  case AArch64::LDURDi:
344
19.0k
    return AArch64::LDPDi;
345
16.5k
  case AArch64::LDRQui:
346
16.5k
  case AArch64::LDURQi:
347
16.5k
    return AArch64::LDPQi;
348
88.8k
  case AArch64::LDRWui:
349
88.8k
  case AArch64::LDURWi:
350
88.8k
    return AArch64::LDPWi;
351
127k
  case AArch64::LDRXui:
352
127k
  case AArch64::LDURXi:
353
127k
    return AArch64::LDPXi;
354
14.3k
  case AArch64::LDRSWui:
355
14.3k
  case AArch64::LDURSWi:
356
14.3k
    return AArch64::LDPSWi;
357
0
  }
358
0
}
359
360
static unsigned isMatchingStore(MachineInstr &LoadInst,
361
468k
                                MachineInstr &StoreInst) {
362
468k
  unsigned LdOpc = LoadInst.getOpcode();
363
468k
  unsigned StOpc = StoreInst.getOpcode();
364
468k
  switch (LdOpc) {
365
0
  default:
366
0
    llvm_unreachable("Unsupported load instruction!");
367
27.9k
  case AArch64::LDRBBui:
368
20.0k
    return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
369
27.9k
           
StOpc == AArch64::STRWui18.0k
||
StOpc == AArch64::STRXui9.93k
;
370
897
  case AArch64::LDURBBi:
371
690
    return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
372
897
           
StOpc == AArch64::STURWi680
||
StOpc == AArch64::STURXi638
;
373
11.1k
  case AArch64::LDRHHui:
374
7.30k
    return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
375
5.17k
           StOpc == AArch64::STRXui;
376
2.87k
  case AArch64::LDURHHi:
377
2.84k
    return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
378
2.77k
           StOpc == AArch64::STURXi;
379
116k
  case AArch64::LDRWui:
380
69.6k
    return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
381
4.54k
  case AArch64::LDURWi:
382
3.18k
    return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
383
300k
  case AArch64::LDRXui:
384
300k
    return StOpc == AArch64::STRXui;
385
4.48k
  case AArch64::LDURXi:
386
4.48k
    return StOpc == AArch64::STURXi;
387
0
  }
388
0
}
389
390
348
static unsigned getPreIndexedOpcode(unsigned Opc) {
391
348
  // FIXME: We don't currently support creating pre-indexed loads/stores when
392
348
  // the load or store is the unscaled version.  If we decide to perform such an
393
348
  // optimization in the future the cases for the unscaled loads/stores will
394
348
  // need to be added here.
395
348
  switch (Opc) {
396
0
  default:
397
0
    llvm_unreachable("Opcode has no pre-indexed equivalent!");
398
6
  case AArch64::STRSui:
399
6
    return AArch64::STRSpre;
400
8
  case AArch64::STRDui:
401
8
    return AArch64::STRDpre;
402
28
  case AArch64::STRQui:
403
28
    return AArch64::STRQpre;
404
6
  case AArch64::STRBBui:
405
6
    return AArch64::STRBBpre;
406
4
  case AArch64::STRHHui:
407
4
    return AArch64::STRHHpre;
408
14
  case AArch64::STRWui:
409
14
    return AArch64::STRWpre;
410
53
  case AArch64::STRXui:
411
53
    return AArch64::STRXpre;
412
14
  case AArch64::LDRSui:
413
14
    return AArch64::LDRSpre;
414
6
  case AArch64::LDRDui:
415
6
    return AArch64::LDRDpre;
416
6
  case AArch64::LDRQui:
417
6
    return AArch64::LDRQpre;
418
28
  case AArch64::LDRBBui:
419
28
    return AArch64::LDRBBpre;
420
2
  case AArch64::LDRHHui:
421
2
    return AArch64::LDRHHpre;
422
6
  case AArch64::LDRWui:
423
6
    return AArch64::LDRWpre;
424
121
  case AArch64::LDRXui:
425
121
    return AArch64::LDRXpre;
426
0
  case AArch64::LDRSWui:
427
0
    return AArch64::LDRSWpre;
428
0
  case AArch64::LDPSi:
429
0
    return AArch64::LDPSpre;
430
0
  case AArch64::LDPSWi:
431
0
    return AArch64::LDPSWpre;
432
0
  case AArch64::LDPDi:
433
0
    return AArch64::LDPDpre;
434
0
  case AArch64::LDPQi:
435
0
    return AArch64::LDPQpre;
436
2
  case AArch64::LDPWi:
437
2
    return AArch64::LDPWpre;
438
0
  case AArch64::LDPXi:
439
0
    return AArch64::LDPXpre;
440
0
  case AArch64::STPSi:
441
0
    return AArch64::STPSpre;
442
0
  case AArch64::STPDi:
443
0
    return AArch64::STPDpre;
444
1
  case AArch64::STPQi:
445
1
    return AArch64::STPQpre;
446
4
  case AArch64::STPWi:
447
4
    return AArch64::STPWpre;
448
39
  case AArch64::STPXi:
449
39
    return AArch64::STPXpre;
450
0
  }
451
0
}
452
453
3.13k
static unsigned getPostIndexedOpcode(unsigned Opc) {
454
3.13k
  switch (Opc) {
455
0
  default:
456
0
    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
457
49
  case AArch64::STRSui:
458
49
  case AArch64::STURSi:
459
49
    return AArch64::STRSpost;
460
247
  case AArch64::STRDui:
461
247
  case AArch64::STURDi:
462
247
    return AArch64::STRDpost;
463
33
  case AArch64::STRQui:
464
33
  case AArch64::STURQi:
465
33
    return AArch64::STRQpost;
466
54
  case AArch64::STRBBui:
467
54
    return AArch64::STRBBpost;
468
78
  case AArch64::STRHHui:
469
78
    return AArch64::STRHHpost;
470
243
  case AArch64::STRWui:
471
243
  case AArch64::STURWi:
472
243
    return AArch64::STRWpost;
473
204
  case AArch64::STRXui:
474
204
  case AArch64::STURXi:
475
204
    return AArch64::STRXpost;
476
67
  case AArch64::LDRSui:
477
67
  case AArch64::LDURSi:
478
67
    return AArch64::LDRSpost;
479
326
  case AArch64::LDRDui:
480
326
  case AArch64::LDURDi:
481
326
    return AArch64::LDRDpost;
482
117
  case AArch64::LDRQui:
483
117
  case AArch64::LDURQi:
484
117
    return AArch64::LDRQpost;
485
350
  case AArch64::LDRBBui:
486
350
    return AArch64::LDRBBpost;
487
370
  case AArch64::LDRHHui:
488
370
    return AArch64::LDRHHpost;
489
162
  case AArch64::LDRWui:
490
162
  case AArch64::LDURWi:
491
162
    return AArch64::LDRWpost;
492
47
  case AArch64::LDRXui:
493
47
  case AArch64::LDURXi:
494
47
    return AArch64::LDRXpost;
495
0
  case AArch64::LDRSWui:
496
0
    return AArch64::LDRSWpost;
497
14
  case AArch64::LDPSi:
498
14
    return AArch64::LDPSpost;
499
1
  case AArch64::LDPSWi:
500
1
    return AArch64::LDPSWpost;
501
4
  case AArch64::LDPDi:
502
4
    return AArch64::LDPDpost;
503
102
  case AArch64::LDPQi:
504
102
    return AArch64::LDPQpost;
505
72
  case AArch64::LDPWi:
506
72
    return AArch64::LDPWpost;
507
6
  case AArch64::LDPXi:
508
6
    return AArch64::LDPXpost;
509
6
  case AArch64::STPSi:
510
6
    return AArch64::STPSpost;
511
476
  case AArch64::STPDi:
512
476
    return AArch64::STPDpost;
513
18
  case AArch64::STPQi:
514
18
    return AArch64::STPQpost;
515
18
  case AArch64::STPWi:
516
18
    return AArch64::STPWpost;
517
71
  case AArch64::STPXi:
518
71
    return AArch64::STPXpost;
519
0
  }
520
0
}
521
522
73.9M
static bool isPairedLdSt(const MachineInstr &MI) {
523
73.9M
  switch (MI.getOpcode()) {
524
53.8M
  default:
525
53.8M
    return false;
526
20.1M
  case AArch64::LDPSi:
527
20.1M
  case AArch64::LDPSWi:
528
20.1M
  case AArch64::LDPDi:
529
20.1M
  case AArch64::LDPQi:
530
20.1M
  case AArch64::LDPWi:
531
20.1M
  case AArch64::LDPXi:
532
20.1M
  case AArch64::STPSi:
533
20.1M
  case AArch64::STPDi:
534
20.1M
  case AArch64::STPQi:
535
20.1M
  case AArch64::STPWi:
536
20.1M
  case AArch64::STPXi:
537
20.1M
    return true;
538
0
  }
539
0
}
540
541
static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
542
12.9M
                                          unsigned PairedRegOp = 0) {
543
12.9M
  assert(PairedRegOp < 2 && "Unexpected register operand idx.");
544
12.9M
  unsigned Idx = isPairedLdSt(MI) ? 
PairedRegOp3.83M
:
09.14M
;
545
12.9M
  return MI.getOperand(Idx);
546
12.9M
}
547
548
20.2M
static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
549
20.2M
  unsigned Idx = isPairedLdSt(MI) ? 
25.36M
:
114.8M
;
550
20.2M
  return MI.getOperand(Idx);
551
20.2M
}
552
553
34.6M
static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
554
34.6M
  unsigned Idx = isPairedLdSt(MI) ? 
38.94M
:
225.6M
;
555
34.6M
  return MI.getOperand(Idx);
556
34.6M
}
557
558
static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
559
                                  MachineInstr &StoreInst,
560
66.5k
                                  const AArch64InstrInfo *TII) {
561
66.5k
  assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
562
66.5k
  int LoadSize = getMemScale(LoadInst);
563
66.5k
  int StoreSize = getMemScale(StoreInst);
564
66.5k
  int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
565
2.32k
                             ? getLdStOffsetOp(StoreInst).getImm()
566
64.2k
                             : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
567
66.5k
  int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
568
2.32k
                             ? getLdStOffsetOp(LoadInst).getImm()
569
64.2k
                             : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
570
66.5k
  return (UnscaledStOffset <= UnscaledLdOffset) &&
571
41.0k
         (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
572
66.5k
}
573
574
25.9M
static bool isPromotableZeroStoreInst(MachineInstr &MI) {
575
25.9M
  unsigned Opc = MI.getOpcode();
576
25.3M
  return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
577
25.3M
          isNarrowStore(Opc)) &&
578
1.09M
         getLdStRegOp(MI).getReg() == AArch64::WZR;
579
25.9M
}
580
581
MachineBasicBlock::iterator
582
AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
583
                                           MachineBasicBlock::iterator MergeMI,
584
6.37k
                                           const LdStPairFlags &Flags) {
585
6.37k
  assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
586
6.37k
         "Expected promotable zero stores.");
587
6.37k
588
6.37k
  MachineBasicBlock::iterator NextI = I;
589
6.37k
  ++NextI;
590
6.37k
  // If NextI is the second of the two instructions to be merged, we need
591
6.37k
  // to skip one further. Either way we merge will invalidate the iterator,
592
6.37k
  // and we don't need to scan the new instruction, as it's a pairwise
593
6.37k
  // instruction, which we're not considering for further action anyway.
594
6.37k
  if (NextI == MergeMI)
595
6.29k
    ++NextI;
596
6.37k
597
6.37k
  unsigned Opc = I->getOpcode();
598
6.37k
  bool IsScaled = !TII->isUnscaledLdSt(Opc);
599
6.37k
  int OffsetStride = IsScaled ? 
16.28k
:
getMemScale(*I)90
;
600
6.37k
601
6.37k
  bool MergeForward = Flags.getMergeForward();
602
6.37k
  // Insert our new paired instruction after whichever of the paired
603
6.37k
  // instructions MergeForward indicates.
604
6.37k
  MachineBasicBlock::iterator InsertionPoint = MergeForward ? 
MergeMI0
:
I6.37k
;
605
6.37k
  // Also based on MergeForward is from where we copy the base register operand
606
6.37k
  // so we get the flags compatible with the input code.
607
6.37k
  const MachineOperand &BaseRegOp =
608
6.37k
      MergeForward ? 
getLdStBaseOp(*MergeMI)0
:
getLdStBaseOp(*I)6.37k
;
609
6.37k
610
6.37k
  // Which register is Rt and which is Rt2 depends on the offset order.
611
6.37k
  MachineInstr *RtMI;
612
6.37k
  if (getLdStOffsetOp(*I).getImm() ==
613
6.37k
      getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
614
2.95k
    RtMI = &*MergeMI;
615
6.37k
  else
616
3.41k
    RtMI = &*I;
617
6.37k
618
6.37k
  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
619
6.37k
  // Change the scaled offset from small to large type.
620
6.37k
  if (
IsScaled6.37k
) {
621
6.28k
    assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
622
6.28k
    OffsetImm /= 2;
623
6.28k
  }
624
6.37k
625
6.37k
  // Construct the new instruction.
626
6.37k
  DebugLoc DL = I->getDebugLoc();
627
6.37k
  MachineBasicBlock *MBB = I->getParent();
628
6.37k
  MachineInstrBuilder MIB;
629
6.37k
  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
630
6.37k
            .addReg(isNarrowStore(Opc) ? 
AArch64::WZR8
:
AArch64::XZR6.36k
)
631
6.37k
            .add(BaseRegOp)
632
6.37k
            .addImm(OffsetImm)
633
6.37k
            .setMemRefs(I->mergeMemRefsWith(*MergeMI));
634
6.37k
  (void)MIB;
635
6.37k
636
6.37k
  DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
637
6.37k
  DEBUG(I->print(dbgs()));
638
6.37k
  DEBUG(dbgs() << "    ");
639
6.37k
  DEBUG(MergeMI->print(dbgs()));
640
6.37k
  DEBUG(dbgs() << "  with instruction:\n    ");
641
6.37k
  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
642
6.37k
  DEBUG(dbgs() << "\n");
643
6.37k
644
6.37k
  // Erase the old instructions.
645
6.37k
  I->eraseFromParent();
646
6.37k
  MergeMI->eraseFromParent();
647
6.37k
  return NextI;
648
6.37k
}
649
650
MachineBasicBlock::iterator
651
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
652
                                      MachineBasicBlock::iterator Paired,
653
354k
                                      const LdStPairFlags &Flags) {
654
354k
  MachineBasicBlock::iterator NextI = I;
655
354k
  ++NextI;
656
354k
  // If NextI is the second of the two instructions to be merged, we need
657
354k
  // to skip one further. Either way we merge will invalidate the iterator,
658
354k
  // and we don't need to scan the new instruction, as it's a pairwise
659
354k
  // instruction, which we're not considering for further action anyway.
660
354k
  if (NextI == Paired)
661
307k
    ++NextI;
662
354k
663
354k
  int SExtIdx = Flags.getSExtIdx();
664
354k
  unsigned Opc =
665
354k
      SExtIdx == -1 ? 
I->getOpcode()342k
:
getMatchingNonSExtOpcode(I->getOpcode())11.7k
;
666
354k
  bool IsUnscaled = TII->isUnscaledLdSt(Opc);
667
354k
  int OffsetStride = IsUnscaled ? 
getMemScale(*I)34.2k
:
1320k
;
668
354k
669
354k
  bool MergeForward = Flags.getMergeForward();
670
354k
  // Insert our new paired instruction after whichever of the paired
671
354k
  // instructions MergeForward indicates.
672
354k
  MachineBasicBlock::iterator InsertionPoint = MergeForward ? 
Paired15.7k
:
I338k
;
673
354k
  // Also based on MergeForward is from where we copy the base register operand
674
354k
  // so we get the flags compatible with the input code.
675
354k
  const MachineOperand &BaseRegOp =
676
354k
      MergeForward ? 
getLdStBaseOp(*Paired)15.7k
:
getLdStBaseOp(*I)338k
;
677
354k
678
354k
  int Offset = getLdStOffsetOp(*I).getImm();
679
354k
  int PairedOffset = getLdStOffsetOp(*Paired).getImm();
680
354k
  bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
681
354k
  if (
IsUnscaled != PairedIsUnscaled354k
) {
682
19.0k
    // We're trying to pair instructions that differ in how they are scaled.  If
683
19.0k
    // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
684
19.0k
    // the opposite (i.e., make Paired's offset unscaled).
685
19.0k
    int MemSize = getMemScale(*Paired);
686
19.0k
    if (
PairedIsUnscaled19.0k
) {
687
919
      // If the unscaled offset isn't a multiple of the MemSize, we can't
688
919
      // pair the operations together.
689
919
      assert(!(PairedOffset % getMemScale(*Paired)) &&
690
919
             "Offset should be a multiple of the stride!");
691
919
      PairedOffset /= MemSize;
692
19.0k
    } else {
693
18.1k
      PairedOffset *= MemSize;
694
18.1k
    }
695
19.0k
  }
696
354k
697
354k
  // Which register is Rt and which is Rt2 depends on the offset order.
698
354k
  MachineInstr *RtMI, *Rt2MI;
699
354k
  if (
Offset == PairedOffset + OffsetStride354k
) {
700
49.0k
    RtMI = &*Paired;
701
49.0k
    Rt2MI = &*I;
702
49.0k
    // Here we swapped the assumption made for SExtIdx.
703
49.0k
    // I.e., we turn ldp I, Paired into ldp Paired, I.
704
49.0k
    // Update the index accordingly.
705
49.0k
    if (SExtIdx != -1)
706
32
      SExtIdx = (SExtIdx + 1) % 2;
707
354k
  } else {
708
305k
    RtMI = &*I;
709
305k
    Rt2MI = &*Paired;
710
305k
  }
711
354k
  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
712
354k
  // Scale the immediate offset, if necessary.
713
354k
  if (
TII->isUnscaledLdSt(RtMI->getOpcode())354k
) {
714
35.1k
    assert(!(OffsetImm % getMemScale(*RtMI)) &&
715
35.1k
           "Unscaled offset cannot be scaled.");
716
35.1k
    OffsetImm /= getMemScale(*RtMI);
717
35.1k
  }
718
354k
719
354k
  // Construct the new instruction.
720
354k
  MachineInstrBuilder MIB;
721
354k
  DebugLoc DL = I->getDebugLoc();
722
354k
  MachineBasicBlock *MBB = I->getParent();
723
354k
  MachineOperand RegOp0 = getLdStRegOp(*RtMI);
724
354k
  MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
725
354k
  // Kill flags may become invalid when moving stores for pairing.
726
354k
  if (
RegOp0.isUse()354k
) {
727
193k
    if (
!MergeForward193k
) {
728
178k
      // Clear kill flags on store if moving upwards. Example:
729
178k
      //   STRWui %w0, ...
730
178k
      //   USE %w1
731
178k
      //   STRWui kill %w1  ; need to clear kill flag when moving STRWui upwards
732
178k
      RegOp0.setIsKill(false);
733
178k
      RegOp1.setIsKill(false);
734
193k
    } else {
735
15.0k
      // Clear kill flags of the first stores register. Example:
736
15.0k
      //   STRWui %w1, ...
737
15.0k
      //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
738
15.0k
      //   STRW %w0
739
15.0k
      unsigned Reg = getLdStRegOp(*I).getReg();
740
15.0k
      for (MachineInstr &MI : make_range(std::next(I), Paired))
741
43.9k
        MI.clearRegisterKills(Reg, TRI);
742
15.0k
    }
743
193k
  }
744
354k
  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
745
354k
            .add(RegOp0)
746
354k
            .add(RegOp1)
747
354k
            .add(BaseRegOp)
748
354k
            .addImm(OffsetImm)
749
354k
            .setMemRefs(I->mergeMemRefsWith(*Paired));
750
354k
751
354k
  (void)MIB;
752
354k
753
354k
  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
754
354k
  DEBUG(I->print(dbgs()));
755
354k
  DEBUG(dbgs() << "    ");
756
354k
  DEBUG(Paired->print(dbgs()));
757
354k
  DEBUG(dbgs() << "  with instruction:\n    ");
758
354k
  if (
SExtIdx != -1354k
) {
759
11.7k
    // Generate the sign extension for the proper result of the ldp.
760
11.7k
    // I.e., with X1, that would be:
761
11.7k
    // %W1<def> = KILL %W1, %X1<imp-def>
762
11.7k
    // %X1<def> = SBFMXri %X1<kill>, 0, 31
763
11.7k
    MachineOperand &DstMO = MIB->getOperand(SExtIdx);
764
11.7k
    // Right now, DstMO has the extended register, since it comes from an
765
11.7k
    // extended opcode.
766
11.7k
    unsigned DstRegX = DstMO.getReg();
767
11.7k
    // Get the W variant of that register.
768
11.7k
    unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
769
11.7k
    // Update the result of LDP to use the W instead of the X variant.
770
11.7k
    DstMO.setReg(DstRegW);
771
11.7k
    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
772
11.7k
    DEBUG(dbgs() << "\n");
773
11.7k
    // Make the machine verifier happy by providing a definition for
774
11.7k
    // the X register.
775
11.7k
    // Insert this definition right after the generated LDP, i.e., before
776
11.7k
    // InsertionPoint.
777
11.7k
    MachineInstrBuilder MIBKill =
778
11.7k
        BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
779
11.7k
            .addReg(DstRegW)
780
11.7k
            .addReg(DstRegX, RegState::Define);
781
11.7k
    MIBKill->getOperand(2).setImplicit();
782
11.7k
    // Create the sign extension.
783
11.7k
    MachineInstrBuilder MIBSXTW =
784
11.7k
        BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
785
11.7k
            .addReg(DstRegX)
786
11.7k
            .addImm(0)
787
11.7k
            .addImm(31);
788
11.7k
    (void)MIBSXTW;
789
11.7k
    DEBUG(dbgs() << "  Extend operand:\n    ");
790
11.7k
    DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
791
354k
  } else {
792
342k
    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
793
342k
  }
794
354k
  DEBUG(dbgs() << "\n");
795
354k
796
354k
  // Erase the old instructions.
797
354k
  I->eraseFromParent();
798
354k
  Paired->eraseFromParent();
799
354k
800
354k
  return NextI;
801
354k
}
802
803
MachineBasicBlock::iterator
804
AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
805
141
                                          MachineBasicBlock::iterator StoreI) {
806
141
  MachineBasicBlock::iterator NextI = LoadI;
807
141
  ++NextI;
808
141
809
141
  int LoadSize = getMemScale(*LoadI);
810
141
  int StoreSize = getMemScale(*StoreI);
811
141
  unsigned LdRt = getLdStRegOp(*LoadI).getReg();
812
141
  const MachineOperand &StMO = getLdStRegOp(*StoreI);
813
141
  unsigned StRt = getLdStRegOp(*StoreI).getReg();
814
141
  bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
815
141
816
141
  assert((IsStoreXReg ||
817
141
          TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
818
141
         "Unexpected RegClass");
819
141
820
141
  MachineInstr *BitExtMI;
821
141
  if (
LoadSize == StoreSize && 141
(LoadSize == 4 || 79
LoadSize == 839
)) {
822
74
    // Remove the load, if the destination register of the loads is the same
823
74
    // register for stored value.
824
74
    if (
StRt == LdRt && 74
LoadSize == 840
) {
825
13
      for (MachineInstr &MI : make_range(StoreI->getIterator(),
826
14
                                         LoadI->getIterator())) {
827
14
        if (
MI.killsRegister(StRt, TRI)14
) {
828
13
          MI.clearRegisterKills(StRt, TRI);
829
13
          break;
830
13
        }
831
13
      }
832
13
      DEBUG(dbgs() << "Remove load instruction:\n    ");
833
13
      DEBUG(LoadI->print(dbgs()));
834
13
      DEBUG(dbgs() << "\n");
835
13
      LoadI->eraseFromParent();
836
13
      return NextI;
837
13
    }
838
61
    // Replace the load with a mov if the load and store are in the same size.
839
61
    BitExtMI =
840
61
        BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
841
61
                TII->get(IsStoreXReg ? 
AArch64::ORRXrs21
:
AArch64::ORRWrs40
), LdRt)
842
61
            .addReg(IsStoreXReg ? 
AArch64::XZR21
:
AArch64::WZR40
)
843
74
            .add(StMO)
844
74
            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
845
141
  } else {
846
67
    // FIXME: Currently we disable this transformation in big-endian targets as
847
67
    // performance and correctness are verified only in little-endian.
848
67
    if (!Subtarget->isLittleEndian())
849
0
      return NextI;
850
67
    bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
851
67
    assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
852
67
           "Unsupported ld/st match");
853
67
    assert(LoadSize <= StoreSize && "Invalid load size");
854
67
    int UnscaledLdOffset = IsUnscaled
855
24
                               ? getLdStOffsetOp(*LoadI).getImm()
856
43
                               : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
857
67
    int UnscaledStOffset = IsUnscaled
858
24
                               ? getLdStOffsetOp(*StoreI).getImm()
859
43
                               : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
860
67
    int Width = LoadSize * 8;
861
67
    int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
862
67
    int Imms = Immr + Width - 1;
863
67
    unsigned DestReg = IsStoreXReg
864
35
                           ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
865
35
                                                      &AArch64::GPR64RegClass)
866
32
                           : LdRt;
867
67
868
67
    assert((UnscaledLdOffset >= UnscaledStOffset &&
869
67
            (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
870
67
           "Invalid offset");
871
67
872
67
    Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
873
67
    Imms = Immr + Width - 1;
874
67
    if (
UnscaledLdOffset == UnscaledStOffset67
) {
875
25
      uint32_t AndMaskEncoded = ((IsStoreXReg ? 
111
:
014
) << 12) // N
876
25
                                | ((Immr) << 6)               // immr
877
25
                                | ((Imms) << 0)               // imms
878
25
          ;
879
25
880
25
      BitExtMI =
881
25
          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
882
25
                  TII->get(IsStoreXReg ? 
AArch64::ANDXri11
:
AArch64::ANDWri14
),
883
25
                  DestReg)
884
25
              .add(StMO)
885
25
              .addImm(AndMaskEncoded);
886
67
    } else {
887
42
      BitExtMI =
888
42
          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
889
42
                  TII->get(IsStoreXReg ? 
AArch64::UBFMXri24
:
AArch64::UBFMWri18
),
890
42
                  DestReg)
891
42
              .add(StMO)
892
42
              .addImm(Immr)
893
42
              .addImm(Imms);
894
42
    }
895
67
  }
896
141
897
141
  // Clear kill flags between store and load.
898
128
  for (MachineInstr &MI : make_range(StoreI->getIterator(),
899
128
                                     BitExtMI->getIterator()))
900
146
    
if (146
MI.killsRegister(StRt, TRI)146
) {
901
120
      MI.clearRegisterKills(StRt, TRI);
902
120
      break;
903
120
    }
904
128
905
128
  DEBUG(dbgs() << "Promoting load by replacing :\n    ");
906
128
  DEBUG(StoreI->print(dbgs()));
907
128
  DEBUG(dbgs() << "    ");
908
128
  DEBUG(LoadI->print(dbgs()));
909
128
  DEBUG(dbgs() << "  with instructions:\n    ");
910
128
  DEBUG(StoreI->print(dbgs()));
911
128
  DEBUG(dbgs() << "    ");
912
128
  DEBUG((BitExtMI)->print(dbgs()));
913
128
  DEBUG(dbgs() << "\n");
914
141
915
141
  // Erase the old instructions.
916
141
  LoadI->eraseFromParent();
917
141
  return NextI;
918
141
}
919
920
/// trackRegDefsUses - Remember what registers the specified instruction uses
921
/// and modifies.
922
static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
923
                             BitVector &UsedRegs,
924
24.5M
                             const TargetRegisterInfo *TRI) {
925
90.6M
  for (const MachineOperand &MO : MI.operands()) {
926
90.6M
    if (MO.isRegMask())
927
1.20M
      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
928
90.6M
929
90.6M
    if (!MO.isReg())
930
29.0M
      continue;
931
61.6M
    unsigned Reg = MO.getReg();
932
61.6M
    if (!Reg)
933
2
      continue;
934
61.6M
    
if (61.6M
MO.isDef()61.6M
) {
935
23.5M
      // WZR/XZR are not modified even when used as a destination register.
936
23.5M
      if (
Reg != AArch64::WZR && 23.5M
Reg != AArch64::XZR22.7M
)
937
164M
        
for (MCRegAliasIterator AI(Reg, TRI, true); 22.2M
AI.isValid()164M
;
++AI142M
)
938
142M
          ModifiedRegs.set(*AI);
939
61.6M
    } else {
940
38.0M
      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
941
280M
      for (MCRegAliasIterator AI(Reg, TRI, true); 
AI.isValid()280M
;
++AI242M
)
942
242M
        UsedRegs.set(*AI);
943
38.0M
    }
944
90.6M
  }
945
24.5M
}
946
947
2.96M
static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
948
2.96M
  // Convert the byte-offset used by unscaled into an "element" offset used
949
2.96M
  // by the scaled pair load/store instructions.
950
2.96M
  if (
IsUnscaled2.96M
) {
951
232k
    // If the byte-offset isn't a multiple of the stride, there's no point
952
232k
    // trying to match it.
953
232k
    if (Offset % OffsetStride)
954
88.7k
      return false;
955
143k
    Offset /= OffsetStride;
956
143k
  }
957
2.88M
  
return Offset <= 63 && 2.88M
Offset >= -642.69M
;
958
2.96M
}
959
960
// Do alignment, specialized to power of 2 and for signed ints,
961
// avoiding having to do a C-style cast from uint_64t to int when
962
// using alignTo from include/llvm/Support/MathExtras.h.
963
// FIXME: Move this function to include/MathExtras.h?
964
51.7k
static int alignTo(int Num, int PowOf2) {
965
51.7k
  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
966
51.7k
}
967
968
static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
969
544k
                     AliasAnalysis *AA) {
970
544k
  // One of the instructions must modify memory.
971
544k
  if (
!MIa.mayStore() && 544k
!MIb.mayStore()492k
)
972
16.3k
    return false;
973
527k
974
527k
  // Both instructions must be memory operations.
975
527k
  
if (527k
!MIa.mayLoadOrStore() && 527k
!MIb.mayLoadOrStore()0
)
976
0
    return false;
977
527k
978
527k
  return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
979
527k
}
980
981
static bool mayAlias(MachineInstr &MIa,
982
                     SmallVectorImpl<MachineInstr *> &MemInsns,
983
383k
                     AliasAnalysis *AA) {
984
383k
  for (MachineInstr *MIb : MemInsns)
985
75.5k
    
if (75.5k
mayAlias(MIa, *MIb, AA)75.5k
)
986
22.6k
      return true;
987
361k
988
361k
  return false;
989
361k
}
990
991
bool AArch64LoadStoreOpt::findMatchingStore(
992
    MachineBasicBlock::iterator I, unsigned Limit,
993
1.88M
    MachineBasicBlock::iterator &StoreI) {
994
1.88M
  MachineBasicBlock::iterator B = I->getParent()->begin();
995
1.88M
  MachineBasicBlock::iterator MBBI = I;
996
1.88M
  MachineInstr &LoadMI = *I;
997
1.88M
  unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
998
1.88M
999
1.88M
  // If the load is the first instruction in the block, there's obviously
1000
1.88M
  // not any matching store.
1001
1.88M
  if (MBBI == B)
1002
744k
    return false;
1003
1.13M
1004
1.13M
  // Track which registers have been modified and used between the first insn
1005
1.13M
  // and the second insn.
1006
1.13M
  ModifiedRegs.reset();
1007
1.13M
  UsedRegs.reset();
1008
1.13M
1009
1.13M
  unsigned Count = 0;
1010
3.27M
  do {
1011
3.27M
    --MBBI;
1012
3.27M
    MachineInstr &MI = *MBBI;
1013
3.27M
1014
3.27M
    // Don't count transient instructions towards the search limit since there
1015
3.27M
    // may be different numbers of them if e.g. debug information is present.
1016
3.27M
    if (!MI.isTransient())
1017
3.25M
      ++Count;
1018
3.27M
1019
3.27M
    // If the load instruction reads directly from the address to which the
1020
3.27M
    // store instruction writes and the stored value is not modified, we can
1021
3.27M
    // promote the load. Since we do not handle stores with pre-/post-index,
1022
3.27M
    // it's unnecessary to check if BaseReg is modified by the store itself.
1023
3.27M
    if (
MI.mayStore() && 3.27M
isMatchingStore(LoadMI, MI)468k
&&
1024
231k
        BaseReg == getLdStBaseOp(MI).getReg() &&
1025
66.5k
        isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1026
3.27M
        
!ModifiedRegs[getLdStRegOp(MI).getReg()]309
) {
1027
141
      StoreI = MBBI;
1028
141
      return true;
1029
141
    }
1030
3.27M
1031
3.27M
    
if (3.27M
MI.isCall()3.27M
)
1032
224k
      return false;
1033
3.05M
1034
3.05M
    // Update modified / uses register lists.
1035
3.05M
    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1036
3.05M
1037
3.05M
    // Otherwise, if the base register is modified, we have no match, so
1038
3.05M
    // return early.
1039
3.05M
    if (ModifiedRegs[BaseReg])
1040
325k
      return false;
1041
2.72M
1042
2.72M
    // If we encounter a store aliased with the load, return early.
1043
2.72M
    
if (2.72M
MI.mayStore() && 2.72M
mayAlias(LoadMI, MI, AA)468k
)
1044
174k
      return false;
1045
2.55M
  } while (
MBBI != B && 2.55M
Count < Limit2.16M
);
1046
411k
  return false;
1047
1.88M
}
1048
1049
// Returns true if FirstMI and MI are candidates for merging or pairing.
1050
// Otherwise, returns false.
1051
static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1052
                                       LdStPairFlags &Flags,
1053
8.21M
                                       const AArch64InstrInfo *TII) {
1054
8.21M
  // If this is volatile or if pairing is suppressed, not a candidate.
1055
8.21M
  if (
MI.hasOrderedMemoryRef() || 8.21M
TII->isLdStPairSuppressed(MI)7.45M
)
1056
797k
    return false;
1057
7.41M
1058
7.41M
  // We should have already checked FirstMI for pair suppression and volatility.
1059
8.21M
  assert(!FirstMI.hasOrderedMemoryRef() &&
1060
7.41M
         !TII->isLdStPairSuppressed(FirstMI) &&
1061
7.41M
         "FirstMI shouldn't get here if either of these checks are true.");
1062
7.41M
1063
7.41M
  unsigned OpcA = FirstMI.getOpcode();
1064
7.41M
  unsigned OpcB = MI.getOpcode();
1065
7.41M
1066
7.41M
  // Opcodes match: nothing more to check.
1067
7.41M
  if (OpcA == OpcB)
1068
1.27M
    return true;
1069
6.13M
1070
6.13M
  // Try to match a sign-extended load/store with a zero-extended load/store.
1071
6.13M
  bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1072
6.13M
  unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1073
6.13M
  assert(IsValidLdStrOpc &&
1074
6.13M
         "Given Opc should be a Load or Store with an immediate");
1075
6.13M
  // OpcA will be the first instruction in the pair.
1076
6.13M
  if (
NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)6.13M
) {
1077
117k
    Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 
196.7k
:
020.2k
);
1078
117k
    return true;
1079
117k
  }
1080
6.02M
1081
6.02M
  // If the second instruction isn't even a mergable/pairable load/store, bail
1082
6.02M
  // out.
1083
6.02M
  
if (6.02M
!PairIsValidLdStrOpc6.02M
)
1084
4.35M
    return false;
1085
1.67M
1086
1.67M
  // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1087
1.67M
  // offsets.
1088
1.67M
  
if (1.67M
isNarrowStore(OpcA) || 1.67M
isNarrowStore(OpcB)1.64M
)
1089
150k
    return false;
1090
1.51M
1091
1.51M
  // Try to match an unscaled load/store with a scaled load/store.
1092
1.51M
  return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
1093
191k
         getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1094
8.21M
1095
8.21M
  // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1096
8.21M
}
1097
1098
/// Scan the instructions looking for a load/store that can be combined with the
1099
/// current instruction into a wider equivalent or a load/store pair.
1100
MachineBasicBlock::iterator
1101
AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1102
                                      LdStPairFlags &Flags, unsigned Limit,
1103
2.39M
                                      bool FindNarrowMerge) {
1104
2.39M
  MachineBasicBlock::iterator E = I->getParent()->end();
1105
2.39M
  MachineBasicBlock::iterator MBBI = I;
1106
2.39M
  MachineInstr &FirstMI = *I;
1107
2.39M
  ++MBBI;
1108
2.39M
1109
2.39M
  bool MayLoad = FirstMI.mayLoad();
1110
2.39M
  bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
1111
2.39M
  unsigned Reg = getLdStRegOp(FirstMI).getReg();
1112
2.39M
  unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
1113
2.39M
  int Offset = getLdStOffsetOp(FirstMI).getImm();
1114
2.39M
  int OffsetStride = IsUnscaled ? 
getMemScale(FirstMI)106k
:
12.28M
;
1115
2.39M
  bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1116
2.39M
1117
2.39M
  // Track which registers have been modified and used between the first insn
1118
2.39M
  // (inclusive) and the second insn.
1119
2.39M
  ModifiedRegs.reset();
1120
2.39M
  UsedRegs.reset();
1121
2.39M
1122
2.39M
  // Remember any instructions that read/write memory between FirstMI and MI.
1123
2.39M
  SmallVector<MachineInstr *, 4> MemInsns;
1124
2.39M
1125
9.52M
  for (unsigned Count = 0; 
MBBI != E && 9.52M
Count < Limit8.27M
;
++MBBI7.13M
) {
1126
8.21M
    MachineInstr &MI = *MBBI;
1127
8.21M
1128
8.21M
    // Don't count transient instructions towards the search limit since there
1129
8.21M
    // may be different numbers of them if e.g. debug information is present.
1130
8.21M
    if (!MI.isTransient())
1131
8.19M
      ++Count;
1132
8.21M
1133
8.21M
    Flags.setSExtIdx(-1);
1134
8.21M
    if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1135
8.21M
        
getLdStOffsetOp(MI).isImm()1.46M
) {
1136
1.46M
      assert(MI.mayLoadOrStore() && "Expected memory operation.");
1137
1.46M
      // If we've found another instruction with the same opcode, check to see
1138
1.46M
      // if the base and offset are compatible with our starting instruction.
1139
1.46M
      // These instructions all have scaled immediate operands, so we just
1140
1.46M
      // check for +1/-1. Make sure to check the new instruction offset is
1141
1.46M
      // actually an immediate and not a symbolic reference destined for
1142
1.46M
      // a relocation.
1143
1.46M
      unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
1144
1.46M
      int MIOffset = getLdStOffsetOp(MI).getImm();
1145
1.46M
      bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
1146
1.46M
      if (
IsUnscaled != MIIsUnscaled1.46M
) {
1147
75.1k
        // We're trying to pair instructions that differ in how they are scaled.
1148
75.1k
        // If FirstMI is scaled then scale the offset of MI accordingly.
1149
75.1k
        // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1150
75.1k
        int MemSize = getMemScale(MI);
1151
75.1k
        if (
MIIsUnscaled75.1k
) {
1152
40.3k
          // If the unscaled offset isn't a multiple of the MemSize, we can't
1153
40.3k
          // pair the operations together: bail and keep looking.
1154
40.3k
          if (
MIOffset % MemSize40.3k
) {
1155
27.1k
            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1156
27.1k
            MemInsns.push_back(&MI);
1157
27.1k
            continue;
1158
27.1k
          }
1159
13.1k
          MIOffset /= MemSize;
1160
75.1k
        } else {
1161
34.8k
          MIOffset *= MemSize;
1162
34.8k
        }
1163
75.1k
      }
1164
1.46M
1165
1.43M
      
if (1.43M
BaseReg == MIBaseReg && 1.43M
((Offset == MIOffset + OffsetStride) ||
1166
1.43M
                                   
(Offset + OffsetStride == MIOffset)849k
)) {
1167
440k
        int MinOffset = Offset < MIOffset ? 
Offset351k
:
MIOffset88.6k
;
1168
440k
        if (
FindNarrowMerge440k
) {
1169
10.3k
          // If the alignment requirements of the scaled wide load/store
1170
10.3k
          // instruction can't express the offset of the scaled narrow input,
1171
10.3k
          // bail and keep looking. For promotable zero stores, allow only when
1172
10.3k
          // the stored value is the same (i.e., WZR).
1173
10.3k
          if (
(!IsUnscaled && 10.3k
alignTo(MinOffset, 2) != MinOffset10.1k
) ||
1174
10.3k
              
(IsPromotableZeroStore && 7.01k
Reg != getLdStRegOp(MI).getReg()7.01k
)) {
1175
3.85k
            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1176
3.85k
            MemInsns.push_back(&MI);
1177
3.85k
            continue;
1178
3.85k
          }
1179
429k
        } else {
1180
429k
          // Pairwise instructions have a 7-bit signed offset field. Single
1181
429k
          // insns have a 12-bit unsigned offset field.  If the resultant
1182
429k
          // immediate offset of merging these instructions is out of range for
1183
429k
          // a pairwise instruction, bail and keep looking.
1184
429k
          if (
!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)429k
) {
1185
800
            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1186
800
            MemInsns.push_back(&MI);
1187
800
            continue;
1188
800
          }
1189
428k
          // If the alignment requirements of the paired (scaled) instruction
1190
428k
          // can't express the offset of the unscaled input, bail and keep
1191
428k
          // looking.
1192
428k
          
if (428k
IsUnscaled && 428k
(alignTo(MinOffset, OffsetStride) != MinOffset)41.5k
) {
1193
0
            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1194
0
            MemInsns.push_back(&MI);
1195
0
            continue;
1196
0
          }
1197
435k
        }
1198
435k
        // If the destination register of the loads is the same register, bail
1199
435k
        // and keep looking. A load-pair instruction with both destination
1200
435k
        // registers the same is UNPREDICTABLE and will result in an exception.
1201
435k
        
if (435k
MayLoad && 435k
Reg == getLdStRegOp(MI).getReg()190k
) {
1202
20.3k
          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1203
20.3k
          MemInsns.push_back(&MI);
1204
20.3k
          continue;
1205
20.3k
        }
1206
415k
1207
415k
        // If the Rt of the second instruction was not modified or used between
1208
415k
        // the two instructions and none of the instructions between the second
1209
415k
        // and first alias with the second, we can combine the second into the
1210
415k
        // first.
1211
415k
        
if (415k
!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
1212
359k
            
!(MI.mayLoad() && 359k
UsedRegs[getLdStRegOp(MI).getReg()]167k
) &&
1213
415k
            
!mayAlias(MI, MemInsns, AA)356k
) {
1214
345k
          Flags.setMergeForward(false);
1215
345k
          return MBBI;
1216
345k
        }
1217
69.7k
1218
69.7k
        // Likewise, if the Rt of the first instruction is not modified or used
1219
69.7k
        // between the two instructions and none of the instructions between the
1220
69.7k
        // first and the second alias with the first, we can combine the first
1221
69.7k
        // into the second.
1222
69.7k
        
if (69.7k
!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
1223
33.6k
            
!(MayLoad && 33.6k
UsedRegs[getLdStRegOp(FirstMI).getReg()]7.09k
) &&
1224
69.7k
            
!mayAlias(FirstMI, MemInsns, AA)27.2k
) {
1225
15.7k
          Flags.setMergeForward(true);
1226
15.7k
          return MBBI;
1227
15.7k
        }
1228
7.79M
        // Unable to combine these instructions due to interference in between.
1229
7.79M
        // Keep looking.
1230
7.79M
      }
1231
1.46M
    }
1232
7.79M
1233
7.79M
    // If the instruction wasn't a matching load or store.  Stop searching if we
1234
7.79M
    // encounter a call instruction that might modify memory.
1235
7.79M
    
if (7.79M
MI.isCall()7.79M
)
1236
550k
      return E;
1237
7.24M
1238
7.24M
    // Update modified / uses register lists.
1239
7.24M
    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1240
7.24M
1241
7.24M
    // Otherwise, if the base register is modified, we have no match, so
1242
7.24M
    // return early.
1243
7.24M
    if (ModifiedRegs[BaseReg])
1244
164k
      return E;
1245
7.08M
1246
7.08M
    // Update list of instructions that read/write memory.
1247
7.08M
    
if (7.08M
MI.mayLoadOrStore()7.08M
)
1248
3.00M
      MemInsns.push_back(&MI);
1249
8.21M
  }
1250
1.31M
  return E;
1251
2.39M
}
1252
1253
MachineBasicBlock::iterator
1254
AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1255
                                     MachineBasicBlock::iterator Update,
1256
3.48k
                                     bool IsPreIdx) {
1257
3.48k
  assert((Update->getOpcode() == AArch64::ADDXri ||
1258
3.48k
          Update->getOpcode() == AArch64::SUBXri) &&
1259
3.48k
         "Unexpected base register update instruction to merge!");
1260
3.48k
  MachineBasicBlock::iterator NextI = I;
1261
3.48k
  // Return the instruction following the merged instruction, which is
1262
3.48k
  // the instruction following our unmerged load. Unless that's the add/sub
1263
3.48k
  // instruction we're merging, in which case it's the one after that.
1264
3.48k
  if (++NextI == Update)
1265
970
    ++NextI;
1266
3.48k
1267
3.48k
  int Value = Update->getOperand(2).getImm();
1268
3.48k
  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1269
3.48k
         "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1270
3.48k
  if (Update->getOpcode() == AArch64::SUBXri)
1271
327
    Value = -Value;
1272
3.48k
1273
348
  unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1274
3.13k
                             : getPostIndexedOpcode(I->getOpcode());
1275
3.48k
  MachineInstrBuilder MIB;
1276
3.48k
  if (
!isPairedLdSt(*I)3.48k
) {
1277
2.64k
    // Non-paired instruction.
1278
2.64k
    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1279
2.64k
              .add(getLdStRegOp(*Update))
1280
2.64k
              .add(getLdStRegOp(*I))
1281
2.64k
              .add(getLdStBaseOp(*I))
1282
2.64k
              .addImm(Value)
1283
2.64k
              .setMemRefs(I->memoperands_begin(), I->memoperands_end());
1284
3.48k
  } else {
1285
834
    // Paired instruction.
1286
834
    int Scale = getMemScale(*I);
1287
834
    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1288
834
              .add(getLdStRegOp(*Update))
1289
834
              .add(getLdStRegOp(*I, 0))
1290
834
              .add(getLdStRegOp(*I, 1))
1291
834
              .add(getLdStBaseOp(*I))
1292
834
              .addImm(Value / Scale)
1293
834
              .setMemRefs(I->memoperands_begin(), I->memoperands_end());
1294
834
  }
1295
3.48k
  (void)MIB;
1296
3.48k
1297
3.48k
  if (IsPreIdx)
1298
3.48k
    DEBUG(dbgs() << "Creating pre-indexed load/store.");
1299
3.48k
  else
1300
3.48k
    DEBUG(dbgs() << "Creating post-indexed load/store.");
1301
3.48k
  DEBUG(dbgs() << "    Replacing instructions:\n    ");
1302
3.48k
  DEBUG(I->print(dbgs()));
1303
3.48k
  DEBUG(dbgs() << "    ");
1304
3.48k
  DEBUG(Update->print(dbgs()));
1305
3.48k
  DEBUG(dbgs() << "  with instruction:\n    ");
1306
3.48k
  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1307
3.48k
  DEBUG(dbgs() << "\n");
1308
3.48k
1309
3.48k
  // Erase the old instructions for the block.
1310
3.48k
  I->eraseFromParent();
1311
3.48k
  Update->eraseFromParent();
1312
3.48k
1313
3.48k
  return NextI;
1314
3.48k
}
1315
1316
bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
1317
                                               MachineInstr &MI,
1318
14.1M
                                               unsigned BaseReg, int Offset) {
1319
14.1M
  switch (MI.getOpcode()) {
1320
13.2M
  default:
1321
13.2M
    break;
1322
909k
  case AArch64::SUBXri:
1323
909k
  case AArch64::ADDXri:
1324
909k
    // Make sure it's a vanilla immediate operand, not a relocation or
1325
909k
    // anything else we can't handle.
1326
909k
    if (!MI.getOperand(2).isImm())
1327
329k
      break;
1328
580k
    // Watch out for 1 << 12 shifted value.
1329
580k
    
if (580k
AArch64_AM::getShiftValue(MI.getOperand(3).getImm())580k
)
1330
1.58k
      break;
1331
578k
1332
578k
    // The update instruction source and destination register must be the
1333
578k
    // same as the load/store base register.
1334
578k
    
if (578k
MI.getOperand(0).getReg() != BaseReg ||
1335
90.0k
        MI.getOperand(1).getReg() != BaseReg)
1336
500k
      break;
1337
78.3k
1338
78.3k
    bool IsPairedInsn = isPairedLdSt(MemMI);
1339
78.3k
    int UpdateOffset = MI.getOperand(2).getImm();
1340
78.3k
    if (MI.getOpcode() == AArch64::SUBXri)
1341
3.93k
      UpdateOffset = -UpdateOffset;
1342
78.3k
1343
78.3k
    // For non-paired load/store instructions, the immediate must fit in a
1344
78.3k
    // signed 9-bit integer.
1345
78.3k
    if (
!IsPairedInsn && 78.3k
(UpdateOffset > 255 || 9.29k
UpdateOffset < -2567.60k
))
1346
1.76k
      break;
1347
76.5k
1348
76.5k
    // For paired load/store instructions, the immediate must be a multiple of
1349
76.5k
    // the scaling factor.  The scaled offset must also fit into a signed 7-bit
1350
76.5k
    // integer.
1351
76.5k
    
if (76.5k
IsPairedInsn76.5k
) {
1352
69.0k
      int Scale = getMemScale(MemMI);
1353
69.0k
      if (UpdateOffset % Scale != 0)
1354
8
        break;
1355
69.0k
1356
69.0k
      int ScaledOffset = UpdateOffset / Scale;
1357
69.0k
      if (
ScaledOffset > 63 || 69.0k
ScaledOffset < -6468.9k
)
1358
73
        break;
1359
76.4k
    }
1360
76.4k
1361
76.4k
    // If we have a non-zero Offset, we check that it matches the amount
1362
76.4k
    // we're adding to the register.
1363
76.4k
    
if (76.4k
!Offset || 76.4k
Offset == UpdateOffset73.0k
)
1364
3.48k
      return true;
1365
72.9k
    break;
1366
14.1M
  }
1367
14.1M
  return false;
1368
14.1M
}
1369
1370
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1371
9.33M
    MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
1372
9.33M
  MachineBasicBlock::iterator E = I->getParent()->end();
1373
9.33M
  MachineInstr &MemMI = *I;
1374
9.33M
  MachineBasicBlock::iterator MBBI = I;
1375
9.33M
1376
9.33M
  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1377
9.33M
  int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
1378
9.33M
1379
9.33M
  // Scan forward looking for post-index opportunities.  Updating instructions
1380
9.33M
  // can't be formed if the memory instruction doesn't have the offset we're
1381
9.33M
  // looking for.
1382
9.33M
  if (MIUnscaledOffset != UnscaledOffset)
1383
3.95M
    return E;
1384
5.37M
1385
5.37M
  // If the base register overlaps a destination register, we can't
1386
5.37M
  // merge the update.
1387
5.37M
  bool IsPairedInsn = isPairedLdSt(MemMI);
1388
12.3M
  for (unsigned i = 0, e = IsPairedInsn ? 
21.86M
:
13.51M
;
i != e12.3M
;
++i6.99M
) {
1389
7.22M
    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1390
7.22M
    if (
DestReg == BaseReg || 7.22M
TRI->isSubRegister(BaseReg, DestReg)7.05M
)
1391
234k
      return E;
1392
7.22M
  }
1393
5.37M
1394
5.37M
  // Track which registers have been modified and used between the first insn
1395
5.37M
  // (inclusive) and the second insn.
1396
5.13M
  ModifiedRegs.reset();
1397
5.13M
  UsedRegs.reset();
1398
5.13M
  ++MBBI;
1399
14.4M
  for (unsigned Count = 0; 
MBBI != E && 14.4M
Count < Limit12.5M
;
++MBBI9.30M
) {
1400
12.5M
    MachineInstr &MI = *MBBI;
1401
12.5M
1402
12.5M
    // Don't count transient instructions towards the search limit since there
1403
12.5M
    // may be different numbers of them if e.g. debug information is present.
1404
12.5M
    if (!MI.isTransient())
1405
12.5M
      ++Count;
1406
12.5M
1407
12.5M
    // If we found a match, return it.
1408
12.5M
    if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
1409
3.19k
      return MBBI;
1410
12.5M
1411
12.5M
    // Update the status of what the instruction clobbered and used.
1412
12.5M
    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1413
12.5M
1414
12.5M
    // Otherwise, if the base register is used or modified, we have no match, so
1415
12.5M
    // return early.
1416
12.5M
    if (
ModifiedRegs[BaseReg] || 12.5M
UsedRegs[BaseReg]11.8M
)
1417
3.27M
      return E;
1418
12.5M
  }
1419
1.85M
  return E;
1420
9.33M
}
1421
1422
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1423
4.57M
    MachineBasicBlock::iterator I, unsigned Limit) {
1424
4.57M
  MachineBasicBlock::iterator B = I->getParent()->begin();
1425
4.57M
  MachineBasicBlock::iterator E = I->getParent()->end();
1426
4.57M
  MachineInstr &MemMI = *I;
1427
4.57M
  MachineBasicBlock::iterator MBBI = I;
1428
4.57M
1429
4.57M
  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1430
4.57M
  int Offset = getLdStOffsetOp(MemMI).getImm();
1431
4.57M
1432
4.57M
  // If the load/store is the first instruction in the block, there's obviously
1433
4.57M
  // not any matching update. Ditto if the memory offset isn't zero.
1434
4.57M
  if (
MBBI == B || 4.57M
Offset != 03.68M
)
1435
3.96M
    return E;
1436
612k
  // If the base register overlaps a destination register, we can't
1437
612k
  // merge the update.
1438
612k
  bool IsPairedInsn = isPairedLdSt(MemMI);
1439
1.21M
  for (unsigned i = 0, e = IsPairedInsn ? 
258.0k
:
1554k
;
i != e1.21M
;
++i605k
) {
1440
668k
    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1441
668k
    if (
DestReg == BaseReg || 668k
TRI->isSubRegister(BaseReg, DestReg)623k
)
1442
62.8k
      return E;
1443
668k
  }
1444
612k
1445
612k
  // Track which registers have been modified and used between the first insn
1446
612k
  // (inclusive) and the second insn.
1447
549k
  ModifiedRegs.reset();
1448
549k
  UsedRegs.reset();
1449
549k
  unsigned Count = 0;
1450
1.59M
  do {
1451
1.59M
    --MBBI;
1452
1.59M
    MachineInstr &MI = *MBBI;
1453
1.59M
1454
1.59M
    // Don't count transient instructions towards the search limit since there
1455
1.59M
    // may be different numbers of them if e.g. debug information is present.
1456
1.59M
    if (!MI.isTransient())
1457
1.58M
      ++Count;
1458
1.59M
1459
1.59M
    // If we found a match, return it.
1460
1.59M
    if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
1461
293
      return MBBI;
1462
1.59M
1463
1.59M
    // Update the status of what the instruction clobbered and used.
1464
1.59M
    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1465
1.59M
1466
1.59M
    // Otherwise, if the base register is used or modified, we have no match, so
1467
1.59M
    // return early.
1468
1.59M
    if (
ModifiedRegs[BaseReg] || 1.59M
UsedRegs[BaseReg]1.34M
)
1469
408k
      return E;
1470
1.18M
  } while (
MBBI != B && 1.18M
Count < Limit1.04M
);
1471
141k
  return E;
1472
4.57M
}
1473
1474
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1475
2.20M
    MachineBasicBlock::iterator &MBBI) {
1476
2.20M
  MachineInstr &MI = *MBBI;
1477
2.20M
  // If this is a volatile load, don't mess with it.
1478
2.20M
  if (MI.hasOrderedMemoryRef())
1479
305k
    return false;
1480
1.89M
1481
1.89M
  // Make sure this is a reg+imm.
1482
1.89M
  // FIXME: It is possible to extend it to handle reg+reg cases.
1483
1.89M
  
if (1.89M
!getLdStOffsetOp(MI).isImm()1.89M
)
1484
17.6k
    return false;
1485
1.88M
1486
1.88M
  // Look backward up to LdStLimit instructions.
1487
1.88M
  MachineBasicBlock::iterator StoreI;
1488
1.88M
  if (
findMatchingStore(MBBI, LdStLimit, StoreI)1.88M
) {
1489
141
    ++NumLoadsFromStoresPromoted;
1490
141
    // Promote the load. Keeping the iterator straight is a
1491
141
    // pain, so we let the merge routine tell us what the next instruction
1492
141
    // is after it's done mucking about.
1493
141
    MBBI = promoteLoadFromStore(MBBI, StoreI);
1494
141
    return true;
1495
141
  }
1496
1.88M
  return false;
1497
1.88M
}
1498
1499
// Merge adjacent zero stores into a wider store.
1500
bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
1501
127k
    MachineBasicBlock::iterator &MBBI) {
1502
127k
  assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
1503
127k
  MachineInstr &MI = *MBBI;
1504
127k
  MachineBasicBlock::iterator E = MI.getParent()->end();
1505
127k
1506
127k
  if (!TII->isCandidateToMergeOrPair(MI))
1507
503
    return false;
1508
126k
1509
126k
  // Look ahead up to LdStLimit instructions for a mergable instruction.
1510
126k
  LdStPairFlags Flags;
1511
126k
  MachineBasicBlock::iterator MergeMI =
1512
126k
      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
1513
126k
  if (
MergeMI != E126k
) {
1514
6.37k
    ++NumZeroStoresPromoted;
1515
6.37k
1516
6.37k
    // Keeping the iterator straight is a pain, so we let the merge routine tell
1517
6.37k
    // us what the next instruction is after it's done mucking about.
1518
6.37k
    MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
1519
6.37k
    return true;
1520
6.37k
  }
1521
120k
  return false;
1522
120k
}
1523
1524
// Find loads and stores that can be merged into a single load or store pair
1525
// instruction.
1526
3.15M
bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
1527
3.15M
  MachineInstr &MI = *MBBI;
1528
3.15M
  MachineBasicBlock::iterator E = MI.getParent()->end();
1529
3.15M
1530
3.15M
  if (!TII->isCandidateToMergeOrPair(MI))
1531
617k
    return false;
1532
2.53M
1533
2.53M
  // Early exit if the offset is not possible to match. (6 bits of positive
1534
2.53M
  // range, plus allow an extra one in case we find a later insn that matches
1535
2.53M
  // with Offset-1)
1536
2.53M
  bool IsUnscaled = TII->isUnscaledLdSt(MI);
1537
2.53M
  int Offset = getLdStOffsetOp(MI).getImm();
1538
2.53M
  int OffsetStride = IsUnscaled ? 
getMemScale(MI)190k
:
12.34M
;
1539
2.53M
  // Allow one more for offset.
1540
2.53M
  if (Offset > 0)
1541
1.79M
    Offset -= OffsetStride;
1542
2.53M
  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
1543
272k
    return false;
1544
2.26M
1545
2.26M
  // Look ahead up to LdStLimit instructions for a pairable instruction.
1546
2.26M
  LdStPairFlags Flags;
1547
2.26M
  MachineBasicBlock::iterator Paired =
1548
2.26M
      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
1549
2.26M
  if (
Paired != E2.26M
) {
1550
354k
    ++NumPairCreated;
1551
354k
    if (TII->isUnscaledLdSt(MI))
1552
34.2k
      ++NumUnscaledPairCreated;
1553
354k
    // Keeping the iterator straight is a pain, so we let the merge routine tell
1554
354k
    // us what the next instruction is after it's done mucking about.
1555
354k
    MBBI = mergePairedInsns(MBBI, Paired, Flags);
1556
354k
    return true;
1557
354k
  }
1558
1.91M
  return false;
1559
1.91M
}
1560
1561
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
1562
3.72M
                                        bool EnableNarrowZeroStOpt) {
1563
3.72M
  bool Modified = false;
1564
3.72M
  // Four tranformations to do here:
1565
3.72M
  // 1) Find loads that directly read from stores and promote them by
1566
3.72M
  //    replacing with mov instructions. If the store is wider than the load,
1567
3.72M
  //    the load will be replaced with a bitfield extract.
1568
3.72M
  //      e.g.,
1569
3.72M
  //        str w1, [x0, #4]
1570
3.72M
  //        ldrh w2, [x0, #6]
1571
3.72M
  //        ; becomes
1572
3.72M
  //        str w1, [x0, #4]
1573
3.72M
  //        lsr w2, w1, #16
1574
3.72M
  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1575
27.3M
       
MBBI != E27.3M
;) {
1576
23.5M
    MachineInstr &MI = *MBBI;
1577
23.5M
    switch (MI.getOpcode()) {
1578
21.3M
    default:
1579
21.3M
      // Just move on to the next instruction.
1580
21.3M
      ++MBBI;
1581
21.3M
      break;
1582
23.5M
    // Scaled instructions.
1583
2.20M
    case AArch64::LDRBBui:
1584
2.20M
    case AArch64::LDRHHui:
1585
2.20M
    case AArch64::LDRWui:
1586
2.20M
    case AArch64::LDRXui:
1587
2.20M
    // Unscaled instructions.
1588
2.20M
    case AArch64::LDURBBi:
1589
2.20M
    case AArch64::LDURHHi:
1590
2.20M
    case AArch64::LDURWi:
1591
2.20M
    case AArch64::LDURXi:
1592
2.20M
      if (
tryToPromoteLoadFromStore(MBBI)2.20M
) {
1593
141
        Modified = true;
1594
141
        break;
1595
141
      }
1596
2.20M
      ++MBBI;
1597
2.20M
      break;
1598
23.5M
    }
1599
23.5M
  }
1600
3.72M
  // 2) Merge adjacent zero stores into a wider store.
1601
3.72M
  //      e.g.,
1602
3.72M
  //        strh wzr, [x0]
1603
3.72M
  //        strh wzr, [x0, #2]
1604
3.72M
  //        ; becomes
1605
3.72M
  //        str wzr, [x0]
1606
3.72M
  //      e.g.,
1607
3.72M
  //        str wzr, [x0]
1608
3.72M
  //        str wzr, [x0, #4]
1609
3.72M
  //        ; becomes
1610
3.72M
  //        str xzr, [x0]
1611
3.72M
  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1612
27.2M
       
EnableNarrowZeroStOpt && 27.2M
MBBI != E27.2M
;) {
1613
23.5M
    if (
isPromotableZeroStoreInst(*MBBI)23.5M
) {
1614
127k
      if (
tryToMergeZeroStInst(MBBI)127k
) {
1615
6.37k
        Modified = true;
1616
6.37k
      } else
1617
120k
        ++MBBI;
1618
127k
    } else
1619
23.4M
      ++MBBI;
1620
23.5M
  }
1621
3.72M
1622
3.72M
  // 3) Find loads and stores that can be merged into a single load or store
1623
3.72M
  //    pair instruction.
1624
3.72M
  //      e.g.,
1625
3.72M
  //        ldr x0, [x2]
1626
3.72M
  //        ldr x1, [x2, #8]
1627
3.72M
  //        ; becomes
1628
3.72M
  //        ldp x0, x1, [x2]
1629
3.72M
  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1630
26.9M
       
MBBI != E26.9M
;) {
1631
23.2M
    if (
TII->isPairableLdStInst(*MBBI) && 23.2M
tryToPairLdStInst(MBBI)3.15M
)
1632
354k
      Modified = true;
1633
23.2M
    else
1634
22.8M
      ++MBBI;
1635
23.2M
  }
1636
3.72M
  // 4) Find base register updates that can be merged into the load or store
1637
3.72M
  //    as a base-reg writeback.
1638
3.72M
  //      e.g.,
1639
3.72M
  //        ldr x0, [x2]
1640
3.72M
  //        add x2, x2, #4
1641
3.72M
  //        ; becomes
1642
3.72M
  //        ldr x0, [x2], #4
1643
3.72M
  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1644
26.9M
       
MBBI != E26.9M
;) {
1645
23.2M
    MachineInstr &MI = *MBBI;
1646
23.2M
    // Do update merging. It's simpler to keep this separate from the above
1647
23.2M
    // switchs, though not strictly necessary.
1648
23.2M
    unsigned Opc = MI.getOpcode();
1649
23.2M
    switch (Opc) {
1650
18.0M
    default:
1651
18.0M
      // Just move on to the next instruction.
1652
18.0M
      ++MBBI;
1653
18.0M
      break;
1654
23.2M
    // Scaled instructions.
1655
5.16M
    case AArch64::STRSui:
1656
5.16M
    case AArch64::STRDui:
1657
5.16M
    case AArch64::STRQui:
1658
5.16M
    case AArch64::STRXui:
1659
5.16M
    case AArch64::STRWui:
1660
5.16M
    case AArch64::STRHHui:
1661
5.16M
    case AArch64::STRBBui:
1662
5.16M
    case AArch64::LDRSui:
1663
5.16M
    case AArch64::LDRDui:
1664
5.16M
    case AArch64::LDRQui:
1665
5.16M
    case AArch64::LDRXui:
1666
5.16M
    case AArch64::LDRWui:
1667
5.16M
    case AArch64::LDRHHui:
1668
5.16M
    case AArch64::LDRBBui:
1669
5.16M
    // Unscaled instructions.
1670
5.16M
    case AArch64::STURSi:
1671
5.16M
    case AArch64::STURDi:
1672
5.16M
    case AArch64::STURQi:
1673
5.16M
    case AArch64::STURWi:
1674
5.16M
    case AArch64::STURXi:
1675
5.16M
    case AArch64::LDURSi:
1676
5.16M
    case AArch64::LDURDi:
1677
5.16M
    case AArch64::LDURQi:
1678
5.16M
    case AArch64::LDURWi:
1679
5.16M
    case AArch64::LDURXi:
1680
5.16M
    // Paired instructions.
1681
5.16M
    case AArch64::LDPSi:
1682
5.16M
    case AArch64::LDPSWi:
1683
5.16M
    case AArch64::LDPDi:
1684
5.16M
    case AArch64::LDPQi:
1685
5.16M
    case AArch64::LDPWi:
1686
5.16M
    case AArch64::LDPXi:
1687
5.16M
    case AArch64::STPSi:
1688
5.16M
    case AArch64::STPDi:
1689
5.16M
    case AArch64::STPQi:
1690
5.16M
    case AArch64::STPWi:
1691
5.16M
    case AArch64::STPXi: {
1692
5.16M
      // Make sure this is a reg+imm (as opposed to an address reloc).
1693
5.16M
      if (
!getLdStOffsetOp(MI).isImm()5.16M
) {
1694
415k
        ++MBBI;
1695
415k
        break;
1696
415k
      }
1697
4.75M
      // Look forward to try to form a post-index instruction. For example,
1698
4.75M
      // ldr x0, [x20]
1699
4.75M
      // add x20, x20, #32
1700
4.75M
      //   merged into:
1701
4.75M
      // ldr x0, [x20], #32
1702
4.75M
      MachineBasicBlock::iterator Update =
1703
4.75M
          findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
1704
4.75M
      if (
Update != E4.75M
) {
1705
3.13k
        // Merge the update into the ld/st.
1706
3.13k
        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1707
3.13k
        Modified = true;
1708
3.13k
        ++NumPostFolded;
1709
3.13k
        break;
1710
3.13k
      }
1711
4.75M
1712
4.75M
      // Don't know how to handle unscaled pre/post-index versions below, so
1713
4.75M
      // move to the next instruction.
1714
4.75M
      
if (4.75M
TII->isUnscaledLdSt(Opc)4.75M
) {
1715
171k
        ++MBBI;
1716
171k
        break;
1717
171k
      }
1718
4.57M
1719
4.57M
      // Look back to try to find a pre-index instruction. For example,
1720
4.57M
      // add x0, x0, #8
1721
4.57M
      // ldr x1, [x0]
1722
4.57M
      //   merged into:
1723
4.57M
      // ldr x1, [x0, #8]!
1724
4.57M
      Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
1725
4.57M
      if (
Update != E4.57M
) {
1726
293
        // Merge the update into the ld/st.
1727
293
        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1728
293
        Modified = true;
1729
293
        ++NumPreFolded;
1730
293
        break;
1731
293
      }
1732
4.57M
      // The immediate in the load/store is scaled by the size of the memory
1733
4.57M
      // operation. The immediate in the add we're looking for,
1734
4.57M
      // however, is not, so adjust here.
1735
4.57M
      int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1736
4.57M
1737
4.57M
      // Look forward to try to find a post-index instruction. For example,
1738
4.57M
      // ldr x1, [x0, #64]
1739
4.57M
      // add x0, x0, #64
1740
4.57M
      //   merged into:
1741
4.57M
      // ldr x1, [x0, #64]!
1742
4.57M
      Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
1743
4.57M
      if (
Update != E4.57M
) {
1744
55
        // Merge the update into the ld/st.
1745
55
        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1746
55
        Modified = true;
1747
55
        ++NumPreFolded;
1748
55
        break;
1749
55
      }
1750
4.57M
1751
4.57M
      // Nothing found. Just move to the next instruction.
1752
4.57M
      ++MBBI;
1753
4.57M
      break;
1754
4.57M
    }
1755
23.2M
    }
1756
23.2M
  }
1757
3.72M
1758
3.72M
  return Modified;
1759
3.72M
}
1760
1761
455k
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1762
455k
  if (skipFunction(*Fn.getFunction()))
1763
1
    return false;
1764
455k
1765
455k
  Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
1766
455k
  TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
1767
455k
  TRI = Subtarget->getRegisterInfo();
1768
455k
  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1769
455k
1770
455k
  // Resize the modified and used register bitfield trackers.  We do this once
1771
455k
  // per function and then clear the bitfield each time we optimize a load or
1772
455k
  // store.
1773
455k
  ModifiedRegs.resize(TRI->getNumRegs());
1774
455k
  UsedRegs.resize(TRI->getNumRegs());
1775
455k
1776
455k
  bool Modified = false;
1777
455k
  bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
1778
455k
  for (auto &MBB : Fn)
1779
3.72M
    Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt);
1780
455k
1781
455k
  return Modified;
1782
455k
}
1783
1784
// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
1785
// stores near one another?  Note: The pre-RA instruction scheduler already has
1786
// hooks to try and schedule pairable loads/stores together to improve pairing
1787
// opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
1788
1789
// FIXME: When pairing store instructions it's very possible for this pass to
1790
// hoist a store with a KILL marker above another use (without a KILL marker).
1791
// The resulting IR is invalid, but nothing uses the KILL markers after this
1792
// pass, so it's never caused a problem in practice.
1793
1794
/// createAArch64LoadStoreOptimizationPass - returns an instance of the
1795
/// load / store optimization pass.
1796
13.7k
FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1797
13.7k
  return new AArch64LoadStoreOpt();
1798
13.7k
}