Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file contains a pass that performs load / store related peephole
10
// optimizations. This pass should be run after register allocation.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "AArch64InstrInfo.h"
15
#include "AArch64Subtarget.h"
16
#include "MCTargetDesc/AArch64AddressingModes.h"
17
#include "llvm/ADT/BitVector.h"
18
#include "llvm/ADT/SmallVector.h"
19
#include "llvm/ADT/Statistic.h"
20
#include "llvm/ADT/StringRef.h"
21
#include "llvm/ADT/iterator_range.h"
22
#include "llvm/Analysis/AliasAnalysis.h"
23
#include "llvm/CodeGen/MachineBasicBlock.h"
24
#include "llvm/CodeGen/MachineFunction.h"
25
#include "llvm/CodeGen/MachineFunctionPass.h"
26
#include "llvm/CodeGen/MachineInstr.h"
27
#include "llvm/CodeGen/MachineInstrBuilder.h"
28
#include "llvm/CodeGen/MachineOperand.h"
29
#include "llvm/CodeGen/TargetRegisterInfo.h"
30
#include "llvm/IR/DebugLoc.h"
31
#include "llvm/MC/MCRegisterInfo.h"
32
#include "llvm/Pass.h"
33
#include "llvm/Support/CommandLine.h"
34
#include "llvm/Support/Debug.h"
35
#include "llvm/Support/ErrorHandling.h"
36
#include "llvm/Support/raw_ostream.h"
37
#include <cassert>
38
#include <cstdint>
39
#include <iterator>
40
#include <limits>
41
42
using namespace llvm;
43
44
#define DEBUG_TYPE "aarch64-ldst-opt"
45
46
STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
47
STATISTIC(NumPostFolded, "Number of post-index updates folded");
48
STATISTIC(NumPreFolded, "Number of pre-index updates folded");
49
STATISTIC(NumUnscaledPairCreated,
50
          "Number of load/store from unscaled generated");
51
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
52
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
53
54
// The LdStLimit limits how far we search for load/store pairs.
55
static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
56
                                   cl::init(20), cl::Hidden);
57
58
// The UpdateLimit limits how far we search for update instructions when we form
59
// pre-/post-index instructions.
60
static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
61
                                     cl::Hidden);
62
63
513k
#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
64
65
namespace {
66
67
using LdStPairFlags = struct LdStPairFlags {
68
  // If a matching instruction is found, MergeForward is set to true if the
69
  // merge is to remove the first instruction and replace the second with
70
  // a pair-wise insn, and false if the reverse is true.
71
  bool MergeForward = false;
72
73
  // SExtIdx gives the index of the result of the load pair that must be
74
  // extended. The value of SExtIdx assumes that the paired load produces the
75
  // value in this order: (I, returned iterator), i.e., -1 means no value has
76
  // to be extended, 0 means I, and 1 means the returned iterator.
77
  int SExtIdx = -1;
78
79
2.75M
  LdStPairFlags() = default;
80
81
222k
  void setMergeForward(bool V = true) { MergeForward = V; }
82
222k
  bool getMergeForward() const { return MergeForward; }
83
84
11.1M
  void setSExtIdx(int V) { SExtIdx = V; }
85
214k
  int getSExtIdx() const { return SExtIdx; }
86
};
87
88
struct AArch64LoadStoreOpt : public MachineFunctionPass {
89
  static char ID;
90
91
15.9k
  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
92
15.9k
    initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
93
15.9k
  }
94
95
  AliasAnalysis *AA;
96
  const AArch64InstrInfo *TII;
97
  const TargetRegisterInfo *TRI;
98
  const AArch64Subtarget *Subtarget;
99
100
  // Track which register units have been modified and used.
101
  LiveRegUnits ModifiedRegUnits, UsedRegUnits;
102
103
15.8k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
104
15.8k
    AU.addRequired<AAResultsWrapperPass>();
105
15.8k
    MachineFunctionPass::getAnalysisUsage(AU);
106
15.8k
  }
107
108
  // Scan the instructions looking for a load/store that can be combined
109
  // with the current instruction into a load/store pair.
110
  // Return the matching instruction if one is found, else MBB->end().
111
  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
112
                                               LdStPairFlags &Flags,
113
                                               unsigned Limit,
114
                                               bool FindNarrowMerge);
115
116
  // Scan the instructions looking for a store that writes to the address from
117
  // which the current load instruction reads. Return true if one is found.
118
  bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
119
                         MachineBasicBlock::iterator &StoreI);
120
121
  // Merge the two instructions indicated into a wider narrow store instruction.
122
  MachineBasicBlock::iterator
123
  mergeNarrowZeroStores(MachineBasicBlock::iterator I,
124
                        MachineBasicBlock::iterator MergeMI,
125
                        const LdStPairFlags &Flags);
126
127
  // Merge the two instructions indicated into a single pair-wise instruction.
128
  MachineBasicBlock::iterator
129
  mergePairedInsns(MachineBasicBlock::iterator I,
130
                   MachineBasicBlock::iterator Paired,
131
                   const LdStPairFlags &Flags);
132
133
  // Promote the load that reads directly from the address stored to.
134
  MachineBasicBlock::iterator
135
  promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
136
                       MachineBasicBlock::iterator StoreI);
137
138
  // Scan the instruction list to find a base register update that can
139
  // be combined with the current instruction (a load or store) using
140
  // pre or post indexed addressing with writeback. Scan forwards.
141
  MachineBasicBlock::iterator
142
  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
143
                                int UnscaledOffset, unsigned Limit);
144
145
  // Scan the instruction list to find a base register update that can
146
  // be combined with the current instruction (a load or store) using
147
  // pre or post indexed addressing with writeback. Scan backwards.
148
  MachineBasicBlock::iterator
149
  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
150
151
  // Find an instruction that updates the base register of the ld/st
152
  // instruction.
153
  bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
154
                            unsigned BaseReg, int Offset);
155
156
  // Merge a pre- or post-index base register update into a ld/st instruction.
157
  MachineBasicBlock::iterator
158
  mergeUpdateInsn(MachineBasicBlock::iterator I,
159
                  MachineBasicBlock::iterator Update, bool IsPreIdx);
160
161
  // Find and merge zero store instructions.
162
  bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
163
164
  // Find and pair ldr/str instructions.
165
  bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
166
167
  // Find and promote load instructions which read directly from store.
168
  bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
169
170
  // Find and merge a base register updates before or after a ld/st instruction.
171
  bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
172
173
  bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
174
175
  bool runOnMachineFunction(MachineFunction &Fn) override;
176
177
15.8k
  MachineFunctionProperties getRequiredProperties() const override {
178
15.8k
    return MachineFunctionProperties().set(
179
15.8k
        MachineFunctionProperties::Property::NoVRegs);
180
15.8k
  }
181
182
513k
  StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
183
};
184
185
char AArch64LoadStoreOpt::ID = 0;
186
187
} // end anonymous namespace
188
189
INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
190
                AARCH64_LOAD_STORE_OPT_NAME, false, false)
191
192
35.8M
static bool isNarrowStore(unsigned Opc) {
193
35.8M
  switch (Opc) {
194
35.8M
  default:
195
35.1M
    return false;
196
35.8M
  case AArch64::STRBBui:
197
674k
  case AArch64::STURBBi:
198
674k
  case AArch64::STRHHui:
199
674k
  case AArch64::STURHHi:
200
674k
    return true;
201
35.8M
  }
202
35.8M
}
203
204
// Scaling factor for unscaled load or store.
205
18.9M
static int getMemScale(MachineInstr &MI) {
206
18.9M
  switch (MI.getOpcode()) {
207
18.9M
  default:
208
0
    llvm_unreachable("Opcode has unknown scale!");
209
18.9M
  case AArch64::LDRBBui:
210
1.93M
  case AArch64::LDURBBi:
211
1.93M
  case AArch64::LDRSBWui:
212
1.93M
  case AArch64::LDURSBWi:
213
1.93M
  case AArch64::STRBBui:
214
1.93M
  case AArch64::STURBBi:
215
1.93M
    return 1;
216
1.93M
  case AArch64::LDRHHui:
217
548k
  case AArch64::LDURHHi:
218
548k
  case AArch64::LDRSHWui:
219
548k
  case AArch64::LDURSHWi:
220
548k
  case AArch64::STRHHui:
221
548k
  case AArch64::STURHHi:
222
548k
    return 2;
223
3.19M
  case AArch64::LDRSui:
224
3.19M
  case AArch64::LDURSi:
225
3.19M
  case AArch64::LDRSWui:
226
3.19M
  case AArch64::LDURSWi:
227
3.19M
  case AArch64::LDRWui:
228
3.19M
  case AArch64::LDURWi:
229
3.19M
  case AArch64::STRSui:
230
3.19M
  case AArch64::STURSi:
231
3.19M
  case AArch64::STRWui:
232
3.19M
  case AArch64::STURWi:
233
3.19M
  case AArch64::LDPSi:
234
3.19M
  case AArch64::LDPSWi:
235
3.19M
  case AArch64::LDPWi:
236
3.19M
  case AArch64::STPSi:
237
3.19M
  case AArch64::STPWi:
238
3.19M
    return 4;
239
12.0M
  case AArch64::LDRDui:
240
12.0M
  case AArch64::LDURDi:
241
12.0M
  case AArch64::LDRXui:
242
12.0M
  case AArch64::LDURXi:
243
12.0M
  case AArch64::STRDui:
244
12.0M
  case AArch64::STURDi:
245
12.0M
  case AArch64::STRXui:
246
12.0M
  case AArch64::STURXi:
247
12.0M
  case AArch64::LDPDi:
248
12.0M
  case AArch64::LDPXi:
249
12.0M
  case AArch64::STPDi:
250
12.0M
  case AArch64::STPXi:
251
12.0M
    return 8;
252
12.0M
  case AArch64::LDRQui:
253
1.16M
  case AArch64::LDURQi:
254
1.16M
  case AArch64::STRQui:
255
1.16M
  case AArch64::STURQi:
256
1.16M
  case AArch64::LDPQi:
257
1.16M
  case AArch64::STPQi:
258
1.16M
    return 16;
259
18.9M
  }
260
18.9M
}
261
262
static unsigned getMatchingNonSExtOpcode(unsigned Opc,
263
17.4M
                                         bool *IsValidLdStrOpc = nullptr) {
264
17.4M
  if (IsValidLdStrOpc)
265
17.4M
    *IsValidLdStrOpc = true;
266
17.4M
  switch (Opc) {
267
17.4M
  default:
268
6.53M
    if (IsValidLdStrOpc)
269
6.53M
      *IsValidLdStrOpc = false;
270
6.53M
    return std::numeric_limits<unsigned>::max();
271
17.4M
  case AArch64::STRDui:
272
10.5M
  case AArch64::STURDi:
273
10.5M
  case AArch64::STRQui:
274
10.5M
  case AArch64::STURQi:
275
10.5M
  case AArch64::STRBBui:
276
10.5M
  case AArch64::STURBBi:
277
10.5M
  case AArch64::STRHHui:
278
10.5M
  case AArch64::STURHHi:
279
10.5M
  case AArch64::STRWui:
280
10.5M
  case AArch64::STURWi:
281
10.5M
  case AArch64::STRXui:
282
10.5M
  case AArch64::STURXi:
283
10.5M
  case AArch64::LDRDui:
284
10.5M
  case AArch64::LDURDi:
285
10.5M
  case AArch64::LDRQui:
286
10.5M
  case AArch64::LDURQi:
287
10.5M
  case AArch64::LDRWui:
288
10.5M
  case AArch64::LDURWi:
289
10.5M
  case AArch64::LDRXui:
290
10.5M
  case AArch64::LDURXi:
291
10.5M
  case AArch64::STRSui:
292
10.5M
  case AArch64::STURSi:
293
10.5M
  case AArch64::LDRSui:
294
10.5M
  case AArch64::LDURSi:
295
10.5M
    return Opc;
296
10.5M
  case AArch64::LDRSWui:
297
316k
    return AArch64::LDRWui;
298
10.5M
  case AArch64::LDURSWi:
299
6.62k
    return AArch64::LDURWi;
300
17.4M
  }
301
17.4M
}
302
303
7.74k
static unsigned getMatchingWideOpcode(unsigned Opc) {
304
7.74k
  switch (Opc) {
305
7.74k
  default:
306
0
    llvm_unreachable("Opcode has no wide equivalent!");
307
7.74k
  case AArch64::STRBBui:
308
0
    return AArch64::STRHHui;
309
7.74k
  case AArch64::STRHHui:
310
1
    return AArch64::STRWui;
311
7.74k
  case AArch64::STURBBi:
312
0
    return AArch64::STURHHi;
313
7.74k
  case AArch64::STURHHi:
314
0
    return AArch64::STURWi;
315
7.74k
  case AArch64::STURWi:
316
35
    return AArch64::STURXi;
317
7.74k
  case AArch64::STRWui:
318
7.70k
    return AArch64::STRXui;
319
7.74k
  }
320
7.74k
}
321
322
625k
static unsigned getMatchingPairOpcode(unsigned Opc) {
323
625k
  switch (Opc) {
324
625k
  default:
325
0
    llvm_unreachable("Opcode has no pairwise equivalent!");
326
625k
  case AArch64::STRSui:
327
6.71k
  case AArch64::STURSi:
328
6.71k
    return AArch64::STPSi;
329
16.2k
  case AArch64::STRDui:
330
16.2k
  case AArch64::STURDi:
331
16.2k
    return AArch64::STPDi;
332
136k
  case AArch64::STRQui:
333
136k
  case AArch64::STURQi:
334
136k
    return AArch64::STPQi;
335
136k
  case AArch64::STRWui:
336
96.5k
  case AArch64::STURWi:
337
96.5k
    return AArch64::STPWi;
338
110k
  case AArch64::STRXui:
339
110k
  case AArch64::STURXi:
340
110k
    return AArch64::STPXi;
341
110k
  case AArch64::LDRSui:
342
9.88k
  case AArch64::LDURSi:
343
9.88k
    return AArch64::LDPSi;
344
18.3k
  case AArch64::LDRDui:
345
18.3k
  case AArch64::LDURDi:
346
18.3k
    return AArch64::LDPDi;
347
30.4k
  case AArch64::LDRQui:
348
30.4k
  case AArch64::LDURQi:
349
30.4k
    return AArch64::LDPQi;
350
61.8k
  case AArch64::LDRWui:
351
61.8k
  case AArch64::LDURWi:
352
61.8k
    return AArch64::LDPWi;
353
128k
  case AArch64::LDRXui:
354
128k
  case AArch64::LDURXi:
355
128k
    return AArch64::LDPXi;
356
128k
  case AArch64::LDRSWui:
357
10.6k
  case AArch64::LDURSWi:
358
10.6k
    return AArch64::LDPSWi;
359
625k
  }
360
625k
}
361
362
static unsigned isMatchingStore(MachineInstr &LoadInst,
363
586k
                                MachineInstr &StoreInst) {
364
586k
  unsigned LdOpc = LoadInst.getOpcode();
365
586k
  unsigned StOpc = StoreInst.getOpcode();
366
586k
  switch (LdOpc) {
367
586k
  default:
368
0
    llvm_unreachable("Unsupported load instruction!");
369
586k
  case AArch64::LDRBBui:
370
38.8k
    return StOpc == AArch64::STRBBui || 
StOpc == AArch64::STRHHui29.0k
||
371
38.8k
           
StOpc == AArch64::STRWui27.0k
||
StOpc == AArch64::STRXui18.6k
;
372
586k
  case AArch64::LDURBBi:
373
700
    return StOpc == AArch64::STURBBi || 
StOpc == AArch64::STURHHi664
||
374
700
           
StOpc == AArch64::STURWi661
||
StOpc == AArch64::STURXi630
;
375
586k
  case AArch64::LDRHHui:
376
46.5k
    return StOpc == AArch64::STRHHui || 
StOpc == AArch64::STRWui5.64k
||
377
46.5k
           
StOpc == AArch64::STRXui3.55k
;
378
586k
  case AArch64::LDURHHi:
379
1.86k
    return StOpc == AArch64::STURHHi || 
StOpc == AArch64::STURWi1.80k
||
380
1.86k
           
StOpc == AArch64::STURXi1.75k
;
381
586k
  case AArch64::LDRWui:
382
147k
    return StOpc == AArch64::STRWui || 
StOpc == AArch64::STRXui99.9k
;
383
586k
  case AArch64::LDURWi:
384
5.37k
    return StOpc == AArch64::STURWi || 
StOpc == AArch64::STURXi4.57k
;
385
586k
  case AArch64::LDRXui:
386
341k
    return StOpc == AArch64::STRXui;
387
586k
  case AArch64::LDURXi:
388
5.23k
    return StOpc == AArch64::STURXi;
389
586k
  }
390
586k
}
391
392
218
static unsigned getPreIndexedOpcode(unsigned Opc) {
393
218
  // FIXME: We don't currently support creating pre-indexed loads/stores when
394
218
  // the load or store is the unscaled version.  If we decide to perform such an
395
218
  // optimization in the future the cases for the unscaled loads/stores will
396
218
  // need to be added here.
397
218
  switch (Opc) {
398
218
  default:
399
0
    llvm_unreachable("Opcode has no pre-indexed equivalent!");
400
218
  case AArch64::STRSui:
401
6
    return AArch64::STRSpre;
402
218
  case AArch64::STRDui:
403
8
    return AArch64::STRDpre;
404
218
  case AArch64::STRQui:
405
9
    return AArch64::STRQpre;
406
218
  case AArch64::STRBBui:
407
2
    return AArch64::STRBBpre;
408
218
  case AArch64::STRHHui:
409
2
    return AArch64::STRHHpre;
410
218
  case AArch64::STRWui:
411
9
    return AArch64::STRWpre;
412
218
  case AArch64::STRXui:
413
37
    return AArch64::STRXpre;
414
218
  case AArch64::LDRSui:
415
7
    return AArch64::LDRSpre;
416
218
  case AArch64::LDRDui:
417
6
    return AArch64::LDRDpre;
418
218
  case AArch64::LDRQui:
419
14
    return AArch64::LDRQpre;
420
218
  case AArch64::LDRBBui:
421
38
    return AArch64::LDRBBpre;
422
218
  case AArch64::LDRHHui:
423
6
    return AArch64::LDRHHpre;
424
218
  case AArch64::LDRWui:
425
7
    return AArch64::LDRWpre;
426
218
  case AArch64::LDRXui:
427
31
    return AArch64::LDRXpre;
428
218
  case AArch64::LDRSWui:
429
0
    return AArch64::LDRSWpre;
430
218
  case AArch64::LDPSi:
431
0
    return AArch64::LDPSpre;
432
218
  case AArch64::LDPSWi:
433
0
    return AArch64::LDPSWpre;
434
218
  case AArch64::LDPDi:
435
0
    return AArch64::LDPDpre;
436
218
  case AArch64::LDPQi:
437
2
    return AArch64::LDPQpre;
438
218
  case AArch64::LDPWi:
439
2
    return AArch64::LDPWpre;
440
218
  case AArch64::LDPXi:
441
0
    return AArch64::LDPXpre;
442
218
  case AArch64::STPSi:
443
1
    return AArch64::STPSpre;
444
218
  case AArch64::STPDi:
445
0
    return AArch64::STPDpre;
446
218
  case AArch64::STPQi:
447
1
    return AArch64::STPQpre;
448
218
  case AArch64::STPWi:
449
2
    return AArch64::STPWpre;
450
218
  case AArch64::STPXi:
451
28
    return AArch64::STPXpre;
452
218
  }
453
218
}
454
455
833
static unsigned getPostIndexedOpcode(unsigned Opc) {
456
833
  switch (Opc) {
457
833
  default:
458
0
    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
459
833
  case AArch64::STRSui:
460
18
  case AArch64::STURSi:
461
18
    return AArch64::STRSpost;
462
44
  case AArch64::STRDui:
463
44
  case AArch64::STURDi:
464
44
    return AArch64::STRDpost;
465
44
  case AArch64::STRQui:
466
44
  case AArch64::STURQi:
467
44
    return AArch64::STRQpost;
468
44
  case AArch64::STRBBui:
469
10
    return AArch64::STRBBpost;
470
44
  case AArch64::STRHHui:
471
20
    return AArch64::STRHHpost;
472
87
  case AArch64::STRWui:
473
87
  case AArch64::STURWi:
474
87
    return AArch64::STRWpost;
475
87
  case AArch64::STRXui:
476
61
  case AArch64::STURXi:
477
61
    return AArch64::STRXpost;
478
61
  case AArch64::LDRSui:
479
28
  case AArch64::LDURSi:
480
28
    return AArch64::LDRSpost;
481
114
  case AArch64::LDRDui:
482
114
  case AArch64::LDURDi:
483
114
    return AArch64::LDRDpost;
484
114
  case AArch64::LDRQui:
485
87
  case AArch64::LDURQi:
486
87
    return AArch64::LDRQpost;
487
87
  case AArch64::LDRBBui:
488
31
    return AArch64::LDRBBpost;
489
87
  case AArch64::LDRHHui:
490
42
    return AArch64::LDRHHpost;
491
87
  case AArch64::LDRWui:
492
30
  case AArch64::LDURWi:
493
30
    return AArch64::LDRWpost;
494
30
  case AArch64::LDRXui:
495
20
  case AArch64::LDURXi:
496
20
    return AArch64::LDRXpost;
497
20
  case AArch64::LDRSWui:
498
0
    return AArch64::LDRSWpost;
499
20
  case AArch64::LDPSi:
500
2
    return AArch64::LDPSpost;
501
20
  case AArch64::LDPSWi:
502
1
    return AArch64::LDPSWpost;
503
47
  case AArch64::LDPDi:
504
47
    return AArch64::LDPDpost;
505
46
  case AArch64::LDPQi:
506
46
    return AArch64::LDPQpost;
507
20
  case AArch64::LDPWi:
508
4
    return AArch64::LDPWpost;
509
20
  case AArch64::LDPXi:
510
5
    return AArch64::LDPXpost;
511
20
  case AArch64::STPSi:
512
2
    return AArch64::STPSpost;
513
20
  case AArch64::STPDi:
514
4
    return AArch64::STPDpost;
515
51
  case AArch64::STPQi:
516
51
    return AArch64::STPQpost;
517
20
  case AArch64::STPWi:
518
3
    return AArch64::STPWpost;
519
32
  case AArch64::STPXi:
520
32
    return AArch64::STPXpost;
521
833
  }
522
833
}
523
524
92.3M
static bool isPairedLdSt(const MachineInstr &MI) {
525
92.3M
  switch (MI.getOpcode()) {
526
92.3M
  default:
527
66.3M
    return false;
528
92.3M
  case AArch64::LDPSi:
529
26.0M
  case AArch64::LDPSWi:
530
26.0M
  case AArch64::LDPDi:
531
26.0M
  case AArch64::LDPQi:
532
26.0M
  case AArch64::LDPWi:
533
26.0M
  case AArch64::LDPXi:
534
26.0M
  case AArch64::STPSi:
535
26.0M
  case AArch64::STPDi:
536
26.0M
  case AArch64::STPQi:
537
26.0M
  case AArch64::STPWi:
538
26.0M
  case AArch64::STPXi:
539
26.0M
    return true;
540
92.3M
  }
541
92.3M
}
542
543
static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
544
15.6M
                                          unsigned PairedRegOp = 0) {
545
15.6M
  assert(PairedRegOp < 2 && "Unexpected register operand idx.");
546
15.6M
  unsigned Idx = isPairedLdSt(MI) ? 
PairedRegOp4.95M
:
010.6M
;
547
15.6M
  return MI.getOperand(Idx);
548
15.6M
}
549
550
25.4M
static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
551
25.4M
  unsigned Idx = isPairedLdSt(MI) ? 
26.95M
:
118.5M
;
552
25.4M
  return MI.getOperand(Idx);
553
25.4M
}
554
555
43.1M
static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
556
43.1M
  unsigned Idx = isPairedLdSt(MI) ? 
311.5M
:
231.5M
;
557
43.1M
  return MI.getOperand(Idx);
558
43.1M
}
559
560
static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
561
                                  MachineInstr &StoreInst,
562
64.8k
                                  const AArch64InstrInfo *TII) {
563
64.8k
  assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
564
64.8k
  int LoadSize = getMemScale(LoadInst);
565
64.8k
  int StoreSize = getMemScale(StoreInst);
566
64.8k
  int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
567
64.8k
                             ? 
getLdStOffsetOp(StoreInst).getImm()1.07k
568
64.8k
                             : 
getLdStOffsetOp(StoreInst).getImm() * StoreSize63.7k
;
569
64.8k
  int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
570
64.8k
                             ? 
getLdStOffsetOp(LoadInst).getImm()1.07k
571
64.8k
                             : 
getLdStOffsetOp(LoadInst).getImm() * LoadSize63.7k
;
572
64.8k
  return (UnscaledStOffset <= UnscaledLdOffset) &&
573
64.8k
         
(UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize))44.3k
;
574
64.8k
}
575
576
32.4M
static bool isPromotableZeroStoreInst(MachineInstr &MI) {
577
32.4M
  unsigned Opc = MI.getOpcode();
578
32.4M
  return (Opc == AArch64::STRWui || 
Opc == AArch64::STURWi31.7M
||
579
32.4M
          
isNarrowStore(Opc)31.6M
) &&
580
32.4M
         
getLdStRegOp(MI).getReg() == AArch64::WZR1.26M
;
581
32.4M
}
582
583
29.6M
static bool isPromotableLoadFromStore(MachineInstr &MI) {
584
29.6M
  switch (MI.getOpcode()) {
585
29.6M
  default:
586
27.0M
    return false;
587
29.6M
  // Scaled instructions.
588
29.6M
  case AArch64::LDRBBui:
589
2.61M
  case AArch64::LDRHHui:
590
2.61M
  case AArch64::LDRWui:
591
2.61M
  case AArch64::LDRXui:
592
2.61M
  // Unscaled instructions.
593
2.61M
  case AArch64::LDURBBi:
594
2.61M
  case AArch64::LDURHHi:
595
2.61M
  case AArch64::LDURWi:
596
2.61M
  case AArch64::LDURXi:
597
2.61M
    return true;
598
29.6M
  }
599
29.6M
}
600
601
29.4M
static bool isMergeableLdStUpdate(MachineInstr &MI) {
602
29.4M
  unsigned Opc = MI.getOpcode();
603
29.4M
  switch (Opc) {
604
29.4M
  default:
605
22.8M
    return false;
606
29.4M
  // Scaled instructions.
607
29.4M
  case AArch64::STRSui:
608
6.64M
  case AArch64::STRDui:
609
6.64M
  case AArch64::STRQui:
610
6.64M
  case AArch64::STRXui:
611
6.64M
  case AArch64::STRWui:
612
6.64M
  case AArch64::STRHHui:
613
6.64M
  case AArch64::STRBBui:
614
6.64M
  case AArch64::LDRSui:
615
6.64M
  case AArch64::LDRDui:
616
6.64M
  case AArch64::LDRQui:
617
6.64M
  case AArch64::LDRXui:
618
6.64M
  case AArch64::LDRWui:
619
6.64M
  case AArch64::LDRHHui:
620
6.64M
  case AArch64::LDRBBui:
621
6.64M
  // Unscaled instructions.
622
6.64M
  case AArch64::STURSi:
623
6.64M
  case AArch64::STURDi:
624
6.64M
  case AArch64::STURQi:
625
6.64M
  case AArch64::STURWi:
626
6.64M
  case AArch64::STURXi:
627
6.64M
  case AArch64::LDURSi:
628
6.64M
  case AArch64::LDURDi:
629
6.64M
  case AArch64::LDURQi:
630
6.64M
  case AArch64::LDURWi:
631
6.64M
  case AArch64::LDURXi:
632
6.64M
  // Paired instructions.
633
6.64M
  case AArch64::LDPSi:
634
6.64M
  case AArch64::LDPSWi:
635
6.64M
  case AArch64::LDPDi:
636
6.64M
  case AArch64::LDPQi:
637
6.64M
  case AArch64::LDPWi:
638
6.64M
  case AArch64::LDPXi:
639
6.64M
  case AArch64::STPSi:
640
6.64M
  case AArch64::STPDi:
641
6.64M
  case AArch64::STPQi:
642
6.64M
  case AArch64::STPWi:
643
6.64M
  case AArch64::STPXi:
644
6.64M
    // Make sure this is a reg+imm (as opposed to an address reloc).
645
6.64M
    if (!getLdStOffsetOp(MI).isImm())
646
418k
      return false;
647
6.22M
648
6.22M
    return true;
649
29.4M
  }
650
29.4M
}
651
652
MachineBasicBlock::iterator
653
AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
654
                                           MachineBasicBlock::iterator MergeMI,
655
7.74k
                                           const LdStPairFlags &Flags) {
656
7.74k
  assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
657
7.74k
         "Expected promotable zero stores.");
658
7.74k
659
7.74k
  MachineBasicBlock::iterator NextI = I;
660
7.74k
  ++NextI;
661
7.74k
  // If NextI is the second of the two instructions to be merged, we need
662
7.74k
  // to skip one further. Either way we merge will invalidate the iterator,
663
7.74k
  // and we don't need to scan the new instruction, as it's a pairwise
664
7.74k
  // instruction, which we're not considering for further action anyway.
665
7.74k
  if (NextI == MergeMI)
666
7.62k
    ++NextI;
667
7.74k
668
7.74k
  unsigned Opc = I->getOpcode();
669
7.74k
  bool IsScaled = !TII->isUnscaledLdSt(Opc);
670
7.74k
  int OffsetStride = IsScaled ? 
17.70k
:
getMemScale(*I)35
;
671
7.74k
672
7.74k
  bool MergeForward = Flags.getMergeForward();
673
7.74k
  // Insert our new paired instruction after whichever of the paired
674
7.74k
  // instructions MergeForward indicates.
675
7.74k
  MachineBasicBlock::iterator InsertionPoint = MergeForward ? 
MergeMI0
: I;
676
7.74k
  // Also based on MergeForward is from where we copy the base register operand
677
7.74k
  // so we get the flags compatible with the input code.
678
7.74k
  const MachineOperand &BaseRegOp =
679
7.74k
      MergeForward ? 
getLdStBaseOp(*MergeMI)0
: getLdStBaseOp(*I);
680
7.74k
681
7.74k
  // Which register is Rt and which is Rt2 depends on the offset order.
682
7.74k
  MachineInstr *RtMI;
683
7.74k
  if (getLdStOffsetOp(*I).getImm() ==
684
7.74k
      getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
685
984
    RtMI = &*MergeMI;
686
6.75k
  else
687
6.75k
    RtMI = &*I;
688
7.74k
689
7.74k
  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
690
7.74k
  // Change the scaled offset from small to large type.
691
7.74k
  if (IsScaled) {
692
7.70k
    assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
693
7.70k
    OffsetImm /= 2;
694
7.70k
  }
695
7.74k
696
7.74k
  // Construct the new instruction.
697
7.74k
  DebugLoc DL = I->getDebugLoc();
698
7.74k
  MachineBasicBlock *MBB = I->getParent();
699
7.74k
  MachineInstrBuilder MIB;
700
7.74k
  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
701
7.74k
            .addReg(isNarrowStore(Opc) ? 
AArch64::WZR1
:
AArch64::XZR7.73k
)
702
7.74k
            .add(BaseRegOp)
703
7.74k
            .addImm(OffsetImm)
704
7.74k
            .cloneMergedMemRefs({&*I, &*MergeMI})
705
7.74k
            .setMIFlags(I->mergeFlagsWith(*MergeMI));
706
7.74k
  (void)MIB;
707
7.74k
708
7.74k
  LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
709
7.74k
  LLVM_DEBUG(I->print(dbgs()));
710
7.74k
  LLVM_DEBUG(dbgs() << "    ");
711
7.74k
  LLVM_DEBUG(MergeMI->print(dbgs()));
712
7.74k
  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
713
7.74k
  LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
714
7.74k
  LLVM_DEBUG(dbgs() << "\n");
715
7.74k
716
7.74k
  // Erase the old instructions.
717
7.74k
  I->eraseFromParent();
718
7.74k
  MergeMI->eraseFromParent();
719
7.74k
  return NextI;
720
7.74k
}
721
722
MachineBasicBlock::iterator
723
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
724
                                      MachineBasicBlock::iterator Paired,
725
214k
                                      const LdStPairFlags &Flags) {
726
214k
  MachineBasicBlock::iterator NextI = I;
727
214k
  ++NextI;
728
214k
  // If NextI is the second of the two instructions to be merged, we need
729
214k
  // to skip one further. Either way we merge will invalidate the iterator,
730
214k
  // and we don't need to scan the new instruction, as it's a pairwise
731
214k
  // instruction, which we're not considering for further action anyway.
732
214k
  if (NextI == Paired)
733
193k
    ++NextI;
734
214k
735
214k
  int SExtIdx = Flags.getSExtIdx();
736
214k
  unsigned Opc =
737
214k
      SExtIdx == -1 ? 
I->getOpcode()208k
:
getMatchingNonSExtOpcode(I->getOpcode())5.51k
;
738
214k
  bool IsUnscaled = TII->isUnscaledLdSt(Opc);
739
214k
  int OffsetStride = IsUnscaled ? 
getMemScale(*I)19.4k
:
1194k
;
740
214k
741
214k
  bool MergeForward = Flags.getMergeForward();
742
214k
  // Insert our new paired instruction after whichever of the paired
743
214k
  // instructions MergeForward indicates.
744
214k
  MachineBasicBlock::iterator InsertionPoint = MergeForward ? 
Paired7.41k
:
I206k
;
745
214k
  // Also based on MergeForward is from where we copy the base register operand
746
214k
  // so we get the flags compatible with the input code.
747
214k
  const MachineOperand &BaseRegOp =
748
214k
      MergeForward ? 
getLdStBaseOp(*Paired)7.41k
:
getLdStBaseOp(*I)206k
;
749
214k
750
214k
  int Offset = getLdStOffsetOp(*I).getImm();
751
214k
  int PairedOffset = getLdStOffsetOp(*Paired).getImm();
752
214k
  bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
753
214k
  if (IsUnscaled != PairedIsUnscaled) {
754
14.3k
    // We're trying to pair instructions that differ in how they are scaled.  If
755
14.3k
    // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
756
14.3k
    // the opposite (i.e., make Paired's offset unscaled).
757
14.3k
    int MemSize = getMemScale(*Paired);
758
14.3k
    if (PairedIsUnscaled) {
759
229
      // If the unscaled offset isn't a multiple of the MemSize, we can't
760
229
      // pair the operations together.
761
229
      assert(!(PairedOffset % getMemScale(*Paired)) &&
762
229
             "Offset should be a multiple of the stride!");
763
229
      PairedOffset /= MemSize;
764
14.1k
    } else {
765
14.1k
      PairedOffset *= MemSize;
766
14.1k
    }
767
14.3k
  }
768
214k
769
214k
  // Which register is Rt and which is Rt2 depends on the offset order.
770
214k
  MachineInstr *RtMI, *Rt2MI;
771
214k
  if (Offset == PairedOffset + OffsetStride) {
772
29.0k
    RtMI = &*Paired;
773
29.0k
    Rt2MI = &*I;
774
29.0k
    // Here we swapped the assumption made for SExtIdx.
775
29.0k
    // I.e., we turn ldp I, Paired into ldp Paired, I.
776
29.0k
    // Update the index accordingly.
777
29.0k
    if (SExtIdx != -1)
778
28
      SExtIdx = (SExtIdx + 1) % 2;
779
185k
  } else {
780
185k
    RtMI = &*I;
781
185k
    Rt2MI = &*Paired;
782
185k
  }
783
214k
  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
784
214k
  // Scale the immediate offset, if necessary.
785
214k
  if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
786
19.6k
    assert(!(OffsetImm % getMemScale(*RtMI)) &&
787
19.6k
           "Unscaled offset cannot be scaled.");
788
19.6k
    OffsetImm /= getMemScale(*RtMI);
789
19.6k
  }
790
214k
791
214k
  // Construct the new instruction.
792
214k
  MachineInstrBuilder MIB;
793
214k
  DebugLoc DL = I->getDebugLoc();
794
214k
  MachineBasicBlock *MBB = I->getParent();
795
214k
  MachineOperand RegOp0 = getLdStRegOp(*RtMI);
796
214k
  MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
797
214k
  // Kill flags may become invalid when moving stores for pairing.
798
214k
  if (RegOp0.isUse()) {
799
115k
    if (!MergeForward) {
800
108k
      // Clear kill flags on store if moving upwards. Example:
801
108k
      //   STRWui %w0, ...
802
108k
      //   USE %w1
803
108k
      //   STRWui kill %w1  ; need to clear kill flag when moving STRWui upwards
804
108k
      RegOp0.setIsKill(false);
805
108k
      RegOp1.setIsKill(false);
806
108k
    } else {
807
6.90k
      // Clear kill flags of the first stores register. Example:
808
6.90k
      //   STRWui %w1, ...
809
6.90k
      //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
810
6.90k
      //   STRW %w0
811
6.90k
      unsigned Reg = getLdStRegOp(*I).getReg();
812
6.90k
      for (MachineInstr &MI : make_range(std::next(I), Paired))
813
15.7k
        MI.clearRegisterKills(Reg, TRI);
814
6.90k
    }
815
115k
  }
816
214k
  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
817
214k
            .add(RegOp0)
818
214k
            .add(RegOp1)
819
214k
            .add(BaseRegOp)
820
214k
            .addImm(OffsetImm)
821
214k
            .cloneMergedMemRefs({&*I, &*Paired})
822
214k
            .setMIFlags(I->mergeFlagsWith(*Paired));
823
214k
824
214k
  (void)MIB;
825
214k
826
214k
  LLVM_DEBUG(
827
214k
      dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
828
214k
  LLVM_DEBUG(I->print(dbgs()));
829
214k
  LLVM_DEBUG(dbgs() << "    ");
830
214k
  LLVM_DEBUG(Paired->print(dbgs()));
831
214k
  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
832
214k
  if (SExtIdx != -1) {
833
5.51k
    // Generate the sign extension for the proper result of the ldp.
834
5.51k
    // I.e., with X1, that would be:
835
5.51k
    // %w1 = KILL %w1, implicit-def %x1
836
5.51k
    // %x1 = SBFMXri killed %x1, 0, 31
837
5.51k
    MachineOperand &DstMO = MIB->getOperand(SExtIdx);
838
5.51k
    // Right now, DstMO has the extended register, since it comes from an
839
5.51k
    // extended opcode.
840
5.51k
    unsigned DstRegX = DstMO.getReg();
841
5.51k
    // Get the W variant of that register.
842
5.51k
    unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
843
5.51k
    // Update the result of LDP to use the W instead of the X variant.
844
5.51k
    DstMO.setReg(DstRegW);
845
5.51k
    LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
846
5.51k
    LLVM_DEBUG(dbgs() << "\n");
847
5.51k
    // Make the machine verifier happy by providing a definition for
848
5.51k
    // the X register.
849
5.51k
    // Insert this definition right after the generated LDP, i.e., before
850
5.51k
    // InsertionPoint.
851
5.51k
    MachineInstrBuilder MIBKill =
852
5.51k
        BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
853
5.51k
            .addReg(DstRegW)
854
5.51k
            .addReg(DstRegX, RegState::Define);
855
5.51k
    MIBKill->getOperand(2).setImplicit();
856
5.51k
    // Create the sign extension.
857
5.51k
    MachineInstrBuilder MIBSXTW =
858
5.51k
        BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
859
5.51k
            .addReg(DstRegX)
860
5.51k
            .addImm(0)
861
5.51k
            .addImm(31);
862
5.51k
    (void)MIBSXTW;
863
5.51k
    LLVM_DEBUG(dbgs() << "  Extend operand:\n    ");
864
5.51k
    LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
865
208k
  } else {
866
208k
    LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
867
208k
  }
868
214k
  LLVM_DEBUG(dbgs() << "\n");
869
214k
870
214k
  // Erase the old instructions.
871
214k
  I->eraseFromParent();
872
214k
  Paired->eraseFromParent();
873
214k
874
214k
  return NextI;
875
214k
}
876
877
MachineBasicBlock::iterator
878
AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
879
226
                                          MachineBasicBlock::iterator StoreI) {
880
226
  MachineBasicBlock::iterator NextI = LoadI;
881
226
  ++NextI;
882
226
883
226
  int LoadSize = getMemScale(*LoadI);
884
226
  int StoreSize = getMemScale(*StoreI);
885
226
  unsigned LdRt = getLdStRegOp(*LoadI).getReg();
886
226
  const MachineOperand &StMO = getLdStRegOp(*StoreI);
887
226
  unsigned StRt = getLdStRegOp(*StoreI).getReg();
888
226
  bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
889
226
890
226
  assert((IsStoreXReg ||
891
226
          TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
892
226
         "Unexpected RegClass");
893
226
894
226
  MachineInstr *BitExtMI;
895
226
  if (LoadSize == StoreSize && 
(166
LoadSize == 4166
||
LoadSize == 895
)) {
896
150
    // Remove the load, if the destination register of the loads is the same
897
150
    // register for stored value.
898
150
    if (StRt == LdRt && 
LoadSize == 888
) {
899
41
      for (MachineInstr &MI : make_range(StoreI->getIterator(),
900
44
                                         LoadI->getIterator())) {
901
44
        if (MI.killsRegister(StRt, TRI)) {
902
41
          MI.clearRegisterKills(StRt, TRI);
903
41
          break;
904
41
        }
905
44
      }
906
41
      LLVM_DEBUG(dbgs() << "Remove load instruction:\n    ");
907
41
      LLVM_DEBUG(LoadI->print(dbgs()));
908
41
      LLVM_DEBUG(dbgs() << "\n");
909
41
      LoadI->eraseFromParent();
910
41
      return NextI;
911
41
    }
912
109
    // Replace the load with a mov if the load and store are in the same size.
913
109
    BitExtMI =
914
109
        BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
915
109
                TII->get(IsStoreXReg ? 
AArch64::ORRXrs38
:
AArch64::ORRWrs71
), LdRt)
916
109
            .addReg(IsStoreXReg ? 
AArch64::XZR38
:
AArch64::WZR71
)
917
109
            .add(StMO)
918
109
            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
919
109
            .setMIFlags(LoadI->getFlags());
920
109
  } else {
921
76
    // FIXME: Currently we disable this transformation in big-endian targets as
922
76
    // performance and correctness are verified only in little-endian.
923
76
    if (!Subtarget->isLittleEndian())
924
3
      return NextI;
925
73
    bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
926
73
    assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
927
73
           "Unsupported ld/st match");
928
73
    assert(LoadSize <= StoreSize && "Invalid load size");
929
73
    int UnscaledLdOffset = IsUnscaled
930
73
                               ? 
getLdStOffsetOp(*LoadI).getImm()17
931
73
                               : 
getLdStOffsetOp(*LoadI).getImm() * LoadSize56
;
932
73
    int UnscaledStOffset = IsUnscaled
933
73
                               ? 
getLdStOffsetOp(*StoreI).getImm()17
934
73
                               : 
getLdStOffsetOp(*StoreI).getImm() * StoreSize56
;
935
73
    int Width = LoadSize * 8;
936
73
    unsigned DestReg = IsStoreXReg
937
73
                           ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
938
32
                                                      &AArch64::GPR64RegClass)
939
73
                           : 
LdRt41
;
940
73
941
73
    assert((UnscaledLdOffset >= UnscaledStOffset &&
942
73
            (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
943
73
           "Invalid offset");
944
73
945
73
    int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
946
73
    int Imms = Immr + Width - 1;
947
73
    if (UnscaledLdOffset == UnscaledStOffset) {
948
34
      uint32_t AndMaskEncoded = ((IsStoreXReg ? 
19
:
025
) << 12) // N
949
34
                                | ((Immr) << 6)               // immr
950
34
                                | ((Imms) << 0)               // imms
951
34
          ;
952
34
953
34
      BitExtMI =
954
34
          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
955
34
                  TII->get(IsStoreXReg ? 
AArch64::ANDXri9
:
AArch64::ANDWri25
),
956
34
                  DestReg)
957
34
              .add(StMO)
958
34
              .addImm(AndMaskEncoded)
959
34
              .setMIFlags(LoadI->getFlags());
960
39
    } else {
961
39
      BitExtMI =
962
39
          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
963
39
                  TII->get(IsStoreXReg ? 
AArch64::UBFMXri23
:
AArch64::UBFMWri16
),
964
39
                  DestReg)
965
39
              .add(StMO)
966
39
              .addImm(Immr)
967
39
              .addImm(Imms)
968
39
              .setMIFlags(LoadI->getFlags());
969
39
    }
970
73
  }
971
226
972
226
  // Clear kill flags between store and load.
973
226
  for (MachineInstr &MI : make_range(StoreI->getIterator(),
974
182
                                     BitExtMI->getIterator()))
975
263
    if (MI.killsRegister(StRt, TRI)) {
976
163
      MI.clearRegisterKills(StRt, TRI);
977
163
      break;
978
163
    }
979
182
980
182
  LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n    ");
981
182
  LLVM_DEBUG(StoreI->print(dbgs()));
982
182
  LLVM_DEBUG(dbgs() << "    ");
983
182
  LLVM_DEBUG(LoadI->print(dbgs()));
984
182
  LLVM_DEBUG(dbgs() << "  with instructions:\n    ");
985
182
  LLVM_DEBUG(StoreI->print(dbgs()));
986
182
  LLVM_DEBUG(dbgs() << "    ");
987
182
  LLVM_DEBUG((BitExtMI)->print(dbgs()));
988
182
  LLVM_DEBUG(dbgs() << "\n");
989
182
990
182
  // Erase the old instructions.
991
182
  LoadI->eraseFromParent();
992
182
  return NextI;
993
226
}
994
995
3.45M
static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
996
3.45M
  // Convert the byte-offset used by unscaled into an "element" offset used
997
3.45M
  // by the scaled pair load/store instructions.
998
3.45M
  if (IsUnscaled) {
999
199k
    // If the byte-offset isn't a multiple of the stride, there's no point
1000
199k
    // trying to match it.
1001
199k
    if (Offset % OffsetStride)
1002
89.6k
      return false;
1003
110k
    Offset /= OffsetStride;
1004
110k
  }
1005
3.45M
  
return 3.36M
Offset <= 633.36M
&&
Offset >= -643.02M
;
1006
3.45M
}
1007
1008
// Do alignment, specialized to power of 2 and for signed ints,
1009
// avoiding having to do a C-style cast from uint_64t to int when
1010
// using alignTo from include/llvm/Support/MathExtras.h.
1011
// FIXME: Move this function to include/MathExtras.h?
1012
34.8k
static int alignTo(int Num, int PowOf2) {
1013
34.8k
  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1014
34.8k
}
1015
1016
static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
1017
626k
                     AliasAnalysis *AA) {
1018
626k
  // One of the instructions must modify memory.
1019
626k
  if (!MIa.mayStore() && 
!MIb.mayStore()601k
)
1020
6.82k
    return false;
1021
619k
1022
619k
  // Both instructions must be memory operations.
1023
619k
  if (!MIa.mayLoadOrStore() && 
!MIb.mayLoadOrStore()0
)
1024
0
    return false;
1025
619k
1026
619k
  return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
1027
619k
}
1028
1029
static bool mayAlias(MachineInstr &MIa,
1030
                     SmallVectorImpl<MachineInstr *> &MemInsns,
1031
235k
                     AliasAnalysis *AA) {
1032
235k
  for (MachineInstr *MIb : MemInsns)
1033
39.5k
    if (mayAlias(MIa, *MIb, AA))
1034
12.9k
      return true;
1035
235k
1036
235k
  
return false222k
;
1037
235k
}
1038
1039
bool AArch64LoadStoreOpt::findMatchingStore(
1040
    MachineBasicBlock::iterator I, unsigned Limit,
1041
2.27M
    MachineBasicBlock::iterator &StoreI) {
1042
2.27M
  MachineBasicBlock::iterator B = I->getParent()->begin();
1043
2.27M
  MachineBasicBlock::iterator MBBI = I;
1044
2.27M
  MachineInstr &LoadMI = *I;
1045
2.27M
  unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
1046
2.27M
1047
2.27M
  // If the load is the first instruction in the block, there's obviously
1048
2.27M
  // not any matching store.
1049
2.27M
  if (MBBI == B)
1050
821k
    return false;
1051
1.45M
1052
1.45M
  // Track which register units have been modified and used between the first
1053
1.45M
  // insn and the second insn.
1054
1.45M
  ModifiedRegUnits.clear();
1055
1.45M
  UsedRegUnits.clear();
1056
1.45M
1057
1.45M
  unsigned Count = 0;
1058
4.24M
  do {
1059
4.24M
    --MBBI;
1060
4.24M
    MachineInstr &MI = *MBBI;
1061
4.24M
1062
4.24M
    // Don't count transient instructions towards the search limit since there
1063
4.24M
    // may be different numbers of them if e.g. debug information is present.
1064
4.24M
    if (!MI.isTransient())
1065
4.06M
      ++Count;
1066
4.24M
1067
4.24M
    // If the load instruction reads directly from the address to which the
1068
4.24M
    // store instruction writes and the stored value is not modified, we can
1069
4.24M
    // promote the load. Since we do not handle stores with pre-/post-index,
1070
4.24M
    // it's unnecessary to check if BaseReg is modified by the store itself.
1071
4.24M
    if (MI.mayStore() && 
isMatchingStore(LoadMI, MI)586k
&&
1072
4.24M
        
BaseReg == getLdStBaseOp(MI).getReg()297k
&&
1073
4.24M
        
isLdOffsetInRangeOfSt(LoadMI, MI, TII)64.8k
&&
1074
4.24M
        
ModifiedRegUnits.available(getLdStRegOp(MI).getReg())370
) {
1075
226
      StoreI = MBBI;
1076
226
      return true;
1077
226
    }
1078
4.24M
1079
4.24M
    if (MI.isCall())
1080
332k
      return false;
1081
3.90M
1082
3.90M
    // Update modified / uses register units.
1083
3.90M
    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1084
3.90M
1085
3.90M
    // Otherwise, if the base register is modified, we have no match, so
1086
3.90M
    // return early.
1087
3.90M
    if (!ModifiedRegUnits.available(BaseReg))
1088
385k
      return false;
1089
3.52M
1090
3.52M
    // If we encounter a store aliased with the load, return early.
1091
3.52M
    if (MI.mayStore() && 
mayAlias(LoadMI, MI, AA)586k
)
1092
213k
      return false;
1093
3.30M
  } while (MBBI != B && 
Count < Limit2.82M
);
1094
1.45M
  
return false517k
;
1095
1.45M
}
1096
1097
// Returns true if FirstMI and MI are candidates for merging or pairing.
1098
// Otherwise, returns false.
1099
static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1100
                                       LdStPairFlags &Flags,
1101
11.1M
                                       const AArch64InstrInfo *TII) {
1102
11.1M
  // If this is volatile or if pairing is suppressed, not a candidate.
1103
11.1M
  if (MI.hasOrderedMemoryRef() || 
TII->isLdStPairSuppressed(MI)10.1M
)
1104
962k
    return false;
1105
10.1M
1106
10.1M
  // We should have already checked FirstMI for pair suppression and volatility.
1107
10.1M
  assert(!FirstMI.hasOrderedMemoryRef() &&
1108
10.1M
         !TII->isLdStPairSuppressed(FirstMI) &&
1109
10.1M
         "FirstMI shouldn't get here if either of these checks are true.");
1110
10.1M
1111
10.1M
  unsigned OpcA = FirstMI.getOpcode();
1112
10.1M
  unsigned OpcB = MI.getOpcode();
1113
10.1M
1114
10.1M
  // Opcodes match: nothing more to check.
1115
10.1M
  if (OpcA == OpcB)
1116
1.46M
    return true;
1117
8.70M
1118
8.70M
  // Try to match a sign-extended load/store with a zero-extended load/store.
1119
8.70M
  bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1120
8.70M
  unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1121
8.70M
  assert(IsValidLdStrOpc &&
1122
8.70M
         "Given Opc should be a Load or Store with an immediate");
1123
8.70M
  // OpcA will be the first instruction in the pair.
1124
8.70M
  if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1125
60.7k
    Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 
150.1k
:
010.6k
);
1126
60.7k
    return true;
1127
60.7k
  }
1128
8.64M
1129
8.64M
  // If the second instruction isn't even a mergable/pairable load/store, bail
1130
8.64M
  // out.
1131
8.64M
  if (!PairIsValidLdStrOpc)
1132
6.53M
    return false;
1133
2.10M
1134
2.10M
  // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1135
2.10M
  // offsets.
1136
2.10M
  if (isNarrowStore(OpcA) || 
isNarrowStore(OpcB)2.10M
)
1137
173k
    return false;
1138
1.93M
1139
1.93M
  // Try to match an unscaled load/store with a scaled load/store.
1140
1.93M
  return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
1141
1.93M
         
getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB)205k
;
1142
1.93M
1143
1.93M
  // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1144
1.93M
}
1145
1146
/// Scan the instructions looking for a load/store that can be combined with the
1147
/// current instruction into a wider equivalent or a load/store pair.
1148
MachineBasicBlock::iterator
1149
AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1150
                                      LdStPairFlags &Flags, unsigned Limit,
1151
2.75M
                                      bool FindNarrowMerge) {
1152
2.75M
  MachineBasicBlock::iterator E = I->getParent()->end();
1153
2.75M
  MachineBasicBlock::iterator MBBI = I;
1154
2.75M
  MachineInstr &FirstMI = *I;
1155
2.75M
  ++MBBI;
1156
2.75M
1157
2.75M
  bool MayLoad = FirstMI.mayLoad();
1158
2.75M
  bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
1159
2.75M
  unsigned Reg = getLdStRegOp(FirstMI).getReg();
1160
2.75M
  unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
1161
2.75M
  int Offset = getLdStOffsetOp(FirstMI).getImm();
1162
2.75M
  int OffsetStride = IsUnscaled ? 
getMemScale(FirstMI)88.2k
:
12.66M
;
1163
2.75M
  bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1164
2.75M
1165
2.75M
  // Track which register units have been modified and used between the first
1166
2.75M
  // insn (inclusive) and the second insn.
1167
2.75M
  ModifiedRegUnits.clear();
1168
2.75M
  UsedRegUnits.clear();
1169
2.75M
1170
2.75M
  // Remember any instructions that read/write memory between FirstMI and MI.
1171
2.75M
  SmallVector<MachineInstr *, 4> MemInsns;
1172
2.75M
1173
12.7M
  for (unsigned Count = 0; MBBI != E && 
Count < Limit11.2M
;
++MBBI9.99M
) {
1174
11.1M
    MachineInstr &MI = *MBBI;
1175
11.1M
1176
11.1M
    // Don't count transient instructions towards the search limit since there
1177
11.1M
    // may be different numbers of them if e.g. debug information is present.
1178
11.1M
    if (!MI.isTransient())
1179
11.0M
      ++Count;
1180
11.1M
1181
11.1M
    Flags.setSExtIdx(-1);
1182
11.1M
    if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1183
11.1M
        
getLdStOffsetOp(MI).isImm()1.58M
) {
1184
1.58M
      assert(MI.mayLoadOrStore() && "Expected memory operation.");
1185
1.58M
      // If we've found another instruction with the same opcode, check to see
1186
1.58M
      // if the base and offset are compatible with our starting instruction.
1187
1.58M
      // These instructions all have scaled immediate operands, so we just
1188
1.58M
      // check for +1/-1. Make sure to check the new instruction offset is
1189
1.58M
      // actually an immediate and not a symbolic reference destined for
1190
1.58M
      // a relocation.
1191
1.58M
      unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
1192
1.58M
      int MIOffset = getLdStOffsetOp(MI).getImm();
1193
1.58M
      bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
1194
1.58M
      if (IsUnscaled != MIIsUnscaled) {
1195
59.8k
        // We're trying to pair instructions that differ in how they are scaled.
1196
59.8k
        // If FirstMI is scaled then scale the offset of MI accordingly.
1197
59.8k
        // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1198
59.8k
        int MemSize = getMemScale(MI);
1199
59.8k
        if (MIIsUnscaled) {
1200
25.2k
          // If the unscaled offset isn't a multiple of the MemSize, we can't
1201
25.2k
          // pair the operations together: bail and keep looking.
1202
25.2k
          if (MIOffset % MemSize) {
1203
9.04k
            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1204
9.04k
                                              UsedRegUnits, TRI);
1205
9.04k
            MemInsns.push_back(&MI);
1206
9.04k
            continue;
1207
9.04k
          }
1208
16.2k
          MIOffset /= MemSize;
1209
34.5k
        } else {
1210
34.5k
          MIOffset *= MemSize;
1211
34.5k
        }
1212
59.8k
      }
1213
1.58M
1214
1.58M
      
if (1.57M
BaseReg == MIBaseReg1.57M
&&
(987k
(Offset == MIOffset + OffsetStride)987k
||
1215
987k
                                   
(Offset + OffsetStride == MIOffset)912k
)) {
1216
338k
        int MinOffset = Offset < MIOffset ? 
Offset263k
:
MIOffset74.8k
;
1217
338k
        if (FindNarrowMerge) {
1218
10.9k
          // If the alignment requirements of the scaled wide load/store
1219
10.9k
          // instruction can't express the offset of the scaled narrow input,
1220
10.9k
          // bail and keep looking. For promotable zero stores, allow only when
1221
10.9k
          // the stored value is the same (i.e., WZR).
1222
10.9k
          if ((!IsUnscaled && 
alignTo(MinOffset, 2) != MinOffset10.9k
) ||
1223
10.9k
              
(8.59k
IsPromotableZeroStore8.59k
&&
Reg != getLdStRegOp(MI).getReg()8.59k
)) {
1224
3.11k
            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1225
3.11k
                                              UsedRegUnits, TRI);
1226
3.11k
            MemInsns.push_back(&MI);
1227
3.11k
            continue;
1228
3.11k
          }
1229
327k
        } else {
1230
327k
          // Pairwise instructions have a 7-bit signed offset field. Single
1231
327k
          // insns have a 12-bit unsigned offset field.  If the resultant
1232
327k
          // immediate offset of merging these instructions is out of range for
1233
327k
          // a pairwise instruction, bail and keep looking.
1234
327k
          if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1235
1.58k
            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1236
1.58k
                                              UsedRegUnits, TRI);
1237
1.58k
            MemInsns.push_back(&MI);
1238
1.58k
            continue;
1239
1.58k
          }
1240
325k
          // If the alignment requirements of the paired (scaled) instruction
1241
325k
          // can't express the offset of the unscaled input, bail and keep
1242
325k
          // looking.
1243
325k
          if (IsUnscaled && 
(alignTo(MinOffset, OffsetStride) != MinOffset)23.9k
) {
1244
0
            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1245
0
                                              UsedRegUnits, TRI);
1246
0
            MemInsns.push_back(&MI);
1247
0
            continue;
1248
0
          }
1249
333k
        }
1250
333k
        // If the destination register of the loads is the same register, bail
1251
333k
        // and keep looking. A load-pair instruction with both destination
1252
333k
        // registers the same is UNPREDICTABLE and will result in an exception.
1253
333k
        if (MayLoad && 
Reg == getLdStRegOp(MI).getReg()147k
) {
1254
37.0k
          LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
1255
37.0k
                                            TRI);
1256
37.0k
          MemInsns.push_back(&MI);
1257
37.0k
          continue;
1258
37.0k
        }
1259
296k
1260
296k
        // If the Rt of the second instruction was not modified or used between
1261
296k
        // the two instructions and none of the instructions between the second
1262
296k
        // and first alias with the second, we can combine the second into the
1263
296k
        // first.
1264
296k
        if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
1265
296k
            
!(225k
MI.mayLoad()225k
&&
1266
225k
              
!UsedRegUnits.available(getLdStRegOp(MI).getReg())107k
) &&
1267
296k
            
!mayAlias(MI, MemInsns, AA)222k
) {
1268
214k
          Flags.setMergeForward(false);
1269
214k
          return MBBI;
1270
214k
        }
1271
81.5k
1272
81.5k
        // Likewise, if the Rt of the first instruction is not modified or used
1273
81.5k
        // between the two instructions and none of the instructions between the
1274
81.5k
        // first and the second alias with the first, we can combine the first
1275
81.5k
        // into the second.
1276
81.5k
        if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg()) &&
1277
81.5k
            
!(21.2k
MayLoad21.2k
&&
1278
21.2k
              
!UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())8.75k
) &&
1279
81.5k
            
!mayAlias(FirstMI, MemInsns, AA)13.0k
) {
1280
7.41k
          Flags.setMergeForward(true);
1281
7.41k
          return MBBI;
1282
7.41k
        }
1283
10.8M
        // Unable to combine these instructions due to interference in between.
1284
10.8M
        // Keep looking.
1285
10.8M
      }
1286
1.57M
    }
1287
10.8M
1288
10.8M
    // If the instruction wasn't a matching load or store.  Stop searching if we
1289
10.8M
    // encounter a call instruction that might modify memory.
1290
10.8M
    if (MI.isCall())
1291
733k
      return E;
1292
10.1M
1293
10.1M
    // Update modified / uses register units.
1294
10.1M
    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1295
10.1M
1296
10.1M
    // Otherwise, if the base register is modified, we have no match, so
1297
10.1M
    // return early.
1298
10.1M
    if (!ModifiedRegUnits.available(BaseReg))
1299
189k
      return E;
1300
9.93M
1301
9.93M
    // Update list of instructions that read/write memory.
1302
9.93M
    if (MI.mayLoadOrStore())
1303
3.87M
      MemInsns.push_back(&MI);
1304
9.93M
  }
1305
2.75M
  
return E1.61M
;
1306
2.75M
}
1307
1308
MachineBasicBlock::iterator
1309
AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1310
                                     MachineBasicBlock::iterator Update,
1311
1.05k
                                     bool IsPreIdx) {
1312
1.05k
  assert((Update->getOpcode() == AArch64::ADDXri ||
1313
1.05k
          Update->getOpcode() == AArch64::SUBXri) &&
1314
1.05k
         "Unexpected base register update instruction to merge!");
1315
1.05k
  MachineBasicBlock::iterator NextI = I;
1316
1.05k
  // Return the instruction following the merged instruction, which is
1317
1.05k
  // the instruction following our unmerged load. Unless that's the add/sub
1318
1.05k
  // instruction we're merging, in which case it's the one after that.
1319
1.05k
  if (++NextI == Update)
1320
304
    ++NextI;
1321
1.05k
1322
1.05k
  int Value = Update->getOperand(2).getImm();
1323
1.05k
  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1324
1.05k
         "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1325
1.05k
  if (Update->getOpcode() == AArch64::SUBXri)
1326
103
    Value = -Value;
1327
1.05k
1328
1.05k
  unsigned NewOpc = IsPreIdx ? 
getPreIndexedOpcode(I->getOpcode())218
1329
1.05k
                             : 
getPostIndexedOpcode(I->getOpcode())833
;
1330
1.05k
  MachineInstrBuilder MIB;
1331
1.05k
  if (!isPairedLdSt(*I)) {
1332
818
    // Non-paired instruction.
1333
818
    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1334
818
              .add(getLdStRegOp(*Update))
1335
818
              .add(getLdStRegOp(*I))
1336
818
              .add(getLdStBaseOp(*I))
1337
818
              .addImm(Value)
1338
818
              .setMemRefs(I->memoperands())
1339
818
              .setMIFlags(I->mergeFlagsWith(*Update));
1340
818
  } else {
1341
233
    // Paired instruction.
1342
233
    int Scale = getMemScale(*I);
1343
233
    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1344
233
              .add(getLdStRegOp(*Update))
1345
233
              .add(getLdStRegOp(*I, 0))
1346
233
              .add(getLdStRegOp(*I, 1))
1347
233
              .add(getLdStBaseOp(*I))
1348
233
              .addImm(Value / Scale)
1349
233
              .setMemRefs(I->memoperands())
1350
233
              .setMIFlags(I->mergeFlagsWith(*Update));
1351
233
  }
1352
1.05k
  (void)MIB;
1353
1.05k
1354
1.05k
  if (IsPreIdx) {
1355
218
    ++NumPreFolded;
1356
218
    LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1357
833
  } else {
1358
833
    ++NumPostFolded;
1359
833
    LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1360
833
  }
1361
1.05k
  LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
1362
1.05k
  LLVM_DEBUG(I->print(dbgs()));
1363
1.05k
  LLVM_DEBUG(dbgs() << "    ");
1364
1.05k
  LLVM_DEBUG(Update->print(dbgs()));
1365
1.05k
  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
1366
1.05k
  LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1367
1.05k
  LLVM_DEBUG(dbgs() << "\n");
1368
1.05k
1369
1.05k
  // Erase the old instructions for the block.
1370
1.05k
  I->eraseFromParent();
1371
1.05k
  Update->eraseFromParent();
1372
1.05k
1373
1.05k
  return NextI;
1374
1.05k
}
1375
1376
bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
1377
                                               MachineInstr &MI,
1378
18.9M
                                               unsigned BaseReg, int Offset) {
1379
18.9M
  switch (MI.getOpcode()) {
1380
18.9M
  default:
1381
18.0M
    break;
1382
18.9M
  case AArch64::SUBXri:
1383
913k
  case AArch64::ADDXri:
1384
913k
    // Make sure it's a vanilla immediate operand, not a relocation or
1385
913k
    // anything else we can't handle.
1386
913k
    if (!MI.getOperand(2).isImm())
1387
241k
      break;
1388
672k
    // Watch out for 1 << 12 shifted value.
1389
672k
    if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
1390
12.0k
      break;
1391
660k
1392
660k
    // The update instruction source and destination register must be the
1393
660k
    // same as the load/store base register.
1394
660k
    if (MI.getOperand(0).getReg() != BaseReg ||
1395
660k
        
MI.getOperand(1).getReg() != BaseReg105k
)
1396
562k
      break;
1397
98.4k
1398
98.4k
    bool IsPairedInsn = isPairedLdSt(MemMI);
1399
98.4k
    int UpdateOffset = MI.getOperand(2).getImm();
1400
98.4k
    if (MI.getOpcode() == AArch64::SUBXri)
1401
270
      UpdateOffset = -UpdateOffset;
1402
98.4k
1403
98.4k
    // For non-paired load/store instructions, the immediate must fit in a
1404
98.4k
    // signed 9-bit integer.
1405
98.4k
    if (!IsPairedInsn && 
(3.50k
UpdateOffset > 2553.50k
||
UpdateOffset < -2562.37k
))
1406
1.16k
      break;
1407
97.2k
1408
97.2k
    // For paired load/store instructions, the immediate must be a multiple of
1409
97.2k
    // the scaling factor.  The scaled offset must also fit into a signed 7-bit
1410
97.2k
    // integer.
1411
97.2k
    if (IsPairedInsn) {
1412
94.9k
      int Scale = getMemScale(MemMI);
1413
94.9k
      if (UpdateOffset % Scale != 0)
1414
10
        break;
1415
94.9k
1416
94.9k
      int ScaledOffset = UpdateOffset / Scale;
1417
94.9k
      if (ScaledOffset > 63 || 
ScaledOffset < -6494.0k
)
1418
978
        break;
1419
96.2k
    }
1420
96.2k
1421
96.2k
    // If we have a non-zero Offset, we check that it matches the amount
1422
96.2k
    // we're adding to the register.
1423
96.2k
    if (!Offset || 
Offset == UpdateOffset95.2k
)
1424
1.05k
      return true;
1425
95.2k
    break;
1426
18.9M
  }
1427
18.9M
  return false;
1428
18.9M
}
1429
1430
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1431
12.2M
    MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
1432
12.2M
  MachineBasicBlock::iterator E = I->getParent()->end();
1433
12.2M
  MachineInstr &MemMI = *I;
1434
12.2M
  MachineBasicBlock::iterator MBBI = I;
1435
12.2M
1436
12.2M
  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1437
12.2M
  int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
1438
12.2M
1439
12.2M
  // Scan forward looking for post-index opportunities.  Updating instructions
1440
12.2M
  // can't be formed if the memory instruction doesn't have the offset we're
1441
12.2M
  // looking for.
1442
12.2M
  if (MIUnscaledOffset != UnscaledOffset)
1443
5.11M
    return E;
1444
7.16M
1445
7.16M
  // If the base register overlaps a destination register, we can't
1446
7.16M
  // merge the update.
1447
7.16M
  bool IsPairedInsn = isPairedLdSt(MemMI);
1448
16.3M
  for (unsigned i = 0, e = IsPairedInsn ? 
22.40M
:
14.75M
; i != e;
++i9.16M
) {
1449
9.56M
    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1450
9.56M
    if (DestReg == BaseReg || 
TRI->isSubRegister(BaseReg, DestReg)9.31M
)
1451
401k
      return E;
1452
9.56M
  }
1453
7.16M
1454
7.16M
  // Track which register units have been modified and used between the first
1455
7.16M
  // insn (inclusive) and the second insn.
1456
7.16M
  ModifiedRegUnits.clear();
1457
6.76M
  UsedRegUnits.clear();
1458
6.76M
  ++MBBI;
1459
18.4M
  for (unsigned Count = 0; MBBI != E && 
Count < Limit16.2M
;
++MBBI11.6M
) {
1460
16.2M
    MachineInstr &MI = *MBBI;
1461
16.2M
1462
16.2M
    // Don't count transient instructions towards the search limit since there
1463
16.2M
    // may be different numbers of them if e.g. debug information is present.
1464
16.2M
    if (!MI.isTransient())
1465
16.1M
      ++Count;
1466
16.2M
1467
16.2M
    // If we found a match, return it.
1468
16.2M
    if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
1469
870
      return MBBI;
1470
16.2M
1471
16.2M
    // Update the status of what the instruction clobbered and used.
1472
16.2M
    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1473
16.2M
1474
16.2M
    // Otherwise, if the base register is used or modified, we have no match, so
1475
16.2M
    // return early.
1476
16.2M
    if (!ModifiedRegUnits.available(BaseReg) ||
1477
16.2M
        
!UsedRegUnits.available(BaseReg)15.0M
)
1478
4.53M
      return E;
1479
16.2M
  }
1480
6.76M
  
return E2.22M
;
1481
6.76M
}
1482
1483
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1484
6.05M
    MachineBasicBlock::iterator I, unsigned Limit) {
1485
6.05M
  MachineBasicBlock::iterator B = I->getParent()->begin();
1486
6.05M
  MachineBasicBlock::iterator E = I->getParent()->end();
1487
6.05M
  MachineInstr &MemMI = *I;
1488
6.05M
  MachineBasicBlock::iterator MBBI = I;
1489
6.05M
1490
6.05M
  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1491
6.05M
  int Offset = getLdStOffsetOp(MemMI).getImm();
1492
6.05M
1493
6.05M
  // If the load/store is the first instruction in the block, there's obviously
1494
6.05M
  // not any matching update. Ditto if the memory offset isn't zero.
1495
6.05M
  if (MBBI == B || 
Offset != 05.02M
)
1496
5.16M
    return E;
1497
884k
  // If the base register overlaps a destination register, we can't
1498
884k
  // merge the update.
1499
884k
  bool IsPairedInsn = isPairedLdSt(MemMI);
1500
1.70M
  for (unsigned i = 0, e = IsPairedInsn ? 
271.3k
:
1812k
; i != e;
++i825k
) {
1501
953k
    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1502
953k
    if (DestReg == BaseReg || 
TRI->isSubRegister(BaseReg, DestReg)884k
)
1503
128k
      return E;
1504
953k
  }
1505
884k
1506
884k
  // Track which register units have been modified and used between the first
1507
884k
  // insn (inclusive) and the second insn.
1508
884k
  ModifiedRegUnits.clear();
1509
755k
  UsedRegUnits.clear();
1510
755k
  unsigned Count = 0;
1511
2.74M
  do {
1512
2.74M
    --MBBI;
1513
2.74M
    MachineInstr &MI = *MBBI;
1514
2.74M
1515
2.74M
    // Don't count transient instructions towards the search limit since there
1516
2.74M
    // may be different numbers of them if e.g. debug information is present.
1517
2.74M
    if (!MI.isTransient())
1518
2.71M
      ++Count;
1519
2.74M
1520
2.74M
    // If we found a match, return it.
1521
2.74M
    if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
1522
181
      return MBBI;
1523
2.74M
1524
2.74M
    // Update the status of what the instruction clobbered and used.
1525
2.74M
    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1526
2.74M
1527
2.74M
    // Otherwise, if the base register is used or modified, we have no match, so
1528
2.74M
    // return early.
1529
2.74M
    if (!ModifiedRegUnits.available(BaseReg) ||
1530
2.74M
        
!UsedRegUnits.available(BaseReg)2.43M
)
1531
547k
      return E;
1532
2.19M
  } while (MBBI != B && 
Count < Limit1.99M
);
1533
755k
  
return E208k
;
1534
755k
}
1535
1536
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1537
2.61M
    MachineBasicBlock::iterator &MBBI) {
1538
2.61M
  MachineInstr &MI = *MBBI;
1539
2.61M
  // If this is a volatile load, don't mess with it.
1540
2.61M
  if (MI.hasOrderedMemoryRef())
1541
327k
    return false;
1542
2.28M
1543
2.28M
  // Make sure this is a reg+imm.
1544
2.28M
  // FIXME: It is possible to extend it to handle reg+reg cases.
1545
2.28M
  if (!getLdStOffsetOp(MI).isImm())
1546
15.2k
    return false;
1547
2.27M
1548
2.27M
  // Look backward up to LdStLimit instructions.
1549
2.27M
  MachineBasicBlock::iterator StoreI;
1550
2.27M
  if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
1551
226
    ++NumLoadsFromStoresPromoted;
1552
226
    // Promote the load. Keeping the iterator straight is a
1553
226
    // pain, so we let the merge routine tell us what the next instruction
1554
226
    // is after it's done mucking about.
1555
226
    MBBI = promoteLoadFromStore(MBBI, StoreI);
1556
226
    return true;
1557
226
  }
1558
2.27M
  return false;
1559
2.27M
}
1560
1561
// Merge adjacent zero stores into a wider store.
1562
bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
1563
57.4k
    MachineBasicBlock::iterator &MBBI) {
1564
57.4k
  assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
1565
57.4k
  MachineInstr &MI = *MBBI;
1566
57.4k
  MachineBasicBlock::iterator E = MI.getParent()->end();
1567
57.4k
1568
57.4k
  if (!TII->isCandidateToMergeOrPair(MI))
1569
940
    return false;
1570
56.5k
1571
56.5k
  // Look ahead up to LdStLimit instructions for a mergable instruction.
1572
56.5k
  LdStPairFlags Flags;
1573
56.5k
  MachineBasicBlock::iterator MergeMI =
1574
56.5k
      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
1575
56.5k
  if (MergeMI != E) {
1576
7.74k
    ++NumZeroStoresPromoted;
1577
7.74k
1578
7.74k
    // Keeping the iterator straight is a pain, so we let the merge routine tell
1579
7.74k
    // us what the next instruction is after it's done mucking about.
1580
7.74k
    MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
1581
7.74k
    return true;
1582
7.74k
  }
1583
48.8k
  return false;
1584
48.8k
}
1585
1586
// Find loads and stores that can be merged into a single load or store pair
1587
// instruction.
1588
3.75M
bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
1589
3.75M
  MachineInstr &MI = *MBBI;
1590
3.75M
  MachineBasicBlock::iterator E = MI.getParent()->end();
1591
3.75M
1592
3.75M
  if (!TII->isCandidateToMergeOrPair(MI))
1593
629k
    return false;
1594
3.12M
1595
3.12M
  // Early exit if the offset is not possible to match. (6 bits of positive
1596
3.12M
  // range, plus allow an extra one in case we find a later insn that matches
1597
3.12M
  // with Offset-1)
1598
3.12M
  bool IsUnscaled = TII->isUnscaledLdSt(MI);
1599
3.12M
  int Offset = getLdStOffsetOp(MI).getImm();
1600
3.12M
  int OffsetStride = IsUnscaled ? 
getMemScale(MI)176k
:
12.95M
;
1601
3.12M
  // Allow one more for offset.
1602
3.12M
  if (Offset > 0)
1603
2.24M
    Offset -= OffsetStride;
1604
3.12M
  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
1605
431k
    return false;
1606
2.69M
1607
2.69M
  // Look ahead up to LdStLimit instructions for a pairable instruction.
1608
2.69M
  LdStPairFlags Flags;
1609
2.69M
  MachineBasicBlock::iterator Paired =
1610
2.69M
      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
1611
2.69M
  if (Paired != E) {
1612
214k
    ++NumPairCreated;
1613
214k
    if (TII->isUnscaledLdSt(MI))
1614
19.4k
      ++NumUnscaledPairCreated;
1615
214k
    // Keeping the iterator straight is a pain, so we let the merge routine tell
1616
214k
    // us what the next instruction is after it's done mucking about.
1617
214k
    MBBI = mergePairedInsns(MBBI, Paired, Flags);
1618
214k
    return true;
1619
214k
  }
1620
2.48M
  return false;
1621
2.48M
}
1622
1623
bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
1624
6.22M
    (MachineBasicBlock::iterator &MBBI) {
1625
6.22M
  MachineInstr &MI = *MBBI;
1626
6.22M
  MachineBasicBlock::iterator E = MI.getParent()->end();
1627
6.22M
  MachineBasicBlock::iterator Update;
1628
6.22M
1629
6.22M
  // Look forward to try to form a post-index instruction. For example,
1630
6.22M
  // ldr x0, [x20]
1631
6.22M
  // add x20, x20, #32
1632
6.22M
  //   merged into:
1633
6.22M
  // ldr x0, [x20], #32
1634
6.22M
  Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
1635
6.22M
  if (Update != E) {
1636
833
    // Merge the update into the ld/st.
1637
833
    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1638
833
    return true;
1639
833
  }
1640
6.22M
1641
6.22M
  // Don't know how to handle unscaled pre/post-index versions below, so bail.
1642
6.22M
  if (TII->isUnscaledLdSt(MI.getOpcode()))
1643
170k
    return false;
1644
6.05M
1645
6.05M
  // Look back to try to find a pre-index instruction. For example,
1646
6.05M
  // add x0, x0, #8
1647
6.05M
  // ldr x1, [x0]
1648
6.05M
  //   merged into:
1649
6.05M
  // ldr x1, [x0, #8]!
1650
6.05M
  Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
1651
6.05M
  if (Update != E) {
1652
181
    // Merge the update into the ld/st.
1653
181
    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1654
181
    return true;
1655
181
  }
1656
6.05M
1657
6.05M
  // The immediate in the load/store is scaled by the size of the memory
1658
6.05M
  // operation. The immediate in the add we're looking for,
1659
6.05M
  // however, is not, so adjust here.
1660
6.05M
  int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1661
6.05M
1662
6.05M
  // Look forward to try to find a post-index instruction. For example,
1663
6.05M
  // ldr x1, [x0, #64]
1664
6.05M
  // add x0, x0, #64
1665
6.05M
  //   merged into:
1666
6.05M
  // ldr x1, [x0, #64]!
1667
6.05M
  Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
1668
6.05M
  if (Update != E) {
1669
37
    // Merge the update into the ld/st.
1670
37
    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1671
37
    return true;
1672
37
  }
1673
6.05M
1674
6.05M
  return false;
1675
6.05M
}
1676
1677
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
1678
4.14M
                                        bool EnableNarrowZeroStOpt) {
1679
4.14M
  bool Modified = false;
1680
4.14M
  // Four tranformations to do here:
1681
4.14M
  // 1) Find loads that directly read from stores and promote them by
1682
4.14M
  //    replacing with mov instructions. If the store is wider than the load,
1683
4.14M
  //    the load will be replaced with a bitfield extract.
1684
4.14M
  //      e.g.,
1685
4.14M
  //        str w1, [x0, #4]
1686
4.14M
  //        ldrh w2, [x0, #6]
1687
4.14M
  //        ; becomes
1688
4.14M
  //        str w1, [x0, #4]
1689
4.14M
  //        lsr w2, w1, #16
1690
4.14M
  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1691
33.8M
       MBBI != E;) {
1692
29.6M
    if (isPromotableLoadFromStore(*MBBI) && 
tryToPromoteLoadFromStore(MBBI)2.61M
)
1693
226
      Modified = true;
1694
29.6M
    else
1695
29.6M
      ++MBBI;
1696
29.6M
  }
1697
4.14M
  // 2) Merge adjacent zero stores into a wider store.
1698
4.14M
  //      e.g.,
1699
4.14M
  //        strh wzr, [x0]
1700
4.14M
  //        strh wzr, [x0, #2]
1701
4.14M
  //        ; becomes
1702
4.14M
  //        str wzr, [x0]
1703
4.14M
  //      e.g.,
1704
4.14M
  //        str wzr, [x0]
1705
4.14M
  //        str wzr, [x0, #4]
1706
4.14M
  //        ; becomes
1707
4.14M
  //        str xzr, [x0]
1708
4.14M
  if (EnableNarrowZeroStOpt)
1709
4.14M
    for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1710
33.7M
         MBBI != E;) {
1711
29.6M
      if (isPromotableZeroStoreInst(*MBBI) && 
tryToMergeZeroStInst(MBBI)57.4k
)
1712
7.74k
        Modified = true;
1713
29.6M
      else
1714
29.6M
        ++MBBI;
1715
29.6M
    }
1716
4.14M
  // 3) Find loads and stores that can be merged into a single load or store
1717
4.14M
  //    pair instruction.
1718
4.14M
  //      e.g.,
1719
4.14M
  //        ldr x0, [x2]
1720
4.14M
  //        ldr x1, [x2, #8]
1721
4.14M
  //        ; becomes
1722
4.14M
  //        ldp x0, x1, [x2]
1723
4.14M
  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1724
33.5M
       MBBI != E;) {
1725
29.4M
    if (TII->isPairableLdStInst(*MBBI) && 
tryToPairLdStInst(MBBI)3.75M
)
1726
214k
      Modified = true;
1727
29.2M
    else
1728
29.2M
      ++MBBI;
1729
29.4M
  }
1730
4.14M
  // 4) Find base register updates that can be merged into the load or store
1731
4.14M
  //    as a base-reg writeback.
1732
4.14M
  //      e.g.,
1733
4.14M
  //        ldr x0, [x2]
1734
4.14M
  //        add x2, x2, #4
1735
4.14M
  //        ; becomes
1736
4.14M
  //        ldr x0, [x2], #4
1737
4.14M
  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1738
33.5M
       MBBI != E;) {
1739
29.4M
    if (isMergeableLdStUpdate(*MBBI) && 
tryToMergeLdStUpdate(MBBI)6.22M
)
1740
1.05k
      Modified = true;
1741
29.4M
    else
1742
29.4M
      ++MBBI;
1743
29.4M
  }
1744
4.14M
1745
4.14M
  return Modified;
1746
4.14M
}
1747
1748
497k
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1749
497k
  if (skipFunction(Fn.getFunction()))
1750
17
    return false;
1751
497k
1752
497k
  Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
1753
497k
  TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
1754
497k
  TRI = Subtarget->getRegisterInfo();
1755
497k
  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1756
497k
1757
497k
  // Resize the modified and used register unit trackers.  We do this once
1758
497k
  // per function and then clear the register units each time we optimize a load
1759
497k
  // or store.
1760
497k
  ModifiedRegUnits.init(*TRI);
1761
497k
  UsedRegUnits.init(*TRI);
1762
497k
1763
497k
  bool Modified = false;
1764
497k
  bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
1765
497k
  for (auto &MBB : Fn)
1766
4.14M
    Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt);
1767
497k
1768
497k
  return Modified;
1769
497k
}
1770
1771
// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
1772
// stores near one another?  Note: The pre-RA instruction scheduler already has
1773
// hooks to try and schedule pairable loads/stores together to improve pairing
1774
// opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
1775
1776
// FIXME: When pairing store instructions it's very possible for this pass to
1777
// hoist a store with a KILL marker above another use (without a KILL marker).
1778
// The resulting IR is invalid, but nothing uses the KILL markers after this
1779
// pass, so it's never caused a problem in practice.
1780
1781
/// createAArch64LoadStoreOptimizationPass - returns an instance of the
1782
/// load / store optimization pass.
1783
15.9k
FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1784
15.9k
  return new AArch64LoadStoreOpt();
1785
15.9k
}