Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This pass tries to fuse DS instructions with close by immediate offsets.
10
// This will fuse operations such as
11
//  ds_read_b32 v0, v2 offset:16
12
//  ds_read_b32 v1, v2 offset:32
13
// ==>
14
//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15
//
16
// The same is done for certain SMEM and VMEM opcodes, e.g.:
17
//  s_buffer_load_dword s4, s[0:3], 4
18
//  s_buffer_load_dword s5, s[0:3], 8
19
// ==>
20
//  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21
//
22
// This pass also tries to promote constant offset to the immediate by
23
// adjusting the base. It tries to use a base from the nearby instructions that
24
// allows it to have a 13bit constant offset and then promotes the 13bit offset
25
// to the immediate.
26
// E.g.
27
//  s_movk_i32 s0, 0x1800
28
//  v_add_co_u32_e32 v0, vcc, s0, v2
29
//  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30
//
31
//  s_movk_i32 s0, 0x1000
32
//  v_add_co_u32_e32 v5, vcc, s0, v2
33
//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34
//  global_load_dwordx2 v[5:6], v[5:6], off
35
//  global_load_dwordx2 v[0:1], v[0:1], off
36
// =>
37
//  s_movk_i32 s0, 0x1000
38
//  v_add_co_u32_e32 v5, vcc, s0, v2
39
//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40
//  global_load_dwordx2 v[5:6], v[5:6], off
41
//  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42
//
43
// Future improvements:
44
//
45
// - This currently relies on the scheduler to place loads and stores next to
46
//   each other, and then only merges adjacent pairs of instructions. It would
47
//   be good to be more flexible with interleaved instructions, and possibly run
48
//   before scheduling. It currently missing stores of constants because loading
49
//   the constant into the data register is placed between the stores, although
50
//   this is arguably a scheduling problem.
51
//
52
// - Live interval recomputing seems inefficient. This currently only matches
53
//   one pair, and recomputes live intervals and moves on to the next pair. It
54
//   would be better to compute a list of all merges that need to occur.
55
//
56
// - With a list of instructions to process, we can also merge more. If a
57
//   cluster of loads have offsets that are too large to fit in the 8-bit
58
//   offsets, but are close enough to fit in the 8 bits, we can add to the base
59
//   pointer and use the new reduced offsets.
60
//
61
//===----------------------------------------------------------------------===//
62
63
#include "AMDGPU.h"
64
#include "AMDGPUSubtarget.h"
65
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
66
#include "SIInstrInfo.h"
67
#include "SIRegisterInfo.h"
68
#include "Utils/AMDGPUBaseInfo.h"
69
#include "llvm/ADT/ArrayRef.h"
70
#include "llvm/ADT/SmallVector.h"
71
#include "llvm/ADT/StringRef.h"
72
#include "llvm/Analysis/AliasAnalysis.h"
73
#include "llvm/CodeGen/MachineBasicBlock.h"
74
#include "llvm/CodeGen/MachineFunction.h"
75
#include "llvm/CodeGen/MachineFunctionPass.h"
76
#include "llvm/CodeGen/MachineInstr.h"
77
#include "llvm/CodeGen/MachineInstrBuilder.h"
78
#include "llvm/CodeGen/MachineOperand.h"
79
#include "llvm/CodeGen/MachineRegisterInfo.h"
80
#include "llvm/IR/DebugLoc.h"
81
#include "llvm/Pass.h"
82
#include "llvm/Support/Debug.h"
83
#include "llvm/Support/MathExtras.h"
84
#include "llvm/Support/raw_ostream.h"
85
#include <algorithm>
86
#include <cassert>
87
#include <cstdlib>
88
#include <iterator>
89
#include <utility>
90
91
using namespace llvm;
92
93
#define DEBUG_TYPE "si-load-store-opt"
94
95
namespace {
96
enum InstClassEnum {
97
  UNKNOWN,
98
  DS_READ,
99
  DS_WRITE,
100
  S_BUFFER_LOAD_IMM,
101
  BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
102
  BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
103
  BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
104
  BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
105
  BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
106
  BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
107
  BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
108
  BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
109
};
110
111
enum RegisterEnum {
112
  SBASE = 0x1,
113
  SRSRC = 0x2,
114
  SOFFSET = 0x4,
115
  VADDR = 0x8,
116
  ADDR = 0x10,
117
};
118
119
class SILoadStoreOptimizer : public MachineFunctionPass {
120
  struct CombineInfo {
121
    MachineBasicBlock::iterator I;
122
    MachineBasicBlock::iterator Paired;
123
    unsigned EltSize;
124
    unsigned Offset0;
125
    unsigned Offset1;
126
    unsigned Width0;
127
    unsigned Width1;
128
    unsigned BaseOff;
129
    InstClassEnum InstClass;
130
    bool GLC0;
131
    bool GLC1;
132
    bool SLC0;
133
    bool SLC1;
134
    bool DLC0;
135
    bool DLC1;
136
    bool UseST64;
137
    SmallVector<MachineInstr *, 8> InstsToMove;
138
  };
139
140
  struct BaseRegisters {
141
    unsigned LoReg = 0;
142
    unsigned HiReg = 0;
143
144
    unsigned LoSubReg = 0;
145
    unsigned HiSubReg = 0;
146
  };
147
148
  struct MemAddress {
149
    BaseRegisters Base;
150
    int64_t Offset = 0;
151
  };
152
153
  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
154
155
private:
156
  const GCNSubtarget *STM = nullptr;
157
  const SIInstrInfo *TII = nullptr;
158
  const SIRegisterInfo *TRI = nullptr;
159
  MachineRegisterInfo *MRI = nullptr;
160
  AliasAnalysis *AA = nullptr;
161
  bool OptimizeAgain;
162
163
  static bool offsetsCanBeCombined(CombineInfo &CI);
164
  static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
165
  static unsigned getNewOpcode(const CombineInfo &CI);
166
  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
167
  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
168
  unsigned getOpcodeWidth(const MachineInstr &MI);
169
  InstClassEnum getInstClass(unsigned Opc);
170
  unsigned getRegs(unsigned Opc);
171
172
  bool findMatchingInst(CombineInfo &CI);
173
174
  unsigned read2Opcode(unsigned EltSize) const;
175
  unsigned read2ST64Opcode(unsigned EltSize) const;
176
  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
177
178
  unsigned write2Opcode(unsigned EltSize) const;
179
  unsigned write2ST64Opcode(unsigned EltSize) const;
180
  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
181
  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
182
  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
183
  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
184
185
  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
186
                           int32_t NewOffset);
187
  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
188
  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
189
  Optional<int32_t> extractConstOffset(const MachineOperand &Op);
190
  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
191
  /// Promotes constant offset to the immediate by adjusting the base. It
192
  /// tries to use a base from the nearby instructions that allows it to have
193
  /// a 13bit constant offset which gets promoted to the immediate.
194
  bool promoteConstantOffsetToImm(MachineInstr &CI,
195
                                  MemInfoMap &Visited,
196
                                  SmallPtrSet<MachineInstr *, 4> &Promoted);
197
198
public:
199
  static char ID;
200
201
2.39k
  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
202
2.39k
    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
203
2.39k
  }
204
205
  bool optimizeBlock(MachineBasicBlock &MBB);
206
207
  bool runOnMachineFunction(MachineFunction &MF) override;
208
209
27.6k
  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
210
211
2.37k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
212
2.37k
    AU.setPreservesCFG();
213
2.37k
    AU.addRequired<AAResultsWrapperPass>();
214
2.37k
215
2.37k
    MachineFunctionPass::getAnalysisUsage(AU);
216
2.37k
  }
217
};
218
219
} // end anonymous namespace.
220
221
101k
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
222
101k
                      "SI Load Store Optimizer", false, false)
223
101k
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
224
101k
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
225
                    false, false)
226
227
char SILoadStoreOptimizer::ID = 0;
228
229
char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
230
231
0
FunctionPass *llvm::createSILoadStoreOptimizerPass() {
232
0
  return new SILoadStoreOptimizer();
233
0
}
234
235
static void moveInstsAfter(MachineBasicBlock::iterator I,
236
3.63k
                           ArrayRef<MachineInstr *> InstsToMove) {
237
3.63k
  MachineBasicBlock *MBB = I->getParent();
238
3.63k
  ++I;
239
3.63k
  for (MachineInstr *MI : InstsToMove) {
240
981
    MI->removeFromParent();
241
981
    MBB->insert(I, MI);
242
981
  }
243
3.63k
}
244
245
static void addDefsUsesToList(const MachineInstr &MI,
246
                              DenseSet<unsigned> &RegDefs,
247
35.9k
                              DenseSet<unsigned> &PhysRegUses) {
248
232k
  for (const MachineOperand &Op : MI.operands()) {
249
232k
    if (Op.isReg()) {
250
126k
      if (Op.isDef())
251
23.3k
        RegDefs.insert(Op.getReg());
252
103k
      else if (Op.readsReg() &&
253
103k
               TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
254
39.5k
        PhysRegUses.insert(Op.getReg());
255
126k
    }
256
232k
  }
257
35.9k
}
258
259
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
260
                                      MachineBasicBlock::iterator B,
261
49.1k
                                      AliasAnalysis *AA) {
262
49.1k
  // RAW or WAR - cannot reorder
263
49.1k
  // WAW - cannot reorder
264
49.1k
  // RAR - safe to reorder
265
49.1k
  return !(A->mayStore() || 
B->mayStore()29.4k
) ||
!A->mayAlias(AA, *B, true)28.3k
;
266
49.1k
}
267
268
// Add MI and its defs to the lists if MI reads one of the defs that are
269
// already in the list. Returns true in that case.
270
static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
271
                                  DenseSet<unsigned> &PhysRegUses,
272
208k
                                  SmallVectorImpl<MachineInstr *> &Insts) {
273
1.04M
  for (MachineOperand &Use : MI.operands()) {
274
1.04M
    // If one of the defs is read, then there is a use of Def between I and the
275
1.04M
    // instruction that I will potentially be merged with. We will need to move
276
1.04M
    // this instruction after the merged instructions.
277
1.04M
    //
278
1.04M
    // Similarly, if there is a def which is read by an instruction that is to
279
1.04M
    // be moved for merging, then we need to move the def-instruction as well.
280
1.04M
    // This can only happen for physical registers such as M0; virtual
281
1.04M
    // registers are in SSA form.
282
1.04M
    if (Use.isReg() &&
283
1.04M
        
(615k
(615k
Use.readsReg()615k
&&
RegDefs.count(Use.getReg())431k
) ||
284
615k
         
(597k
Use.isDef()597k
&&
RegDefs.count(Use.getReg())184k
) ||
285
615k
         
(597k
Use.isDef()597k
&&
TargetRegisterInfo::isPhysicalRegister(Use.getReg())184k
&&
286
597k
          
PhysRegUses.count(Use.getReg())6.23k
))) {
287
18.1k
      Insts.push_back(&MI);
288
18.1k
      addDefsUsesToList(MI, RegDefs, PhysRegUses);
289
18.1k
      return true;
290
18.1k
    }
291
1.04M
  }
292
208k
293
208k
  
return false189k
;
294
208k
}
295
296
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
297
                                    ArrayRef<MachineInstr *> InstsToMove,
298
44.6k
                                    AliasAnalysis *AA) {
299
44.6k
  assert(MemOp.mayLoadOrStore());
300
44.6k
301
47.9k
  for (MachineInstr *InstToMove : InstsToMove) {
302
47.9k
    if (!InstToMove->mayLoadOrStore())
303
47.8k
      continue;
304
106
    if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
305
63
      return false;
306
106
  }
307
44.6k
  
return true44.5k
;
308
44.6k
}
309
310
7.59k
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
311
7.59k
  // XXX - Would the same offset be OK? Is there any reason this would happen or
312
7.59k
  // be useful?
313
7.59k
  if (CI.Offset0 == CI.Offset1)
314
30
    return false;
315
7.56k
316
7.56k
  // This won't be valid if the offset isn't aligned.
317
7.56k
  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
318
0
    return false;
319
7.56k
320
7.56k
  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
321
7.56k
  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
322
7.56k
  CI.UseST64 = false;
323
7.56k
  CI.BaseOff = 0;
324
7.56k
325
7.56k
  // Handle SMEM and VMEM instructions.
326
7.56k
  if ((CI.InstClass != DS_READ) && 
(CI.InstClass != DS_WRITE)6.51k
) {
327
4.32k
    return (EltOffset0 + CI.Width0 == EltOffset1 ||
328
4.32k
            
EltOffset1 + CI.Width1 == EltOffset03.89k
) &&
329
4.32k
           
CI.GLC0 == CI.GLC1433
&&
CI.DLC0 == CI.DLC1419
&&
330
4.32k
           
(417
CI.InstClass == S_BUFFER_LOAD_IMM417
||
CI.SLC0 == CI.SLC1218
);
331
4.32k
  }
332
3.23k
333
3.23k
  // If the offset in elements doesn't fit in 8-bits, we might be able to use
334
3.23k
  // the stride 64 versions.
335
3.23k
  if ((EltOffset0 % 64 == 0) && 
(EltOffset1 % 64) == 0469
&&
336
3.23k
      
isUInt<8>(EltOffset0 / 64)40
&&
isUInt<8>(EltOffset1 / 64)40
) {
337
40
    CI.Offset0 = EltOffset0 / 64;
338
40
    CI.Offset1 = EltOffset1 / 64;
339
40
    CI.UseST64 = true;
340
40
    return true;
341
40
  }
342
3.19k
343
3.19k
  // Check if the new offsets fit in the reduced 8-bit range.
344
3.19k
  if (isUInt<8>(EltOffset0) && 
isUInt<8>(EltOffset1)3.13k
) {
345
3.11k
    CI.Offset0 = EltOffset0;
346
3.11k
    CI.Offset1 = EltOffset1;
347
3.11k
    return true;
348
3.11k
  }
349
73
350
73
  // Try to shift base address to decrease offsets.
351
73
  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
352
73
  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
353
73
354
73
  if ((OffsetDiff % 64 == 0) && 
isUInt<8>(OffsetDiff / 64)24
) {
355
24
    CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
356
24
    CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
357
24
    CI.UseST64 = true;
358
24
    return true;
359
24
  }
360
49
361
49
  if (isUInt<8>(OffsetDiff)) {
362
40
    CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
363
40
    CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
364
40
    return true;
365
40
  }
366
9
367
9
  return false;
368
9
}
369
370
bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
371
36.2k
                                     const CombineInfo &CI) {
372
36.2k
  const unsigned Width = (CI.Width0 + CI.Width1);
373
36.2k
  switch (CI.InstClass) {
374
36.2k
  default:
375
24.8k
    return (Width <= 4) && 
(4.55k
STM.hasDwordx3LoadStores()4.55k
||
(Width != 3)2.21k
);
376
36.2k
  case S_BUFFER_LOAD_IMM:
377
11.3k
    switch (Width) {
378
11.3k
    default:
379
8.19k
      return false;
380
11.3k
    case 2:
381
3.17k
    case 4:
382
3.17k
      return true;
383
11.3k
    }
384
36.2k
  }
385
36.2k
}
386
387
72.5k
unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
388
72.5k
  const unsigned Opc = MI.getOpcode();
389
72.5k
390
72.5k
  if (TII->isMUBUF(MI)) {
391
43.2k
    return AMDGPU::getMUBUFDwords(Opc);
392
43.2k
  }
393
29.2k
394
29.2k
  switch (Opc) {
395
29.2k
  default:
396
6.49k
    return 0;
397
29.2k
  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
398
11.2k
    return 1;
399
29.2k
  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
400
9.32k
    return 2;
401
29.2k
  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
402
2.16k
    return 4;
403
29.2k
  }
404
29.2k
}
405
406
763k
InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
407
763k
  if (TII->isMUBUF(Opc)) {
408
75.0k
    const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
409
75.0k
410
75.0k
    // If we couldn't identify the opcode, bail out.
411
75.0k
    if (baseOpcode == -1) {
412
0
      return UNKNOWN;
413
0
    }
414
75.0k
415
75.0k
    switch (baseOpcode) {
416
75.0k
    default:
417
5.88k
      return UNKNOWN;
418
75.0k
    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
419
5.18k
      return BUFFER_LOAD_OFFEN;
420
75.0k
    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
421
6.92k
      return BUFFER_LOAD_OFFSET;
422
75.0k
    case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
423
11.8k
      return BUFFER_STORE_OFFEN;
424
75.0k
    case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
425
44.4k
      return BUFFER_STORE_OFFSET;
426
75.0k
    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
427
0
      return BUFFER_LOAD_OFFEN_exact;
428
75.0k
    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
429
0
      return BUFFER_LOAD_OFFSET_exact;
430
75.0k
    case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
431
442
      return BUFFER_STORE_OFFEN_exact;
432
75.0k
    case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
433
272
      return BUFFER_STORE_OFFSET_exact;
434
688k
    }
435
688k
  }
436
688k
437
688k
  switch (Opc) {
438
688k
  default:
439
662k
    return UNKNOWN;
440
688k
  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
441
12.8k
  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
442
12.8k
  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
443
12.8k
    return S_BUFFER_LOAD_IMM;
444
12.8k
  case AMDGPU::DS_READ_B32:
445
5.36k
  case AMDGPU::DS_READ_B64:
446
5.36k
  case AMDGPU::DS_READ_B32_gfx9:
447
5.36k
  case AMDGPU::DS_READ_B64_gfx9:
448
5.36k
    return DS_READ;
449
8.55k
  case AMDGPU::DS_WRITE_B32:
450
8.55k
  case AMDGPU::DS_WRITE_B64:
451
8.55k
  case AMDGPU::DS_WRITE_B32_gfx9:
452
8.55k
  case AMDGPU::DS_WRITE_B64_gfx9:
453
8.55k
    return DS_WRITE;
454
688k
  }
455
688k
}
456
457
25.5k
unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
458
25.5k
  if (TII->isMUBUF(Opc)) {
459
20.0k
    unsigned result = 0;
460
20.0k
461
20.0k
    if (AMDGPU::getMUBUFHasVAddr(Opc)) {
462
7.29k
      result |= VADDR;
463
7.29k
    }
464
20.0k
465
20.0k
    if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
466
20.0k
      result |= SRSRC;
467
20.0k
    }
468
20.0k
469
20.0k
    if (AMDGPU::getMUBUFHasSoffset(Opc)) {
470
20.0k
      result |= SOFFSET;
471
20.0k
    }
472
20.0k
473
20.0k
    return result;
474
20.0k
  }
475
5.54k
476
5.54k
  switch (Opc) {
477
5.54k
  default:
478
0
    return 0;
479
5.54k
  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
480
748
  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
481
748
  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
482
748
    return SBASE;
483
4.79k
  case AMDGPU::DS_READ_B32:
484
4.79k
  case AMDGPU::DS_READ_B64:
485
4.79k
  case AMDGPU::DS_READ_B32_gfx9:
486
4.79k
  case AMDGPU::DS_READ_B64_gfx9:
487
4.79k
  case AMDGPU::DS_WRITE_B32:
488
4.79k
  case AMDGPU::DS_WRITE_B64:
489
4.79k
  case AMDGPU::DS_WRITE_B32_gfx9:
490
4.79k
  case AMDGPU::DS_WRITE_B64_gfx9:
491
4.79k
    return ADDR;
492
5.54k
  }
493
5.54k
}
494
495
25.3k
bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
496
25.3k
  MachineBasicBlock *MBB = CI.I->getParent();
497
25.3k
  MachineBasicBlock::iterator E = MBB->end();
498
25.3k
  MachineBasicBlock::iterator MBBI = CI.I;
499
25.3k
500
25.3k
  const unsigned Opc = CI.I->getOpcode();
501
25.3k
  const InstClassEnum InstClass = getInstClass(Opc);
502
25.3k
503
25.3k
  if (InstClass == UNKNOWN) {
504
0
    return false;
505
0
  }
506
25.3k
507
25.3k
  const unsigned Regs = getRegs(Opc);
508
25.3k
509
25.3k
  unsigned AddrOpName[5] = {0};
510
25.3k
  int AddrIdx[5];
511
25.3k
  const MachineOperand *AddrReg[5];
512
25.3k
  unsigned NumAddresses = 0;
513
25.3k
514
25.3k
  if (Regs & ADDR) {
515
4.79k
    AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
516
4.79k
  }
517
25.3k
518
25.3k
  if (Regs & SBASE) {
519
748
    AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
520
748
  }
521
25.3k
522
25.3k
  if (Regs & SRSRC) {
523
19.8k
    AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
524
19.8k
  }
525
25.3k
526
25.3k
  if (Regs & SOFFSET) {
527
19.8k
    AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
528
19.8k
  }
529
25.3k
530
25.3k
  if (Regs & VADDR) {
531
7.17k
    AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
532
7.17k
  }
533
25.3k
534
40.7k
  for (unsigned i = 0; i < NumAddresses; 
i++15.3k
) {
535
31.0k
    AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
536
31.0k
    AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
537
31.0k
538
31.0k
    // We only ever merge operations with the same base address register, so
539
31.0k
    // don't bother scanning forward if there are no other uses.
540
31.0k
    if (AddrReg[i]->isReg() &&
541
31.0k
        
(25.7k
TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg())25.7k
||
542
25.7k
         
MRI->hasOneNonDBGUse(AddrReg[i]->getReg())18.8k
))
543
15.7k
      return false;
544
31.0k
  }
545
25.3k
546
25.3k
  ++MBBI;
547
9.64k
548
9.64k
  DenseSet<unsigned> RegDefsToMove;
549
9.64k
  DenseSet<unsigned> PhysRegUsesToMove;
550
9.64k
  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
551
9.64k
552
222k
  for (; MBBI != E; 
++MBBI212k
) {
553
216k
    const bool IsDS = (InstClass == DS_READ) || 
(InstClass == DS_WRITE)213k
;
554
216k
555
216k
    if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
556
216k
        
(36.5k
IsDS36.5k
&&
(MBBI->getOpcode() != Opc)3.34k
)) {
557
179k
      // This is not a matching DS instruction, but we can keep looking as
558
179k
      // long as one of these conditions are met:
559
179k
      // 1. It is safe to move I down past MBBI.
560
179k
      // 2. It is safe to move MBBI down past the instruction that I will
561
179k
      //    be merged into.
562
179k
563
179k
      if (MBBI->hasUnmodeledSideEffects()) {
564
142
        // We can't re-order this instruction with respect to other memory
565
142
        // operations, so we fail both conditions mentioned above.
566
142
        return false;
567
142
      }
568
179k
569
179k
      if (MBBI->mayLoadOrStore() &&
570
179k
          
(16.3k
!memAccessesCanBeReordered(*CI.I, *MBBI, AA)16.3k
||
571
16.3k
           
!canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)8.29k
)) {
572
8.08k
        // We fail condition #1, but we may still be able to satisfy condition
573
8.08k
        // #2.  Add this instruction to the move list and then we will check
574
8.08k
        // if condition #2 holds once we have selected the matching instruction.
575
8.08k
        CI.InstsToMove.push_back(&*MBBI);
576
8.08k
        addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
577
8.08k
        continue;
578
8.08k
      }
579
171k
580
171k
      // When we match I with another DS instruction we will be moving I down
581
171k
      // to the location of the matched instruction any uses of I will need to
582
171k
      // be moved down as well.
583
171k
      addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
584
171k
                            CI.InstsToMove);
585
171k
      continue;
586
171k
    }
587
36.5k
588
36.5k
    // Don't merge volatiles.
589
36.5k
    if (MBBI->hasOrderedMemoryRef())
590
6
      return false;
591
36.5k
592
36.5k
    // Handle a case like
593
36.5k
    //   DS_WRITE_B32 addr, v, idx0
594
36.5k
    //   w = DS_READ_B32 addr, idx0
595
36.5k
    //   DS_WRITE_B32 addr, f(w), idx1
596
36.5k
    // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
597
36.5k
    // merging of the two writes.
598
36.5k
    if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
599
36.5k
                              CI.InstsToMove))
600
132
      continue;
601
36.3k
602
36.3k
    bool Match = true;
603
94.6k
    for (unsigned i = 0; i < NumAddresses; 
i++58.2k
) {
604
58.4k
      const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
605
58.4k
606
58.4k
      if (AddrReg[i]->isImm() || 
AddrRegNext.isImm()36.7k
) {
607
21.6k
        if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
608
21.6k
            AddrReg[i]->getImm() != AddrRegNext.getImm()) {
609
0
          Match = false;
610
0
          break;
611
0
        }
612
21.6k
        continue;
613
21.6k
      }
614
36.7k
615
36.7k
      // Check same base pointer. Be careful of subregisters, which can occur
616
36.7k
      // with vectors of pointers.
617
36.7k
      if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
618
36.7k
          
AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()36.6k
) {
619
106
        Match = false;
620
106
        break;
621
106
      }
622
36.7k
    }
623
36.3k
624
36.3k
    if (Match) {
625
36.2k
      int OffsetIdx =
626
36.2k
          AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
627
36.2k
      CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
628
36.2k
      CI.Width0 = getOpcodeWidth(*CI.I);
629
36.2k
      CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
630
36.2k
      CI.Width1 = getOpcodeWidth(*MBBI);
631
36.2k
      CI.Paired = MBBI;
632
36.2k
633
36.2k
      if ((CI.InstClass == DS_READ) || 
(CI.InstClass == DS_WRITE)35.2k
) {
634
3.24k
        CI.Offset0 &= 0xffff;
635
3.24k
        CI.Offset1 &= 0xffff;
636
33.0k
      } else {
637
33.0k
        CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
638
33.0k
        CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
639
33.0k
        if (CI.InstClass != S_BUFFER_LOAD_IMM) {
640
21.6k
          CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
641
21.6k
          CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
642
21.6k
        }
643
33.0k
        CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm();
644
33.0k
        CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm();
645
33.0k
      }
646
36.2k
647
36.2k
      // Check both offsets fit in the reduced range.
648
36.2k
      // We also need to go through the list of instructions that we plan to
649
36.2k
      // move and make sure they are all safe to move down past the merged
650
36.2k
      // instruction.
651
36.2k
      if (widthsFit(*STM, CI) && 
offsetsCanBeCombined(CI)7.59k
)
652
3.64k
        if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
653
3.63k
          return true;
654
32.7k
    }
655
32.7k
656
32.7k
    // We've found a load/store that we couldn't merge for some reason.
657
32.7k
    // We could potentially keep looking, but we'd need to make sure that
658
32.7k
    // it was safe to move I and also all the instruction in InstsToMove
659
32.7k
    // down past this instruction.
660
32.7k
    // check if we can move I across MBBI and if we can move all I's users
661
32.7k
    if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
662
32.7k
        
!canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)32.6k
)
663
86
      break;
664
32.7k
  }
665
9.64k
  
return false5.85k
;
666
9.64k
}
667
668
1.00k
unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
669
1.00k
  if (STM->ldsRequiresM0Init())
670
755
    return (EltSize == 4) ? 
AMDGPU::DS_READ2_B32248
:
AMDGPU::DS_READ2_B64507
;
671
246
  return (EltSize == 4) ? 
AMDGPU::DS_READ2_B32_gfx932
:
AMDGPU::DS_READ2_B64_gfx9214
;
672
246
}
673
674
36
unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
675
36
  if (STM->ldsRequiresM0Init())
676
20
    return (EltSize == 4) ? 
AMDGPU::DS_READ2ST64_B3214
:
AMDGPU::DS_READ2ST64_B646
;
677
16
678
16
  return (EltSize == 4) ? 
AMDGPU::DS_READ2ST64_B32_gfx910
679
16
                        : 
AMDGPU::DS_READ2ST64_B64_gfx96
;
680
16
}
681
682
MachineBasicBlock::iterator
683
1.03k
SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
684
1.03k
  MachineBasicBlock *MBB = CI.I->getParent();
685
1.03k
686
1.03k
  // Be careful, since the addresses could be subregisters themselves in weird
687
1.03k
  // cases, like vectors of pointers.
688
1.03k
  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
689
1.03k
690
1.03k
  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
691
1.03k
  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
692
1.03k
693
1.03k
  unsigned NewOffset0 = CI.Offset0;
694
1.03k
  unsigned NewOffset1 = CI.Offset1;
695
1.03k
  unsigned Opc =
696
1.03k
      CI.UseST64 ? 
read2ST64Opcode(CI.EltSize)36
:
read2Opcode(CI.EltSize)1.00k
;
697
1.03k
698
1.03k
  unsigned SubRegIdx0 = (CI.EltSize == 4) ? 
AMDGPU::sub0304
:
AMDGPU::sub0_sub1733
;
699
1.03k
  unsigned SubRegIdx1 = (CI.EltSize == 4) ? 
AMDGPU::sub1304
:
AMDGPU::sub2_sub3733
;
700
1.03k
701
1.03k
  if (NewOffset0 > NewOffset1) {
702
476
    // Canonicalize the merged instruction so the smaller offset comes first.
703
476
    std::swap(NewOffset0, NewOffset1);
704
476
    std::swap(SubRegIdx0, SubRegIdx1);
705
476
  }
706
1.03k
707
1.03k
  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
708
1.03k
         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
709
1.03k
710
1.03k
  const MCInstrDesc &Read2Desc = TII->get(Opc);
711
1.03k
712
1.03k
  const TargetRegisterClass *SuperRC =
713
1.03k
      (CI.EltSize == 4) ? 
&AMDGPU::VReg_64RegClass304
:
&AMDGPU::VReg_128RegClass733
;
714
1.03k
  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
715
1.03k
716
1.03k
  DebugLoc DL = CI.I->getDebugLoc();
717
1.03k
718
1.03k
  unsigned BaseReg = AddrReg->getReg();
719
1.03k
  unsigned BaseSubReg = AddrReg->getSubReg();
720
1.03k
  unsigned BaseRegFlags = 0;
721
1.03k
  if (CI.BaseOff) {
722
31
    unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
723
31
    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
724
31
        .addImm(CI.BaseOff);
725
31
726
31
    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
727
31
    BaseRegFlags = RegState::Kill;
728
31
729
31
    TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
730
31
        .addReg(ImmReg)
731
31
        .addReg(AddrReg->getReg(), 0, BaseSubReg)
732
31
        .addImm(0); // clamp bit
733
31
    BaseSubReg = 0;
734
31
  }
735
1.03k
736
1.03k
  MachineInstrBuilder Read2 =
737
1.03k
      BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
738
1.03k
          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
739
1.03k
          .addImm(NewOffset0)                        // offset0
740
1.03k
          .addImm(NewOffset1)                        // offset1
741
1.03k
          .addImm(0)                                 // gds
742
1.03k
          .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
743
1.03k
744
1.03k
  (void)Read2;
745
1.03k
746
1.03k
  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
747
1.03k
748
1.03k
  // Copy to the old destination registers.
749
1.03k
  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
750
1.03k
      .add(*Dest0) // Copy to same destination including flags and sub reg.
751
1.03k
      .addReg(DestReg, 0, SubRegIdx0);
752
1.03k
  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
753
1.03k
                            .add(*Dest1)
754
1.03k
                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
755
1.03k
756
1.03k
  moveInstsAfter(Copy1, CI.InstsToMove);
757
1.03k
758
1.03k
  MachineBasicBlock::iterator Next = std::next(CI.I);
759
1.03k
  CI.I->eraseFromParent();
760
1.03k
  CI.Paired->eraseFromParent();
761
1.03k
762
1.03k
  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
763
1.03k
  return Next;
764
1.03k
}
765
766
2.15k
unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
767
2.15k
  if (STM->ldsRequiresM0Init())
768
1.45k
    return (EltSize == 4) ? 
AMDGPU::DS_WRITE2_B3258
:
AMDGPU::DS_WRITE2_B641.39k
;
769
694
  return (EltSize == 4) ? 
AMDGPU::DS_WRITE2_B32_gfx927
770
694
                        : 
AMDGPU::DS_WRITE2_B64_gfx9667
;
771
694
}
772
773
28
unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
774
28
  if (STM->ldsRequiresM0Init())
775
14
    return (EltSize == 4) ? 
AMDGPU::DS_WRITE2ST64_B3210
776
14
                          : 
AMDGPU::DS_WRITE2ST64_B644
;
777
14
778
14
  return (EltSize == 4) ? 
AMDGPU::DS_WRITE2ST64_B32_gfx910
779
14
                        : 
AMDGPU::DS_WRITE2ST64_B64_gfx94
;
780
14
}
781
782
MachineBasicBlock::iterator
783
2.17k
SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
784
2.17k
  MachineBasicBlock *MBB = CI.I->getParent();
785
2.17k
786
2.17k
  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
787
2.17k
  // sure we preserve the subregister index and any register flags set on them.
788
2.17k
  const MachineOperand *AddrReg =
789
2.17k
      TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
790
2.17k
  const MachineOperand *Data0 =
791
2.17k
      TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
792
2.17k
  const MachineOperand *Data1 =
793
2.17k
      TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
794
2.17k
795
2.17k
  unsigned NewOffset0 = CI.Offset0;
796
2.17k
  unsigned NewOffset1 = CI.Offset1;
797
2.17k
  unsigned Opc =
798
2.17k
      CI.UseST64 ? 
write2ST64Opcode(CI.EltSize)28
:
write2Opcode(CI.EltSize)2.15k
;
799
2.17k
800
2.17k
  if (NewOffset0 > NewOffset1) {
801
1.06k
    // Canonicalize the merged instruction so the smaller offset comes first.
802
1.06k
    std::swap(NewOffset0, NewOffset1);
803
1.06k
    std::swap(Data0, Data1);
804
1.06k
  }
805
2.17k
806
2.17k
  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
807
2.17k
         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
808
2.17k
809
2.17k
  const MCInstrDesc &Write2Desc = TII->get(Opc);
810
2.17k
  DebugLoc DL = CI.I->getDebugLoc();
811
2.17k
812
2.17k
  unsigned BaseReg = AddrReg->getReg();
813
2.17k
  unsigned BaseSubReg = AddrReg->getSubReg();
814
2.17k
  unsigned BaseRegFlags = 0;
815
2.17k
  if (CI.BaseOff) {
816
33
    unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
817
33
    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
818
33
        .addImm(CI.BaseOff);
819
33
820
33
    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
821
33
    BaseRegFlags = RegState::Kill;
822
33
823
33
    TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
824
33
        .addReg(ImmReg)
825
33
        .addReg(AddrReg->getReg(), 0, BaseSubReg)
826
33
        .addImm(0); // clamp bit
827
33
    BaseSubReg = 0;
828
33
  }
829
2.17k
830
2.17k
  MachineInstrBuilder Write2 =
831
2.17k
      BuildMI(*MBB, CI.Paired, DL, Write2Desc)
832
2.17k
          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
833
2.17k
          .add(*Data0)                               // data0
834
2.17k
          .add(*Data1)                               // data1
835
2.17k
          .addImm(NewOffset0)                        // offset0
836
2.17k
          .addImm(NewOffset1)                        // offset1
837
2.17k
          .addImm(0)                                 // gds
838
2.17k
          .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
839
2.17k
840
2.17k
  moveInstsAfter(Write2, CI.InstsToMove);
841
2.17k
842
2.17k
  MachineBasicBlock::iterator Next = std::next(CI.I);
843
2.17k
  CI.I->eraseFromParent();
844
2.17k
  CI.Paired->eraseFromParent();
845
2.17k
846
2.17k
  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
847
2.17k
  return Next;
848
2.17k
}
849
850
MachineBasicBlock::iterator
851
199
SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
852
199
  MachineBasicBlock *MBB = CI.I->getParent();
853
199
  DebugLoc DL = CI.I->getDebugLoc();
854
199
  const unsigned Opcode = getNewOpcode(CI);
855
199
856
199
  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
857
199
858
199
  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
859
199
  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
860
199
861
199
  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
862
199
      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
863
199
      .addImm(MergedOffset) // offset
864
199
      .addImm(CI.GLC0)      // glc
865
199
      .addImm(CI.DLC0)      // dlc
866
199
      .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
867
199
868
199
  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
869
199
  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
870
199
  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
871
199
872
199
  // Copy to the old destination registers.
873
199
  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
874
199
  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
875
199
  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
876
199
877
199
  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
878
199
      .add(*Dest0) // Copy to same destination including flags and sub reg.
879
199
      .addReg(DestReg, 0, SubRegIdx0);
880
199
  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
881
199
                            .add(*Dest1)
882
199
                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
883
199
884
199
  moveInstsAfter(Copy1, CI.InstsToMove);
885
199
886
199
  MachineBasicBlock::iterator Next = std::next(CI.I);
887
199
  CI.I->eraseFromParent();
888
199
  CI.Paired->eraseFromParent();
889
199
  return Next;
890
199
}
891
892
MachineBasicBlock::iterator
893
128
SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
894
128
  MachineBasicBlock *MBB = CI.I->getParent();
895
128
  DebugLoc DL = CI.I->getDebugLoc();
896
128
897
128
  const unsigned Opcode = getNewOpcode(CI);
898
128
899
128
  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
900
128
901
128
  // Copy to the new source register.
902
128
  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
903
128
  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
904
128
905
128
  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
906
128
907
128
  const unsigned Regs = getRegs(Opcode);
908
128
909
128
  if (Regs & VADDR)
910
76
    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
911
128
912
128
  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
913
128
      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
914
128
      .addImm(MergedOffset) // offset
915
128
      .addImm(CI.GLC0)      // glc
916
128
      .addImm(CI.SLC0)      // slc
917
128
      .addImm(0)            // tfe
918
128
      .addImm(CI.DLC0)      // dlc
919
128
      .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
920
128
921
128
  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
922
128
  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
923
128
  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
924
128
925
128
  // Copy to the old destination registers.
926
128
  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
927
128
  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
928
128
  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
929
128
930
128
  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
931
128
      .add(*Dest0) // Copy to same destination including flags and sub reg.
932
128
      .addReg(DestReg, 0, SubRegIdx0);
933
128
  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
934
128
                            .add(*Dest1)
935
128
                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
936
128
937
128
  moveInstsAfter(Copy1, CI.InstsToMove);
938
128
939
128
  MachineBasicBlock::iterator Next = std::next(CI.I);
940
128
  CI.I->eraseFromParent();
941
128
  CI.Paired->eraseFromParent();
942
128
  return Next;
943
128
}
944
945
417
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
946
417
  const unsigned Width = CI.Width0 + CI.Width1;
947
417
948
417
  switch (CI.InstClass) {
949
417
  default:
950
218
    return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
951
417
  case UNKNOWN:
952
0
    llvm_unreachable("Unknown instruction class");
953
417
  case S_BUFFER_LOAD_IMM:
954
199
    switch (Width) {
955
199
    default:
956
0
      return 0;
957
199
    case 2:
958
154
      return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
959
199
    case 4:
960
45
      return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
961
199
    }
962
417
  }
963
417
}
964
965
std::pair<unsigned, unsigned>
966
417
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
967
417
  if (CI.Offset0 > CI.Offset1) {
968
2
    switch (CI.Width0) {
969
2
    default:
970
0
      return std::make_pair(0, 0);
971
2
    case 1:
972
2
      switch (CI.Width1) {
973
2
      default:
974
0
        return std::make_pair(0, 0);
975
2
      case 1:
976
0
        return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
977
2
      case 2:
978
2
        return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
979
2
      case 3:
980
0
        return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
981
0
      }
982
0
    case 2:
983
0
      switch (CI.Width1) {
984
0
      default:
985
0
        return std::make_pair(0, 0);
986
0
      case 1:
987
0
        return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
988
0
      case 2:
989
0
        return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
990
0
      }
991
0
    case 3:
992
0
      switch (CI.Width1) {
993
0
      default:
994
0
        return std::make_pair(0, 0);
995
0
      case 1:
996
0
        return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
997
415
      }
998
415
    }
999
415
  } else {
1000
415
    switch (CI.Width0) {
1001
415
    default:
1002
0
      return std::make_pair(0, 0);
1003
415
    case 1:
1004
277
      switch (CI.Width1) {
1005
277
      default:
1006
0
        return std::make_pair(0, 0);
1007
277
      case 1:
1008
275
        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1009
277
      case 2:
1010
2
        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1011
277
      case 3:
1012
0
        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1013
0
      }
1014
119
    case 2:
1015
119
      switch (CI.Width1) {
1016
119
      default:
1017
0
        return std::make_pair(0, 0);
1018
119
      case 1:
1019
25
        return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1020
119
      case 2:
1021
94
        return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1022
0
      }
1023
19
    case 3:
1024
19
      switch (CI.Width1) {
1025
19
      default:
1026
0
        return std::make_pair(0, 0);
1027
19
      case 1:
1028
19
        return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1029
19
      }
1030
415
    }
1031
415
  }
1032
417
}
1033
1034
const TargetRegisterClass *
1035
417
SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1036
417
  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1037
199
    switch (CI.Width0 + CI.Width1) {
1038
199
    default:
1039
0
      return nullptr;
1040
199
    case 2:
1041
154
      return &AMDGPU::SReg_64_XEXECRegClass;
1042
199
    case 4:
1043
45
      return &AMDGPU::SReg_128RegClass;
1044
199
    case 8:
1045
0
      return &AMDGPU::SReg_256RegClass;
1046
199
    case 16:
1047
0
      return &AMDGPU::SReg_512RegClass;
1048
218
    }
1049
218
  } else {
1050
218
    switch (CI.Width0 + CI.Width1) {
1051
218
    default:
1052
0
      return nullptr;
1053
218
    case 2:
1054
121
      return &AMDGPU::VReg_64RegClass;
1055
218
    case 3:
1056
29
      return &AMDGPU::VReg_96RegClass;
1057
218
    case 4:
1058
68
      return &AMDGPU::VReg_128RegClass;
1059
218
    }
1060
218
  }
1061
417
}
1062
1063
MachineBasicBlock::iterator
1064
90
SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1065
90
  MachineBasicBlock *MBB = CI.I->getParent();
1066
90
  DebugLoc DL = CI.I->getDebugLoc();
1067
90
1068
90
  const unsigned Opcode = getNewOpcode(CI);
1069
90
1070
90
  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1071
90
  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1072
90
  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1073
90
1074
90
  // Copy to the new source register.
1075
90
  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1076
90
  unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
1077
90
1078
90
  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1079
90
  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1080
90
1081
90
  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1082
90
      .add(*Src0)
1083
90
      .addImm(SubRegIdx0)
1084
90
      .add(*Src1)
1085
90
      .addImm(SubRegIdx1);
1086
90
1087
90
  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1088
90
                 .addReg(SrcReg, RegState::Kill);
1089
90
1090
90
  const unsigned Regs = getRegs(Opcode);
1091
90
1092
90
  if (Regs & VADDR)
1093
47
    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1094
90
1095
90
  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1096
90
      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1097
90
      .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1098
90
      .addImm(CI.GLC0)      // glc
1099
90
      .addImm(CI.SLC0)      // slc
1100
90
      .addImm(0)            // tfe
1101
90
      .addImm(CI.DLC0)      // dlc
1102
90
      .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1103
90
1104
90
  moveInstsAfter(MIB, CI.InstsToMove);
1105
90
1106
90
  MachineBasicBlock::iterator Next = std::next(CI.I);
1107
90
  CI.I->eraseFromParent();
1108
90
  CI.Paired->eraseFromParent();
1109
90
  return Next;
1110
90
}
1111
1112
MachineOperand
1113
30
SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1114
30
  APInt V(32, Val, true);
1115
30
  if (TII->isInlineConstant(V))
1116
17
    return MachineOperand::CreateImm(Val);
1117
13
1118
13
  unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1119
13
  MachineInstr *Mov =
1120
13
  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1121
13
          TII->get(AMDGPU::S_MOV_B32), Reg)
1122
13
    .addImm(Val);
1123
13
  (void)Mov;
1124
13
  LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1125
13
  return MachineOperand::CreateReg(Reg, false);
1126
13
}
1127
1128
// Compute base address using Addr and return the final register.
1129
unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1130
15
                                           const MemAddress &Addr) {
1131
15
  MachineBasicBlock *MBB = MI.getParent();
1132
15
  MachineBasicBlock::iterator MBBI = MI.getIterator();
1133
15
  DebugLoc DL = MI.getDebugLoc();
1134
15
1135
15
  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1136
15
          Addr.Base.LoSubReg) &&
1137
15
         "Expected 32-bit Base-Register-Low!!");
1138
15
1139
15
  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1140
15
          Addr.Base.HiSubReg) &&
1141
15
         "Expected 32-bit Base-Register-Hi!!");
1142
15
1143
15
  LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1144
15
  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1145
15
  MachineOperand OffsetHi =
1146
15
    createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1147
15
1148
15
  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1149
15
  unsigned CarryReg = MRI->createVirtualRegister(CarryRC);
1150
15
  unsigned DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1151
15
1152
15
  unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1153
15
  unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1154
15
  MachineInstr *LoHalf =
1155
15
    BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1156
15
      .addReg(CarryReg, RegState::Define)
1157
15
      .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1158
15
      .add(OffsetLo)
1159
15
      .addImm(0); // clamp bit
1160
15
  (void)LoHalf;
1161
15
  LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1162
15
1163
15
  MachineInstr *HiHalf =
1164
15
  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1165
15
    .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1166
15
    .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1167
15
    .add(OffsetHi)
1168
15
    .addReg(CarryReg, RegState::Kill)
1169
15
    .addImm(0); // clamp bit
1170
15
  (void)HiHalf;
1171
15
  LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1172
15
1173
15
  unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1174
15
  MachineInstr *FullBase =
1175
15
    BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1176
15
      .addReg(DestSub0)
1177
15
      .addImm(AMDGPU::sub0)
1178
15
      .addReg(DestSub1)
1179
15
      .addImm(AMDGPU::sub1);
1180
15
  (void)FullBase;
1181
15
  LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1182
15
1183
15
  return FullDestReg;
1184
15
}
1185
1186
// Update base and offset with the NewBase and NewOffset in MI.
1187
void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1188
                                               unsigned NewBase,
1189
44
                                               int32_t NewOffset) {
1190
44
  TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1191
44
  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1192
44
}
1193
1194
Optional<int32_t>
1195
2.09k
SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1196
2.09k
  if (Op.isImm())
1197
2
    return Op.getImm();
1198
2.09k
1199
2.09k
  if (!Op.isReg())
1200
0
    return None;
1201
2.09k
1202
2.09k
  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1203
2.09k
  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1204
2.09k
      
!Def->getOperand(1).isImm()47
)
1205
2.04k
    return None;
1206
47
1207
47
  return Def->getOperand(1).getImm();
1208
47
}
1209
1210
// Analyze Base and extracts:
1211
//  - 32bit base registers, subregisters
1212
//  - 64bit constant offset
1213
// Expecting base computation as:
1214
//   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1215
//   %LO:vgpr_32, %c:sreg_64_xexec =
1216
//       V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1217
//   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1218
//   %Base:vreg_64 =
1219
//       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1220
void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1221
1.71k
                                                      MemAddress &Addr) {
1222
1.71k
  if (!Base.isReg())
1223
0
    return;
1224
1.71k
1225
1.71k
  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1226
1.71k
  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1227
1.71k
      || 
Def->getNumOperands() != 51.41k
)
1228
292
    return;
1229
1.41k
1230
1.41k
  MachineOperand BaseLo = Def->getOperand(1);
1231
1.41k
  MachineOperand BaseHi = Def->getOperand(3);
1232
1.41k
  if (!BaseLo.isReg() || !BaseHi.isReg())
1233
0
    return;
1234
1.41k
1235
1.41k
  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1236
1.41k
  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1237
1.41k
1238
1.41k
  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1239
1.41k
      
!BaseHiDef1.04k
||
BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e641.04k
)
1240
370
    return;
1241
1.04k
1242
1.04k
  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1243
1.04k
  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1244
1.04k
1245
1.04k
  auto Offset0P = extractConstOffset(*Src0);
1246
1.04k
  if (Offset0P)
1247
1
    BaseLo = *Src1;
1248
1.04k
  else {
1249
1.04k
    if (!(Offset0P = extractConstOffset(*Src1)))
1250
1.00k
      return;
1251
48
    BaseLo = *Src0;
1252
48
  }
1253
1.04k
1254
1.04k
  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1255
49
  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1256
49
1257
49
  if (Src0->isImm())
1258
0
    std::swap(Src0, Src1);
1259
49
1260
49
  if (!Src1->isImm())
1261
0
    return;
1262
49
1263
49
  uint64_t Offset1 = Src1->getImm();
1264
49
  BaseHi = *Src0;
1265
49
1266
49
  Addr.Base.LoReg = BaseLo.getReg();
1267
49
  Addr.Base.HiReg = BaseHi.getReg();
1268
49
  Addr.Base.LoSubReg = BaseLo.getSubReg();
1269
49
  Addr.Base.HiSubReg = BaseHi.getSubReg();
1270
49
  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1271
49
}
1272
1273
bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1274
    MachineInstr &MI,
1275
    MemInfoMap &Visited,
1276
543k
    SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1277
543k
1278
543k
  // TODO: Support flat and scratch.
1279
543k
  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1280
543k
      
TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL6.05k
)
1281
543k
    
return false540k
;
1282
2.42k
1283
2.42k
  // TODO: Support Store.
1284
2.42k
  if (!MI.mayLoad())
1285
0
    return false;
1286
2.42k
1287
2.42k
  if (AnchorList.count(&MI))
1288
15
    return false;
1289
2.41k
1290
2.41k
  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1291
2.41k
1292
2.41k
  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1293
728
    LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1294
728
    return false;
1295
728
  }
1296
1.68k
1297
1.68k
  // Step1: Find the base-registers and a 64bit constant offset.
1298
1.68k
  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1299
1.68k
  MemAddress MAddr;
1300
1.68k
  if (Visited.find(&MI) == Visited.end()) {
1301
1.67k
    processBaseWithConstOffset(Base, MAddr);
1302
1.67k
    Visited[&MI] = MAddr;
1303
1.67k
  } else
1304
10
    MAddr = Visited[&MI];
1305
1.68k
1306
1.68k
  if (MAddr.Offset == 0) {
1307
1.66k
    LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1308
1.66k
                         " constant offsets that can be promoted.\n";);
1309
1.66k
    return false;
1310
1.66k
  }
1311
20
1312
20
  LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1313
20
             << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1314
20
1315
20
  // Step2: Traverse through MI's basic block and find an anchor(that has the
1316
20
  // same base-registers) with the highest 13bit distance from MI's offset.
1317
20
  // E.g. (64bit loads)
1318
20
  // bb:
1319
20
  //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1320
20
  //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1321
20
  //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1322
20
  //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1323
20
  //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1324
20
  //
1325
20
  // Starting from the first load, the optimization will try to find a new base
1326
20
  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1327
20
  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1328
20
  // as the new-base(anchor) because of the maximum distance which can
1329
20
  // accomodate more intermediate bases presumeably.
1330
20
  //
1331
20
  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1332
20
  // (&a + 8192) for load1, load2, load4.
1333
20
  //   addr = &a + 8192
1334
20
  //   load1 = load(addr,       -4096)
1335
20
  //   load2 = load(addr,       -2048)
1336
20
  //   load3 = load(addr,       0)
1337
20
  //   load4 = load(addr,       2048)
1338
20
  //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1339
20
  //
1340
20
  MachineInstr *AnchorInst = nullptr;
1341
20
  MemAddress AnchorAddr;
1342
20
  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1343
20
  SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1344
20
1345
20
  MachineBasicBlock *MBB = MI.getParent();
1346
20
  MachineBasicBlock::iterator E = MBB->end();
1347
20
  MachineBasicBlock::iterator MBBI = MI.getIterator();
1348
20
  ++MBBI;
1349
20
  const SITargetLowering *TLI =
1350
20
    static_cast<const SITargetLowering *>(STM->getTargetLowering());
1351
20
1352
473
  for ( ; MBBI != E; 
++MBBI453
) {
1353
453
    MachineInstr &MINext = *MBBI;
1354
453
    // TODO: Support finding an anchor(with same base) from store addresses or
1355
453
    // any other load addresses where the opcodes are different.
1356
453
    if (MINext.getOpcode() != MI.getOpcode() ||
1357
453
        
TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()54
)
1358
405
      continue;
1359
48
1360
48
    const MachineOperand &BaseNext =
1361
48
      *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1362
48
    MemAddress MAddrNext;
1363
48
    if (Visited.find(&MINext) == Visited.end()) {
1364
39
      processBaseWithConstOffset(BaseNext, MAddrNext);
1365
39
      Visited[&MINext] = MAddrNext;
1366
39
    } else
1367
9
      MAddrNext = Visited[&MINext];
1368
48
1369
48
    if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1370
48
        
MAddrNext.Base.HiReg != MAddr.Base.HiReg41
||
1371
48
        
MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg41
||
1372
48
        
MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg41
)
1373
7
      continue;
1374
41
1375
41
    InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1376
41
1377
41
    int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1378
41
    TargetLoweringBase::AddrMode AM;
1379
41
    AM.HasBaseReg = true;
1380
41
    AM.BaseOffs = Dist;
1381
41
    if (TLI->isLegalGlobalAddressingMode(AM) &&
1382
41
        
(uint32_t)std::abs(Dist) > MaxDist26
) {
1383
26
      MaxDist = std::abs(Dist);
1384
26
1385
26
      AnchorAddr = MAddrNext;
1386
26
      AnchorInst = &MINext;
1387
26
    }
1388
41
  }
1389
20
1390
20
  if (AnchorInst) {
1391
15
    LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1392
15
               AnchorInst->dump());
1393
15
    LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1394
15
               <<  AnchorAddr.Offset << "\n\n");
1395
15
1396
15
    // Instead of moving up, just re-compute anchor-instruction's base address.
1397
15
    unsigned Base = computeBase(MI, AnchorAddr);
1398
15
1399
15
    updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1400
15
    LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1401
15
1402
37
    for (auto P : InstsWCommonBase) {
1403
37
      TargetLoweringBase::AddrMode AM;
1404
37
      AM.HasBaseReg = true;
1405
37
      AM.BaseOffs = P.second - AnchorAddr.Offset;
1406
37
1407
37
      if (TLI->isLegalGlobalAddressingMode(AM)) {
1408
29
        LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1409
29
                   dbgs() << ")"; P.first->dump());
1410
29
        updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1411
29
        LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1412
29
      }
1413
37
    }
1414
15
    AnchorList.insert(AnchorInst);
1415
15
    return true;
1416
15
  }
1417
5
1418
5
  return false;
1419
5
}
1420
1421
// Scan through looking for adjacent LDS operations with constant offsets from
1422
// the same base register. We rely on the scheduler to do the hard work of
1423
// clustering nearby loads, and assume these are all adjacent.
1424
28.7k
bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
1425
28.7k
  bool Modified = false;
1426
28.7k
1427
28.7k
  // Contain the list
1428
28.7k
  MemInfoMap Visited;
1429
28.7k
  // Contains the list of instructions for which constant offsets are being
1430
28.7k
  // promoted to the IMM.
1431
28.7k
  SmallPtrSet<MachineInstr *, 4> AnchorList;
1432
28.7k
1433
571k
  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
1434
543k
    MachineInstr &MI = *I;
1435
543k
1436
543k
    if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1437
15
      Modified = true;
1438
543k
1439
543k
    // Don't combine if volatile.
1440
543k
    if (MI.hasOrderedMemoryRef()) {
1441
20.9k
      ++I;
1442
20.9k
      continue;
1443
20.9k
    }
1444
522k
1445
522k
    const unsigned Opc = MI.getOpcode();
1446
522k
1447
522k
    CombineInfo CI;
1448
522k
    CI.I = I;
1449
522k
    CI.InstClass = getInstClass(Opc);
1450
522k
1451
522k
    switch (CI.InstClass) {
1452
522k
    default:
1453
496k
      break;
1454
522k
    case DS_READ:
1455
1.97k
      CI.EltSize =
1456
1.97k
          (Opc == AMDGPU::DS_READ_B64 || 
Opc == AMDGPU::DS_READ_B64_gfx91.31k
) ?
8958
1457
1.97k
                                                                          : 
41.01k
;
1458
1.97k
      if (findMatchingInst(CI)) {
1459
1.03k
        Modified = true;
1460
1.03k
        I = mergeRead2Pair(CI);
1461
1.03k
      } else {
1462
933
        ++I;
1463
933
      }
1464
1.97k
      continue;
1465
522k
    case DS_WRITE:
1466
2.82k
      CI.EltSize =
1467
2.82k
          (Opc == AMDGPU::DS_WRITE_B64 || 
Opc == AMDGPU::DS_WRITE_B64_gfx91.25k
) ?
82.33k
1468
2.82k
                                                                            : 
4496
;
1469
2.82k
      if (findMatchingInst(CI)) {
1470
2.17k
        Modified = true;
1471
2.17k
        I = mergeWrite2Pair(CI);
1472
2.17k
      } else {
1473
647
        ++I;
1474
647
      }
1475
2.82k
      continue;
1476
522k
    case S_BUFFER_LOAD_IMM:
1477
748
      CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
1478
748
      if (findMatchingInst(CI)) {
1479
199
        Modified = true;
1480
199
        I = mergeSBufferLoadImmPair(CI);
1481
199
        OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
1482
549
      } else {
1483
549
        ++I;
1484
549
      }
1485
748
      continue;
1486
522k
    case BUFFER_LOAD_OFFEN:
1487
4.91k
    case BUFFER_LOAD_OFFSET:
1488
4.91k
    case BUFFER_LOAD_OFFEN_exact:
1489
4.91k
    case BUFFER_LOAD_OFFSET_exact:
1490
4.91k
      CI.EltSize = 4;
1491
4.91k
      if (findMatchingInst(CI)) {
1492
128
        Modified = true;
1493
128
        I = mergeBufferLoadPair(CI);
1494
128
        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1495
4.78k
      } else {
1496
4.78k
        ++I;
1497
4.78k
      }
1498
4.91k
      continue;
1499
14.8k
    case BUFFER_STORE_OFFEN:
1500
14.8k
    case BUFFER_STORE_OFFSET:
1501
14.8k
    case BUFFER_STORE_OFFEN_exact:
1502
14.8k
    case BUFFER_STORE_OFFSET_exact:
1503
14.8k
      CI.EltSize = 4;
1504
14.8k
      if (findMatchingInst(CI)) {
1505
90
        Modified = true;
1506
90
        I = mergeBufferStorePair(CI);
1507
90
        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1508
14.8k
      } else {
1509
14.8k
        ++I;
1510
14.8k
      }
1511
14.8k
      continue;
1512
496k
    }
1513
496k
1514
496k
    ++I;
1515
496k
  }
1516
28.7k
1517
28.7k
  return Modified;
1518
28.7k
}
1519
1520
25.2k
bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1521
25.2k
  if (skipFunction(MF.getFunction()))
1522
8
    return false;
1523
25.2k
1524
25.2k
  STM = &MF.getSubtarget<GCNSubtarget>();
1525
25.2k
  if (!STM->loadStoreOptEnabled())
1526
1
    return false;
1527
25.2k
1528
25.2k
  TII = STM->getInstrInfo();
1529
25.2k
  TRI = &TII->getRegisterInfo();
1530
25.2k
1531
25.2k
  MRI = &MF.getRegInfo();
1532
25.2k
  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1533
25.2k
1534
25.2k
  assert(MRI->isSSA() && "Must be run on SSA");
1535
25.2k
1536
25.2k
  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1537
25.2k
1538
25.2k
  bool Modified = false;
1539
25.2k
1540
28.6k
  for (MachineBasicBlock &MBB : MF) {
1541
28.7k
    do {
1542
28.7k
      OptimizeAgain = false;
1543
28.7k
      Modified |= optimizeBlock(MBB);
1544
28.7k
    } while (OptimizeAgain);
1545
28.6k
  }
1546
25.2k
1547
25.2k
  return Modified;
1548
25.2k
}