Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This pass tries to fuse DS instructions with close by immediate offsets.
11
// This will fuse operations such as
12
//  ds_read_b32 v0, v2 offset:16
13
//  ds_read_b32 v1, v2 offset:32
14
// ==>
15
//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16
//
17
//
18
// Future improvements:
19
//
20
// - This currently relies on the scheduler to place loads and stores next to
21
//   each other, and then only merges adjacent pairs of instructions. It would
22
//   be good to be more flexible with interleaved instructions, and possibly run
23
//   before scheduling. It currently missing stores of constants because loading
24
//   the constant into the data register is placed between the stores, although
25
//   this is arguably a scheduling problem.
26
//
27
// - Live interval recomputing seems inefficient. This currently only matches
28
//   one pair, and recomputes live intervals and moves on to the next pair. It
29
//   would be better to compute a list of all merges that need to occur.
30
//
31
// - With a list of instructions to process, we can also merge more. If a
32
//   cluster of loads have offsets that are too large to fit in the 8-bit
33
//   offsets, but are close enough to fit in the 8 bits, we can add to the base
34
//   pointer and use the new reduced offsets.
35
//
36
//===----------------------------------------------------------------------===//
37
38
#include "AMDGPU.h"
39
#include "AMDGPUSubtarget.h"
40
#include "SIInstrInfo.h"
41
#include "SIRegisterInfo.h"
42
#include "Utils/AMDGPUBaseInfo.h"
43
#include "llvm/ADT/ArrayRef.h"
44
#include "llvm/ADT/SmallVector.h"
45
#include "llvm/ADT/StringRef.h"
46
#include "llvm/Analysis/AliasAnalysis.h"
47
#include "llvm/CodeGen/MachineBasicBlock.h"
48
#include "llvm/CodeGen/MachineFunction.h"
49
#include "llvm/CodeGen/MachineFunctionPass.h"
50
#include "llvm/CodeGen/MachineInstr.h"
51
#include "llvm/CodeGen/MachineInstrBuilder.h"
52
#include "llvm/CodeGen/MachineOperand.h"
53
#include "llvm/CodeGen/MachineRegisterInfo.h"
54
#include "llvm/IR/DebugLoc.h"
55
#include "llvm/Pass.h"
56
#include "llvm/Support/Debug.h"
57
#include "llvm/Support/MathExtras.h"
58
#include "llvm/Support/raw_ostream.h"
59
#include <algorithm>
60
#include <cassert>
61
#include <cstdlib>
62
#include <iterator>
63
#include <utility>
64
65
using namespace llvm;
66
67
#define DEBUG_TYPE "si-load-store-opt"
68
69
namespace {
70
71
class SILoadStoreOptimizer : public MachineFunctionPass {
72
  using CombineInfo = struct {
73
    MachineBasicBlock::iterator I;
74
    MachineBasicBlock::iterator Paired;
75
    unsigned EltSize;
76
    unsigned Offset0;
77
    unsigned Offset1;
78
    unsigned BaseOff;
79
    bool UseST64;
80
    SmallVector<MachineInstr*, 8> InstsToMove;
81
   };
82
83
private:
84
  const SIInstrInfo *TII = nullptr;
85
  const SIRegisterInfo *TRI = nullptr;
86
  MachineRegisterInfo *MRI = nullptr;
87
  AliasAnalysis *AA = nullptr;
88
89
  static bool offsetsCanBeCombined(CombineInfo &CI);
90
91
  bool findMatchingDSInst(CombineInfo &CI);
92
93
  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
94
95
  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
96
97
public:
98
  static char ID;
99
100
1.43k
  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
101
1.43k
    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
102
1.43k
  }
103
104
  bool optimizeBlock(MachineBasicBlock &MBB);
105
106
  bool runOnMachineFunction(MachineFunction &MF) override;
107
108
1.43k
  StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
109
110
1.43k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
111
1.43k
    AU.setPreservesCFG();
112
1.43k
    AU.addRequired<AAResultsWrapperPass>();
113
1.43k
114
1.43k
    MachineFunctionPass::getAnalysisUsage(AU);
115
1.43k
  }
116
};
117
118
} // end anonymous namespace.
119
120
90.0k
INITIALIZE_PASS_BEGIN90.0k
(SILoadStoreOptimizer, DEBUG_TYPE,
121
90.0k
                      "SI Load / Store Optimizer", false, false)
122
90.0k
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
123
90.0k
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
124
                    "SI Load / Store Optimizer", false, false)
125
126
char SILoadStoreOptimizer::ID = 0;
127
128
char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
129
130
0
FunctionPass *llvm::createSILoadStoreOptimizerPass() {
131
0
  return new SILoadStoreOptimizer();
132
0
}
133
134
static void moveInstsAfter(MachineBasicBlock::iterator I,
135
1.94k
                           ArrayRef<MachineInstr*> InstsToMove) {
136
1.94k
  MachineBasicBlock *MBB = I->getParent();
137
1.94k
  ++I;
138
687
  for (MachineInstr *MI : InstsToMove) {
139
687
    MI->removeFromParent();
140
687
    MBB->insert(I, MI);
141
687
  }
142
1.94k
}
143
144
2.99k
static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
145
2.99k
  // XXX: Should this be looking for implicit defs?
146
2.99k
  for (const MachineOperand &Def : MI.defs())
147
1.45k
    Defs.insert(Def.getReg());
148
2.99k
}
149
150
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
151
                                      MachineBasicBlock::iterator B,
152
                                      const SIInstrInfo *TII,
153
332
                                      AliasAnalysis * AA) {
154
332
  // RAW or WAR - cannot reorder
155
332
  // WAW - cannot reorder
156
332
  // RAR - safe to reorder
157
256
  return !(A->mayStore() || B->mayStore()) ||
158
205
    TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
159
332
}
160
161
// Add MI and its defs to the lists if MI reads one of the defs that are
162
// already in the list. Returns true in that case.
163
static bool
164
addToListsIfDependent(MachineInstr &MI,
165
                      DenseSet<unsigned> &Defs,
166
4.87k
                      SmallVectorImpl<MachineInstr*> &Insts) {
167
23.5k
  for (MachineOperand &Use : MI.operands()) {
168
23.5k
    // If one of the defs is read, then there is a use of Def between I and the
169
23.5k
    // instruction that I will potentially be merged with. We will need to move
170
23.5k
    // this instruction after the merged instructions.
171
23.5k
172
23.5k
    if (
Use.isReg() && 23.5k
Use.readsReg()15.4k
&&
Defs.count(Use.getReg())12.1k
) {
173
808
      Insts.push_back(&MI);
174
808
      addDefsToList(MI, Defs);
175
808
      return true;
176
808
    }
177
4.06k
  }
178
4.06k
179
4.06k
  return false;
180
4.06k
}
181
182
static bool
183
canMoveInstsAcrossMemOp(MachineInstr &MemOp,
184
                        ArrayRef<MachineInstr*> InstsToMove,
185
                        const SIInstrInfo *TII,
186
1.97k
                        AliasAnalysis *AA) {
187
1.97k
  assert(MemOp.mayLoadOrStore());
188
1.97k
189
710
  for (MachineInstr *InstToMove : InstsToMove) {
190
710
    if (!InstToMove->mayLoadOrStore())
191
689
      continue;
192
21
    
if (21
!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)21
)
193
14
        return false;
194
1.96k
  }
195
1.96k
  return true;
196
1.96k
}
197
198
1.96k
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
199
1.96k
  // XXX - Would the same offset be OK? Is there any reason this would happen or
200
1.96k
  // be useful?
201
1.96k
  if (CI.Offset0 == CI.Offset1)
202
9
    return false;
203
1.95k
204
1.95k
  // This won't be valid if the offset isn't aligned.
205
1.95k
  
if (1.95k
(CI.Offset0 % CI.EltSize != 0) || 1.95k
(CI.Offset1 % CI.EltSize != 0)1.95k
)
206
0
    return false;
207
1.95k
208
1.95k
  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
209
1.95k
  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
210
1.95k
  CI.UseST64 = false;
211
1.95k
  CI.BaseOff = 0;
212
1.95k
213
1.95k
  // If the offset in elements doesn't fit in 8-bits, we might be able to use
214
1.95k
  // the stride 64 versions.
215
1.95k
  if (
(EltOffset0 % 64 == 0) && 1.95k
(EltOffset1 % 64) == 0163
&&
216
1.95k
      
isUInt<8>(EltOffset0 / 64)30
&&
isUInt<8>(EltOffset1 / 64)30
) {
217
30
    CI.Offset0 = EltOffset0 / 64;
218
30
    CI.Offset1 = EltOffset1 / 64;
219
30
    CI.UseST64 = true;
220
30
    return true;
221
30
  }
222
1.92k
223
1.92k
  // Check if the new offsets fit in the reduced 8-bit range.
224
1.92k
  
if (1.92k
isUInt<8>(EltOffset0) && 1.92k
isUInt<8>(EltOffset1)1.88k
) {
225
1.86k
    CI.Offset0 = EltOffset0;
226
1.86k
    CI.Offset1 = EltOffset1;
227
1.86k
    return true;
228
1.86k
  }
229
57
230
57
  // Try to shift base address to decrease offsets.
231
57
  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
232
57
  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
233
57
234
57
  if (
(OffsetDiff % 64 == 0) && 57
isUInt<8>(OffsetDiff / 64)24
) {
235
24
    CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
236
24
    CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
237
24
    CI.UseST64 = true;
238
24
    return true;
239
24
  }
240
33
241
33
  
if (33
isUInt<8>(OffsetDiff)33
) {
242
28
    CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
243
28
    CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
244
28
    return true;
245
28
  }
246
5
247
5
  return false;
248
5
}
249
250
2.90k
bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
251
2.90k
  MachineBasicBlock *MBB = CI.I->getParent();
252
2.90k
  MachineBasicBlock::iterator E = MBB->end();
253
2.90k
  MachineBasicBlock::iterator MBBI = CI.I;
254
2.90k
255
2.90k
  int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
256
2.90k
                                           AMDGPU::OpName::addr);
257
2.90k
  const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx);
258
2.90k
259
2.90k
  // We only ever merge operations with the same base address register, so don't
260
2.90k
  // bother scanning forward if there are no other uses.
261
2.90k
  if (TargetRegisterInfo::isPhysicalRegister(AddrReg0.getReg()) ||
262
2.90k
      MRI->hasOneNonDBGUse(AddrReg0.getReg()))
263
788
    return false;
264
2.11k
265
2.11k
  ++MBBI;
266
2.11k
267
2.11k
  DenseSet<unsigned> DefsToMove;
268
2.11k
  addDefsToList(*CI.I, DefsToMove);
269
2.11k
270
5.10k
  for ( ; 
MBBI != E5.10k
;
++MBBI2.99k
) {
271
4.96k
    if (
MBBI->getOpcode() != CI.I->getOpcode()4.96k
) {
272
2.97k
      // This is not a matching DS instruction, but we can keep looking as
273
2.97k
      // long as one of these conditions are met:
274
2.97k
      // 1. It is safe to move I down past MBBI.
275
2.97k
      // 2. It is safe to move MBBI down past the instruction that I will
276
2.97k
      //    be merged into.
277
2.97k
278
2.97k
      if (
MBBI->hasUnmodeledSideEffects()2.97k
) {
279
11
        // We can't re-order this instruction with respect to other memory
280
11
        // operations, so we fail both conditions mentioned above.
281
11
        return false;
282
11
      }
283
2.96k
284
2.96k
      
if (2.96k
MBBI->mayLoadOrStore() &&
285
2.96k
        
!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)268
) {
286
77
        // We fail condition #1, but we may still be able to satisfy condition
287
77
        // #2.  Add this instruction to the move list and then we will check
288
77
        // if condition #2 holds once we have selected the matching instruction.
289
77
        CI.InstsToMove.push_back(&*MBBI);
290
77
        addDefsToList(*MBBI, DefsToMove);
291
77
        continue;
292
77
      }
293
2.88k
294
2.88k
      // When we match I with another DS instruction we will be moving I down
295
2.88k
      // to the location of the matched instruction any uses of I will need to
296
2.88k
      // be moved down as well.
297
2.88k
      addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
298
2.88k
      continue;
299
2.88k
    }
300
1.98k
301
1.98k
    // Don't merge volatiles.
302
1.98k
    
if (1.98k
MBBI->hasOrderedMemoryRef()1.98k
)
303
2
      return false;
304
1.98k
305
1.98k
    // Handle a case like
306
1.98k
    //   DS_WRITE_B32 addr, v, idx0
307
1.98k
    //   w = DS_READ_B32 addr, idx0
308
1.98k
    //   DS_WRITE_B32 addr, f(w), idx1
309
1.98k
    // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
310
1.98k
    // merging of the two writes.
311
1.98k
    
if (1.98k
addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove)1.98k
)
312
2
      continue;
313
1.98k
314
1.98k
    const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
315
1.98k
316
1.98k
    // Check same base pointer. Be careful of subregisters, which can occur with
317
1.98k
    // vectors of pointers.
318
1.98k
    if (AddrReg0.getReg() == AddrReg1.getReg() &&
319
1.98k
        
AddrReg0.getSubReg() == AddrReg1.getSubReg()1.96k
) {
320
1.96k
      int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
321
1.96k
                                                 AMDGPU::OpName::offset);
322
1.96k
      CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff;
323
1.96k
      CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
324
1.96k
      CI.Paired = MBBI;
325
1.96k
326
1.96k
      // Check both offsets fit in the reduced range.
327
1.96k
      // We also need to go through the list of instructions that we plan to
328
1.96k
      // move and make sure they are all safe to move down past the merged
329
1.96k
      // instruction.
330
1.96k
      if (offsetsCanBeCombined(CI))
331
1.94k
        
if (1.94k
canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)1.94k
)
332
1.94k
          return true;
333
43
    }
334
43
335
43
    // We've found a load/store that we couldn't merge for some reason.
336
43
    // We could potentially keep looking, but we'd need to make sure that
337
43
    // it was safe to move I and also all the instruction in InstsToMove
338
43
    // down past this instruction.
339
43
    // check if we can move I across MBBI and if we can move all I's users
340
43
    
if (43
!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
341
31
      !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
342
20
      break;
343
4.96k
  }
344
160
  return false;
345
2.90k
}
346
347
MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
348
594
  CombineInfo &CI) {
349
594
  MachineBasicBlock *MBB = CI.I->getParent();
350
594
351
594
  // Be careful, since the addresses could be subregisters themselves in weird
352
594
  // cases, like vectors of pointers.
353
594
  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
354
594
355
594
  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
356
594
  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
357
594
358
594
  unsigned NewOffset0 = CI.Offset0;
359
594
  unsigned NewOffset1 = CI.Offset1;
360
162
  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
361
432
                                   : AMDGPU::DS_READ2_B64;
362
594
363
594
  if (CI.UseST64)
364
30
    
Opc = (CI.EltSize == 4) ? 30
AMDGPU::DS_READ2ST64_B3221
365
9
                            : AMDGPU::DS_READ2ST64_B64;
366
594
367
594
  unsigned SubRegIdx0 = (CI.EltSize == 4) ? 
AMDGPU::sub0162
:
AMDGPU::sub0_sub1432
;
368
594
  unsigned SubRegIdx1 = (CI.EltSize == 4) ? 
AMDGPU::sub1162
:
AMDGPU::sub2_sub3432
;
369
594
370
594
  if (
NewOffset0 > NewOffset1594
) {
371
253
    // Canonicalize the merged instruction so the smaller offset comes first.
372
253
    std::swap(NewOffset0, NewOffset1);
373
253
    std::swap(SubRegIdx0, SubRegIdx1);
374
253
  }
375
594
376
594
  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
377
594
         (NewOffset0 != NewOffset1) &&
378
594
         "Computed offset doesn't fit");
379
594
380
594
  const MCInstrDesc &Read2Desc = TII->get(Opc);
381
594
382
594
  const TargetRegisterClass *SuperRC
383
594
    = (CI.EltSize == 4) ? 
&AMDGPU::VReg_64RegClass162
:
&AMDGPU::VReg_128RegClass432
;
384
594
  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
385
594
386
594
  DebugLoc DL = CI.I->getDebugLoc();
387
594
388
594
  unsigned BaseReg = AddrReg->getReg();
389
594
  unsigned BaseRegFlags = 0;
390
594
  if (
CI.BaseOff594
) {
391
26
    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
392
26
    BaseRegFlags = RegState::Kill;
393
26
    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
394
26
           .addImm(CI.BaseOff)
395
26
           .addReg(AddrReg->getReg());
396
26
  }
397
594
398
594
  MachineInstrBuilder Read2 =
399
594
    BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
400
594
      .addReg(BaseReg, BaseRegFlags) // addr
401
594
      .addImm(NewOffset0)            // offset0
402
594
      .addImm(NewOffset1)            // offset1
403
594
      .addImm(0)                     // gds
404
594
      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
405
594
406
594
  (void)Read2;
407
594
408
594
  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
409
594
410
594
  // Copy to the old destination registers.
411
594
  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
412
594
      .add(*Dest0) // Copy to same destination including flags and sub reg.
413
594
      .addReg(DestReg, 0, SubRegIdx0);
414
594
  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
415
594
                            .add(*Dest1)
416
594
                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
417
594
418
594
  moveInstsAfter(Copy1, CI.InstsToMove);
419
594
420
594
  MachineBasicBlock::iterator Next = std::next(CI.I);
421
594
  CI.I->eraseFromParent();
422
594
  CI.Paired->eraseFromParent();
423
594
424
594
  DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
425
594
  return Next;
426
594
}
427
428
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
429
1.34k
  CombineInfo &CI) {
430
1.34k
  MachineBasicBlock *MBB = CI.I->getParent();
431
1.34k
432
1.34k
  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
433
1.34k
  // sure we preserve the subregister index and any register flags set on them.
434
1.34k
  const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
435
1.34k
  const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
436
1.34k
  const MachineOperand *Data1
437
1.34k
    = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
438
1.34k
439
1.34k
  unsigned NewOffset0 = CI.Offset0;
440
1.34k
  unsigned NewOffset1 = CI.Offset1;
441
57
  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
442
1.28k
                                   : AMDGPU::DS_WRITE2_B64;
443
1.34k
444
1.34k
  if (CI.UseST64)
445
24
    
Opc = (CI.EltSize == 4) ? 24
AMDGPU::DS_WRITE2ST64_B3217
446
7
                            : AMDGPU::DS_WRITE2ST64_B64;
447
1.34k
448
1.34k
  if (
NewOffset0 > NewOffset11.34k
) {
449
1.21k
    // Canonicalize the merged instruction so the smaller offset comes first.
450
1.21k
    std::swap(NewOffset0, NewOffset1);
451
1.21k
    std::swap(Data0, Data1);
452
1.21k
  }
453
1.34k
454
1.34k
  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
455
1.34k
         (NewOffset0 != NewOffset1) &&
456
1.34k
         "Computed offset doesn't fit");
457
1.34k
458
1.34k
  const MCInstrDesc &Write2Desc = TII->get(Opc);
459
1.34k
  DebugLoc DL = CI.I->getDebugLoc();
460
1.34k
461
1.34k
  unsigned BaseReg = Addr->getReg();
462
1.34k
  unsigned BaseRegFlags = 0;
463
1.34k
  if (
CI.BaseOff1.34k
) {
464
26
    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
465
26
    BaseRegFlags = RegState::Kill;
466
26
    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
467
26
           .addImm(CI.BaseOff)
468
26
           .addReg(Addr->getReg());
469
26
  }
470
1.34k
471
1.34k
  MachineInstrBuilder Write2 =
472
1.34k
    BuildMI(*MBB, CI.Paired, DL, Write2Desc)
473
1.34k
      .addReg(BaseReg, BaseRegFlags) // addr
474
1.34k
      .add(*Data0)                   // data0
475
1.34k
      .add(*Data1)                   // data1
476
1.34k
      .addImm(NewOffset0)            // offset0
477
1.34k
      .addImm(NewOffset1)            // offset1
478
1.34k
      .addImm(0)                     // gds
479
1.34k
      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
480
1.34k
481
1.34k
  moveInstsAfter(Write2, CI.InstsToMove);
482
1.34k
483
1.34k
  MachineBasicBlock::iterator Next = std::next(CI.I);
484
1.34k
  CI.I->eraseFromParent();
485
1.34k
  CI.Paired->eraseFromParent();
486
1.34k
487
1.34k
  DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
488
1.34k
  return Next;
489
1.34k
}
490
491
// Scan through looking for adjacent LDS operations with constant offsets from
492
// the same base register. We rely on the scheduler to do the hard work of
493
// clustering nearby loads, and assume these are all adjacent.
494
16.7k
bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
495
16.7k
  bool Modified = false;
496
16.7k
497
366k
  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 
I != E366k
;) {
498
350k
    MachineInstr &MI = *I;
499
350k
500
350k
    // Don't combine if volatile.
501
350k
    if (
MI.hasOrderedMemoryRef()350k
) {
502
15.8k
      ++I;
503
15.8k
      continue;
504
15.8k
    }
505
334k
506
334k
    CombineInfo CI;
507
334k
    CI.I = I;
508
334k
    unsigned Opc = MI.getOpcode();
509
334k
    if (
Opc == AMDGPU::DS_READ_B32 || 334k
Opc == AMDGPU::DS_READ_B64333k
) {
510
1.22k
      CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 
8532
:
4695
;
511
1.22k
      if (
findMatchingDSInst(CI)1.22k
) {
512
594
        Modified = true;
513
594
        I = mergeRead2Pair(CI);
514
1.22k
      } else {
515
633
        ++I;
516
633
      }
517
1.22k
518
1.22k
      continue;
519
333k
    } else 
if (333k
Opc == AMDGPU::DS_WRITE_B32 || 333k
Opc == AMDGPU::DS_WRITE_B64332k
) {
520
1.67k
      CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 
81.40k
:
4266
;
521
1.67k
      if (
findMatchingDSInst(CI)1.67k
) {
522
1.34k
        Modified = true;
523
1.34k
        I = mergeWrite2Pair(CI);
524
1.67k
      } else {
525
328
        ++I;
526
328
      }
527
333k
528
333k
      continue;
529
333k
    }
530
331k
531
331k
    ++I;
532
331k
  }
533
16.7k
534
16.7k
  return Modified;
535
16.7k
}
536
537
14.8k
bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
538
14.8k
  if (skipFunction(*MF.getFunction()))
539
1
    return false;
540
14.8k
541
14.8k
  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
542
14.8k
  if (!STM.loadStoreOptEnabled())
543
1
    return false;
544
14.8k
545
14.8k
  TII = STM.getInstrInfo();
546
14.8k
  TRI = &TII->getRegisterInfo();
547
14.8k
548
14.8k
  MRI = &MF.getRegInfo();
549
14.8k
  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
550
14.8k
551
14.8k
  assert(MRI->isSSA() && "Must be run on SSA");
552
14.8k
553
14.8k
  DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
554
14.8k
555
14.8k
  bool Modified = false;
556
14.8k
557
14.8k
  for (MachineBasicBlock &MBB : MF)
558
16.7k
    Modified |= optimizeBlock(MBB);
559
14.8k
560
14.8k
  return Modified;
561
14.8k
}