Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// If a load follows a store and reloads data that the store has written to
10
// memory, Intel microarchitectures can in many cases forward the data directly
11
// from the store to the load, This "store forwarding" saves cycles by enabling
12
// the load to directly obtain the data instead of accessing the data from
13
// cache or memory.
14
// A "store forward block" occurs in cases that a store cannot be forwarded to
15
// the load. The most typical case of store forward block on Intel Core
16
// microarchitecture that a small store cannot be forwarded to a large load.
17
// The estimated penalty for a store forward block is ~13 cycles.
18
//
19
// This pass tries to recognize and handle cases where "store forward block"
20
// is created by the compiler when lowering memcpy calls to a sequence
21
// of a load and a store.
22
//
23
// The pass currently only handles cases where memcpy is lowered to
24
// XMM/YMM registers, it tries to break the memcpy into smaller copies.
25
// breaking the memcpy should be possible since there is no atomicity
26
// guarantee for loads and stores to XMM/YMM.
27
//
28
// It could be better for performance to solve the problem by loading
29
// to XMM/YMM then inserting the partial store before storing back from XMM/YMM
30
// to memory, but this will result in a more conservative optimization since it
31
// requires we prove that all memory accesses between the blocking store and the
32
// load must alias/don't alias before we can move the store, whereas the
33
// transformation done here is correct regardless to other memory accesses.
34
//===----------------------------------------------------------------------===//
35
36
#include "X86InstrInfo.h"
37
#include "X86Subtarget.h"
38
#include "llvm/CodeGen/MachineBasicBlock.h"
39
#include "llvm/CodeGen/MachineFunction.h"
40
#include "llvm/CodeGen/MachineFunctionPass.h"
41
#include "llvm/CodeGen/MachineInstr.h"
42
#include "llvm/CodeGen/MachineInstrBuilder.h"
43
#include "llvm/CodeGen/MachineOperand.h"
44
#include "llvm/CodeGen/MachineRegisterInfo.h"
45
#include "llvm/IR/DebugInfoMetadata.h"
46
#include "llvm/IR/DebugLoc.h"
47
#include "llvm/IR/Function.h"
48
#include "llvm/MC/MCInstrDesc.h"
49
50
using namespace llvm;
51
52
#define DEBUG_TYPE "x86-avoid-SFB"
53
54
static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(
55
    "x86-disable-avoid-SFB", cl::Hidden,
56
    cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
57
58
static cl::opt<unsigned> X86AvoidSFBInspectionLimit(
59
    "x86-sfb-inspection-limit",
60
    cl::desc("X86: Number of instructions backward to "
61
             "inspect for store forwarding blocks."),
62
    cl::init(20), cl::Hidden);
63
64
namespace {
65
66
using DisplacementSizeMap = std::map<int64_t, unsigned>;
67
68
class X86AvoidSFBPass : public MachineFunctionPass {
69
public:
70
  static char ID;
71
11.4k
  X86AvoidSFBPass() : MachineFunctionPass(ID) { }
72
73
146k
  StringRef getPassName() const override {
74
146k
    return "X86 Avoid Store Forwarding Blocks";
75
146k
  }
76
77
  bool runOnMachineFunction(MachineFunction &MF) override;
78
79
11.3k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
80
11.3k
    MachineFunctionPass::getAnalysisUsage(AU);
81
11.3k
    AU.addRequired<AAResultsWrapperPass>();
82
11.3k
  }
83
84
private:
85
  MachineRegisterInfo *MRI;
86
  const X86InstrInfo *TII;
87
  const X86RegisterInfo *TRI;
88
  SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2>
89
      BlockedLoadsStoresPairs;
90
  SmallVector<MachineInstr *, 2> ForRemoval;
91
  AliasAnalysis *AA;
92
93
  /// Returns couples of Load then Store to memory which look
94
  ///  like a memcpy.
95
  void findPotentiallylBlockedCopies(MachineFunction &MF);
96
  /// Break the memcpy's load and store into smaller copies
97
  /// such that each memory load that was blocked by a smaller store
98
  /// would now be copied separately.
99
  void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
100
                          const DisplacementSizeMap &BlockingStoresDispSizeMap);
101
  /// Break a copy of size Size to smaller copies.
102
  void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
103
                   MachineInstr *StoreInst, int64_t StDispImm,
104
                   int64_t LMMOffset, int64_t SMMOffset);
105
106
  void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
107
                 MachineInstr *StoreInst, unsigned NStoreOpcode,
108
                 int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
109
                 int64_t SMMOffset);
110
111
  bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
112
113
  unsigned getRegSizeInBytes(MachineInstr *Inst);
114
};
115
116
} // end anonymous namespace
117
118
char X86AvoidSFBPass::ID = 0;
119
120
102k
INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",
121
102k
                      false, false)
122
102k
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
123
102k
INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false,
124
                    false)
125
126
11.3k
FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() {
127
11.3k
  return new X86AvoidSFBPass();
128
11.3k
}
129
130
2.67M
static bool isXMMLoadOpcode(unsigned Opcode) {
131
2.67M
  return Opcode == X86::MOVUPSrm || 
Opcode == X86::MOVAPSrm2.67M
||
132
2.67M
         
Opcode == X86::VMOVUPSrm2.66M
||
Opcode == X86::VMOVAPSrm2.66M
||
133
2.67M
         
Opcode == X86::VMOVUPDrm2.66M
||
Opcode == X86::VMOVAPDrm2.66M
||
134
2.67M
         
Opcode == X86::VMOVDQUrm2.66M
||
Opcode == X86::VMOVDQArm2.66M
||
135
2.67M
         
Opcode == X86::VMOVUPSZ128rm2.66M
||
Opcode == X86::VMOVAPSZ128rm2.66M
||
136
2.67M
         
Opcode == X86::VMOVUPDZ128rm2.66M
||
Opcode == X86::VMOVAPDZ128rm2.66M
||
137
2.67M
         
Opcode == X86::VMOVDQU64Z128rm2.66M
||
Opcode == X86::VMOVDQA64Z128rm2.66M
||
138
2.67M
         
Opcode == X86::VMOVDQU32Z128rm2.65M
||
Opcode == X86::VMOVDQA32Z128rm2.65M
;
139
2.67M
}
140
2.66M
static bool isYMMLoadOpcode(unsigned Opcode) {
141
2.66M
  return Opcode == X86::VMOVUPSYrm || 
Opcode == X86::VMOVAPSYrm2.65M
||
142
2.66M
         
Opcode == X86::VMOVUPDYrm2.65M
||
Opcode == X86::VMOVAPDYrm2.65M
||
143
2.66M
         
Opcode == X86::VMOVDQUYrm2.65M
||
Opcode == X86::VMOVDQAYrm2.65M
||
144
2.66M
         
Opcode == X86::VMOVUPSZ256rm2.65M
||
Opcode == X86::VMOVAPSZ256rm2.65M
||
145
2.66M
         
Opcode == X86::VMOVUPDZ256rm2.65M
||
Opcode == X86::VMOVAPDZ256rm2.65M
||
146
2.66M
         
Opcode == X86::VMOVDQU64Z256rm2.65M
||
Opcode == X86::VMOVDQA64Z256rm2.65M
||
147
2.66M
         
Opcode == X86::VMOVDQU32Z256rm2.65M
||
Opcode == X86::VMOVDQA32Z256rm2.65M
;
148
2.66M
}
149
150
2.67M
static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
151
2.67M
  return isXMMLoadOpcode(Opcode) || 
isYMMLoadOpcode(Opcode)2.65M
;
152
2.67M
}
153
154
15.8k
static bool isPotentialBlockedMemCpyPair(int LdOpcode, int StOpcode) {
155
15.8k
  switch (LdOpcode) {
156
15.8k
  case X86::MOVUPSrm:
157
6.35k
  case X86::MOVAPSrm:
158
6.35k
    return StOpcode == X86::MOVUPSmr || 
StOpcode == X86::MOVAPSmr4.03k
;
159
6.35k
  case X86::VMOVUPSrm:
160
279
  case X86::VMOVAPSrm:
161
279
    return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
162
279
  case X86::VMOVUPDrm:
163
153
  case X86::VMOVAPDrm:
164
153
    return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
165
2.61k
  case X86::VMOVDQUrm:
166
2.61k
  case X86::VMOVDQArm:
167
2.61k
    return StOpcode == X86::VMOVDQUmr || 
StOpcode == X86::VMOVDQAmr2.12k
;
168
2.61k
  case X86::VMOVUPSZ128rm:
169
78
  case X86::VMOVAPSZ128rm:
170
78
    return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
171
78
  case X86::VMOVUPDZ128rm:
172
27
  case X86::VMOVAPDZ128rm:
173
27
    return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
174
4.26k
  case X86::VMOVUPSYrm:
175
4.26k
  case X86::VMOVAPSYrm:
176
4.26k
    return StOpcode == X86::VMOVUPSYmr || 
StOpcode == X86::VMOVAPSYmr1.40k
;
177
4.26k
  case X86::VMOVUPDYrm:
178
161
  case X86::VMOVAPDYrm:
179
161
    return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
180
161
  case X86::VMOVDQUYrm:
181
2
  case X86::VMOVDQAYrm:
182
2
    return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
183
46
  case X86::VMOVUPSZ256rm:
184
46
  case X86::VMOVAPSZ256rm:
185
46
    return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
186
46
  case X86::VMOVUPDZ256rm:
187
28
  case X86::VMOVAPDZ256rm:
188
28
    return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
189
807
  case X86::VMOVDQU64Z128rm:
190
807
  case X86::VMOVDQA64Z128rm:
191
807
    return StOpcode == X86::VMOVDQU64Z128mr || 
StOpcode == X86::VMOVDQA64Z128mr773
;
192
807
  case X86::VMOVDQU32Z128rm:
193
0
  case X86::VMOVDQA32Z128rm:
194
0
    return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
195
1.01k
  case X86::VMOVDQU64Z256rm:
196
1.01k
  case X86::VMOVDQA64Z256rm:
197
1.01k
    return StOpcode == X86::VMOVDQU64Z256mr || 
StOpcode == X86::VMOVDQA64Z256mr1.00k
;
198
1.01k
  case X86::VMOVDQU32Z256rm:
199
0
  case X86::VMOVDQA32Z256rm:
200
0
    return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
201
0
  default:
202
0
    return false;
203
15.8k
  }
204
15.8k
}
205
206
5.29k
static bool isPotentialBlockingStoreInst(int Opcode, int LoadOpcode) {
207
5.29k
  bool PBlock = false;
208
5.29k
  PBlock |= Opcode == X86::MOV64mr || 
Opcode == X86::MOV64mi324.98k
||
209
5.29k
            
Opcode == X86::MOV32mr4.91k
||
Opcode == X86::MOV32mi4.83k
||
210
5.29k
            
Opcode == X86::MOV16mr4.79k
||
Opcode == X86::MOV16mi4.78k
||
211
5.29k
            
Opcode == X86::MOV8mr4.74k
||
Opcode == X86::MOV8mi4.71k
;
212
5.29k
  if (isYMMLoadOpcode(LoadOpcode))
213
1.59k
    PBlock |= Opcode == X86::VMOVUPSmr || 
Opcode == X86::VMOVAPSmr1.59k
||
214
1.59k
              
Opcode == X86::VMOVUPDmr1.59k
||
Opcode == X86::VMOVAPDmr1.59k
||
215
1.59k
              
Opcode == X86::VMOVDQUmr1.59k
||
Opcode == X86::VMOVDQAmr1.59k
||
216
1.59k
              
Opcode == X86::VMOVUPSZ128mr1.59k
||
Opcode == X86::VMOVAPSZ128mr1.59k
||
217
1.59k
              
Opcode == X86::VMOVUPDZ128mr1.59k
||
Opcode == X86::VMOVAPDZ128mr1.59k
||
218
1.59k
              
Opcode == X86::VMOVDQU64Z128mr1.59k
||
219
1.59k
              
Opcode == X86::VMOVDQA64Z128mr1.58k
||
220
1.59k
              
Opcode == X86::VMOVDQU32Z128mr1.58k
||
Opcode == X86::VMOVDQA32Z128mr1.58k
;
221
5.29k
  return PBlock;
222
5.29k
}
223
224
static const int MOV128SZ = 16;
225
static const int MOV64SZ = 8;
226
static const int MOV32SZ = 4;
227
static const int MOV16SZ = 2;
228
static const int MOV8SZ = 1;
229
230
13
static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
231
13
  switch (LoadOpcode) {
232
13
  case X86::VMOVUPSYrm:
233
7
  case X86::VMOVAPSYrm:
234
7
    return X86::VMOVUPSrm;
235
7
  case X86::VMOVUPDYrm:
236
0
  case X86::VMOVAPDYrm:
237
0
    return X86::VMOVUPDrm;
238
0
  case X86::VMOVDQUYrm:
239
0
  case X86::VMOVDQAYrm:
240
0
    return X86::VMOVDQUrm;
241
0
  case X86::VMOVUPSZ256rm:
242
0
  case X86::VMOVAPSZ256rm:
243
0
    return X86::VMOVUPSZ128rm;
244
0
  case X86::VMOVUPDZ256rm:
245
0
  case X86::VMOVAPDZ256rm:
246
0
    return X86::VMOVUPDZ128rm;
247
6
  case X86::VMOVDQU64Z256rm:
248
6
  case X86::VMOVDQA64Z256rm:
249
6
    return X86::VMOVDQU64Z128rm;
250
6
  case X86::VMOVDQU32Z256rm:
251
0
  case X86::VMOVDQA32Z256rm:
252
0
    return X86::VMOVDQU32Z128rm;
253
0
  default:
254
0
    llvm_unreachable("Unexpected Load Instruction Opcode");
255
0
  }
256
0
  return 0;
257
0
}
258
259
13
static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
260
13
  switch (StoreOpcode) {
261
13
  case X86::VMOVUPSYmr:
262
7
  case X86::VMOVAPSYmr:
263
7
    return X86::VMOVUPSmr;
264
7
  case X86::VMOVUPDYmr:
265
0
  case X86::VMOVAPDYmr:
266
0
    return X86::VMOVUPDmr;
267
0
  case X86::VMOVDQUYmr:
268
0
  case X86::VMOVDQAYmr:
269
0
    return X86::VMOVDQUmr;
270
0
  case X86::VMOVUPSZ256mr:
271
0
  case X86::VMOVAPSZ256mr:
272
0
    return X86::VMOVUPSZ128mr;
273
0
  case X86::VMOVUPDZ256mr:
274
0
  case X86::VMOVAPDZ256mr:
275
0
    return X86::VMOVUPDZ128mr;
276
6
  case X86::VMOVDQU64Z256mr:
277
6
  case X86::VMOVDQA64Z256mr:
278
6
    return X86::VMOVDQU64Z128mr;
279
6
  case X86::VMOVDQU32Z256mr:
280
0
  case X86::VMOVDQA32Z256mr:
281
0
    return X86::VMOVDQU32Z128mr;
282
0
  default:
283
0
    llvm_unreachable("Unexpected Load Instruction Opcode");
284
0
  }
285
0
  return 0;
286
0
}
287
288
37.1k
static int getAddrOffset(MachineInstr *MI) {
289
37.1k
  const MCInstrDesc &Descl = MI->getDesc();
290
37.1k
  int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
291
37.1k
  assert(AddrOffset != -1 && "Expected Memory Operand");
292
37.1k
  AddrOffset += X86II::getOperandBias(Descl);
293
37.1k
  return AddrOffset;
294
37.1k
}
295
296
13.7k
static MachineOperand &getBaseOperand(MachineInstr *MI) {
297
13.7k
  int AddrOffset = getAddrOffset(MI);
298
13.7k
  return MI->getOperand(AddrOffset + X86::AddrBaseReg);
299
13.7k
}
300
301
12.3k
static MachineOperand &getDispOperand(MachineInstr *MI) {
302
12.3k
  int AddrOffset = getAddrOffset(MI);
303
12.3k
  return MI->getOperand(AddrOffset + X86::AddrDisp);
304
12.3k
}
305
306
// Relevant addressing modes contain only base register and immediate
307
// displacement or frameindex and immediate displacement.
308
// TODO: Consider expanding to other addressing modes in the future
309
11.0k
static bool isRelevantAddressingMode(MachineInstr *MI) {
310
11.0k
  int AddrOffset = getAddrOffset(MI);
311
11.0k
  MachineOperand &Base = getBaseOperand(MI);
312
11.0k
  MachineOperand &Disp = getDispOperand(MI);
313
11.0k
  MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
314
11.0k
  MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
315
11.0k
  MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
316
11.0k
317
11.0k
  if (!((Base.isReg() && 
Base.getReg() != X86::NoRegister7.38k
) ||
Base.isFI()3.63k
))
318
0
    return false;
319
11.0k
  if (!Disp.isImm())
320
345
    return false;
321
10.6k
  if (Scale.getImm() != 1)
322
2.13k
    return false;
323
8.54k
  if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
324
2.56k
    return false;
325
5.98k
  if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
326
0
    return false;
327
5.98k
  return true;
328
5.98k
}
329
330
// Collect potentially blocking stores.
331
// Limit the number of instructions backwards we want to inspect
332
// since the effect of store block won't be visible if the store
333
// and load instructions have enough instructions in between to
334
// keep the core busy.
335
static SmallVector<MachineInstr *, 2>
336
580
findPotentialBlockers(MachineInstr *LoadInst) {
337
580
  SmallVector<MachineInstr *, 2> PotentialBlockers;
338
580
  unsigned BlockCount = 0;
339
580
  const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
340
580
  for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
341
580
            E = LoadInst->getParent()->rend();
342
3.21k
       PBInst != E; 
++PBInst2.63k
) {
343
2.72k
    if (PBInst->isMetaInstruction())
344
41
      continue;
345
2.67k
    BlockCount++;
346
2.67k
    if (BlockCount >= InspectionLimit)
347
2
      break;
348
2.67k
    MachineInstr &MI = *PBInst;
349
2.67k
    if (MI.getDesc().isCall())
350
85
      return PotentialBlockers;
351
2.59k
    PotentialBlockers.push_back(&MI);
352
2.59k
  }
353
580
  // If we didn't get to the instructions limit try predecessing blocks.
354
580
  // Ideally we should traverse the predecessor blocks in depth with some
355
580
  // coloring algorithm, but for now let's just look at the first order
356
580
  // predecessors.
357
580
  
if (495
BlockCount < InspectionLimit495
) {
358
493
    MachineBasicBlock *MBB = LoadInst->getParent();
359
493
    int LimitLeft = InspectionLimit - BlockCount;
360
493
    for (MachineBasicBlock::pred_iterator PB = MBB->pred_begin(),
361
493
                                          PE = MBB->pred_end();
362
1.10k
         PB != PE; 
++PB608
) {
363
608
      MachineBasicBlock *PMBB = *PB;
364
608
      int PredCount = 0;
365
608
      for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(),
366
608
                                               PME = PMBB->rend();
367
3.34k
           PBInst != PME; 
++PBInst2.74k
) {
368
2.88k
        if (PBInst->isMetaInstruction())
369
39
          continue;
370
2.84k
        PredCount++;
371
2.84k
        if (PredCount >= LimitLeft)
372
30
          break;
373
2.81k
        if (PBInst->getDesc().isCall())
374
111
          break;
375
2.70k
        PotentialBlockers.push_back(&*PBInst);
376
2.70k
      }
377
608
    }
378
493
  }
379
495
  return PotentialBlockers;
380
580
}
381
382
void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
383
                                int64_t LoadDisp, MachineInstr *StoreInst,
384
                                unsigned NStoreOpcode, int64_t StoreDisp,
385
                                unsigned Size, int64_t LMMOffset,
386
311
                                int64_t SMMOffset) {
387
311
  MachineOperand &LoadBase = getBaseOperand(LoadInst);
388
311
  MachineOperand &StoreBase = getBaseOperand(StoreInst);
389
311
  MachineBasicBlock *MBB = LoadInst->getParent();
390
311
  MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
391
311
  MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
392
311
393
311
  unsigned Reg1 = MRI->createVirtualRegister(
394
311
      TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
395
311
  MachineInstr *NewLoad =
396
311
      BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
397
311
              Reg1)
398
311
          .add(LoadBase)
399
311
          .addImm(1)
400
311
          .addReg(X86::NoRegister)
401
311
          .addImm(LoadDisp)
402
311
          .addReg(X86::NoRegister)
403
311
          .addMemOperand(
404
311
              MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
405
311
  if (LoadBase.isReg())
406
275
    getBaseOperand(NewLoad).setIsKill(false);
407
311
  LLVM_DEBUG(NewLoad->dump());
408
311
  // If the load and store are consecutive, use the loadInst location to
409
311
  // reduce register pressure.
410
311
  MachineInstr *StInst = StoreInst;
411
311
  auto PrevInstrIt = skipDebugInstructionsBackward(
412
311
      std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
413
311
      MBB->instr_begin());
414
311
  if (PrevInstrIt.getNodePtr() == LoadInst)
415
298
    StInst = LoadInst;
416
311
  MachineInstr *NewStore =
417
311
      BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
418
311
          .add(StoreBase)
419
311
          .addImm(1)
420
311
          .addReg(X86::NoRegister)
421
311
          .addImm(StoreDisp)
422
311
          .addReg(X86::NoRegister)
423
311
          .addReg(Reg1)
424
311
          .addMemOperand(
425
311
              MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
426
311
  if (StoreBase.isReg())
427
285
    getBaseOperand(NewStore).setIsKill(false);
428
311
  MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands);
429
311
  assert(StoreSrcVReg.isReg() && "Expected virtual register");
430
311
  NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
431
311
  LLVM_DEBUG(NewStore->dump());
432
311
}
433
434
void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
435
                                  int64_t LdDispImm, MachineInstr *StoreInst,
436
                                  int64_t StDispImm, int64_t LMMOffset,
437
316
                                  int64_t SMMOffset) {
438
316
  int LdDisp = LdDispImm;
439
316
  int StDisp = StDispImm;
440
627
  while (Size > 0) {
441
311
    if ((Size - MOV128SZ >= 0) && 
isYMMLoadOpcode(LoadInst->getOpcode())13
) {
442
13
      Size = Size - MOV128SZ;
443
13
      buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
444
13
                StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),
445
13
                StDisp, MOV128SZ, LMMOffset, SMMOffset);
446
13
      LdDisp += MOV128SZ;
447
13
      StDisp += MOV128SZ;
448
13
      LMMOffset += MOV128SZ;
449
13
      SMMOffset += MOV128SZ;
450
13
      continue;
451
13
    }
452
298
    if (Size - MOV64SZ >= 0) {
453
79
      Size = Size - MOV64SZ;
454
79
      buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
455
79
                MOV64SZ, LMMOffset, SMMOffset);
456
79
      LdDisp += MOV64SZ;
457
79
      StDisp += MOV64SZ;
458
79
      LMMOffset += MOV64SZ;
459
79
      SMMOffset += MOV64SZ;
460
79
      continue;
461
79
    }
462
219
    if (Size - MOV32SZ >= 0) {
463
129
      Size = Size - MOV32SZ;
464
129
      buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
465
129
                MOV32SZ, LMMOffset, SMMOffset);
466
129
      LdDisp += MOV32SZ;
467
129
      StDisp += MOV32SZ;
468
129
      LMMOffset += MOV32SZ;
469
129
      SMMOffset += MOV32SZ;
470
129
      continue;
471
129
    }
472
90
    if (Size - MOV16SZ >= 0) {
473
42
      Size = Size - MOV16SZ;
474
42
      buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
475
42
                MOV16SZ, LMMOffset, SMMOffset);
476
42
      LdDisp += MOV16SZ;
477
42
      StDisp += MOV16SZ;
478
42
      LMMOffset += MOV16SZ;
479
42
      SMMOffset += MOV16SZ;
480
42
      continue;
481
42
    }
482
48
    if (Size - MOV8SZ >= 0) {
483
48
      Size = Size - MOV8SZ;
484
48
      buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
485
48
                MOV8SZ, LMMOffset, SMMOffset);
486
48
      LdDisp += MOV8SZ;
487
48
      StDisp += MOV8SZ;
488
48
      LMMOffset += MOV8SZ;
489
48
      SMMOffset += MOV8SZ;
490
48
      continue;
491
48
    }
492
48
  }
493
316
  assert(Size == 0 && "Wrong size division");
494
316
}
495
496
80
static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
497
80
  MachineOperand &LoadBase = getBaseOperand(LoadInst);
498
80
  MachineOperand &StoreBase = getBaseOperand(StoreInst);
499
80
  auto StorePrevNonDbgInstr = skipDebugInstructionsBackward(
500
80
          std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
501
80
          LoadInst->getParent()->instr_begin()).getNodePtr();
502
80
  if (LoadBase.isReg()) {
503
71
    MachineInstr *LastLoad = LoadInst->getPrevNode();
504
71
    // If the original load and store to xmm/ymm were consecutive
505
71
    // then the partial copies were also created in
506
71
    // a consecutive order to reduce register pressure,
507
71
    // and the location of the last load is before the last store.
508
71
    if (StorePrevNonDbgInstr == LoadInst)
509
69
      LastLoad = LoadInst->getPrevNode()->getPrevNode();
510
71
    getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
511
71
  }
512
80
  if (StoreBase.isReg()) {
513
74
    MachineInstr *StInst = StoreInst;
514
74
    if (StorePrevNonDbgInstr == LoadInst)
515
71
      StInst = LoadInst;
516
74
    getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
517
74
  }
518
80
}
519
520
bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
521
2.62k
                            const MachineMemOperand &Op2) const {
522
2.62k
  if (!Op1.getValue() || 
!Op2.getValue()1.05k
)
523
1.60k
    return true;
524
1.02k
525
1.02k
  int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
526
1.02k
  int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
527
1.02k
  int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
528
1.02k
529
1.02k
  AliasResult AAResult =
530
1.02k
      AA->alias(MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
531
1.02k
                MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
532
1.02k
  return AAResult != NoAlias;
533
1.02k
}
534
535
108k
void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
536
108k
  for (auto &MBB : MF)
537
2.67M
    
for (auto &MI : MBB)337k
{
538
2.67M
      if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
539
2.65M
        continue;
540
24.4k
      int DefVR = MI.getOperand(0).getReg();
541
24.4k
      if (!MRI->hasOneNonDBGUse(DefVR))
542
8.28k
        continue;
543
16.1k
      for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
544
32.3k
           UI != UE;) {
545
16.1k
        MachineOperand &StoreMO = *UI++;
546
16.1k
        MachineInstr &StoreMI = *StoreMO.getParent();
547
16.1k
        // Skip cases where the memcpy may overlap.
548
16.1k
        if (StoreMI.getParent() == MI.getParent() &&
549
16.1k
            
isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode())15.8k
&&
550
16.1k
            
isRelevantAddressingMode(&MI)7.66k
&&
551
16.1k
            
isRelevantAddressingMode(&StoreMI)2.72k
) {
552
2.62k
          assert(MI.hasOneMemOperand() &&
553
2.62k
                 "Expected one memory operand for load instruction");
554
2.62k
          assert(StoreMI.hasOneMemOperand() &&
555
2.62k
                 "Expected one memory operand for store instruction");
556
2.62k
          if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
557
580
            BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
558
2.62k
        }
559
16.1k
      }
560
16.1k
    }
561
108k
}
562
563
307
unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
564
307
  auto TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
565
307
                              *LoadInst->getParent()->getParent());
566
307
  return TRI->getRegSizeInBits(*TRC) / 8;
567
307
}
568
569
void X86AvoidSFBPass::breakBlockedCopies(
570
    MachineInstr *LoadInst, MachineInstr *StoreInst,
571
80
    const DisplacementSizeMap &BlockingStoresDispSizeMap) {
572
80
  int64_t LdDispImm = getDispOperand(LoadInst).getImm();
573
80
  int64_t StDispImm = getDispOperand(StoreInst).getImm();
574
80
  int64_t LMMOffset = 0;
575
80
  int64_t SMMOffset = 0;
576
80
577
80
  int64_t LdDisp1 = LdDispImm;
578
80
  int64_t LdDisp2 = 0;
579
80
  int64_t StDisp1 = StDispImm;
580
80
  int64_t StDisp2 = 0;
581
80
  unsigned Size1 = 0;
582
80
  unsigned Size2 = 0;
583
80
  int64_t LdStDelta = StDispImm - LdDispImm;
584
80
585
118
  for (auto DispSizePair : BlockingStoresDispSizeMap) {
586
118
    LdDisp2 = DispSizePair.first;
587
118
    StDisp2 = DispSizePair.first + LdStDelta;
588
118
    Size2 = DispSizePair.second;
589
118
    // Avoid copying overlapping areas.
590
118
    if (LdDisp2 < LdDisp1) {
591
10
      int OverlapDelta = LdDisp1 - LdDisp2;
592
10
      LdDisp2 += OverlapDelta;
593
10
      StDisp2 += OverlapDelta;
594
10
      Size2 -= OverlapDelta;
595
10
    }
596
118
    Size1 = LdDisp2 - LdDisp1;
597
118
598
118
    // Build a copy for the point until the current blocking store's
599
118
    // displacement.
600
118
    buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
601
118
                SMMOffset);
602
118
    // Build a copy for the current blocking store.
603
118
    buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
604
118
                SMMOffset + Size1);
605
118
    LdDisp1 = LdDisp2 + Size2;
606
118
    StDisp1 = StDisp2 + Size2;
607
118
    LMMOffset += Size1 + Size2;
608
118
    SMMOffset += Size1 + Size2;
609
118
  }
610
80
  unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
611
80
  buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
612
80
              LMMOffset);
613
80
}
614
615
static bool hasSameBaseOpValue(MachineInstr *LoadInst,
616
630
                               MachineInstr *StoreInst) {
617
630
  MachineOperand &LoadBase = getBaseOperand(LoadInst);
618
630
  MachineOperand &StoreBase = getBaseOperand(StoreInst);
619
630
  if (LoadBase.isReg() != StoreBase.isReg())
620
200
    return false;
621
430
  if (LoadBase.isReg())
622
256
    return LoadBase.getReg() == StoreBase.getReg();
623
174
  return LoadBase.getIndex() == StoreBase.getIndex();
624
174
}
625
626
static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
627
227
                            int64_t StoreDispImm, unsigned StoreSize) {
628
227
  return ((StoreDispImm >= LoadDispImm) &&
629
227
          
(StoreDispImm <= LoadDispImm + (LoadSize - StoreSize))199
);
630
227
}
631
632
// Keep track of all stores blocking a load
633
static void
634
updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
635
131
                                int64_t DispImm, unsigned Size) {
636
131
  if (BlockingStoresDispSizeMap.count(DispImm)) {
637
2
    // Choose the smallest blocking store starting at this displacement.
638
2
    if (BlockingStoresDispSizeMap[DispImm] > Size)
639
0
      BlockingStoresDispSizeMap[DispImm] = Size;
640
2
641
2
  } else
642
129
    BlockingStoresDispSizeMap[DispImm] = Size;
643
131
}
644
645
// Remove blocking stores contained in each other.
646
static void
647
80
removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
648
80
  if (BlockingStoresDispSizeMap.size() <= 1)
649
56
    return;
650
24
651
24
  SmallVector<std::pair<int64_t, unsigned>, 0> DispSizeStack;
652
73
  for (auto DispSizePair : BlockingStoresDispSizeMap) {
653
73
    int64_t CurrDisp = DispSizePair.first;
654
73
    unsigned CurrSize = DispSizePair.second;
655
84
    while (DispSizeStack.size()) {
656
54
      int64_t PrevDisp = DispSizeStack.back().first;
657
54
      unsigned PrevSize = DispSizeStack.back().second;
658
54
      if (CurrDisp + CurrSize > PrevDisp + PrevSize)
659
43
        break;
660
11
      DispSizeStack.pop_back();
661
11
    }
662
73
    DispSizeStack.push_back(DispSizePair);
663
73
  }
664
24
  BlockingStoresDispSizeMap.clear();
665
24
  for (auto Disp : DispSizeStack)
666
62
    BlockingStoresDispSizeMap.insert(Disp);
667
24
}
668
669
135k
bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
670
135k
  bool Changed = false;
671
135k
672
135k
  if (DisableX86AvoidStoreForwardBlocks || 
skipFunction(MF.getFunction())135k
||
673
135k
      
!MF.getSubtarget<X86Subtarget>().is64Bit()135k
)
674
26.7k
    return false;
675
108k
676
108k
  MRI = &MF.getRegInfo();
677
108k
  assert(MRI->isSSA() && "Expected MIR to be in SSA form");
678
108k
  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
679
108k
  TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
680
108k
  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
681
108k
  LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
682
108k
  // Look for a load then a store to XMM/YMM which look like a memcpy
683
108k
  findPotentiallylBlockedCopies(MF);
684
108k
685
108k
  for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
686
580
    MachineInstr *LoadInst = LoadStoreInstPair.first;
687
580
    int64_t LdDispImm = getDispOperand(LoadInst).getImm();
688
580
    DisplacementSizeMap BlockingStoresDispSizeMap;
689
580
690
580
    SmallVector<MachineInstr *, 2> PotentialBlockers =
691
580
        findPotentialBlockers(LoadInst);
692
5.29k
    for (auto PBInst : PotentialBlockers) {
693
5.29k
      if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
694
5.29k
                                        LoadInst->getOpcode()) ||
695
5.29k
          
!isRelevantAddressingMode(PBInst)636
)
696
4.66k
        continue;
697
630
      int64_t PBstDispImm = getDispOperand(PBInst).getImm();
698
630
      assert(PBInst->hasOneMemOperand() && "Expected One Memory Operand");
699
630
      unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
700
630
      // This check doesn't cover all cases, but it will suffice for now.
701
630
      // TODO: take branch probability into consideration, if the blocking
702
630
      // store is in an unreached block, breaking the memcopy could lose
703
630
      // performance.
704
630
      if (hasSameBaseOpValue(LoadInst, PBInst) &&
705
630
          isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
706
227
                          PBstSize))
707
131
        updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
708
131
                                        PBstSize);
709
630
    }
710
580
711
580
    if (BlockingStoresDispSizeMap.empty())
712
500
      continue;
713
80
714
80
    // We found a store forward block, break the memcpy's load and store
715
80
    // into smaller copies such that each smaller store that was causing
716
80
    // a store block would now be copied separately.
717
80
    MachineInstr *StoreInst = LoadStoreInstPair.second;
718
80
    LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
719
80
    LLVM_DEBUG(LoadInst->dump());
720
80
    LLVM_DEBUG(StoreInst->dump());
721
80
    LLVM_DEBUG(dbgs() << "Replaced with:\n");
722
80
    removeRedundantBlockingStores(BlockingStoresDispSizeMap);
723
80
    breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
724
80
    updateKillStatus(LoadInst, StoreInst);
725
80
    ForRemoval.push_back(LoadInst);
726
80
    ForRemoval.push_back(StoreInst);
727
80
  }
728
108k
  for (auto RemovedInst : ForRemoval) {
729
160
    RemovedInst->eraseFromParent();
730
160
  }
731
108k
  ForRemoval.clear();
732
108k
  BlockedLoadsStoresPairs.clear();
733
108k
  LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
734
108k
735
108k
  return Changed;
736
108k
}