Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
Line
Count
Source (jump to first uncovered line)
1
//=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This file contains a pass that performs optimization for vector by element
11
// SIMD instructions.
12
//
13
// Certain SIMD instructions with vector element operand are not efficient.
14
// Rewrite them into SIMD instructions with vector operands. This rewrite
15
// is driven by the latency of the instructions.
16
//
17
// Example:
18
//    fmla v0.4s, v1.4s, v2.s[1]
19
//    is rewritten into
20
//    dup v3.4s, v2.s[1]
21
//    fmla v0.4s, v1.4s, v3.4s
22
//
23
//===----------------------------------------------------------------------===//
24
25
#include "AArch64InstrInfo.h"
26
#include "llvm/ADT/SmallVector.h"
27
#include "llvm/ADT/Statistic.h"
28
#include "llvm/ADT/StringRef.h"
29
#include "llvm/CodeGen/MachineBasicBlock.h"
30
#include "llvm/CodeGen/MachineFunction.h"
31
#include "llvm/CodeGen/MachineFunctionPass.h"
32
#include "llvm/CodeGen/MachineInstr.h"
33
#include "llvm/CodeGen/MachineInstrBuilder.h"
34
#include "llvm/CodeGen/MachineOperand.h"
35
#include "llvm/CodeGen/MachineRegisterInfo.h"
36
#include "llvm/CodeGen/TargetSchedule.h"
37
#include "llvm/MC/MCInstrDesc.h"
38
#include "llvm/MC/MCSchedule.h"
39
#include "llvm/Pass.h"
40
#include "llvm/Target/TargetInstrInfo.h"
41
#include "llvm/Target/TargetSubtargetInfo.h"
42
#include <map>
43
44
using namespace llvm;
45
46
#define DEBUG_TYPE "aarch64-vectorbyelement-opt"
47
48
STATISTIC(NumModifiedInstr,
49
          "Number of vector by element instructions modified");
50
51
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME                                     \
52
13.8k
  "AArch64 vector by element instruction optimization pass"
53
54
namespace {
55
56
struct AArch64VectorByElementOpt : public MachineFunctionPass {
57
  static char ID;
58
59
  const TargetInstrInfo *TII;
60
  MachineRegisterInfo *MRI;
61
  TargetSchedModel SchedModel;
62
63
13.8k
  AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
64
13.8k
    initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
65
13.8k
  }
66
67
  /// Based only on latency of instructions, determine if it is cost efficient
68
  /// to replace the instruction InstDesc by the two instructions InstDescRep1
69
  /// and InstDescRep2.
70
  /// Return true if replacement is recommended.
71
  bool
72
  shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc,
73
                           const MCInstrDesc *InstDescRep1,
74
                           const MCInstrDesc *InstDescRep2,
75
                           std::map<unsigned, bool> &VecInstElemTable) const;
76
77
  /// Determine if we need to exit the vector by element instruction
78
  /// optimization pass early. This makes sure that Targets with no need
79
  /// for this optimization do not spent any compile time on this pass.
80
  /// This check is done by comparing the latency of an indexed FMLA
81
  /// instruction to the latency of the DUP + the latency of a vector
82
  /// FMLA instruction. We do not check on other related instructions such
83
  /// as FMLS as we assume that if the situation shows up for one
84
  /// instruction, then it is likely to show up for the related ones.
85
  /// Return true if early exit of the pass is recommended.
86
  bool earlyExitVectElement(MachineFunction *MF);
87
88
  /// Check whether an equivalent DUP instruction has already been
89
  /// created or not.
90
  /// Return true when the dup instruction already exists. In this case,
91
  /// DestReg will point to the destination of the already created DUP.
92
  bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
93
                unsigned LaneNumber, unsigned *DestReg) const;
94
95
  /// Certain SIMD instructions with vector element operand are not efficient.
96
  /// Rewrite them into SIMD instructions with vector operands. This rewrite
97
  /// is driven by the latency of the instructions.
98
  /// Return true if the SIMD instruction is modified.
99
  bool optimizeVectElement(MachineInstr &MI,
100
                           std::map<unsigned, bool> *VecInstElemTable) const;
101
102
  bool runOnMachineFunction(MachineFunction &Fn) override;
103
104
13.8k
  StringRef getPassName() const override {
105
13.8k
    return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
106
13.8k
  }
107
};
108
109
char AArch64VectorByElementOpt::ID = 0;
110
111
} // end anonymous namespace
112
113
INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
114
                AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
115
116
/// Based only on latency of instructions, determine if it is cost efficient
117
/// to replace the instruction InstDesc by the two instructions InstDescRep1
118
/// and InstDescRep2. Note that it is assumed in this fuction that an
119
/// instruction of type InstDesc is always replaced by the same two
120
/// instructions as results are cached here.
121
/// Return true if replacement is recommended.
122
bool AArch64VectorByElementOpt::shouldReplaceInstruction(
123
    MachineFunction *MF, const MCInstrDesc *InstDesc,
124
    const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2,
125
446k
    std::map<unsigned, bool> &VecInstElemTable) const {
126
446k
  // Check if replacment decision is alredy available in the cached table.
127
446k
  // if so, return it.
128
446k
  if (!VecInstElemTable.empty() &&
129
2
      VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end())
130
0
    return VecInstElemTable[InstDesc->getOpcode()];
131
446k
132
446k
  unsigned SCIdx = InstDesc->getSchedClass();
133
446k
  unsigned SCIdxRep1 = InstDescRep1->getSchedClass();
134
446k
  unsigned SCIdxRep2 = InstDescRep2->getSchedClass();
135
446k
  const MCSchedClassDesc *SCDesc =
136
446k
      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
137
446k
  const MCSchedClassDesc *SCDescRep1 =
138
446k
      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1);
139
446k
  const MCSchedClassDesc *SCDescRep2 =
140
446k
      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2);
141
446k
142
446k
  // If a subtarget does not define resources for any of the instructions
143
446k
  // of interest, then return false for no replacement.
144
446k
  if (
!SCDesc->isValid() || 446k
SCDesc->isVariant()446k
||
!SCDescRep1->isValid()446k
||
145
446k
      
SCDescRep1->isVariant()446k
||
!SCDescRep2->isValid()446k
||
146
446k
      
SCDescRep2->isVariant()446k
) {
147
0
    VecInstElemTable[InstDesc->getOpcode()] = false;
148
0
    return false;
149
0
  }
150
446k
151
446k
  
if (446k
SchedModel.computeInstrLatency(InstDesc->getOpcode()) >
152
446k
      SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) +
153
446k
          SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) {
154
496
    VecInstElemTable[InstDesc->getOpcode()] = true;
155
496
    return true;
156
496
  }
157
445k
  VecInstElemTable[InstDesc->getOpcode()] = false;
158
445k
  return false;
159
445k
}
160
161
/// Determine if we need to exit the vector by element instruction
162
/// optimization pass early. This makes sure that Targets with no need
163
/// for this optimization do not spent any compile time on this pass.
164
/// This check is done by comparing the latency of an indexed FMLA
165
/// instruction to the latency of the DUP + the latency of a vector
166
/// FMLA instruction. We do not check on other related instructions such
167
/// as FMLS as we assume that if the situation shows up for one
168
/// instruction, then it is likely to show up for the related ones.
169
/// Return true if early exit of the pass is recommended.
170
446k
bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) {
171
446k
  std::map<unsigned, bool> VecInstElemTable;
172
446k
  const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed);
173
446k
  const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane);
174
446k
  const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32);
175
446k
176
446k
  if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID,
177
446k
                                VecInstElemTable))
178
445k
    return true;
179
447
  return false;
180
447
}
181
182
/// Check whether an equivalent DUP instruction has already been
183
/// created or not.
184
/// Return true when the dup instruction already exists. In this case,
185
/// DestReg will point to the destination of the already created DUP.
186
bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
187
                                         unsigned SrcReg, unsigned LaneNumber,
188
49
                                         unsigned *DestReg) const {
189
49
  for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
190
215
       
MII != MIE215
;) {
191
167
    MII--;
192
167
    MachineInstr *CurrentMI = &*MII;
193
167
194
167
    if (CurrentMI->getOpcode() == DupOpcode &&
195
2
        CurrentMI->getNumOperands() == 3 &&
196
2
        CurrentMI->getOperand(1).getReg() == SrcReg &&
197
167
        
CurrentMI->getOperand(2).getImm() == LaneNumber2
) {
198
1
      *DestReg = CurrentMI->getOperand(0).getReg();
199
1
      return true;
200
1
    }
201
167
  }
202
49
203
48
  return false;
204
49
}
205
206
/// Certain SIMD instructions with vector element operand are not efficient.
207
/// Rewrite them into SIMD instructions with vector operands. This rewrite
208
/// is driven by the latency of the instructions.
209
/// The instruction of concerns are for the time being fmla, fmls, fmul,
210
/// and fmulx and hence they are hardcoded.
211
///
212
/// Example:
213
///    fmla v0.4s, v1.4s, v2.s[1]
214
///    is rewritten into
215
///    dup v3.4s, v2.s[1]           // dup not necessary if redundant
216
///    fmla v0.4s, v1.4s, v3.4s
217
/// Return true if the SIMD instruction is modified.
218
bool AArch64VectorByElementOpt::optimizeVectElement(
219
4.11k
    MachineInstr &MI, std::map<unsigned, bool> *VecInstElemTable) const {
220
4.11k
  const MCInstrDesc *MulMCID, *DupMCID;
221
4.11k
  const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
222
4.11k
223
4.11k
  switch (MI.getOpcode()) {
224
4.06k
  default:
225
4.06k
    return false;
226
4.11k
227
4.11k
  // 4X32 instructions
228
6
  case AArch64::FMLAv4i32_indexed:
229
6
    DupMCID = &TII->get(AArch64::DUPv4i32lane);
230
6
    MulMCID = &TII->get(AArch64::FMLAv4f32);
231
6
    break;
232
6
  case AArch64::FMLSv4i32_indexed:
233
6
    DupMCID = &TII->get(AArch64::DUPv4i32lane);
234
6
    MulMCID = &TII->get(AArch64::FMLSv4f32);
235
6
    break;
236
4
  case AArch64::FMULXv4i32_indexed:
237
4
    DupMCID = &TII->get(AArch64::DUPv4i32lane);
238
4
    MulMCID = &TII->get(AArch64::FMULXv4f32);
239
4
    break;
240
4
  case AArch64::FMULv4i32_indexed:
241
4
    DupMCID = &TII->get(AArch64::DUPv4i32lane);
242
4
    MulMCID = &TII->get(AArch64::FMULv4f32);
243
4
    break;
244
4.11k
245
4.11k
  // 2X64 instructions
246
3
  case AArch64::FMLAv2i64_indexed:
247
3
    DupMCID = &TII->get(AArch64::DUPv2i64lane);
248
3
    MulMCID = &TII->get(AArch64::FMLAv2f64);
249
3
    break;
250
3
  case AArch64::FMLSv2i64_indexed:
251
3
    DupMCID = &TII->get(AArch64::DUPv2i64lane);
252
3
    MulMCID = &TII->get(AArch64::FMLSv2f64);
253
3
    break;
254
4
  case AArch64::FMULXv2i64_indexed:
255
4
    DupMCID = &TII->get(AArch64::DUPv2i64lane);
256
4
    MulMCID = &TII->get(AArch64::FMULXv2f64);
257
4
    break;
258
3
  case AArch64::FMULv2i64_indexed:
259
3
    DupMCID = &TII->get(AArch64::DUPv2i64lane);
260
3
    MulMCID = &TII->get(AArch64::FMULv2f64);
261
3
    break;
262
4.11k
263
4.11k
  // 2X32 instructions
264
4
  case AArch64::FMLAv2i32_indexed:
265
4
    RC = &AArch64::FPR64RegClass;
266
4
    DupMCID = &TII->get(AArch64::DUPv2i32lane);
267
4
    MulMCID = &TII->get(AArch64::FMLAv2f32);
268
4
    break;
269
4
  case AArch64::FMLSv2i32_indexed:
270
4
    RC = &AArch64::FPR64RegClass;
271
4
    DupMCID = &TII->get(AArch64::DUPv2i32lane);
272
4
    MulMCID = &TII->get(AArch64::FMLSv2f32);
273
4
    break;
274
4
  case AArch64::FMULXv2i32_indexed:
275
4
    RC = &AArch64::FPR64RegClass;
276
4
    DupMCID = &TII->get(AArch64::DUPv2i32lane);
277
4
    MulMCID = &TII->get(AArch64::FMULXv2f32);
278
4
    break;
279
4
  case AArch64::FMULv2i32_indexed:
280
4
    RC = &AArch64::FPR64RegClass;
281
4
    DupMCID = &TII->get(AArch64::DUPv2i32lane);
282
4
    MulMCID = &TII->get(AArch64::FMULv2f32);
283
4
    break;
284
49
  }
285
49
286
49
  
if (49
!shouldReplaceInstruction(MI.getParent()->getParent(),
287
49
                                &TII->get(MI.getOpcode()), DupMCID, MulMCID,
288
49
                                *VecInstElemTable))
289
0
    return false;
290
49
291
49
  const DebugLoc &DL = MI.getDebugLoc();
292
49
  MachineBasicBlock &MBB = *MI.getParent();
293
49
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
294
49
295
49
  // get the operands of the current SIMD arithmetic instruction.
296
49
  unsigned MulDest = MI.getOperand(0).getReg();
297
49
  unsigned SrcReg0 = MI.getOperand(1).getReg();
298
49
  unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
299
49
  unsigned SrcReg1 = MI.getOperand(2).getReg();
300
49
  unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
301
49
  unsigned DupDest;
302
49
303
49
  // Instructions of interest have either 4 or 5 operands.
304
49
  if (
MI.getNumOperands() == 549
) {
305
26
    unsigned SrcReg2 = MI.getOperand(3).getReg();
306
26
    unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
307
26
    unsigned LaneNumber = MI.getOperand(4).getImm();
308
26
309
26
    // Create a new DUP instruction. Note that if an equivalent DUP instruction
310
26
    // has already been created before, then use that one instread of creating
311
26
    // a new one.
312
26
    if (
!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)26
) {
313
25
      DupDest = MRI.createVirtualRegister(RC);
314
25
      BuildMI(MBB, MI, DL, *DupMCID, DupDest)
315
25
          .addReg(SrcReg2, Src2IsKill)
316
25
          .addImm(LaneNumber);
317
25
    }
318
26
    BuildMI(MBB, MI, DL, *MulMCID, MulDest)
319
26
        .addReg(SrcReg0, Src0IsKill)
320
26
        .addReg(SrcReg1, Src1IsKill)
321
26
        .addReg(DupDest, Src2IsKill);
322
49
  } else 
if (23
MI.getNumOperands() == 423
) {
323
23
    unsigned LaneNumber = MI.getOperand(3).getImm();
324
23
    if (
!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)23
) {
325
23
      DupDest = MRI.createVirtualRegister(RC);
326
23
      BuildMI(MBB, MI, DL, *DupMCID, DupDest)
327
23
          .addReg(SrcReg1, Src1IsKill)
328
23
          .addImm(LaneNumber);
329
23
    }
330
23
    BuildMI(MBB, MI, DL, *MulMCID, MulDest)
331
23
        .addReg(SrcReg0, Src0IsKill)
332
23
        .addReg(DupDest, Src1IsKill);
333
0
  } else {
334
0
    return false;
335
0
  }
336
49
337
49
  ++NumModifiedInstr;
338
49
  return true;
339
49
}
340
341
456k
bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) {
342
456k
  if (skipFunction(*MF.getFunction()))
343
1
    return false;
344
456k
345
456k
  TII = MF.getSubtarget().getInstrInfo();
346
456k
  MRI = &MF.getRegInfo();
347
456k
  const TargetSubtargetInfo &ST = MF.getSubtarget();
348
456k
  const AArch64InstrInfo *AAII =
349
456k
      static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
350
456k
  if (!AAII)
351
0
    return false;
352
456k
  SchedModel.init(ST.getSchedModel(), &ST, AAII);
353
456k
  if (!SchedModel.hasInstrSchedModel())
354
10.0k
    return false;
355
446k
356
446k
  // A simple check to exit this pass early for targets that do not need it.
357
446k
  
if (446k
earlyExitVectElement(&MF)446k
)
358
445k
    return false;
359
447
360
447
  bool Changed = false;
361
447
  std::map<unsigned, bool> VecInstElemTable;
362
447
  SmallVector<MachineInstr *, 8> RemoveMIs;
363
447
364
514
  for (MachineBasicBlock &MBB : MF) {
365
514
    for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
366
4.62k
         
MII != MIE4.62k
;) {
367
4.11k
      MachineInstr &MI = *MII;
368
4.11k
      if (
optimizeVectElement(MI, &VecInstElemTable)4.11k
) {
369
49
        // Add MI to the list of instructions to be removed given that it has
370
49
        // been replaced.
371
49
        RemoveMIs.push_back(&MI);
372
49
        Changed = true;
373
49
      }
374
4.11k
      ++MII;
375
4.11k
    }
376
514
  }
377
447
378
447
  for (MachineInstr *MI : RemoveMIs)
379
49
    MI->eraseFromParent();
380
456k
381
456k
  return Changed;
382
456k
}
383
384
/// createAArch64VectorByElementOptPass - returns an instance of the
385
/// vector by element optimization pass.
386
13.8k
FunctionPass *llvm::createAArch64VectorByElementOptPass() {
387
13.8k
  return new AArch64VectorByElementOpt();
388
13.8k
}