Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/X86/X86CmovConversion.cpp
Line
Count
Source (jump to first uncovered line)
1
//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// This file implements a pass that converts X86 cmov instructions into
11
/// branches when profitable. This pass is conservative. It transforms if and
12
/// only if it can guarantee a gain with high confidence.
13
///
14
/// Thus, the optimization applies under the following conditions:
15
///   1. Consider as candidates only CMOVs in innermost loops (assume that
16
///      most hotspots are represented by these loops).
17
///   2. Given a group of CMOV instructions that are using the same EFLAGS def
18
///      instruction:
19
///      a. Consider them as candidates only if all have the same code condition
20
///         or the opposite one to prevent generating more than one conditional
21
///         jump per EFLAGS def instruction.
22
///      b. Consider them as candidates only if all are profitable to be
23
///         converted (assume that one bad conversion may cause a degradation).
24
///   3. Apply conversion only for loops that are found profitable and only for
25
///      CMOV candidates that were found profitable.
26
///      a. A loop is considered profitable only if conversion will reduce its
27
///         depth cost by some threshold.
28
///      b. CMOV is considered profitable if the cost of its condition is higher
29
///         than the average cost of its true-value and false-value by 25% of
30
///         branch-misprediction-penalty. This assures no degradation even with
31
///         25% branch misprediction.
32
///
33
/// Note: This pass is assumed to run on SSA machine code.
34
//
35
//===----------------------------------------------------------------------===//
36
//
37
//  External interfaces:
38
//      FunctionPass *llvm::createX86CmovConverterPass();
39
//      bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
40
//
41
//===----------------------------------------------------------------------===//
42
43
#include "X86.h"
44
#include "X86InstrInfo.h"
45
#include "llvm/ADT/ArrayRef.h"
46
#include "llvm/ADT/DenseMap.h"
47
#include "llvm/ADT/STLExtras.h"
48
#include "llvm/ADT/SmallPtrSet.h"
49
#include "llvm/ADT/SmallVector.h"
50
#include "llvm/ADT/Statistic.h"
51
#include "llvm/CodeGen/MachineBasicBlock.h"
52
#include "llvm/CodeGen/MachineFunction.h"
53
#include "llvm/CodeGen/MachineFunctionPass.h"
54
#include "llvm/CodeGen/MachineInstr.h"
55
#include "llvm/CodeGen/MachineInstrBuilder.h"
56
#include "llvm/CodeGen/MachineLoopInfo.h"
57
#include "llvm/CodeGen/MachineOperand.h"
58
#include "llvm/CodeGen/MachineRegisterInfo.h"
59
#include "llvm/CodeGen/TargetInstrInfo.h"
60
#include "llvm/CodeGen/TargetRegisterInfo.h"
61
#include "llvm/CodeGen/TargetSchedule.h"
62
#include "llvm/CodeGen/TargetSubtargetInfo.h"
63
#include "llvm/IR/DebugLoc.h"
64
#include "llvm/MC/MCSchedule.h"
65
#include "llvm/Pass.h"
66
#include "llvm/Support/CommandLine.h"
67
#include "llvm/Support/Debug.h"
68
#include "llvm/Support/raw_ostream.h"
69
#include <algorithm>
70
#include <cassert>
71
#include <iterator>
72
#include <utility>
73
74
using namespace llvm;
75
76
#define DEBUG_TYPE "x86-cmov-conversion"
77
78
STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
79
STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
80
STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
81
STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
82
83
// This internal switch can be used to turn off the cmov/branch optimization.
84
static cl::opt<bool>
85
    EnableCmovConverter("x86-cmov-converter",
86
                        cl::desc("Enable the X86 cmov-to-branch optimization."),
87
                        cl::init(true), cl::Hidden);
88
89
static cl::opt<unsigned>
90
    GainCycleThreshold("x86-cmov-converter-threshold",
91
                       cl::desc("Minimum gain per loop (in cycles) threshold."),
92
                       cl::init(4), cl::Hidden);
93
94
static cl::opt<bool> ForceMemOperand(
95
    "x86-cmov-converter-force-mem-operand",
96
    cl::desc("Convert cmovs to branches whenever they have memory operands."),
97
    cl::init(true), cl::Hidden);
98
99
namespace {
100
101
/// Converts X86 cmov instructions into branches when profitable.
102
class X86CmovConverterPass : public MachineFunctionPass {
103
public:
104
11.3k
  X86CmovConverterPass() : MachineFunctionPass(ID) { }
105
106
146k
  StringRef getPassName() const override { return "X86 cmov Conversion"; }
107
  bool runOnMachineFunction(MachineFunction &MF) override;
108
  void getAnalysisUsage(AnalysisUsage &AU) const override;
109
110
  /// Pass identification, replacement for typeid.
111
  static char ID;
112
113
private:
114
  MachineRegisterInfo *MRI;
115
  const TargetInstrInfo *TII;
116
  const TargetRegisterInfo *TRI;
117
  TargetSchedModel TSchedModel;
118
119
  /// List of consecutive CMOV instructions.
120
  using CmovGroup = SmallVector<MachineInstr *, 2>;
121
  using CmovGroups = SmallVector<CmovGroup, 2>;
122
123
  /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
124
  /// CmovInstGroups accordingly.
125
  ///
126
  /// \param Blocks List of blocks to process.
127
  /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
128
  /// \returns true iff it found any CMOV-group-candidate.
129
  bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
130
                             CmovGroups &CmovInstGroups,
131
                             bool IncludeLoads = false);
132
133
  /// Check if it is profitable to transform each CMOV-group-candidates into
134
  /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
135
  ///
136
  /// \param Blocks List of blocks to process.
137
  /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
138
  /// \returns true iff any CMOV-group-candidate remain.
139
  bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
140
                                        CmovGroups &CmovInstGroups);
141
142
  /// Convert the given list of consecutive CMOV instructions into a branch.
143
  ///
144
  /// \param Group Consecutive CMOV instructions to be converted into branch.
145
  void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
146
};
147
148
} // end anonymous namespace
149
150
char X86CmovConverterPass::ID = 0;
151
152
11.3k
void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
153
11.3k
  MachineFunctionPass::getAnalysisUsage(AU);
154
11.3k
  AU.addRequired<MachineLoopInfo>();
155
11.3k
}
156
157
135k
bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
158
135k
  if (skipFunction(MF.getFunction()))
159
196
    return false;
160
135k
  if (!EnableCmovConverter)
161
9
    return false;
162
135k
163
135k
  LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
164
135k
                    << "**********\n");
165
135k
166
135k
  bool Changed = false;
167
135k
  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
168
135k
  const TargetSubtargetInfo &STI = MF.getSubtarget();
169
135k
  MRI = &MF.getRegInfo();
170
135k
  TII = STI.getInstrInfo();
171
135k
  TRI = STI.getRegisterInfo();
172
135k
  TSchedModel.init(&STI);
173
135k
174
135k
  // Before we handle the more subtle cases of register-register CMOVs inside
175
135k
  // of potentially hot loops, we want to quickly remove all CMOVs with
176
135k
  // a memory operand. The CMOV will risk a stall waiting for the load to
177
135k
  // complete that speculative execution behind a branch is better suited to
178
135k
  // handle on modern x86 chips.
179
135k
  if (ForceMemOperand) {
180
135k
    CmovGroups AllCmovGroups;
181
135k
    SmallVector<MachineBasicBlock *, 4> Blocks;
182
135k
    for (auto &MBB : MF)
183
381k
      Blocks.push_back(&MBB);
184
135k
    if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
185
8.92k
      for (auto &Group : AllCmovGroups) {
186
8.92k
        // Skip any group that doesn't do at least one memory operand cmov.
187
9.51k
        if (
!llvm::any_of(Group, [&](MachineInstr *I) 8.92k
{ return I->mayLoad(); }))
188
7.81k
          continue;
189
1.10k
190
1.10k
        // For CMOV groups which we can rewrite and which contain a memory load,
191
1.10k
        // always rewrite them. On x86, a CMOV will dramatically amplify any
192
1.10k
        // memory latency by blocking speculative execution.
193
1.10k
        Changed = true;
194
1.10k
        convertCmovInstsToBranches(Group);
195
1.10k
      }
196
4.50k
    }
197
135k
  }
198
135k
199
135k
  //===--------------------------------------------------------------------===//
200
135k
  // Register-operand Conversion Algorithm
201
135k
  // ---------
202
135k
  //   For each inner most loop
203
135k
  //     collectCmovCandidates() {
204
135k
  //       Find all CMOV-group-candidates.
205
135k
  //     }
206
135k
  //
207
135k
  //     checkForProfitableCmovCandidates() {
208
135k
  //       * Calculate both loop-depth and optimized-loop-depth.
209
135k
  //       * Use these depth to check for loop transformation profitability.
210
135k
  //       * Check for CMOV-group-candidate transformation profitability.
211
135k
  //     }
212
135k
  //
213
135k
  //     For each profitable CMOV-group-candidate
214
135k
  //       convertCmovInstsToBranches() {
215
135k
  //           * Create FalseBB, SinkBB, Conditional branch to SinkBB.
216
135k
  //           * Replace each CMOV instruction with a PHI instruction in SinkBB.
217
135k
  //       }
218
135k
  //
219
135k
  // Note: For more details, see each function description.
220
135k
  //===--------------------------------------------------------------------===//
221
135k
222
135k
  // Build up the loops in pre-order.
223
135k
  SmallVector<MachineLoop *, 4> Loops(MLI.begin(), MLI.end());
224
135k
  // Note that we need to check size on each iteration as we accumulate child
225
135k
  // loops.
226
146k
  for (int i = 0; i < (int)Loops.size(); 
++i11.6k
)
227
11.6k
    for (MachineLoop *Child : Loops[i]->getSubLoops())
228
2.36k
      Loops.push_back(Child);
229
135k
230
135k
  for (MachineLoop *CurrLoop : Loops) {
231
11.6k
    // Optimize only inner most loops.
232
11.6k
    if (!CurrLoop->getSubLoops().empty())
233
1.36k
      continue;
234
10.3k
235
10.3k
    // List of consecutive CMOV instructions to be processed.
236
10.3k
    CmovGroups CmovInstGroups;
237
10.3k
238
10.3k
    if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups))
239
9.44k
      continue;
240
860
241
860
    if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(),
242
860
                                          CmovInstGroups))
243
507
      continue;
244
353
245
353
    Changed = true;
246
353
    for (auto &Group : CmovInstGroups)
247
733
      convertCmovInstsToBranches(Group);
248
353
  }
249
135k
250
135k
  return Changed;
251
135k
}
252
253
bool X86CmovConverterPass::collectCmovCandidates(
254
    ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
255
145k
    bool IncludeLoads) {
256
145k
  //===--------------------------------------------------------------------===//
257
145k
  // Collect all CMOV-group-candidates and add them into CmovInstGroups.
258
145k
  //
259
145k
  // CMOV-group:
260
145k
  //   CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
261
145k
  //
262
145k
  // CMOV-group-candidate:
263
145k
  //   CMOV-group where all the CMOV instructions are
264
145k
  //     1. consecutive.
265
145k
  //     2. have same condition code or opposite one.
266
145k
  //     3. have only operand registers (X86::CMOVrr).
267
145k
  //===--------------------------------------------------------------------===//
268
145k
  // List of possible improvement (TODO's):
269
145k
  // --------------------------------------
270
145k
  //   TODO: Add support for X86::CMOVrm instructions.
271
145k
  //   TODO: Add support for X86::SETcc instructions.
272
145k
  //   TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
273
145k
  //===--------------------------------------------------------------------===//
274
145k
275
145k
  // Current processed CMOV-Group.
276
145k
  CmovGroup Group;
277
409k
  for (auto *MBB : Blocks) {
278
409k
    Group.clear();
279
409k
    // Condition code of first CMOV instruction current processed range and its
280
409k
    // opposite condition code.
281
409k
    X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID,
282
409k
                  MemOpCC = X86::COND_INVALID;
283
409k
    // Indicator of a non CMOVrr instruction in the current processed range.
284
409k
    bool FoundNonCMOVInst = false;
285
409k
    // Indicator for current processed CMOV-group if it should be skipped.
286
409k
    bool SkipGroup = false;
287
409k
288
3.47M
    for (auto &I : *MBB) {
289
3.47M
      // Skip debug instructions.
290
3.47M
      if (I.isDebugInstr())
291
4.70k
        continue;
292
3.47M
      X86::CondCode CC = X86::getCondFromCMov(I);
293
3.47M
      // Check if we found a X86::CMOVrr instruction.
294
3.47M
      if (CC != X86::COND_INVALID && 
(12.6k
IncludeLoads12.6k
||
!I.mayLoad()2.04k
)) {
295
12.6k
        if (Group.empty()) {
296
11.1k
          // We found first CMOV in the range, reset flags.
297
11.1k
          FirstCC = CC;
298
11.1k
          FirstOppCC = X86::GetOppositeBranchCondition(CC);
299
11.1k
          // Clear out the prior group's memory operand CC.
300
11.1k
          MemOpCC = X86::COND_INVALID;
301
11.1k
          FoundNonCMOVInst = false;
302
11.1k
          SkipGroup = false;
303
11.1k
        }
304
12.6k
        Group.push_back(&I);
305
12.6k
        // Check if it is a non-consecutive CMOV instruction or it has different
306
12.6k
        // condition code than FirstCC or FirstOppCC.
307
12.6k
        if (FoundNonCMOVInst || 
(12.1k
CC != FirstCC12.1k
&&
CC != FirstOppCC54
))
308
559
          // Mark the SKipGroup indicator to skip current processed CMOV-Group.
309
559
          SkipGroup = true;
310
12.6k
        if (I.mayLoad()) {
311
1.24k
          if (MemOpCC == X86::COND_INVALID)
312
1.12k
            // The first memory operand CMOV.
313
1.12k
            MemOpCC = CC;
314
119
          else if (CC != MemOpCC)
315
1
            // Can't handle mixed conditions with memory operands.
316
1
            SkipGroup = true;
317
1.24k
        }
318
12.6k
        // Check if we were relying on zero-extending behavior of the CMOV.
319
12.6k
        if (!SkipGroup &&
320
12.6k
            llvm::any_of(
321
12.1k
                MRI->use_nodbg_instructions(I.defs().begin()->getReg()),
322
20.0k
                [&](MachineInstr &UseI) {
323
20.0k
                  return UseI.getOpcode() == X86::SUBREG_TO_REG;
324
20.0k
                }))
325
69
          // FIXME: We should model the cost of using an explicit MOV to handle
326
69
          // the zero-extension rather than just refusing to handle this.
327
69
          SkipGroup = true;
328
12.6k
        continue;
329
12.6k
      }
330
3.46M
      // If Group is empty, keep looking for first CMOV in the range.
331
3.46M
      if (Group.empty())
332
3.44M
        continue;
333
18.6k
334
18.6k
      // We found a non X86::CMOVrr instruction.
335
18.6k
      FoundNonCMOVInst = true;
336
18.6k
      // Check if this instruction define EFLAGS, to determine end of processed
337
18.6k
      // range, as there would be no more instructions using current EFLAGS def.
338
18.6k
      if (I.definesRegister(X86::EFLAGS)) {
339
8.49k
        // Check if current processed CMOV-group should not be skipped and add
340
8.49k
        // it as a CMOV-group-candidate.
341
8.49k
        if (!SkipGroup)
342
8.00k
          CmovInstGroups.push_back(Group);
343
490
        else
344
490
          ++NumOfSkippedCmovGroups;
345
8.49k
        Group.clear();
346
8.49k
      }
347
18.6k
    }
348
409k
    // End of basic block is considered end of range, check if current processed
349
409k
    // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
350
409k
    if (Group.empty())
351
406k
      continue;
352
2.66k
    if (!SkipGroup)
353
2.58k
      CmovInstGroups.push_back(Group);
354
83
    else
355
83
      ++NumOfSkippedCmovGroups;
356
2.66k
  }
357
145k
358
145k
  NumOfCmovGroupCandidate += CmovInstGroups.size();
359
145k
  return !CmovInstGroups.empty();
360
145k
}
361
362
/// \returns Depth of CMOV instruction as if it was converted into branch.
363
/// \param TrueOpDepth depth cost of CMOV true value operand.
364
/// \param FalseOpDepth depth cost of CMOV false value operand.
365
4.72k
static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
366
4.72k
  //===--------------------------------------------------------------------===//
367
4.72k
  // With no info about branch weight, we assume 50% for each value operand.
368
4.72k
  // Thus, depth of optimized CMOV instruction is the rounded up average of
369
4.72k
  // its True-Operand-Value-Depth and False-Operand-Value-Depth.
370
4.72k
  //===--------------------------------------------------------------------===//
371
4.72k
  return (TrueOpDepth + FalseOpDepth + 1) / 2;
372
4.72k
}
373
374
bool X86CmovConverterPass::checkForProfitableCmovCandidates(
375
860
    ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
376
860
  struct DepthInfo {
377
860
    /// Depth of original loop.
378
860
    unsigned Depth;
379
860
    /// Depth of optimized loop.
380
860
    unsigned OptDepth;
381
860
  };
382
860
  /// Number of loop iterations to calculate depth for ?!
383
860
  static const unsigned LoopIterations = 2;
384
860
  DenseMap<MachineInstr *, DepthInfo> DepthMap;
385
860
  DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
386
860
  enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
387
860
  /// For each register type maps the register to its last def instruction.
388
860
  DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum];
389
860
  /// Maps register operand to its def instruction, which can be nullptr if it
390
860
  /// is unknown (e.g., operand is defined outside the loop).
391
860
  DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap;
392
860
393
860
  // Set depth of unknown instruction (i.e., nullptr) to zero.
394
860
  DepthMap[nullptr] = {0, 0};
395
860
396
860
  SmallPtrSet<MachineInstr *, 4> CmovInstructions;
397
860
  for (auto &Group : CmovInstGroups)
398
1.66k
    CmovInstructions.insert(Group.begin(), Group.end());
399
860
400
860
  //===--------------------------------------------------------------------===//
401
860
  // Step 1: Calculate instruction depth and loop depth.
402
860
  // Optimized-Loop:
403
860
  //   loop with CMOV-group-candidates converted into branches.
404
860
  //
405
860
  // Instruction-Depth:
406
860
  //   instruction latency + max operand depth.
407
860
  //     * For CMOV instruction in optimized loop the depth is calculated as:
408
860
  //       CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
409
860
  // TODO: Find a better way to estimate the latency of the branch instruction
410
860
  //       rather than using the CMOV latency.
411
860
  //
412
860
  // Loop-Depth:
413
860
  //   max instruction depth of all instructions in the loop.
414
860
  // Note: instruction with max depth represents the critical-path in the loop.
415
860
  //
416
860
  // Loop-Depth[i]:
417
860
  //   Loop-Depth calculated for first `i` iterations.
418
860
  //   Note: it is enough to calculate depth for up to two iterations.
419
860
  //
420
860
  // Depth-Diff[i]:
421
860
  //   Number of cycles saved in first 'i` iterations by optimizing the loop.
422
860
  //===--------------------------------------------------------------------===//
423
2.58k
  for (unsigned I = 0; I < LoopIterations; 
++I1.72k
) {
424
1.72k
    DepthInfo &MaxDepth = LoopDepth[I];
425
9.76k
    for (auto *MBB : Blocks) {
426
9.76k
      // Clear physical registers Def map.
427
9.76k
      RegDefMaps[PhyRegType].clear();
428
92.1k
      for (MachineInstr &MI : *MBB) {
429
92.1k
        // Skip debug instructions.
430
92.1k
        if (MI.isDebugInstr())
431
8
          continue;
432
92.0k
        unsigned MIDepth = 0;
433
92.0k
        unsigned MIDepthOpt = 0;
434
92.0k
        bool IsCMOV = CmovInstructions.count(&MI);
435
324k
        for (auto &MO : MI.uses()) {
436
324k
          // Checks for "isUse()" as "uses()" returns also implicit definitions.
437
324k
          if (!MO.isReg() || 
!MO.isUse()212k
)
438
163k
            continue;
439
160k
          unsigned Reg = MO.getReg();
440
160k
          auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)];
441
160k
          if (MachineInstr *DefMI = RDM.lookup(Reg)) {
442
108k
            OperandToDefMap[&MO] = DefMI;
443
108k
            DepthInfo Info = DepthMap.lookup(DefMI);
444
108k
            MIDepth = std::max(MIDepth, Info.Depth);
445
108k
            if (!IsCMOV)
446
97.7k
              MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
447
108k
          }
448
160k
        }
449
92.0k
450
92.0k
        if (IsCMOV)
451
3.72k
          MIDepthOpt = getDepthOfOptCmov(
452
3.72k
              DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
453
3.72k
              DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
454
92.0k
455
92.0k
        // Iterates over all operands to handle implicit definitions as well.
456
383k
        for (auto &MO : MI.operands()) {
457
383k
          if (!MO.isReg() || 
!MO.isDef()271k
)
458
272k
            continue;
459
110k
          unsigned Reg = MO.getReg();
460
110k
          RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI;
461
110k
        }
462
92.0k
463
92.0k
        unsigned Latency = TSchedModel.computeInstrLatency(&MI);
464
92.0k
        DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
465
92.0k
        MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
466
92.0k
        MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
467
92.0k
      }
468
9.76k
    }
469
1.72k
  }
470
860
471
860
  unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
472
860
                                   LoopDepth[1].Depth - LoopDepth[1].OptDepth};
473
860
474
860
  //===--------------------------------------------------------------------===//
475
860
  // Step 2: Check if Loop worth to be optimized.
476
860
  // Worth-Optimize-Loop:
477
860
  //   case 1: Diff[1] == Diff[0]
478
860
  //           Critical-path is iteration independent - there is no dependency
479
860
  //           of critical-path instructions on critical-path instructions of
480
860
  //           previous iteration.
481
860
  //           Thus, it is enough to check gain percent of 1st iteration -
482
860
  //           To be conservative, the optimized loop need to have a depth of
483
860
  //           12.5% cycles less than original loop, per iteration.
484
860
  //
485
860
  //   case 2: Diff[1] > Diff[0]
486
860
  //           Critical-path is iteration dependent - there is dependency of
487
860
  //           critical-path instructions on critical-path instructions of
488
860
  //           previous iteration.
489
860
  //           Thus, check the gain percent of the 2nd iteration (similar to the
490
860
  //           previous case), but it is also required to check the gradient of
491
860
  //           the gain - the change in Depth-Diff compared to the change in
492
860
  //           Loop-Depth between 1st and 2nd iterations.
493
860
  //           To be conservative, the gradient need to be at least 50%.
494
860
  //
495
860
  //   In addition, In order not to optimize loops with very small gain, the
496
860
  //   gain (in cycles) after 2nd iteration should not be less than a given
497
860
  //   threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
498
860
  //
499
860
  // If loop is not worth optimizing, remove all CMOV-group-candidates.
500
860
  //===--------------------------------------------------------------------===//
501
860
  if (Diff[1] < GainCycleThreshold)
502
330
    return false;
503
530
504
530
  bool WorthOptLoop = false;
505
530
  if (Diff[1] == Diff[0])
506
90
    WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
507
440
  else if (Diff[1] > Diff[0])
508
439
    WorthOptLoop =
509
439
        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
510
439
        
(Diff[1] * 8 >= LoopDepth[1].Depth)340
;
511
530
512
530
  if (!WorthOptLoop)
513
127
    return false;
514
403
515
403
  ++NumOfLoopCandidate;
516
403
517
403
  //===--------------------------------------------------------------------===//
518
403
  // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
519
403
  // Worth-Optimize-Group:
520
403
  //   Iff it worths to optimize all CMOV instructions in the group.
521
403
  //
522
403
  // Worth-Optimize-CMOV:
523
403
  //   Predicted branch is faster than CMOV by the difference between depth of
524
403
  //   condition operand and depth of taken (predicted) value operand.
525
403
  //   To be conservative, the gain of such CMOV transformation should cover at
526
403
  //   at least 25% of branch-misprediction-penalty.
527
403
  //===--------------------------------------------------------------------===//
528
403
  unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
529
403
  CmovGroups TempGroups;
530
403
  std::swap(TempGroups, CmovInstGroups);
531
916
  for (auto &Group : TempGroups) {
532
916
    bool WorthOpGroup = true;
533
1.00k
    for (auto *MI : Group) {
534
1.00k
      // Avoid CMOV instruction which value is used as a pointer to load from.
535
1.00k
      // This is another conservative check to avoid converting CMOV instruction
536
1.00k
      // used with tree-search like algorithm, where the branch is unpredicted.
537
1.00k
      auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
538
1.00k
      if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) {
539
476
        unsigned Op = UIs.begin()->getOpcode();
540
476
        if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
541
2
          WorthOpGroup = false;
542
2
          break;
543
2
        }
544
1.00k
      }
545
1.00k
546
1.00k
      unsigned CondCost =
547
1.00k
          DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth;
548
1.00k
      unsigned ValCost = getDepthOfOptCmov(
549
1.00k
          DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
550
1.00k
          DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
551
1.00k
      if (ValCost > CondCost || 
(CondCost - ValCost) * 4 < MispredictPenalty909
) {
552
181
        WorthOpGroup = false;
553
181
        break;
554
181
      }
555
1.00k
    }
556
916
557
916
    if (WorthOpGroup)
558
733
      CmovInstGroups.push_back(Group);
559
916
  }
560
403
561
403
  return !CmovInstGroups.empty();
562
403
}
563
564
1.84k
static bool checkEFLAGSLive(MachineInstr *MI) {
565
1.84k
  if (MI->killsRegister(X86::EFLAGS))
566
0
    return false;
567
1.84k
568
1.84k
  // The EFLAGS operand of MI might be missing a kill marker.
569
1.84k
  // Figure out whether EFLAGS operand should LIVE after MI instruction.
570
1.84k
  MachineBasicBlock *BB = MI->getParent();
571
1.84k
  MachineBasicBlock::iterator ItrMI = MI;
572
1.84k
573
1.84k
  // Scan forward through BB for a use/def of EFLAGS.
574
2.65k
  for (auto I = std::next(ItrMI), E = BB->end(); I != E; 
++I814
) {
575
2.29k
    if (I->readsRegister(X86::EFLAGS))
576
90
      return true;
577
2.20k
    if (I->definesRegister(X86::EFLAGS))
578
1.39k
      return false;
579
2.20k
  }
580
1.84k
581
1.84k
  // We hit the end of the block, check whether EFLAGS is live into a successor.
582
1.84k
  
for (auto I = BB->succ_begin(), E = BB->succ_end(); 359
I != E625
;
++I266
) {
583
266
    if ((*I)->isLiveIn(X86::EFLAGS))
584
0
      return true;
585
266
  }
586
359
587
359
  return false;
588
359
}
589
590
/// Given /p First CMOV instruction and /p Last CMOV instruction representing a
591
/// group of CMOV instructions, which may contain debug instructions in between,
592
/// move all debug instructions to after the last CMOV instruction, making the
593
/// CMOV group consecutive.
594
1.84k
static void packCmovGroup(MachineInstr *First, MachineInstr *Last) {
595
1.84k
  assert(X86::getCondFromCMov(*Last) != X86::COND_INVALID &&
596
1.84k
         "Last instruction in a CMOV group must be a CMOV instruction");
597
1.84k
598
1.84k
  SmallVector<MachineInstr *, 2> DBGInstructions;
599
2.06k
  for (auto I = First->getIterator(), E = Last->getIterator(); I != E; 
I++218
) {
600
218
    if (I->isDebugInstr())
601
2
      DBGInstructions.push_back(&*I);
602
218
  }
603
1.84k
604
1.84k
  // Splice the debug instruction after the cmov group.
605
1.84k
  MachineBasicBlock *MBB = First->getParent();
606
1.84k
  for (auto *MI : DBGInstructions)
607
2
    MBB->insertAfter(Last, MI->removeFromParent());
608
1.84k
}
609
610
void X86CmovConverterPass::convertCmovInstsToBranches(
611
1.84k
    SmallVectorImpl<MachineInstr *> &Group) const {
612
1.84k
  assert(!Group.empty() && "No CMOV instructions to convert");
613
1.84k
  ++NumOfOptimizedCmovGroups;
614
1.84k
615
1.84k
  // If the CMOV group is not packed, e.g., there are debug instructions between
616
1.84k
  // first CMOV and last CMOV, then pack the group and make the CMOV instruction
617
1.84k
  // consecutive by moving the debug instructions to after the last CMOV.
618
1.84k
  packCmovGroup(Group.front(), Group.back());
619
1.84k
620
1.84k
  // To convert a CMOVcc instruction, we actually have to insert the diamond
621
1.84k
  // control-flow pattern.  The incoming instruction knows the destination vreg
622
1.84k
  // to set, the condition code register to branch on, the true/false values to
623
1.84k
  // select between, and a branch opcode to use.
624
1.84k
625
1.84k
  // Before
626
1.84k
  // -----
627
1.84k
  // MBB:
628
1.84k
  //   cond = cmp ...
629
1.84k
  //   v1 = CMOVge t1, f1, cond
630
1.84k
  //   v2 = CMOVlt t2, f2, cond
631
1.84k
  //   v3 = CMOVge v1, f3, cond
632
1.84k
  //
633
1.84k
  // After
634
1.84k
  // -----
635
1.84k
  // MBB:
636
1.84k
  //   cond = cmp ...
637
1.84k
  //   jge %SinkMBB
638
1.84k
  //
639
1.84k
  // FalseMBB:
640
1.84k
  //   jmp %SinkMBB
641
1.84k
  //
642
1.84k
  // SinkMBB:
643
1.84k
  //   %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
644
1.84k
  //   %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
645
1.84k
  //                                          ; true-value with false-value
646
1.84k
  //   %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
647
1.84k
  //                                          ; previous Phi instruction result
648
1.84k
649
1.84k
  MachineInstr &MI = *Group.front();
650
1.84k
  MachineInstr *LastCMOV = Group.back();
651
1.84k
  DebugLoc DL = MI.getDebugLoc();
652
1.84k
653
1.84k
  X86::CondCode CC = X86::CondCode(X86::getCondFromCMov(MI));
654
1.84k
  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
655
1.84k
  // Potentially swap the condition codes so that any memory operand to a CMOV
656
1.84k
  // is in the *false* position instead of the *true* position. We can invert
657
1.84k
  // any non-memory operand CMOV instructions to cope with this and we ensure
658
1.84k
  // memory operand CMOVs are only included with a single condition code.
659
1.94k
  if (
llvm::any_of(Group, [&](MachineInstr *I) 1.84k
{
660
1.94k
        return I->mayLoad() && 
X86::getCondFromCMov(*I) == CC1.10k
;
661
1.94k
      }))
662
1.10k
    std::swap(CC, OppCC);
663
1.84k
664
1.84k
  MachineBasicBlock *MBB = MI.getParent();
665
1.84k
  MachineFunction::iterator It = ++MBB->getIterator();
666
1.84k
  MachineFunction *F = MBB->getParent();
667
1.84k
  const BasicBlock *BB = MBB->getBasicBlock();
668
1.84k
669
1.84k
  MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
670
1.84k
  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
671
1.84k
  F->insert(It, FalseMBB);
672
1.84k
  F->insert(It, SinkMBB);
673
1.84k
674
1.84k
  // If the EFLAGS register isn't dead in the terminator, then claim that it's
675
1.84k
  // live into the sink and copy blocks.
676
1.84k
  if (checkEFLAGSLive(LastCMOV)) {
677
90
    FalseMBB->addLiveIn(X86::EFLAGS);
678
90
    SinkMBB->addLiveIn(X86::EFLAGS);
679
90
  }
680
1.84k
681
1.84k
  // Transfer the remainder of BB and its successor edges to SinkMBB.
682
1.84k
  SinkMBB->splice(SinkMBB->begin(), MBB,
683
1.84k
                  std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
684
1.84k
  SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
685
1.84k
686
1.84k
  // Add the false and sink blocks as its successors.
687
1.84k
  MBB->addSuccessor(FalseMBB);
688
1.84k
  MBB->addSuccessor(SinkMBB);
689
1.84k
690
1.84k
  // Create the conditional branch instruction.
691
1.84k
  BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
692
1.84k
693
1.84k
  // Add the sink block to the false block successors.
694
1.84k
  FalseMBB->addSuccessor(SinkMBB);
695
1.84k
696
1.84k
  MachineInstrBuilder MIB;
697
1.84k
  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
698
1.84k
  MachineBasicBlock::iterator MIItEnd =
699
1.84k
      std::next(MachineBasicBlock::iterator(LastCMOV));
700
1.84k
  MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
701
1.84k
  MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
702
1.84k
703
1.84k
  // First we need to insert an explicit load on the false path for any memory
704
1.84k
  // operand. We also need to potentially do register rewriting here, but it is
705
1.84k
  // simpler as the memory operands are always on the false path so we can
706
1.84k
  // simply take that input, whatever it is.
707
1.84k
  DenseMap<unsigned, unsigned> FalseBBRegRewriteTable;
708
3.90k
  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
709
2.05k
    auto &MI = *MIIt++;
710
2.05k
    // Skip any CMOVs in this group which don't load from memory.
711
2.05k
    if (!MI.mayLoad()) {
712
833
      // Remember the false-side register input.
713
833
      unsigned FalseReg =
714
833
          MI.getOperand(X86::getCondFromCMov(MI) == CC ? 
1830
:
23
).getReg();
715
833
      // Walk back through any intermediate cmovs referenced.
716
835
      while (true) {
717
835
        auto FRIt = FalseBBRegRewriteTable.find(FalseReg);
718
835
        if (FRIt == FalseBBRegRewriteTable.end())
719
833
          break;
720
2
        FalseReg = FRIt->second;
721
2
      }
722
833
      FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg;
723
833
      continue;
724
833
    }
725
1.22k
726
1.22k
    // The condition must be the *opposite* of the one we've decided to branch
727
1.22k
    // on as the branch will go *around* the load and the load should happen
728
1.22k
    // when the CMOV condition is false.
729
1.22k
    assert(X86::getCondFromCMov(MI) == OppCC &&
730
1.22k
           "Can only handle memory-operand cmov instructions with a condition "
731
1.22k
           "opposite to the selected branch direction.");
732
1.22k
733
1.22k
    // The goal is to rewrite the cmov from:
734
1.22k
    //
735
1.22k
    //   MBB:
736
1.22k
    //     %A = CMOVcc %B (tied), (mem)
737
1.22k
    //
738
1.22k
    // to
739
1.22k
    //
740
1.22k
    //   MBB:
741
1.22k
    //     %A = CMOVcc %B (tied), %C
742
1.22k
    //   FalseMBB:
743
1.22k
    //     %C = MOV (mem)
744
1.22k
    //
745
1.22k
    // Which will allow the next loop to rewrite the CMOV in terms of a PHI:
746
1.22k
    //
747
1.22k
    //   MBB:
748
1.22k
    //     JMP!cc SinkMBB
749
1.22k
    //   FalseMBB:
750
1.22k
    //     %C = MOV (mem)
751
1.22k
    //   SinkMBB:
752
1.22k
    //     %A = PHI [ %C, FalseMBB ], [ %B, MBB]
753
1.22k
754
1.22k
    // Get a fresh register to use as the destination of the MOV.
755
1.22k
    const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
756
1.22k
    unsigned TmpReg = MRI->createVirtualRegister(RC);
757
1.22k
758
1.22k
    SmallVector<MachineInstr *, 4> NewMIs;
759
1.22k
    bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
760
1.22k
                                             /*UnfoldLoad*/ true,
761
1.22k
                                             /*UnfoldStore*/ false, NewMIs);
762
1.22k
    (void)Unfolded;
763
1.22k
    assert(Unfolded && "Should never fail to unfold a loading cmov!");
764
1.22k
765
1.22k
    // Move the new CMOV to just before the old one and reset any impacted
766
1.22k
    // iterator.
767
1.22k
    auto *NewCMOV = NewMIs.pop_back_val();
768
1.22k
    assert(X86::getCondFromCMov(*NewCMOV) == OppCC &&
769
1.22k
           "Last new instruction isn't the expected CMOV!");
770
1.22k
    LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
771
1.22k
    MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);
772
1.22k
    if (&*MIItBegin == &MI)
773
1.10k
      MIItBegin = MachineBasicBlock::iterator(NewCMOV);
774
1.22k
775
1.22k
    // Sink whatever instructions were needed to produce the unfolded operand
776
1.22k
    // into the false block.
777
1.22k
    for (auto *NewMI : NewMIs) {
778
1.22k
      LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
779
1.22k
      FalseMBB->insert(FalseInsertionPoint, NewMI);
780
1.22k
      // Re-map any operands that are from other cmovs to the inputs for this block.
781
6.12k
      for (auto &MOp : NewMI->uses()) {
782
6.12k
        if (!MOp.isReg())
783
2.88k
          continue;
784
3.23k
        auto It = FalseBBRegRewriteTable.find(MOp.getReg());
785
3.23k
        if (It == FalseBBRegRewriteTable.end())
786
3.23k
          continue;
787
3
788
3
        MOp.setReg(It->second);
789
3
        // This might have been a kill when it referenced the cmov result, but
790
3
        // it won't necessarily be once rewritten.
791
3
        // FIXME: We could potentially improve this by tracking whether the
792
3
        // operand to the cmov was also a kill, and then skipping the PHI node
793
3
        // construction below.
794
3
        MOp.setIsKill(false);
795
3
      }
796
1.22k
    }
797
1.22k
    MBB->erase(MachineBasicBlock::iterator(MI),
798
1.22k
               std::next(MachineBasicBlock::iterator(MI)));
799
1.22k
800
1.22k
    // Add this PHI to the rewrite table.
801
1.22k
    FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
802
1.22k
  }
803
1.84k
804
1.84k
  // As we are creating the PHIs, we have to be careful if there is more than
805
1.84k
  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
806
1.84k
  // PHIs have to reference the individual true/false inputs from earlier PHIs.
807
1.84k
  // That also means that PHI construction must work forward from earlier to
808
1.84k
  // later, and that the code must maintain a mapping from earlier PHI's
809
1.84k
  // destination registers, and the registers that went into the PHI.
810
1.84k
  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
811
1.84k
812
3.90k
  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; 
++MIIt2.05k
) {
813
2.05k
    unsigned DestReg = MIIt->getOperand(0).getReg();
814
2.05k
    unsigned Op1Reg = MIIt->getOperand(1).getReg();
815
2.05k
    unsigned Op2Reg = MIIt->getOperand(2).getReg();
816
2.05k
817
2.05k
    // If this CMOV we are processing is the opposite condition from the jump we
818
2.05k
    // generated, then we have to swap the operands for the PHI that is going to
819
2.05k
    // be generated.
820
2.05k
    if (X86::getCondFromCMov(*MIIt) == OppCC)
821
1.22k
      std::swap(Op1Reg, Op2Reg);
822
2.05k
823
2.05k
    auto Op1Itr = RegRewriteTable.find(Op1Reg);
824
2.05k
    if (Op1Itr != RegRewriteTable.end())
825
2
      Op1Reg = Op1Itr->second.first;
826
2.05k
827
2.05k
    auto Op2Itr = RegRewriteTable.find(Op2Reg);
828
2.05k
    if (Op2Itr != RegRewriteTable.end())
829
0
      Op2Reg = Op2Itr->second.second;
830
2.05k
831
2.05k
    //  SinkMBB:
832
2.05k
    //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
833
2.05k
    //  ...
834
2.05k
    MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
835
2.05k
              .addReg(Op1Reg)
836
2.05k
              .addMBB(FalseMBB)
837
2.05k
              .addReg(Op2Reg)
838
2.05k
              .addMBB(MBB);
839
2.05k
    (void)MIB;
840
2.05k
    LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
841
2.05k
    LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump());
842
2.05k
843
2.05k
    // Add this PHI to the rewrite table.
844
2.05k
    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
845
2.05k
  }
846
1.84k
847
1.84k
  // Now remove the CMOV(s).
848
1.84k
  MBB->erase(MIItBegin, MIItEnd);
849
1.84k
}
850
851
102k
INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
852
102k
                      false, false)
853
102k
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
854
102k
INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
855
                    false, false)
856
857
11.3k
FunctionPass *llvm::createX86CmovConverterPass() {
858
11.3k
  return new X86CmovConverterPass();
859
11.3k
}