Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/CodeGen/MachinePipeliner.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
11
//
12
// Software pipelining (SWP) is an instruction scheduling technique for loops
13
// that overlap loop iterations and explioits ILP via a compiler transformation.
14
//
15
// Swing Modulo Scheduling is an implementation of software pipelining
16
// that generates schedules that are near optimal in terms of initiation
17
// interval, register requirements, and stage count. See the papers:
18
//
19
// "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
20
// A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Processings of the 1996
21
// Conference on Parallel Architectures and Compilation Techiniques.
22
//
23
// "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
24
// Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
25
// Transactions on Computers, Vol. 50, No. 3, 2001.
26
//
27
// "An Implementation of Swing Modulo Scheduling With Extensions for
28
// Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
29
// Urbana-Chambpain, 2005.
30
//
31
//
32
// The SMS algorithm consists of three main steps after computing the minimal
33
// initiation interval (MII).
34
// 1) Analyze the dependence graph and compute information about each
35
//    instruction in the graph.
36
// 2) Order the nodes (instructions) by priority based upon the heuristics
37
//    described in the algorithm.
38
// 3) Attempt to schedule the nodes in the specified order using the MII.
39
//
40
// This SMS implementation is a target-independent back-end pass. When enabled,
41
// the pass runs just prior to the register allocation pass, while the machine
42
// IR is in SSA form. If software pipelining is successful, then the original
43
// loop is replaced by the optimized loop. The optimized loop contains one or
44
// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
45
// the instructions cannot be scheduled in a given MII, we increase the MII by
46
// one and try again.
47
//
48
// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
49
// represent loop carried dependences in the DAG as order edges to the Phi
50
// nodes. We also perform several passes over the DAG to eliminate unnecessary
51
// edges that inhibit the ability to pipeline. The implementation uses the
52
// DFAPacketizer class to compute the minimum initiation interval and the check
53
// where an instruction may be inserted in the pipelined schedule.
54
//
55
// In order for the SMS pass to work, several target specific hooks need to be
56
// implemented to get information about the loop structure and to rewrite
57
// instructions.
58
//
59
//===----------------------------------------------------------------------===//
60
61
#include "llvm/ADT/ArrayRef.h"
62
#include "llvm/ADT/BitVector.h"
63
#include "llvm/ADT/DenseMap.h"
64
#include "llvm/ADT/MapVector.h"
65
#include "llvm/ADT/PriorityQueue.h"
66
#include "llvm/ADT/SetVector.h"
67
#include "llvm/ADT/SmallPtrSet.h"
68
#include "llvm/ADT/SmallSet.h"
69
#include "llvm/ADT/SmallVector.h"
70
#include "llvm/ADT/Statistic.h"
71
#include "llvm/ADT/iterator_range.h"
72
#include "llvm/Analysis/AliasAnalysis.h"
73
#include "llvm/Analysis/MemoryLocation.h"
74
#include "llvm/Analysis/ValueTracking.h"
75
#include "llvm/CodeGen/DFAPacketizer.h"
76
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
77
#include "llvm/CodeGen/MachineBasicBlock.h"
78
#include "llvm/CodeGen/MachineDominators.h"
79
#include "llvm/CodeGen/MachineFunction.h"
80
#include "llvm/CodeGen/MachineFunctionPass.h"
81
#include "llvm/CodeGen/MachineInstr.h"
82
#include "llvm/CodeGen/MachineInstrBuilder.h"
83
#include "llvm/CodeGen/MachineLoopInfo.h"
84
#include "llvm/CodeGen/MachineMemOperand.h"
85
#include "llvm/CodeGen/MachineOperand.h"
86
#include "llvm/CodeGen/MachineRegisterInfo.h"
87
#include "llvm/CodeGen/RegisterClassInfo.h"
88
#include "llvm/CodeGen/RegisterPressure.h"
89
#include "llvm/CodeGen/ScheduleDAG.h"
90
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
91
#include "llvm/CodeGen/ScheduleDAGMutation.h"
92
#include "llvm/IR/Attributes.h"
93
#include "llvm/IR/DebugLoc.h"
94
#include "llvm/IR/Function.h"
95
#include "llvm/MC/LaneBitmask.h"
96
#include "llvm/MC/MCInstrDesc.h"
97
#include "llvm/MC/MCInstrItineraries.h"
98
#include "llvm/MC/MCRegisterInfo.h"
99
#include "llvm/Pass.h"
100
#include "llvm/Support/CommandLine.h"
101
#include "llvm/Support/Compiler.h"
102
#include "llvm/Support/Debug.h"
103
#include "llvm/Support/MathExtras.h"
104
#include "llvm/Support/raw_ostream.h"
105
#include "llvm/Target/TargetInstrInfo.h"
106
#include "llvm/Target/TargetOpcodes.h"
107
#include "llvm/Target/TargetRegisterInfo.h"
108
#include "llvm/Target/TargetSubtargetInfo.h"
109
#include <algorithm>
110
#include <cassert>
111
#include <climits>
112
#include <cstdint>
113
#include <deque>
114
#include <functional>
115
#include <iterator>
116
#include <map>
117
#include <memory>
118
#include <tuple>
119
#include <utility>
120
#include <vector>
121
122
using namespace llvm;
123
124
#define DEBUG_TYPE "pipeliner"
125
126
STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
127
STATISTIC(NumPipelined, "Number of loops software pipelined");
128
129
/// A command line option to turn software pipelining on or off.
130
static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
131
                               cl::ZeroOrMore,
132
                               cl::desc("Enable Software Pipelining"));
133
134
/// A command line option to enable SWP at -Os.
135
static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
136
                                      cl::desc("Enable SWP at Os."), cl::Hidden,
137
                                      cl::init(false));
138
139
/// A command line argument to limit minimum initial interval for pipelining.
140
static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
141
                              cl::desc("Size limit for the the MII."),
142
                              cl::Hidden, cl::init(27));
143
144
/// A command line argument to limit the number of stages in the pipeline.
145
static cl::opt<int>
146
    SwpMaxStages("pipeliner-max-stages",
147
                 cl::desc("Maximum stages allowed in the generated scheduled."),
148
                 cl::Hidden, cl::init(3));
149
150
/// A command line option to disable the pruning of chain dependences due to
151
/// an unrelated Phi.
152
static cl::opt<bool>
153
    SwpPruneDeps("pipeliner-prune-deps",
154
                 cl::desc("Prune dependences between unrelated Phi nodes."),
155
                 cl::Hidden, cl::init(true));
156
157
/// A command line option to disable the pruning of loop carried order
158
/// dependences.
159
static cl::opt<bool>
160
    SwpPruneLoopCarried("pipeliner-prune-loop-carried",
161
                        cl::desc("Prune loop carried order dependences."),
162
                        cl::Hidden, cl::init(true));
163
164
#ifndef NDEBUG
165
static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
166
#endif
167
168
static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
169
                                     cl::ReallyHidden, cl::init(false),
170
                                     cl::ZeroOrMore, cl::desc("Ignore RecMII"));
171
172
namespace {
173
174
class NodeSet;
175
class SMSchedule;
176
177
/// The main class in the implementation of the target independent
178
/// software pipeliner pass.
179
class MachinePipeliner : public MachineFunctionPass {
180
public:
181
  MachineFunction *MF = nullptr;
182
  const MachineLoopInfo *MLI = nullptr;
183
  const MachineDominatorTree *MDT = nullptr;
184
  const InstrItineraryData *InstrItins;
185
  const TargetInstrInfo *TII = nullptr;
186
  RegisterClassInfo RegClassInfo;
187
188
#ifndef NDEBUG
189
  static int NumTries;
190
#endif
191
192
  /// Cache the target analysis information about the loop.
193
  struct LoopInfo {
194
    MachineBasicBlock *TBB = nullptr;
195
    MachineBasicBlock *FBB = nullptr;
196
    SmallVector<MachineOperand, 4> BrCond;
197
    MachineInstr *LoopInductionVar = nullptr;
198
    MachineInstr *LoopCompare = nullptr;
199
  };
200
  LoopInfo LI;
201
202
  static char ID;
203
204
405
  MachinePipeliner() : MachineFunctionPass(ID) {
205
405
    initializeMachinePipelinerPass(*PassRegistry::getPassRegistry());
206
405
  }
207
208
  bool runOnMachineFunction(MachineFunction &MF) override;
209
210
401
  void getAnalysisUsage(AnalysisUsage &AU) const override {
211
401
    AU.addRequired<AAResultsWrapperPass>();
212
401
    AU.addPreserved<AAResultsWrapperPass>();
213
401
    AU.addRequired<MachineLoopInfo>();
214
401
    AU.addRequired<MachineDominatorTree>();
215
401
    AU.addRequired<LiveIntervals>();
216
401
    MachineFunctionPass::getAnalysisUsage(AU);
217
401
  }
218
219
private:
220
  bool canPipelineLoop(MachineLoop &L);
221
  bool scheduleLoop(MachineLoop &L);
222
  bool swingModuloScheduler(MachineLoop &L);
223
};
224
225
/// This class builds the dependence graph for the instructions in a loop,
226
/// and attempts to schedule the instructions using the SMS algorithm.
227
class SwingSchedulerDAG : public ScheduleDAGInstrs {
228
  MachinePipeliner &Pass;
229
  /// The minimum initiation interval between iterations for this schedule.
230
  unsigned MII = 0;
231
  /// Set to true if a valid pipelined schedule is found for the loop.
232
  bool Scheduled = false;
233
  MachineLoop &Loop;
234
  LiveIntervals &LIS;
235
  const RegisterClassInfo &RegClassInfo;
236
237
  /// A toplogical ordering of the SUnits, which is needed for changing
238
  /// dependences and iterating over the SUnits.
239
  ScheduleDAGTopologicalSort Topo;
240
241
  struct NodeInfo {
242
    int ASAP = 0;
243
    int ALAP = 0;
244
245
826
    NodeInfo() = default;
246
  };
247
  /// Computed properties for each node in the graph.
248
  std::vector<NodeInfo> ScheduleInfo;
249
250
  enum OrderKind { BottomUp = 0, TopDown = 1 };
251
  /// Computed node ordering for scheduling.
252
  SetVector<SUnit *> NodeOrder;
253
254
  using NodeSetType = SmallVector<NodeSet, 8>;
255
  using ValueMapTy = DenseMap<unsigned, unsigned>;
256
  using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
257
  using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
258
259
  /// Instructions to change when emitting the final schedule.
260
  DenseMap<SUnit *, std::pair<unsigned, int64_t>> InstrChanges;
261
262
  /// We may create a new instruction, so remember it because it
263
  /// must be deleted when the pass is finished.
264
  SmallPtrSet<MachineInstr *, 4> NewMIs;
265
266
  /// Ordered list of DAG postprocessing steps.
267
  std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
268
269
  /// Helper class to implement Johnson's circuit finding algorithm.
270
  class Circuits {
271
    std::vector<SUnit> &SUnits;
272
    SetVector<SUnit *> Stack;
273
    BitVector Blocked;
274
    SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
275
    SmallVector<SmallVector<int, 4>, 16> AdjK;
276
    unsigned NumPaths;
277
    static unsigned MaxPaths;
278
279
  public:
280
    Circuits(std::vector<SUnit> &SUs)
281
113
        : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {}
282
283
    /// Reset the data structures used in the circuit algorithm.
284
1.36k
    void reset() {
285
1.36k
      Stack.clear();
286
1.36k
      Blocked.reset();
287
1.36k
      B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>());
288
1.36k
      NumPaths = 0;
289
1.36k
    }
290
291
    void createAdjacencyStructure(SwingSchedulerDAG *DAG);
292
    bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
293
    void unblock(int U);
294
  };
295
296
public:
297
  SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
298
                    const RegisterClassInfo &rci)
299
      : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
300
113
        RegClassInfo(rci), Topo(SUnits, &ExitSU) {
301
113
    P.MF->getSubtarget().getSMSMutations(Mutations);
302
113
  }
303
304
  void schedule() override;
305
  void finishBlock() override;
306
307
  /// Return true if the loop kernel has been scheduled.
308
113
  bool hasNewSchedule() { return Scheduled; }
309
310
  /// Return the earliest time an instruction may be scheduled.
311
3.72k
  int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
312
313
  /// Return the latest time an instruction my be scheduled.
314
2.89k
  int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
315
316
  /// The mobility function, which the the number of slots in which
317
  /// an instruction may be scheduled.
318
1.86k
  int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
319
320
  /// The depth, in the dependence graph, for a node.
321
1.41k
  int getDepth(SUnit *Node) { return Node->getDepth(); }
322
323
  /// The height, in the dependence graph, for a node.
324
4.48k
  int getHeight(SUnit *Node) { return Node->getHeight(); }
325
326
  /// Return true if the dependence is a back-edge in the data dependence graph.
327
  /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
328
  /// using an anti dependence from a Phi to an instruction.
329
25.6k
  bool isBackedge(SUnit *Source, const SDep &Dep) {
330
25.6k
    if (Dep.getKind() != SDep::Anti)
331
23.6k
      return false;
332
2.02k
    
return Source->getInstr()->isPHI() || 2.02k
Dep.getSUnit()->getInstr()->isPHI()1.82k
;
333
25.6k
  }
334
335
  /// Return true if the dependence is an order dependence between non-Phis.
336
55.7k
  static bool isOrder(SUnit *Source, const SDep &Dep) {
337
55.7k
    if (Dep.getKind() != SDep::Order)
338
2.34k
      return false;
339
53.4k
    return (!Source->getInstr()->isPHI() &&
340
53.4k
            !Dep.getSUnit()->getInstr()->isPHI());
341
55.7k
  }
342
343
  bool isLoopCarriedOrder(SUnit *Source, const SDep &Dep, bool isSucc = true);
344
345
  /// The latency of the dependence.
346
3.91k
  unsigned getLatency(SUnit *Source, const SDep &Dep) {
347
3.91k
    // Anti dependences represent recurrences, so use the latency of the
348
3.91k
    // instruction on the back-edge.
349
3.91k
    if (
Dep.getKind() == SDep::Anti3.91k
) {
350
240
      if (Source->getInstr()->isPHI())
351
204
        return Dep.getSUnit()->Latency;
352
36
      
if (36
Dep.getSUnit()->getInstr()->isPHI()36
)
353
27
        return Source->Latency;
354
9
      return Dep.getLatency();
355
9
    }
356
3.67k
    return Dep.getLatency();
357
3.67k
  }
358
359
  /// The distance function, which indicates that operation V of iteration I
360
  /// depends on operations U of iteration I-distance.
361
3.91k
  unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
362
3.91k
    // Instructions that feed a Phi have a distance of 1. Computing larger
363
3.91k
    // values for arrays requires data dependence information.
364
3.91k
    if (
V->getInstr()->isPHI() && 3.91k
Dep.getKind() == SDep::Anti269
)
365
231
      return 1;
366
3.68k
    return 0;
367
3.68k
  }
368
369
  /// Set the Minimum Initiation Interval for this schedule attempt.
370
0
  void setMII(unsigned mii) { MII = mii; }
371
372
  MachineInstr *applyInstrChange(MachineInstr *MI, SMSchedule &Schedule,
373
                                 bool UpdateDAG = false);
374
375
  /// Return the new base register that was stored away for the changed
376
  /// instruction.
377
180
  unsigned getInstrBaseReg(SUnit *SU) {
378
180
    DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
379
180
        InstrChanges.find(SU);
380
180
    if (It != InstrChanges.end())
381
16
      return It->second.first;
382
164
    return 0;
383
164
  }
384
385
0
  void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
386
0
    Mutations.push_back(std::move(Mutation));
387
0
  }
388
389
private:
390
  void addLoopCarriedDependences(AliasAnalysis *AA);
391
  void updatePhiDependences();
392
  void changeDependences();
393
  unsigned calculateResMII();
394
  unsigned calculateRecMII(NodeSetType &RecNodeSets);
395
  void findCircuits(NodeSetType &NodeSets);
396
  void fuseRecs(NodeSetType &NodeSets);
397
  void removeDuplicateNodes(NodeSetType &NodeSets);
398
  void computeNodeFunctions(NodeSetType &NodeSets);
399
  void registerPressureFilter(NodeSetType &NodeSets);
400
  void colocateNodeSets(NodeSetType &NodeSets);
401
  void checkNodeSets(NodeSetType &NodeSets);
402
  void groupRemainingNodes(NodeSetType &NodeSets);
403
  void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
404
                         SetVector<SUnit *> &NodesAdded);
405
  void computeNodeOrder(NodeSetType &NodeSets);
406
  bool schedulePipeline(SMSchedule &Schedule);
407
  void generatePipelinedLoop(SMSchedule &Schedule);
408
  void generateProlog(SMSchedule &Schedule, unsigned LastStage,
409
                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
410
                      MBBVectorTy &PrologBBs);
411
  void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
412
                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
413
                      MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
414
  void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
415
                            MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
416
                            SMSchedule &Schedule, ValueMapTy *VRMap,
417
                            InstrMapTy &InstrMap, unsigned LastStageNum,
418
                            unsigned CurStageNum, bool IsLast);
419
  void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
420
                    MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
421
                    SMSchedule &Schedule, ValueMapTy *VRMap,
422
                    InstrMapTy &InstrMap, unsigned LastStageNum,
423
                    unsigned CurStageNum, bool IsLast);
424
  void removeDeadInstructions(MachineBasicBlock *KernelBB,
425
                              MBBVectorTy &EpilogBBs);
426
  void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
427
                      SMSchedule &Schedule);
428
  void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
429
                   MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
430
                   ValueMapTy *VRMap);
431
  bool computeDelta(MachineInstr &MI, unsigned &Delta);
432
  void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
433
                         unsigned Num);
434
  MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
435
                           unsigned InstStageNum);
436
  MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
437
                                    unsigned InstStageNum,
438
                                    SMSchedule &Schedule);
439
  void updateInstruction(MachineInstr *NewMI, bool LastDef,
440
                         unsigned CurStageNum, unsigned InstStageNum,
441
                         SMSchedule &Schedule, ValueMapTy *VRMap);
442
  MachineInstr *findDefInLoop(unsigned Reg);
443
  unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
444
                         unsigned LoopStage, ValueMapTy *VRMap,
445
                         MachineBasicBlock *BB);
446
  void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
447
                        SMSchedule &Schedule, ValueMapTy *VRMap,
448
                        InstrMapTy &InstrMap);
449
  void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
450
                             InstrMapTy &InstrMap, unsigned CurStageNum,
451
                             unsigned PhiNum, MachineInstr *Phi,
452
                             unsigned OldReg, unsigned NewReg,
453
                             unsigned PrevReg = 0);
454
  bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
455
                             unsigned &OffsetPos, unsigned &NewBase,
456
                             int64_t &NewOffset);
457
  void postprocessDAG();
458
};
459
460
/// A NodeSet contains a set of SUnit DAG nodes with additional information
461
/// that assigns a priority to the set.
462
class NodeSet {
463
  SetVector<SUnit *> Nodes;
464
  bool HasRecurrence = false;
465
  unsigned RecMII = 0;
466
  int MaxMOV = 0;
467
  int MaxDepth = 0;
468
  unsigned Colocate = 0;
469
  SUnit *ExceedPressure = nullptr;
470
471
public:
472
  using iterator = SetVector<SUnit *>::const_iterator;
473
474
112
  NodeSet() = default;
475
254
  NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {}
476
477
490
  bool insert(SUnit *SU) { return Nodes.insert(SU); }
478
479
31
  void insert(iterator S, iterator E) { Nodes.insert(S, E); }
480
481
222
  template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) {
482
222
    return Nodes.remove_if(P);
483
222
  }
484
485
5.35k
  unsigned count(SUnit *SU) const { return Nodes.count(SU); }
486
487
0
  bool hasRecurrence() { return HasRecurrence; };
488
489
464
  unsigned size() const { return Nodes.size(); }
490
491
1.00k
  bool empty() const { return Nodes.empty(); }
492
493
686
  SUnit *getNode(unsigned i) const { return Nodes[i]; };
494
495
260
  void setRecMII(unsigned mii) { RecMII = mii; };
496
497
14
  void setColocate(unsigned c) { Colocate = c; };
498
499
1
  void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
500
501
482
  bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
502
503
179
  int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
504
505
6
  int getRecMII() { return RecMII; }
506
507
  /// Summarize node functions for the entire node set.
508
210
  void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
509
515
    for (SUnit *SU : *this) {
510
515
      MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
511
515
      MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
512
515
    }
513
210
  }
514
515
113
  void clear() {
516
113
    Nodes.clear();
517
113
    RecMII = 0;
518
113
    HasRecurrence = false;
519
113
    MaxMOV = 0;
520
113
    MaxDepth = 0;
521
113
    Colocate = 0;
522
113
    ExceedPressure = nullptr;
523
113
  }
524
525
918
  operator SetVector<SUnit *> &() { return Nodes; }
526
527
  /// Sort the node sets by importance. First, rank them by recurrence MII,
528
  /// then by mobility (least mobile done first), and finally by depth.
529
  /// Each node set may contain a colocate value which is used as the first
530
  /// tie breaker, if it's set.
531
155
  bool operator>(const NodeSet &RHS) const {
532
155
    if (
RecMII == RHS.RecMII155
) {
533
108
      if (
Colocate != 0 && 108
RHS.Colocate != 09
&&
Colocate != RHS.Colocate7
)
534
0
        return Colocate < RHS.Colocate;
535
108
      
if (108
MaxMOV == RHS.MaxMOV108
)
536
70
        return MaxDepth > RHS.MaxDepth;
537
38
      return MaxMOV < RHS.MaxMOV;
538
38
    }
539
47
    return RecMII > RHS.RecMII;
540
47
  }
541
542
0
  bool operator==(const NodeSet &RHS) const {
543
0
    return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
544
0
           MaxDepth == RHS.MaxDepth;
545
0
  }
546
547
0
  bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
548
549
667
  iterator begin() { return Nodes.begin(); }
550
667
  iterator end() { return Nodes.end(); }
551
552
0
  void print(raw_ostream &os) const {
553
0
    os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
554
0
       << " depth " << MaxDepth << " col " << Colocate << "\n";
555
0
    for (const auto &I : Nodes)
556
0
      os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
557
0
    os << "\n";
558
0
  }
559
560
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
561
  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
562
#endif
563
};
564
565
/// This class repesents the scheduled code.  The main data structure is a
566
/// map from scheduled cycle to instructions.  During scheduling, the
567
/// data structure explicitly represents all stages/iterations.   When
568
/// the algorithm finshes, the schedule is collapsed into a single stage,
569
/// which represents instructions from different loop iterations.
570
///
571
/// The SMS algorithm allows negative values for cycles, so the first cycle
572
/// in the schedule is the smallest cycle value.
573
class SMSchedule {
574
private:
575
  /// Map from execution cycle to instructions.
576
  DenseMap<int, std::deque<SUnit *>> ScheduledInstrs;
577
578
  /// Map from instruction to execution cycle.
579
  std::map<SUnit *, int> InstrToCycle;
580
581
  /// Map for each register and the max difference between its uses and def.
582
  /// The first element in the pair is the max difference in stages. The
583
  /// second is true if the register defines a Phi value and loop value is
584
  /// scheduled before the Phi.
585
  std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
586
587
  /// Keep track of the first cycle value in the schedule.  It starts
588
  /// as zero, but the algorithm allows negative values.
589
  int FirstCycle = 0;
590
591
  /// Keep track of the last cycle value in the schedule.
592
  int LastCycle = 0;
593
594
  /// The initiation interval (II) for the schedule.
595
  int InitiationInterval = 0;
596
597
  /// Target machine information.
598
  const TargetSubtargetInfo &ST;
599
600
  /// Virtual register information.
601
  MachineRegisterInfo &MRI;
602
603
  std::unique_ptr<DFAPacketizer> Resources;
604
605
public:
606
  SMSchedule(MachineFunction *mf)
607
      : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
608
112
        Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {}
609
610
170
  void reset() {
611
170
    ScheduledInstrs.clear();
612
170
    InstrToCycle.clear();
613
170
    RegToStageDiff.clear();
614
170
    FirstCycle = 0;
615
170
    LastCycle = 0;
616
170
    InitiationInterval = 0;
617
170
  }
618
619
  /// Set the initiation interval for this schedule.
620
165
  void setInitiationInterval(int ii) { InitiationInterval = ii; }
621
622
  /// Return the first cycle in the completed schedule.  This
623
  /// can be a negative value.
624
1.90k
  int getFirstCycle() const { return FirstCycle; }
625
626
  /// Return the last cycle in the finalized schedule.
627
586
  int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
628
629
  /// Return the cycle of the earliest scheduled instruction in the dependence
630
  /// chain.
631
  int earliestCycleInChain(const SDep &Dep);
632
633
  /// Return the cycle of the latest scheduled instruction in the dependence
634
  /// chain.
635
  int latestCycleInChain(const SDep &Dep);
636
637
  void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
638
                    int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
639
  bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
640
641
  /// Iterators for the cycle to instruction map.
642
  using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator;
643
  using const_sched_iterator =
644
      DenseMap<int, std::deque<SUnit *>>::const_iterator;
645
646
  /// Return true if the instruction is scheduled at the specified stage.
647
1.32k
  bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
648
1.32k
    return (stageScheduled(SU) == (int)StageNum);
649
1.32k
  }
650
651
  /// Return the stage for a scheduled instruction.  Return -1 if
652
  /// the instruction has not been scheduled.
653
10.9k
  int stageScheduled(SUnit *SU) const {
654
10.9k
    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
655
10.9k
    if (it == InstrToCycle.end())
656
511
      return -1;
657
10.4k
    return (it->second - FirstCycle) / InitiationInterval;
658
10.4k
  }
659
660
  /// Return the cycle for a scheduled instruction. This function normalizes
661
  /// the first cycle to be 0.
662
2.83k
  unsigned cycleScheduled(SUnit *SU) const {
663
2.83k
    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
664
2.83k
    assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
665
2.83k
    return (it->second - FirstCycle) % InitiationInterval;
666
2.83k
  }
667
668
  /// Return the maximum stage count needed for this schedule.
669
3.65k
  unsigned getMaxStageCount() {
670
3.65k
    return (LastCycle - FirstCycle) / InitiationInterval;
671
3.65k
  }
672
673
  /// Return the max. number of stages/iterations that can occur between a
674
  /// register definition and its uses.
675
1.00k
  unsigned getStagesForReg(int Reg, unsigned CurStage) {
676
1.00k
    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
677
1.00k
    if (
CurStage > getMaxStageCount() && 1.00k
Stages.first == 0546
&&
Stages.second244
)
678
2
      return 1;
679
1.00k
    return Stages.first;
680
1.00k
  }
681
682
  /// The number of stages for a Phi is a little different than other
683
  /// instructions. The minimum value computed in RegToStageDiff is 1
684
  /// because we assume the Phi is needed for at least 1 iteration.
685
  /// This is not the case if the loop value is scheduled prior to the
686
  /// Phi in the same stage.  This function returns the number of stages
687
  /// or iterations needed between the Phi definition and any uses.
688
172
  unsigned getStagesForPhi(int Reg) {
689
172
    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
690
172
    if (Stages.second)
691
2
      return Stages.first;
692
170
    return Stages.first - 1;
693
170
  }
694
695
  /// Return the instructions that are scheduled at the specified cycle.
696
8.30k
  std::deque<SUnit *> &getInstructions(int cycle) {
697
8.30k
    return ScheduledInstrs[cycle];
698
8.30k
  }
699
700
  bool isValidSchedule(SwingSchedulerDAG *SSD);
701
  void finalizeSchedule(SwingSchedulerDAG *SSD);
702
  bool orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
703
                       std::deque<SUnit *> &Insts);
704
  bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
705
  bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Inst,
706
                             MachineOperand &MO);
707
  void print(raw_ostream &os) const;
708
  void dump() const;
709
};
710
711
} // end anonymous namespace
712
713
unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
714
char MachinePipeliner::ID = 0;
715
#ifndef NDEBUG
716
int MachinePipeliner::NumTries = 0;
717
#endif
718
char &llvm::MachinePipelinerID = MachinePipeliner::ID;
719
720
36.7k
INITIALIZE_PASS_BEGIN36.7k
(MachinePipeliner, DEBUG_TYPE,
721
36.7k
                      "Modulo Software Pipelining", false, false)
722
36.7k
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
723
36.7k
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
724
36.7k
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
725
36.7k
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
726
36.7k
INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
727
                    "Modulo Software Pipelining", false, false)
728
729
/// The "main" function for implementing Swing Modulo Scheduling.
730
860
bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
731
860
  if (skipFunction(*mf.getFunction()))
732
0
    return false;
733
860
734
860
  
if (860
!EnableSWP860
)
735
7
    return false;
736
853
737
853
  
if (853
mf.getFunction()->getAttributes().hasAttribute(
738
853
          AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
739
15
      !EnableSWPOptSize.getPosition())
740
15
    return false;
741
838
742
838
  MF = &mf;
743
838
  MLI = &getAnalysis<MachineLoopInfo>();
744
838
  MDT = &getAnalysis<MachineDominatorTree>();
745
838
  TII = MF->getSubtarget().getInstrInfo();
746
838
  RegClassInfo.runOnMachineFunction(*MF);
747
838
748
838
  for (auto &L : *MLI)
749
182
    scheduleLoop(*L);
750
860
751
860
  return false;
752
860
}
753
754
/// Attempt to perform the SMS algorithm on the specified loop. This function is
755
/// the main entry point for the algorithm.  The function identifies candidate
756
/// loops, calculates the minimum initiation interval, and attempts to schedule
757
/// the loop.
758
196
bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
759
196
  bool Changed = false;
760
196
  for (auto &InnerLoop : L)
761
14
    Changed |= scheduleLoop(*InnerLoop);
762
196
763
#ifndef NDEBUG
764
  // Stop trying after reaching the limit (if any).
765
  int Limit = SwpLoopLimit;
766
  if (Limit >= 0) {
767
    if (NumTries >= SwpLoopLimit)
768
      return Changed;
769
    NumTries++;
770
  }
771
#endif
772
773
196
  if (!canPipelineLoop(L))
774
83
    return Changed;
775
113
776
113
  ++NumTrytoPipeline;
777
113
778
113
  Changed = swingModuloScheduler(L);
779
113
780
113
  return Changed;
781
113
}
782
783
/// Return true if the loop can be software pipelined.  The algorithm is
784
/// restricted to loops with a single basic block.  Make sure that the
785
/// branch in the loop can be analyzed.
786
196
bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
787
196
  if (L.getNumBlocks() != 1)
788
33
    return false;
789
163
790
163
  // Check if the branch can't be understood because we can't do pipelining
791
163
  // if that's the case.
792
163
  LI.TBB = nullptr;
793
163
  LI.FBB = nullptr;
794
163
  LI.BrCond.clear();
795
163
  if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond))
796
0
    return false;
797
163
798
163
  LI.LoopInductionVar = nullptr;
799
163
  LI.LoopCompare = nullptr;
800
163
  if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare))
801
49
    return false;
802
114
803
114
  
if (114
!L.getLoopPreheader()114
)
804
0
    return false;
805
114
806
114
  // If any of the Phis contain subregs, then we can't pipeline
807
114
  // because we don't know how to maintain subreg information in the
808
114
  // VMap structure.
809
114
  MachineBasicBlock *MBB = L.getHeader();
810
114
  for (MachineBasicBlock::iterator BBI = MBB->instr_begin(),
811
114
                                   BBE = MBB->getFirstNonPHI();
812
313
       
BBI != BBE313
;
++BBI199
)
813
598
    
for (unsigned i = 1; 200
i != BBI->getNumOperands()598
;
i += 2398
)
814
399
      
if (399
BBI->getOperand(i).getSubReg() != 0399
)
815
1
        return false;
816
114
817
113
  return true;
818
196
}
819
820
/// The SMS algorithm consists of the following main steps:
821
/// 1. Computation and analysis of the dependence graph.
822
/// 2. Ordering of the nodes (instructions).
823
/// 3. Attempt to Schedule the loop.
824
113
bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
825
113
  assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
826
113
827
113
  SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo);
828
113
829
113
  MachineBasicBlock *MBB = L.getHeader();
830
113
  // The kernel should not include any terminator instructions.  These
831
113
  // will be added back later.
832
113
  SMS.startBlock(MBB);
833
113
834
113
  // Compute the number of 'real' instructions in the basic block by
835
113
  // ignoring terminators.
836
113
  unsigned size = MBB->size();
837
113
  for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
838
113
                                   E = MBB->instr_end();
839
339
       
I != E339
;
++I, --size226
)
840
226
    ;
841
113
842
113
  SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
843
113
  SMS.schedule();
844
113
  SMS.exitRegion();
845
113
846
113
  SMS.finishBlock();
847
113
  return SMS.hasNewSchedule();
848
113
}
849
850
/// We override the schedule function in ScheduleDAGInstrs to implement the
851
/// scheduling part of the Swing Modulo Scheduling algorithm.
852
113
void SwingSchedulerDAG::schedule() {
853
113
  AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
854
113
  buildSchedGraph(AA);
855
113
  addLoopCarriedDependences(AA);
856
113
  updatePhiDependences();
857
113
  Topo.InitDAGTopologicalSorting();
858
113
  postprocessDAG();
859
113
  changeDependences();
860
113
  DEBUG({
861
113
    for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
862
113
      SUnits[su].dumpAll(this);
863
113
  });
864
113
865
113
  NodeSetType NodeSets;
866
113
  findCircuits(NodeSets);
867
113
868
113
  // Calculate the MII.
869
113
  unsigned ResMII = calculateResMII();
870
113
  unsigned RecMII = calculateRecMII(NodeSets);
871
113
872
113
  fuseRecs(NodeSets);
873
113
874
113
  // This flag is used for testing and can cause correctness problems.
875
113
  if (SwpIgnoreRecMII)
876
0
    RecMII = 0;
877
113
878
113
  MII = std::max(ResMII, RecMII);
879
113
  DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII << ", res=" << ResMII
880
113
               << ")\n");
881
113
882
113
  // Can't schedule a loop without a valid MII.
883
113
  if (MII == 0)
884
0
    return;
885
113
886
113
  // Don't pipeline large loops.
887
113
  
if (113
SwpMaxMii != -1 && 113
(int)MII > SwpMaxMii113
)
888
1
    return;
889
112
890
112
  computeNodeFunctions(NodeSets);
891
112
892
112
  registerPressureFilter(NodeSets);
893
112
894
112
  colocateNodeSets(NodeSets);
895
112
896
112
  checkNodeSets(NodeSets);
897
112
898
112
  DEBUG({
899
112
    for (auto &I : NodeSets) {
900
112
      dbgs() << "  Rec NodeSet ";
901
112
      I.dump();
902
112
    }
903
112
  });
904
112
905
112
  std::sort(NodeSets.begin(), NodeSets.end(), std::greater<NodeSet>());
906
112
907
112
  groupRemainingNodes(NodeSets);
908
112
909
112
  removeDuplicateNodes(NodeSets);
910
112
911
112
  DEBUG({
912
112
    for (auto &I : NodeSets) {
913
112
      dbgs() << "  NodeSet ";
914
112
      I.dump();
915
112
    }
916
112
  });
917
112
918
112
  computeNodeOrder(NodeSets);
919
112
920
112
  SMSchedule Schedule(Pass.MF);
921
112
  Scheduled = schedulePipeline(Schedule);
922
112
923
112
  if (!Scheduled)
924
31
    return;
925
81
926
81
  unsigned numStages = Schedule.getMaxStageCount();
927
81
  // No need to generate pipeline if there are no overlapped iterations.
928
81
  if (numStages == 0)
929
0
    return;
930
81
931
81
  // Check that the maximum stage count is less than user-defined limit.
932
81
  
if (81
SwpMaxStages > -1 && 81
(int)numStages > SwpMaxStages81
)
933
0
    return;
934
81
935
81
  generatePipelinedLoop(Schedule);
936
81
  ++NumPipelined;
937
81
}
938
939
/// Clean up after the software pipeliner runs.
940
113
void SwingSchedulerDAG::finishBlock() {
941
113
  for (MachineInstr *I : NewMIs)
942
7
    MF.DeleteMachineInstr(I);
943
113
  NewMIs.clear();
944
113
945
113
  // Call the superclass.
946
113
  ScheduleDAGInstrs::finishBlock();
947
113
}
948
949
/// Return the register values for  the operands of a Phi instruction.
950
/// This function assume the instruction is a Phi.
951
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
952
1.22k
                       unsigned &InitVal, unsigned &LoopVal) {
953
1.22k
  assert(Phi.isPHI() && "Expecting a Phi.");
954
1.22k
955
1.22k
  InitVal = 0;
956
1.22k
  LoopVal = 0;
957
3.66k
  for (unsigned i = 1, e = Phi.getNumOperands(); 
i != e3.66k
;
i += 22.44k
)
958
2.44k
    
if (2.44k
Phi.getOperand(i + 1).getMBB() != Loop2.44k
)
959
1.22k
      InitVal = Phi.getOperand(i).getReg();
960
2.44k
    else
961
1.22k
      LoopVal = Phi.getOperand(i).getReg();
962
1.22k
963
1.22k
  assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
964
1.22k
}
965
966
/// Return the Phi register value that comes from the incoming block.
967
127
static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
968
127
  for (unsigned i = 1, e = Phi.getNumOperands(); 
i != e127
;
i += 20
)
969
127
    
if (127
Phi.getOperand(i + 1).getMBB() != LoopBB127
)
970
127
      return Phi.getOperand(i).getReg();
971
0
  return 0;
972
127
}
973
974
/// Return the Phi register value that comes the the loop block.
975
1.51k
static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
976
3.04k
  for (unsigned i = 1, e = Phi.getNumOperands(); 
i != e3.04k
;
i += 21.53k
)
977
3.02k
    
if (3.02k
Phi.getOperand(i + 1).getMBB() == LoopBB3.02k
)
978
1.49k
      return Phi.getOperand(i).getReg();
979
19
  return 0;
980
1.51k
}
981
982
/// Return true if SUb can be reached from SUa following the chain edges.
983
49
static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
984
49
  SmallPtrSet<SUnit *, 8> Visited;
985
49
  SmallVector<SUnit *, 8> Worklist;
986
49
  Worklist.push_back(SUa);
987
125
  while (
!Worklist.empty()125
) {
988
99
    const SUnit *SU = Worklist.pop_back_val();
989
122
    for (auto &SI : SU->Succs) {
990
122
      SUnit *SuccSU = SI.getSUnit();
991
122
      if (
SI.getKind() == SDep::Order122
) {
992
73
        if (Visited.count(SuccSU))
993
0
          continue;
994
73
        
if (73
SuccSU == SUb73
)
995
23
          return true;
996
50
        Worklist.push_back(SuccSU);
997
50
        Visited.insert(SuccSU);
998
50
      }
999
122
    }
1000
99
  }
1001
26
  return false;
1002
49
}
1003
1004
/// Return true if the instruction causes a chain between memory
1005
/// references before and after it.
1006
1.36k
static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
1007
1.36k
  return MI.isCall() || MI.hasUnmodeledSideEffects() ||
1008
1.16k
         (MI.hasOrderedMemoryRef() &&
1009
1.16k
          
(!MI.mayLoad() || 8
!MI.isDereferenceableInvariantLoad(AA)8
));
1010
1.36k
}
1011
1012
/// Return the underlying objects for the memory references of an instruction.
1013
/// This function calls the code in ValueTracking, but first checks that the
1014
/// instruction has a memory operand.
1015
static void getUnderlyingObjects(MachineInstr *MI,
1016
                                 SmallVectorImpl<Value *> &Objs,
1017
766
                                 const DataLayout &DL) {
1018
766
  if (!MI->hasOneMemOperand())
1019
40
    return;
1020
726
  MachineMemOperand *MM = *MI->memoperands_begin();
1021
726
  if (!MM->getValue())
1022
526
    return;
1023
200
  GetUnderlyingObjects(const_cast<Value *>(MM->getValue()), Objs, DL);
1024
200
}
1025
1026
/// Add a chain edge between a load and store if the store can be an
1027
/// alias of the load on a subsequent iteration, i.e., a loop carried
1028
/// dependence. This code is very similar to the code in ScheduleDAGInstrs
1029
/// but that code doesn't create loop carried dependences.
1030
113
void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
1031
113
  MapVector<Value *, SmallVector<SUnit *, 4>> PendingLoads;
1032
1.36k
  for (auto &SU : SUnits) {
1033
1.36k
    MachineInstr &MI = *SU.getInstr();
1034
1.36k
    if (isDependenceBarrier(MI, AA))
1035
207
      PendingLoads.clear();
1036
1.15k
    else 
if (1.15k
MI.mayLoad()1.15k
) {
1037
419
      SmallVector<Value *, 4> Objs;
1038
419
      getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1039
121
      for (auto V : Objs) {
1040
121
        SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
1041
121
        SUs.push_back(&SU);
1042
121
      }
1043
1.15k
    } else 
if (735
MI.mayStore()735
) {
1044
347
      SmallVector<Value *, 4> Objs;
1045
347
      getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1046
79
      for (auto V : Objs) {
1047
79
        MapVector<Value *, SmallVector<SUnit *, 4>>::iterator I =
1048
79
            PendingLoads.find(V);
1049
79
        if (I == PendingLoads.end())
1050
56
          continue;
1051
23
        
for (auto Load : I->second) 23
{
1052
49
          if (isSuccOrder(Load, &SU))
1053
23
            continue;
1054
26
          MachineInstr &LdMI = *Load->getInstr();
1055
26
          // First, perform the cheaper check that compares the base register.
1056
26
          // If they are the same and the load offset is less than the store
1057
26
          // offset, then mark the dependence as loop carried potentially.
1058
26
          unsigned BaseReg1, BaseReg2;
1059
26
          int64_t Offset1, Offset2;
1060
26
          if (!TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) ||
1061
26
              
!TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)26
) {
1062
0
            SU.addPred(SDep(Load, SDep::Barrier));
1063
0
            continue;            
1064
0
          }
1065
26
          
if (26
BaseReg1 == BaseReg2 && 26
(int)Offset1 < (int)Offset226
) {
1066
13
            assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
1067
13
                   "What happened to the chain edge?");
1068
13
            SU.addPred(SDep(Load, SDep::Barrier));
1069
13
            continue;
1070
13
          }
1071
13
          // Second, the more expensive check that uses alias analysis on the
1072
13
          // base registers. If they alias, and the load offset is less than
1073
13
          // the store offset, the mark the dependence as loop carried.
1074
13
          
if (13
!AA13
) {
1075
0
            SU.addPred(SDep(Load, SDep::Barrier));
1076
0
            continue;
1077
0
          }
1078
13
          MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
1079
13
          MachineMemOperand *MMO2 = *MI.memoperands_begin();
1080
13
          if (
!MMO1->getValue() || 13
!MMO2->getValue()13
) {
1081
0
            SU.addPred(SDep(Load, SDep::Barrier));
1082
0
            continue;
1083
0
          }
1084
13
          
if (13
MMO1->getValue() == MMO2->getValue() &&
1085
13
              
MMO1->getOffset() <= MMO2->getOffset()13
) {
1086
0
            SU.addPred(SDep(Load, SDep::Barrier));
1087
0
            continue;
1088
0
          }
1089
13
          AliasResult AAResult = AA->alias(
1090
13
              MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
1091
13
                             MMO1->getAAInfo()),
1092
13
              MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
1093
13
                             MMO2->getAAInfo()));
1094
13
1095
13
          if (AAResult != NoAlias)
1096
13
            SU.addPred(SDep(Load, SDep::Barrier));
1097
49
        }
1098
79
      }
1099
1.15k
    }
1100
1.36k
  }
1101
113
}
1102
1103
/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1104
/// processes dependences for PHIs. This function adds true dependences
1105
/// from a PHI to a use, and a loop carried dependence from the use to the
1106
/// PHI. The loop carried dependence is represented as an anti dependence
1107
/// edge. This function also removes chain dependences between unrelated
1108
/// PHIs.
1109
113
void SwingSchedulerDAG::updatePhiDependences() {
1110
113
  SmallVector<SDep, 4> RemoveDeps;
1111
113
  const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1112
113
1113
113
  // Iterate over each DAG node.
1114
1.36k
  for (SUnit &I : SUnits) {
1115
1.36k
    RemoveDeps.clear();
1116
1.36k
    // Set to true if the instruction has an operand defined by a Phi.
1117
1.36k
    unsigned HasPhiUse = 0;
1118
1.36k
    unsigned HasPhiDef = 0;
1119
1.36k
    MachineInstr *MI = I.getInstr();
1120
1.36k
    // Iterate over each operand, and we process the definitions.
1121
1.36k
    for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
1122
1.36k
                                    MOE = MI->operands_end();
1123
6.05k
         
MOI != MOE6.05k
;
++MOI4.69k
) {
1124
4.69k
      if (!MOI->isReg())
1125
1.88k
        continue;
1126
2.81k
      unsigned Reg = MOI->getReg();
1127
2.81k
      if (
MOI->isDef()2.81k
) {
1128
1.05k
        // If the register is used by a Phi, then create an anti dependence.
1129
1.05k
        for (MachineRegisterInfo::use_instr_iterator
1130
1.05k
                 UI = MRI.use_instr_begin(Reg),
1131
1.05k
                 UE = MRI.use_instr_end();
1132
2.40k
             
UI != UE2.40k
;
++UI1.35k
) {
1133
1.35k
          MachineInstr *UseMI = &*UI;
1134
1.35k
          SUnit *SU = getSUnit(UseMI);
1135
1.35k
          if (
SU != nullptr && 1.35k
UseMI->isPHI()1.32k
) {
1136
195
            if (
!MI->isPHI()195
) {
1137
189
              SDep Dep(SU, SDep::Anti, Reg);
1138
189
              I.addPred(Dep);
1139
195
            } else {
1140
6
              HasPhiDef = Reg;
1141
6
              // Add a chain edge to a dependent Phi that isn't an existing
1142
6
              // predecessor.
1143
6
              if (
SU->NodeNum < I.NodeNum && 6
!I.isPred(SU)1
)
1144
0
                I.addPred(SDep(SU, SDep::Barrier));
1145
6
            }
1146
195
          }
1147
1.35k
        }
1148
2.81k
      } else 
if (1.75k
MOI->isUse()1.75k
) {
1149
1.75k
        // If the register is defined by a Phi, then create a true dependence.
1150
1.75k
        MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1151
1.75k
        if (DefMI == nullptr)
1152
12
          continue;
1153
1.74k
        SUnit *SU = getSUnit(DefMI);
1154
1.74k
        if (
SU != nullptr && 1.74k
DefMI->isPHI()1.32k
) {
1155
386
          if (
!MI->isPHI()386
) {
1156
380
            SDep Dep(SU, SDep::Data, Reg);
1157
380
            Dep.setLatency(0);
1158
380
            ST.adjustSchedDependency(SU, &I, Dep);
1159
380
            I.addPred(Dep);
1160
386
          } else {
1161
6
            HasPhiUse = Reg;
1162
6
            // Add a chain edge to a dependent Phi that isn't an existing
1163
6
            // predecessor.
1164
6
            if (
SU->NodeNum < I.NodeNum && 6
!I.isPred(SU)5
)
1165
0
              I.addPred(SDep(SU, SDep::Barrier));
1166
6
          }
1167
386
        }
1168
1.75k
      }
1169
4.69k
    }
1170
1.36k
    // Remove order dependences from an unrelated Phi.
1171
1.36k
    if (!SwpPruneDeps)
1172
0
      continue;
1173
1.36k
    
for (auto &PI : I.Preds) 1.36k
{
1174
88.9k
      MachineInstr *PMI = PI.getSUnit()->getInstr();
1175
88.9k
      if (
PMI->isPHI() && 88.9k
PI.getKind() == SDep::Order1.40k
) {
1176
835
        if (
I.getInstr()->isPHI()835
) {
1177
85
          if (PMI->getOperand(0).getReg() == HasPhiUse)
1178
5
            continue;
1179
80
          
if (80
getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef80
)
1180
1
            continue;
1181
829
        }
1182
829
        RemoveDeps.push_back(PI);
1183
829
      }
1184
88.9k
    }
1185
2.19k
    for (int i = 0, e = RemoveDeps.size(); 
i != e2.19k
;
++i829
)
1186
829
      I.removePred(RemoveDeps[i]);
1187
1.36k
  }
1188
113
}
1189
1190
/// Iterate over each DAG node and see if we can change any dependences
1191
/// in order to reduce the recurrence MII.
1192
113
void SwingSchedulerDAG::changeDependences() {
1193
113
  // See if an instruction can use a value from the previous iteration.
1194
113
  // If so, we update the base and offset of the instruction and change
1195
113
  // the dependences.
1196
1.36k
  for (SUnit &I : SUnits) {
1197
1.36k
    unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1198
1.36k
    int64_t NewOffset = 0;
1199
1.36k
    if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1200
1.36k
                               NewOffset))
1201
1.35k
      continue;
1202
10
1203
10
    // Get the MI and SUnit for the instruction that defines the original base.
1204
10
    unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1205
10
    MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1206
10
    if (!DefMI)
1207
0
      continue;
1208
10
    SUnit *DefSU = getSUnit(DefMI);
1209
10
    if (!DefSU)
1210
0
      continue;
1211
10
    // Get the MI and SUnit for the instruction that defins the new base.
1212
10
    MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1213
10
    if (!LastMI)
1214
0
      continue;
1215
10
    SUnit *LastSU = getSUnit(LastMI);
1216
10
    if (!LastSU)
1217
0
      continue;
1218
10
1219
10
    
if (10
Topo.IsReachable(&I, LastSU)10
)
1220
1
      continue;
1221
9
1222
9
    // Remove the dependence. The value now depends on a prior iteration.
1223
9
    SmallVector<SDep, 4> Deps;
1224
18
    for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
1225
9
         ++P)
1226
9
      
if (9
P->getSUnit() == DefSU9
)
1227
9
        Deps.push_back(*P);
1228
18
    for (int i = 0, e = Deps.size(); 
i != e18
;
i++9
) {
1229
9
      Topo.RemovePred(&I, Deps[i].getSUnit());
1230
9
      I.removePred(Deps[i]);
1231
9
    }
1232
9
    // Remove the chain dependence between the instructions.
1233
9
    Deps.clear();
1234
9
    for (auto &P : LastSU->Preds)
1235
38
      
if (38
P.getSUnit() == &I && 38
P.getKind() == SDep::Order9
)
1236
9
        Deps.push_back(P);
1237
18
    for (int i = 0, e = Deps.size(); 
i != e18
;
i++9
) {
1238
9
      Topo.RemovePred(LastSU, Deps[i].getSUnit());
1239
9
      LastSU->removePred(Deps[i]);
1240
9
    }
1241
1.36k
1242
1.36k
    // Add a dependence between the new instruction and the instruction
1243
1.36k
    // that defines the new base.
1244
1.36k
    SDep Dep(&I, SDep::Anti, NewBase);
1245
1.36k
    LastSU->addPred(Dep);
1246
1.36k
1247
1.36k
    // Remember the base and offset information so that we can update the
1248
1.36k
    // instruction during code generation.
1249
1.36k
    InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1250
1.36k
  }
1251
113
}
1252
1253
namespace {
1254
1255
// FuncUnitSorter - Comparison operator used to sort instructions by
1256
// the number of functional unit choices.
1257
struct FuncUnitSorter {
1258
  const InstrItineraryData *InstrItins;
1259
  DenseMap<unsigned, unsigned> Resources;
1260
1261
113
  FuncUnitSorter(const InstrItineraryData *IID) : InstrItins(IID) {}
1262
1263
  // Compute the number of functional unit alternatives needed
1264
  // at each stage, and take the minimum value. We prioritize the
1265
  // instructions by the least number of choices first.
1266
21.1k
  unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
1267
21.1k
    unsigned schedClass = Inst->getDesc().getSchedClass();
1268
21.1k
    unsigned min = UINT_MAX;
1269
21.1k
    for (const InstrStage *IS = InstrItins->beginStage(schedClass),
1270
21.1k
                          *IE = InstrItins->endStage(schedClass);
1271
44.9k
         
IS != IE44.9k
;
++IS23.7k
) {
1272
23.7k
      unsigned funcUnits = IS->getUnits();
1273
23.7k
      unsigned numAlternatives = countPopulation(funcUnits);
1274
23.7k
      if (
numAlternatives < min23.7k
) {
1275
21.9k
        min = numAlternatives;
1276
21.9k
        F = funcUnits;
1277
21.9k
      }
1278
23.7k
    }
1279
21.1k
    return min;
1280
21.1k
  }
1281
1282
  // Compute the critical resources needed by the instruction. This
1283
  // function records the functional units needed by instructions that
1284
  // must use only one functional unit. We use this as a tie breaker
1285
  // for computing the resource MII. The instrutions that require
1286
  // the same, highly used, functional unit have high priority.
1287
1.16k
  void calcCriticalResources(MachineInstr &MI) {
1288
1.16k
    unsigned SchedClass = MI.getDesc().getSchedClass();
1289
1.16k
    for (const InstrStage *IS = InstrItins->beginStage(SchedClass),
1290
1.16k
                          *IE = InstrItins->endStage(SchedClass);
1291
2.54k
         
IS != IE2.54k
;
++IS1.37k
) {
1292
1.37k
      unsigned FuncUnits = IS->getUnits();
1293
1.37k
      if (countPopulation(FuncUnits) == 1)
1294
197
        Resources[FuncUnits]++;
1295
1.37k
    }
1296
1.16k
  }
1297
1298
  /// Return true if IS1 has less priority than IS2.
1299
10.5k
  bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1300
10.5k
    unsigned F1 = 0, F2 = 0;
1301
10.5k
    unsigned MFUs1 = minFuncUnits(IS1, F1);
1302
10.5k
    unsigned MFUs2 = minFuncUnits(IS2, F2);
1303
10.5k
    if (
MFUs1 == 1 && 10.5k
MFUs2 == 1837
)
1304
428
      return Resources.lookup(F1) < Resources.lookup(F2);
1305
10.1k
    return MFUs1 > MFUs2;
1306
10.1k
  }
1307
};
1308
1309
} // end anonymous namespace
1310
1311
/// Calculate the resource constrained minimum initiation interval for the
1312
/// specified loop. We use the DFA to model the resources needed for
1313
/// each instruction, and we ignore dependences. A different DFA is created
1314
/// for each cycle that is required. When adding a new instruction, we attempt
1315
/// to add it to each existing DFA, until a legal space is found. If the
1316
/// instruction cannot be reserved in an existing DFA, we create a new one.
1317
113
unsigned SwingSchedulerDAG::calculateResMII() {
1318
113
  SmallVector<DFAPacketizer *, 8> Resources;
1319
113
  MachineBasicBlock *MBB = Loop.getHeader();
1320
113
  Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget()));
1321
113
1322
113
  // Sort the instructions by the number of available choices for scheduling,
1323
113
  // least to most. Use the number of critical resources as the tie breaker.
1324
113
  FuncUnitSorter FUS =
1325
113
      FuncUnitSorter(MF.getSubtarget().getInstrItineraryData());
1326
113
  for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1327
113
                                   E = MBB->getFirstTerminator();
1328
1.27k
       
I != E1.27k
;
++I1.16k
)
1329
1.16k
    FUS.calcCriticalResources(*I);
1330
113
  PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
1331
113
      FuncUnitOrder(FUS);
1332
113
1333
113
  for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1334
113
                                   E = MBB->getFirstTerminator();
1335
1.27k
       
I != E1.27k
;
++I1.16k
)
1336
1.16k
    FuncUnitOrder.push(&*I);
1337
113
1338
1.27k
  while (
!FuncUnitOrder.empty()1.27k
) {
1339
1.16k
    MachineInstr *MI = FuncUnitOrder.top();
1340
1.16k
    FuncUnitOrder.pop();
1341
1.16k
    if (TII->isZeroCost(MI->getOpcode()))
1342
20
      continue;
1343
1.14k
    // Attempt to reserve the instruction in an existing DFA. At least one
1344
1.14k
    // DFA is needed for each cycle.
1345
1.14k
    unsigned NumCycles = getSUnit(MI)->Latency;
1346
1.14k
    unsigned ReservedCycles = 0;
1347
1.14k
    SmallVectorImpl<DFAPacketizer *>::iterator RI = Resources.begin();
1348
1.14k
    SmallVectorImpl<DFAPacketizer *>::iterator RE = Resources.end();
1349
2.29k
    for (unsigned C = 0; 
C < NumCycles2.29k
;
++C1.14k
)
1350
72.3k
      
while (1.14k
RI != RE72.3k
) {
1351
72.0k
        if (
(*RI++)->canReserveResources(*MI)72.0k
) {
1352
790
          ++ReservedCycles;
1353
790
          break;
1354
790
        }
1355
1.14k
      }
1356
1.14k
    // Start reserving resources using existing DFAs.
1357
1.93k
    for (unsigned C = 0; 
C < ReservedCycles1.93k
;
++C790
) {
1358
790
      --RI;
1359
790
      (*RI)->reserveResources(*MI);
1360
790
    }
1361
1.14k
    // Add new DFAs, if needed, to reserve resources.
1362
1.50k
    for (unsigned C = ReservedCycles; 
C < NumCycles1.50k
;
++C355
) {
1363
355
      DFAPacketizer *NewResource =
1364
355
          TII->CreateTargetScheduleState(MF.getSubtarget());
1365
355
      assert(NewResource->canReserveResources(*MI) && "Reserve error.");
1366
355
      NewResource->reserveResources(*MI);
1367
355
      Resources.push_back(NewResource);
1368
355
    }
1369
1.16k
  }
1370
113
  int Resmii = Resources.size();
1371
113
  // Delete the memory for each of the DFAs that were created earlier.
1372
468
  for (DFAPacketizer *RI : Resources) {
1373
468
    DFAPacketizer *D = RI;
1374
468
    delete D;
1375
468
  }
1376
113
  Resources.clear();
1377
113
  return Resmii;
1378
113
}
1379
1380
/// Calculate the recurrence-constrainted minimum initiation interval.
1381
/// Iterate over each circuit.  Compute the delay(c) and distance(c)
1382
/// for each circuit. The II needs to satisfy the inequality
1383
/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1384
/// II that satistifies the inequality, and the RecMII is the maximum
1385
/// of those values.
1386
113
unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1387
113
  unsigned RecMII = 0;
1388
113
1389
254
  for (NodeSet &Nodes : NodeSets) {
1390
254
    if (Nodes.empty())
1391
0
      continue;
1392
254
1393
254
    unsigned Delay = Nodes.size() - 1;
1394
254
    unsigned Distance = 1;
1395
254
1396
254
    // ii = ceil(delay / distance)
1397
254
    unsigned CurMII = (Delay + Distance - 1) / Distance;
1398
254
    Nodes.setRecMII(CurMII);
1399
254
    if (CurMII > RecMII)
1400
135
      RecMII = CurMII;
1401
254
  }
1402
113
1403
113
  return RecMII;
1404
113
}
1405
1406
/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1407
/// but we do this to find the circuits, and then change them back.
1408
226
static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1409
226
  SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
1410
2.94k
  for (unsigned i = 0, e = SUnits.size(); 
i != e2.94k
;
++i2.72k
) {
1411
2.72k
    SUnit *SU = &SUnits[i];
1412
2.72k
    for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
1413
178k
         
IP != EP178k
;
++IP176k
) {
1414
176k
      if (IP->getKind() != SDep::Anti)
1415
175k
        continue;
1416
396
      DepsAdded.push_back(std::make_pair(SU, *IP));
1417
396
    }
1418
2.72k
  }
1419
226
  for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
1420
226
                                                          E = DepsAdded.end();
1421
622
       
I != E622
;
++I396
) {
1422
396
    // Remove this anti dependency and add one in the reverse direction.
1423
396
    SUnit *SU = I->first;
1424
396
    SDep &D = I->second;
1425
396
    SUnit *TargetSU = D.getSUnit();
1426
396
    unsigned Reg = D.getReg();
1427
396
    unsigned Lat = D.getLatency();
1428
396
    SU->removePred(D);
1429
396
    SDep Dep(SU, SDep::Anti, Reg);
1430
396
    Dep.setLatency(Lat);
1431
396
    TargetSU->addPred(Dep);
1432
396
  }
1433
226
}
1434
1435
/// Create the adjacency structure of the nodes in the graph.
1436
void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1437
113
    SwingSchedulerDAG *DAG) {
1438
113
  BitVector Added(SUnits.size());
1439
1.47k
  for (int i = 0, e = SUnits.size(); 
i != e1.47k
;
++i1.36k
) {
1440
1.36k
    Added.reset();
1441
1.36k
    // Add any successor to the adjacency matrix and exclude duplicates.
1442
88.0k
    for (auto &SI : SUnits[i].Succs) {
1443
88.0k
      // Do not process a boundary node and a back-edge is processed only
1444
88.0k
      // if it goes to a Phi.
1445
88.0k
      if (SI.getSUnit()->isBoundaryNode() ||
1446
88.0k
          
(SI.getKind() == SDep::Anti && 88.0k
!SI.getSUnit()->getInstr()->isPHI()198
))
1447
9
        continue;
1448
88.0k
      int N = SI.getSUnit()->NodeNum;
1449
88.0k
      if (
!Added.test(N)88.0k
) {
1450
87.7k
        AdjK[i].push_back(N);
1451
87.7k
        Added.set(N);
1452
87.7k
      }
1453
88.0k
    }
1454
1.36k
    // A chain edge between a store and a load is treated as a back-edge in the
1455
1.36k
    // adjacency matrix.
1456
88.0k
    for (auto &PI : SUnits[i].Preds) {
1457
88.0k
      if (!SUnits[i].getInstr()->mayStore() ||
1458
52.9k
          !DAG->isLoopCarriedOrder(&SUnits[i], PI, false))
1459
53.5k
        continue;
1460
34.5k
      
if (34.5k
PI.getKind() == SDep::Order && 34.5k
PI.getSUnit()->getInstr()->mayLoad()34.5k
) {
1461
34.5k
        int N = PI.getSUnit()->NodeNum;
1462
34.5k
        if (
!Added.test(N)34.5k
) {
1463
34.5k
          AdjK[i].push_back(N);
1464
34.5k
          Added.set(N);
1465
34.5k
        }
1466
34.5k
      }
1467
88.0k
    }
1468
1.36k
  }
1469
113
}
1470
1471
/// Identify an elementary circuit in the dependence graph starting at the
1472
/// specified node.
1473
bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1474
148k
                                          bool HasBackedge) {
1475
148k
  SUnit *SV = &SUnits[V];
1476
148k
  bool F = false;
1477
148k
  Stack.insert(SV);
1478
148k
  Blocked.set(V);
1479
148k
1480
18.5M
  for (auto W : AdjK[V]) {
1481
18.5M
    if (NumPaths > MaxPaths)
1482
67.3k
      break;
1483
18.5M
    
if (18.5M
W < S18.5M
)
1484
3.06M
      continue;
1485
15.4M
    
if (15.4M
W == S15.4M
) {
1486
1.88k
      if (!HasBackedge)
1487
254
        NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1488
1.88k
      F = true;
1489
1.88k
      ++NumPaths;
1490
1.88k
      break;
1491
15.4M
    } else 
if (15.4M
!Blocked.test(W)15.4M
) {
1492
147k
      if (
circuit(W, S, NodeSets, W < V ? 147k
true70.7k
:
HasBackedge76.2k
))
1493
70.6k
        F = true;
1494
15.4M
    }
1495
18.5M
  }
1496
148k
1497
148k
  if (F)
1498
71.1k
    unblock(V);
1499
77.2k
  else {
1500
13.9M
    for (auto W : AdjK[V]) {
1501
13.9M
      if (W < S)
1502
3.06M
        continue;
1503
10.9M
      
if (10.9M
B[W].count(SV) == 010.9M
)
1504
10.9M
        B[W].insert(SV);
1505
13.9M
    }
1506
77.2k
  }
1507
148k
  Stack.pop_back();
1508
148k
  return F;
1509
148k
}
1510
1511
/// Unblock a node in the circuit finding algorithm.
1512
72.0k
void SwingSchedulerDAG::Circuits::unblock(int U) {
1513
72.0k
  Blocked.reset(U);
1514
72.0k
  SmallPtrSet<SUnit *, 4> &BU = B[U];
1515
124k
  while (
!BU.empty()124k
) {
1516
52.2k
    SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1517
52.2k
    assert(SI != BU.end() && "Invalid B set.");
1518
52.2k
    SUnit *W = *SI;
1519
52.2k
    BU.erase(W);
1520
52.2k
    if (Blocked.test(W->NodeNum))
1521
972
      unblock(W->NodeNum);
1522
52.2k
  }
1523
72.0k
}
1524
1525
/// Identify all the elementary circuits in the dependence graph using
1526
/// Johnson's circuit algorithm.
1527
113
void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1528
113
  // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1529
113
  // but we do this to find the circuits, and then change them back.
1530
113
  swapAntiDependences(SUnits);
1531
113
1532
113
  Circuits Cir(SUnits);
1533
113
  // Create the adjacency structure.
1534
113
  Cir.createAdjacencyStructure(this);
1535
1.47k
  for (int i = 0, e = SUnits.size(); 
i != e1.47k
;
++i1.36k
) {
1536
1.36k
    Cir.reset();
1537
1.36k
    Cir.circuit(i, i, NodeSets);
1538
1.36k
  }
1539
113
1540
113
  // Change the dependences back so that we've created a DAG again.
1541
113
  swapAntiDependences(SUnits);
1542
113
}
1543
1544
/// Return true for DAG nodes that we ignore when computing the cost functions.
1545
/// We ignore the back-edge recurrence in order to avoid unbounded recurison
1546
/// in the calculation of the ASAP, ALAP, etc functions.
1547
11.4k
static bool ignoreDependence(const SDep &D, bool isPred) {
1548
11.4k
  if (D.isArtificial())
1549
0
    return true;
1550
11.4k
  
return D.getKind() == SDep::Anti && 11.4k
isPred1.77k
;
1551
11.4k
}
1552
1553
/// Compute several functions need to order the nodes for scheduling.
1554
///  ASAP - Earliest time to schedule a node.
1555
///  ALAP - Latest time to schedule a node.
1556
///  MOV - Mobility function, difference between ALAP and ASAP.
1557
///  D - Depth of each node.
1558
///  H - Height of each node.
1559
112
void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1560
112
  ScheduleInfo.resize(SUnits.size());
1561
112
1562
112
  DEBUG({
1563
112
    for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1564
112
                                                    E = Topo.end();
1565
112
         I != E; ++I) {
1566
112
      SUnit *SU = &SUnits[*I];
1567
112
      SU->dump(this);
1568
112
    }
1569
112
  });
1570
112
1571
112
  int maxASAP = 0;
1572
112
  // Compute ASAP.
1573
112
  for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1574
112
                                                  E = Topo.end();
1575
938
       
I != E938
;
++I826
) {
1576
826
    int asap = 0;
1577
826
    SUnit *SU = &SUnits[*I];
1578
826
    for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
1579
826
                                    EP = SU->Preds.end();
1580
2.05k
         
IP != EP2.05k
;
++IP1.22k
) {
1581
1.22k
      if (ignoreDependence(*IP, true))
1582
195
        continue;
1583
1.03k
      SUnit *pred = IP->getSUnit();
1584
1.03k
      asap = std::max(asap, (int)(getASAP(pred) + getLatency(SU, *IP) -
1585
1.03k
                                  getDistance(pred, SU, *IP) * MII));
1586
1.03k
    }
1587
826
    maxASAP = std::max(maxASAP, asap);
1588
826
    ScheduleInfo[*I].ASAP = asap;
1589
826
  }
1590
112
1591
112
  // Compute ALAP and MOV.
1592
112
  for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
1593
112
                                                          E = Topo.rend();
1594
938
       
I != E938
;
++I826
) {
1595
826
    int alap = maxASAP;
1596
826
    SUnit *SU = &SUnits[*I];
1597
826
    for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
1598
826
                                    ES = SU->Succs.end();
1599
2.05k
         
IS != ES2.05k
;
++IS1.22k
) {
1600
1.22k
      if (ignoreDependence(*IS, true))
1601
195
        continue;
1602
1.03k
      SUnit *succ = IS->getSUnit();
1603
1.03k
      alap = std::min(alap, (int)(getALAP(succ) - getLatency(SU, *IS) +
1604
1.03k
                                  getDistance(SU, succ, *IS) * MII));
1605
1.03k
    }
1606
826
1607
826
    ScheduleInfo[*I].ALAP = alap;
1608
826
  }
1609
112
1610
112
  // After computing the node functions, compute the summary for each node set.
1611
112
  for (NodeSet &I : NodeSets)
1612
210
    I.computeNodeSetInfo(this);
1613
112
1614
112
  DEBUG({
1615
112
    for (unsigned i = 0; i < SUnits.size(); i++) {
1616
112
      dbgs() << "\tNode " << i << ":\n";
1617
112
      dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
1618
112
      dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
1619
112
      dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
1620
112
      dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
1621
112
      dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
1622
112
    }
1623
112
  });
1624
112
}
1625
1626
/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1627
/// as the predecessors of the elements of NodeOrder that are not also in
1628
/// NodeOrder.
1629
static bool pred_L(SetVector<SUnit *> &NodeOrder,
1630
                   SmallSetVector<SUnit *, 8> &Preds,
1631
460
                   const NodeSet *S = nullptr) {
1632
460
  Preds.clear();
1633
460
  for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1634
2.56k
       
I != E2.56k
;
++I2.10k
) {
1635
2.10k
    for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
1636
5.54k
         
PI != PE5.54k
;
++PI3.44k
) {
1637
3.44k
      if (
S && 3.44k
S->count(PI->getSUnit()) == 01.04k
)
1638
638
        continue;
1639
2.81k
      
if (2.81k
ignoreDependence(*PI, true)2.81k
)
1640
435
        continue;
1641
2.37k
      
if (2.37k
NodeOrder.count(PI->getSUnit()) == 02.37k
)
1642
215
        Preds.insert(PI->getSUnit());
1643
3.44k
    }
1644
2.10k
    // Back-edges are predecessors with an anti-dependence.
1645
2.10k
    for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
1646
2.10k
                                    ES = (*I)->Succs.end();
1647
5.44k
         
IS != ES5.44k
;
++IS3.34k
) {
1648
3.34k
      if (IS->getKind() != SDep::Anti)
1649
2.86k
        continue;
1650
476
      
if (476
S && 476
S->count(IS->getSUnit()) == 0123
)
1651
101
        continue;
1652
375
      
if (375
NodeOrder.count(IS->getSUnit()) == 0375
)
1653
0
        Preds.insert(IS->getSUnit());
1654
3.34k
    }
1655
2.10k
  }
1656
460
  return !Preds.empty();
1657
460
}
1658
1659
/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1660
/// as the successors of the elements of NodeOrder that are not also in
1661
/// NodeOrder.
1662
static bool succ_L(SetVector<SUnit *> &NodeOrder,
1663
                   SmallSetVector<SUnit *, 8> &Succs,
1664
1.23k
                   const NodeSet *S = nullptr) {
1665
1.23k
  Succs.clear();
1666
1.23k
  for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1667
5.11k
       
I != E5.11k
;
++I3.87k
) {
1668
3.87k
    for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
1669
10.7k
         
SI != SE10.7k
;
++SI6.83k
) {
1670
6.83k
      if (
S && 6.83k
S->count(SI->getSUnit()) == 01.56k
)
1671
917
        continue;
1672
5.91k
      
if (5.91k
ignoreDependence(*SI, false)5.91k
)
1673
0
        continue;
1674
5.91k
      
if (5.91k
NodeOrder.count(SI->getSUnit()) == 05.91k
)
1675
1.23k
        Succs.insert(SI->getSUnit());
1676
6.83k
    }
1677
3.87k
    for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
1678
3.87k
                                    PE = (*I)->Preds.end();
1679
10.7k
         
PI != PE10.7k
;
++PI6.92k
) {
1680
6.92k
      if (PI->getKind() != SDep::Anti)
1681
5.67k
        continue;
1682
1.25k
      
if (1.25k
S && 1.25k
S->count(PI->getSUnit()) == 0269
)
1683
109
        continue;
1684
1.14k
      
if (1.14k
NodeOrder.count(PI->getSUnit()) == 01.14k
)
1685
183
        Succs.insert(PI->getSUnit());
1686
6.92k
    }
1687
3.87k
  }
1688
1.23k
  return !Succs.empty();
1689
1.23k
}
1690
1691
/// Return true if there is a path from the specified node to any of the nodes
1692
/// in DestNodes. Keep track and return the nodes in any path.
1693
static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1694
                        SetVector<SUnit *> &DestNodes,
1695
                        SetVector<SUnit *> &Exclude,
1696
1.74k
                        SmallPtrSet<SUnit *, 8> &Visited) {
1697
1.74k
  if (Cur->isBoundaryNode())
1698
0
    return false;
1699
1.74k
  
if (1.74k
Exclude.count(Cur) != 01.74k
)
1700
143
    return false;
1701
1.59k
  
if (1.59k
DestNodes.count(Cur) != 01.59k
)
1702
219
    return true;
1703
1.37k
  
if (1.37k
!Visited.insert(Cur).second1.37k
)
1704
344
    return Path.count(Cur) != 0;
1705
1.03k
  bool FoundPath = false;
1706
1.03k
  for (auto &SI : Cur->Succs)
1707
1.28k
    FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1708
1.03k
  for (auto &PI : Cur->Preds)
1709
2.12k
    
if (2.12k
PI.getKind() == SDep::Anti2.12k
)
1710
58
      FoundPath |=
1711
58
          computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1712
1.03k
  if (FoundPath)
1713
149
    Path.insert(Cur);
1714
1.74k
  return FoundPath;
1715
1.74k
}
1716
1717
/// Return true if Set1 is a subset of Set2.
1718
235
template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1719
464
  for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); 
I != E464
;
++I229
)
1720
334
    
if (334
Set2.count(*I) == 0334
)
1721
105
      return false;
1722
130
  return true;
1723
235
}
MachinePipeliner.cpp:bool isSubset<llvm::SmallSetVector<llvm::SUnit*, 8u>, llvm::SmallSetVector<llvm::SUnit*, 8u> >(llvm::SmallSetVector<llvm::SUnit*, 8u>&, llvm::SmallSetVector<llvm::SUnit*, 8u>&)
Line
Count
Source
1718
67
template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1719
79
  for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); 
I != E79
;
++I12
)
1720
69
    
if (69
Set2.count(*I) == 069
)
1721
57
      return false;
1722
10
  return true;
1723
67
}
MachinePipeliner.cpp:bool isSubset<llvm::SmallSetVector<llvm::SUnit*, 8u>, (anonymous namespace)::NodeSet>(llvm::SmallSetVector<llvm::SUnit*, 8u>&, (anonymous namespace)::NodeSet&)
Line
Count
Source
1718
168
template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1719
385
  for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); 
I != E385
;
++I217
)
1720
265
    
if (265
Set2.count(*I) == 0265
)
1721
48
      return false;
1722
120
  return true;
1723
168
}
1724
1725
/// Compute the live-out registers for the instructions in a node-set.
1726
/// The live-out registers are those that are defined in the node-set,
1727
/// but not used. Except for use operands of Phis.
1728
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
1729
25
                            NodeSet &NS) {
1730
25
  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1731
25
  MachineRegisterInfo &MRI = MF.getRegInfo();
1732
25
  SmallVector<RegisterMaskPair, 8> LiveOutRegs;
1733
25
  SmallSet<unsigned, 4> Uses;
1734
145
  for (SUnit *SU : NS) {
1735
145
    const MachineInstr *MI = SU->getInstr();
1736
145
    if (MI->isPHI())
1737
5
      continue;
1738
140
    for (const MachineOperand &MO : MI->operands())
1739
502
      
if (502
MO.isReg() && 502
MO.isUse()418
) {
1740
279
        unsigned Reg = MO.getReg();
1741
279
        if (TargetRegisterInfo::isVirtualRegister(Reg))
1742
270
          Uses.insert(Reg);
1743
9
        else 
if (9
MRI.isAllocatable(Reg)9
)
1744
0
          
for (MCRegUnitIterator Units(Reg, TRI); 0
Units.isValid()0
;
++Units0
)
1745
0
            Uses.insert(*Units);
1746
502
      }
1747
145
  }
1748
25
  for (SUnit *SU : NS)
1749
145
    for (const MachineOperand &MO : SU->getInstr()->operands())
1750
527
      
if (527
MO.isReg() && 527
MO.isDef()433
&&
!MO.isDead()144
) {
1751
136
        unsigned Reg = MO.getReg();
1752
136
        if (
TargetRegisterInfo::isVirtualRegister(Reg)136
) {
1753
136
          if (!Uses.count(Reg))
1754
23
            LiveOutRegs.push_back(RegisterMaskPair(Reg,
1755
23
                                                   LaneBitmask::getNone()));
1756
0
        } else 
if (0
MRI.isAllocatable(Reg)0
) {
1757
0
          for (MCRegUnitIterator Units(Reg, TRI); 
Units.isValid()0
;
++Units0
)
1758
0
            
if (0
!Uses.count(*Units)0
)
1759
0
              LiveOutRegs.push_back(RegisterMaskPair(*Units,
1760
0
                                                     LaneBitmask::getNone()));
1761
0
        }
1762
145
      }
1763
25
  RPTracker.addLiveRegs(LiveOutRegs);
1764
25
}
1765
1766
/// A heuristic to filter nodes in recurrent node-sets if the register
1767
/// pressure of a set is too high.
1768
112
void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1769
210
  for (auto &NS : NodeSets) {
1770
210
    // Skip small node-sets since they won't cause register pressure problems.
1771
210
    if (NS.size() <= 2)
1772
185
      continue;
1773
25
    IntervalPressure RecRegPressure;
1774
25
    RegPressureTracker RecRPTracker(RecRegPressure);
1775
25
    RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1776
25
    computeLiveOuts(MF, RecRPTracker, NS);
1777
25
    RecRPTracker.closeBottom();
1778
25
1779
25
    std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1780
427
    std::sort(SUnits.begin(), SUnits.end(), [](const SUnit *A, const SUnit *B) {
1781
427
      return A->NodeNum > B->NodeNum;
1782
427
    });
1783
25
1784
143
    for (auto &SU : SUnits) {
1785
143
      // Since we're computing the register pressure for a subset of the
1786
143
      // instructions in a block, we need to set the tracker for each
1787
143
      // instruction in the node-set. The tracker is set to the instruction
1788
143
      // just after the one we're interested in.
1789
143
      MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
1790
143
      RecRPTracker.setPos(std::next(CurInstI));
1791
143
1792
143
      RegPressureDelta RPDelta;
1793
143
      ArrayRef<PressureChange> CriticalPSets;
1794
143
      RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1795
143
                                             CriticalPSets,
1796
143
                                             RecRegPressure.MaxSetPressure);
1797
143
      if (
RPDelta.Excess.isValid()143
) {
1798
1
        DEBUG(dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1799
1
                     << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
1800
1
                     << ":" << RPDelta.Excess.getUnitInc());
1801
1
        NS.setExceedPressure(SU);
1802
1
        break;
1803
1
      }
1804
142
      RecRPTracker.recede();
1805
142
    }
1806
210
  }
1807
112
}
1808
1809
/// A heuristic to colocate node sets that have the same set of
1810
/// successors.
1811
112
void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1812
112
  unsigned Colocate = 0;
1813
322
  for (int i = 0, e = NodeSets.size(); 
i < e322
;
++i210
) {
1814
210
    NodeSet &N1 = NodeSets[i];
1815
210
    SmallSetVector<SUnit *, 8> S1;
1816
210
    if (
N1.empty() || 210
!succ_L(N1, S1)210
)
1817
53
      continue;
1818
291
    
for (int j = i + 1; 157
j < e291
;
++j134
) {
1819
141
      NodeSet &N2 = NodeSets[j];
1820
141
      if (N1.compareRecMII(N2) != 0)
1821
45
        continue;
1822
96
      SmallSetVector<SUnit *, 8> S2;
1823
96
      if (
N2.empty() || 96
!succ_L(N2, S2)96
)
1824
29
        continue;
1825
67
      
if (67
isSubset(S1, S2) && 67
S1.size() == S2.size()10
) {
1826
7
        N1.setColocate(++Colocate);
1827
7
        N2.setColocate(Colocate);
1828
7
        break;
1829
7
      }
1830
141
    }
1831
210
  }
1832
112
}
1833
1834
/// Check if the existing node-sets are profitable. If not, then ignore the
1835
/// recurrent node-sets, and attempt to schedule all nodes together. This is
1836
/// a heuristic. If the MII is large and there is a non-recurrent node with
1837
/// a large depth compared to the MII, then it's best to try and schedule
1838
/// all instruction together instead of starting with the recurrent node-sets.
1839
112
void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1840
112
  // Look for loops with a large MII.
1841
112
  if (MII <= 20)
1842
112
    return;
1843
0
  // Check if the node-set contains only a simple add recurrence.
1844
0
  for (auto &NS : NodeSets)
1845
0
    
if (0
NS.size() > 20
)
1846
0
      return;
1847
0
  // If the depth of any instruction is significantly larger than the MII, then
1848
0
  // ignore the recurrent node-sets and treat all instructions equally.
1849
0
  for (auto &SU : SUnits)
1850
0
    
if (0
SU.getDepth() > MII * 1.50
) {
1851
0
      NodeSets.clear();
1852
0
      DEBUG(dbgs() << "Clear recurrence node-sets\n");
1853
0
      return;
1854
0
    }
1855
0
}
1856
1857
/// Add the nodes that do not belong to a recurrence set into groups
1858
/// based upon connected componenets.
1859
112
void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1860
112
  SetVector<SUnit *> NodesAdded;
1861
112
  SmallPtrSet<SUnit *, 8> Visited;
1862
112
  // Add the nodes that are on a path between the previous node sets and
1863
112
  // the current node set.
1864
210
  for (NodeSet &I : NodeSets) {
1865
210
    SmallSetVector<SUnit *, 8> N;
1866
210
    // Add the nodes from the current node set to the previous node set.
1867
210
    if (
succ_L(I, N)210
) {
1868
157
      SetVector<SUnit *> Path;
1869
290
      for (SUnit *NI : N) {
1870
290
        Visited.clear();
1871
290
        computePath(NI, Path, NodesAdded, I, Visited);
1872
290
      }
1873
157
      if (!Path.empty())
1874
23
        I.insert(Path.begin(), Path.end());
1875
157
    }
1876
210
    // Add the nodes from the previous node set to the current node set.
1877
210
    N.clear();
1878
210
    if (
succ_L(NodesAdded, N)210
) {
1879
56
      SetVector<SUnit *> Path;
1880
112
      for (SUnit *NI : N) {
1881
112
        Visited.clear();
1882
112
        computePath(NI, Path, I, NodesAdded, Visited);
1883
112
      }
1884
56
      if (!Path.empty())
1885
8
        I.insert(Path.begin(), Path.end());
1886
56
    }
1887
210
    NodesAdded.insert(I.begin(), I.end());
1888
210
  }
1889
112
1890
112
  // Create a new node set with the connected nodes of any successor of a node
1891
112
  // in a recurrent set.
1892
112
  NodeSet NewSet;
1893
112
  SmallSetVector<SUnit *, 8> N;
1894
112
  if (succ_L(NodesAdded, N))
1895
59
    for (SUnit *I : N)
1896
109
      addConnectedNodes(I, NewSet, NodesAdded);
1897
112
  if (!NewSet.empty())
1898
59
    NodeSets.push_back(NewSet);
1899
112
1900
112
  // Create a new node set with the connected nodes of any predecessor of a node
1901
112
  // in a recurrent set.
1902
112
  NewSet.clear();
1903
112
  if (pred_L(NodesAdded, N))
1904
4
    for (SUnit *I : N)
1905
4
      addConnectedNodes(I, NewSet, NodesAdded);
1906
112
  if (!NewSet.empty())
1907
4
    NodeSets.push_back(NewSet);
1908
112
1909
112
  // Create new nodes sets with the connected nodes any any remaining node that
1910
112
  // has no predecessor.
1911
938
  for (unsigned i = 0; 
i < SUnits.size()938
;
++i826
) {
1912
826
    SUnit *SU = &SUnits[i];
1913
826
    if (
NodesAdded.count(SU) == 0826
) {
1914
1
      NewSet.clear();
1915
1
      addConnectedNodes(SU, NewSet, NodesAdded);
1916
1
      if (!NewSet.empty())
1917
1
        NodeSets.push_back(NewSet);
1918
1
    }
1919
826
  }
1920
112
}
1921
1922
/// Add the node to the set, and add all is its connected nodes to the set.
1923
void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
1924
300
                                          SetVector<SUnit *> &NodesAdded) {
1925
300
  NewSet.insert(SU);
1926
300
  NodesAdded.insert(SU);
1927
362
  for (auto &SI : SU->Succs) {
1928
362
    SUnit *Successor = SI.getSUnit();
1929
362
    if (
!SI.isArtificial() && 362
NodesAdded.count(Successor) == 0362
)
1930
102
      addConnectedNodes(Successor, NewSet, NodesAdded);
1931
362
  }
1932
460
  for (auto &PI : SU->Preds) {
1933
460
    SUnit *Predecessor = PI.getSUnit();
1934
460
    if (
!PI.isArtificial() && 460
NodesAdded.count(Predecessor) == 0460
)
1935
84
      addConnectedNodes(Predecessor, NewSet, NodesAdded);
1936
460
  }
1937
300
}
1938
1939
/// Return true if Set1 contains elements in Set2. The elements in common
1940
/// are returned in a different container.
1941
static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
1942
143
                        SmallSetVector<SUnit *, 8> &Result) {
1943
143
  Result.clear();
1944
200
  for (unsigned i = 0, e = Set1.size(); 
i != e200
;
++i57
) {
1945
57
    SUnit *SU = Set1[i];
1946
57
    if (Set2.count(SU) != 0)
1947
13
      Result.insert(SU);
1948
57
  }
1949
143
  return !Result.empty();
1950
143
}
1951
1952
/// Merge the recurrence node sets that have the same initial node.
1953
113
void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
1954
329
  for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1955
216
       
++I216
) {
1956
216
    NodeSet &NI = *I;
1957
559
    for (NodeSetType::iterator J = I + 1; 
J != E559
;) {
1958
343
      NodeSet &NJ = *J;
1959
343
      if (
NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum343
) {
1960
38
        if (NJ.compareRecMII(NI) > 0)
1961
6
          NI.setRecMII(NJ.getRecMII());
1962
228
        for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
1963
190
             ++NII)
1964
190
          I->insert(*NII);
1965
38
        NodeSets.erase(J);
1966
38
        E = NodeSets.end();
1967
343
      } else {
1968
305
        ++J;
1969
305
      }
1970
343
    }
1971
216
  }
1972
113
}
1973
1974
/// Remove nodes that have been scheduled in previous NodeSets.
1975
112
void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
1976
375
  for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1977
263
       ++I)
1978
485
    
for (NodeSetType::iterator J = I + 1; 263
J != E485
;) {
1979
797
      J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
1980
222
1981
222
      if (
J->empty()222
) {
1982
11
        NodeSets.erase(J);
1983
11
        E = NodeSets.end();
1984
222
      } else {
1985
211
        ++J;
1986
211
      }
1987
263
    }
1988
112
}
1989
1990
/// Return true if Inst1 defines a value that is used in Inst2.
1991
1.21k
static bool hasDataDependence(SUnit *Inst1, SUnit *Inst2) {
1992
1.21k
  for (auto &SI : Inst1->Succs)
1993
2.00k
    
if (2.00k
SI.getSUnit() == Inst2 && 2.00k
SI.getKind() == SDep::Data34
)
1994
28
      return true;
1995
1.19k
  return false;
1996
1.19k
}
1997
1998
/// Compute an ordered list of the dependence graph nodes, which
1999
/// indicates the order that the nodes will be scheduled.  This is a
2000
/// two-level algorithm. First, a partial order is created, which
2001
/// consists of a list of sets ordered from highest to lowest priority.
2002
112
void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2003
112
  SmallSetVector<SUnit *, 8> R;
2004
112
  NodeOrder.clear();
2005
112
2006
263
  for (auto &Nodes : NodeSets) {
2007
263
    DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2008
263
    OrderKind Order;
2009
263
    SmallSetVector<SUnit *, 8> N;
2010
263
    if (
pred_L(NodeOrder, N) && 263
isSubset(N, Nodes)69
) {
2011
45
      R.insert(N.begin(), N.end());
2012
45
      Order = BottomUp;
2013
45
      DEBUG(dbgs() << "  Bottom up (preds) ");
2014
263
    } else 
if (218
succ_L(NodeOrder, N) && 218
isSubset(N, Nodes)99
) {
2015
75
      R.insert(N.begin(), N.end());
2016
75
      Order = TopDown;
2017
75
      DEBUG(dbgs() << "  Top down (succs) ");
2018
218
    } else 
if (143
isIntersect(N, Nodes, R)143
) {
2019
9
      // If some of the successors are in the existing node-set, then use the
2020
9
      // top-down ordering.
2021
9
      Order = TopDown;
2022
9
      DEBUG(dbgs() << "  Top down (intersect) ");
2023
143
    } else 
if (134
NodeSets.size() == 1134
) {
2024
1
      for (auto &N : Nodes)
2025
1
        
if (1
N->Succs.size() == 01
)
2026
1
          R.insert(N);
2027
1
      Order = BottomUp;
2028
1
      DEBUG(dbgs() << "  Bottom up (all) ");
2029
134
    } else {
2030
133
      // Find the node with the highest ASAP.
2031
133
      SUnit *maxASAP = nullptr;
2032
356
      for (SUnit *SU : Nodes) {
2033
356
        if (
maxASAP == nullptr || 356
getASAP(SU) >= getASAP(maxASAP)223
)
2034
319
          maxASAP = SU;
2035
356
      }
2036
133
      R.insert(maxASAP);
2037
133
      Order = BottomUp;
2038
133
      DEBUG(dbgs() << "  Bottom up (default) ");
2039
218
    }
2040
263
2041
531
    while (
!R.empty()531
) {
2042
268
      if (
Order == TopDown268
) {
2043
85
        // Choose the node with the maximum height.  If more than one, choose
2044
85
        // the node with the lowest MOV. If still more than one, check if there
2045
85
        // is a dependence between the instructions.
2046
429
        while (
!R.empty()429
) {
2047
344
          SUnit *maxHeight = nullptr;
2048
1.56k
          for (SUnit *I : R) {
2049
1.56k
            if (
maxHeight == nullptr || 1.56k
getHeight(I) > getHeight(maxHeight)1.22k
)
2050
542
              maxHeight = I;
2051
1.02k
            else 
if (1.02k
getHeight(I) == getHeight(maxHeight) &&
2052
604
                     getMOV(I) < getMOV(maxHeight) &&
2053
10
                     !hasDataDependence(maxHeight, I))
2054
7
              maxHeight = I;
2055
1.01k
            else 
if (1.01k
hasDataDependence(I, maxHeight)1.01k
)
2056
13
              maxHeight = I;
2057
1.56k
          }
2058
344
          NodeOrder.insert(maxHeight);
2059
344
          DEBUG(dbgs() << maxHeight->NodeNum << " ");
2060
344
          R.remove(maxHeight);
2061
389
          for (const auto &I : maxHeight->Succs) {
2062
389
            if (Nodes.count(I.getSUnit()) == 0)
2063
38
              continue;
2064
351
            
if (351
NodeOrder.count(I.getSUnit()) != 0351
)
2065
51
              continue;
2066
300
            
if (300
ignoreDependence(I, false)300
)
2067
0
              continue;
2068
300
            R.insert(I.getSUnit());
2069
300
          }
2070
344
          // Back-edges are predecessors with an anti-dependence.
2071
559
          for (const auto &I : maxHeight->Preds) {
2072
559
            if (I.getKind() != SDep::Anti)
2073
536
              continue;
2074
23
            
if (23
Nodes.count(I.getSUnit()) == 023
)
2075
1
              continue;
2076
22
            
if (22
NodeOrder.count(I.getSUnit()) != 022
)
2077
0
              continue;
2078
22
            R.insert(I.getSUnit());
2079
22
          }
2080
344
        }
2081
85
        Order = BottomUp;
2082
85
        DEBUG(dbgs() << "\n   Switching order to bottom up ");
2083
85
        SmallSetVector<SUnit *, 8> N;
2084
85
        if (pred_L(NodeOrder, N, &Nodes))
2085
4
          R.insert(N.begin(), N.end());
2086
268
      } else {
2087
183
        // Choose the node with the maximum depth.  If more than one, choose
2088
183
        // the node with the lowest MOV. If there is still more than one, check
2089
183
        // for a dependence between the instructions.
2090
665
        while (
!R.empty()665
) {
2091
482
          SUnit *maxDepth = nullptr;
2092
740
          for (SUnit *I : R) {
2093
740
            if (
maxDepth == nullptr || 740
getDepth(I) > getDepth(maxDepth)258
)
2094
549
              maxDepth = I;
2095
191
            else 
if (191
getDepth(I) == getDepth(maxDepth) &&
2096
71
                     getMOV(I) < getMOV(maxDepth) &&
2097
24
                     !hasDataDependence(I, maxDepth))
2098
22
              maxDepth = I;
2099
169
            else 
if (169
hasDataDependence(maxDepth, I)169
)
2100
10
              maxDepth = I;
2101
740
          }
2102
482
          NodeOrder.insert(maxDepth);
2103
482
          DEBUG(dbgs() << maxDepth->NodeNum << " ");
2104
482
          R.remove(maxDepth);
2105
482
          if (
Nodes.isExceedSU(maxDepth)482
) {
2106
0
            Order = TopDown;
2107
0
            R.clear();
2108
0
            R.insert(Nodes.getNode(0));
2109
0
            break;
2110
0
          }
2111
482
          
for (const auto &I : maxDepth->Preds) 482
{
2112
667
            if (Nodes.count(I.getSUnit()) == 0)
2113
148
              continue;
2114
519
            
if (519
NodeOrder.count(I.getSUnit()) != 0519
)
2115
60
              continue;
2116
459
            
if (459
I.getKind() == SDep::Anti459
)
2117
115
              continue;
2118
344
            R.insert(I.getSUnit());
2119
344
          }
2120
482
          // Back-edges are predecessors with an anti-dependence.
2121
837
          for (const auto &I : maxDepth->Succs) {
2122
837
            if (I.getKind() != SDep::Anti)
2123
678
              continue;
2124
159
            
if (159
Nodes.count(I.getSUnit()) == 0159
)
2125
15
              continue;
2126
144
            
if (144
NodeOrder.count(I.getSUnit()) != 0144
)
2127
115
              continue;
2128
29
            R.insert(I.getSUnit());
2129
29
          }
2130
482
        }
2131
183
        Order = TopDown;
2132
183
        DEBUG(dbgs() << "\n   Switching order to top down ");
2133
183
        SmallSetVector<SUnit *, 8> N;
2134
183
        if (succ_L(NodeOrder, N, &Nodes))
2135
1
          R.insert(N.begin(), N.end());
2136
183
      }
2137
268
    }
2138
263
    DEBUG(dbgs() << "\nDone with Nodeset\n");
2139
263
  }
2140
112
2141
112
  DEBUG({
2142
112
    dbgs() << "Node order: ";
2143
112
    for (SUnit *I : NodeOrder)
2144
112
      dbgs() << " " << I->NodeNum << " ";
2145
112
    dbgs() << "\n";
2146
112
  });
2147
112
}
2148
2149
/// Process the nodes in the computed order and create the pipelined schedule
2150
/// of the instructions, if possible. Return true if a schedule is found.
2151
112
bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2152
112
  if (NodeOrder.empty())
2153
0
    return false;
2154
112
2155
112
  bool scheduleFound = false;
2156
112
  // Keep increasing II until a valid schedule is found.
2157
277
  for (unsigned II = MII; 
II < MII + 10 && 277
!scheduleFound272
;
++II165
) {
2158
165
    Schedule.reset();
2159
165
    Schedule.setInitiationInterval(II);
2160
165
    DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2161
165
2162
165
    SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2163
165
    SetVector<SUnit *>::iterator NE = NodeOrder.end();
2164
1.41k
    do {
2165
1.41k
      SUnit *SU = *NI;
2166
1.41k
2167
1.41k
      // Compute the schedule time for the instruction, which is based
2168
1.41k
      // upon the scheduled time for any predecessors/successors.
2169
1.41k
      int EarlyStart = INT_MIN;
2170
1.41k
      int LateStart = INT_MAX;
2171
1.41k
      // These values are set when the size of the schedule window is limited
2172
1.41k
      // due to chain dependences.
2173
1.41k
      int SchedEnd = INT_MAX;
2174
1.41k
      int SchedStart = INT_MIN;
2175
1.41k
      Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2176
1.41k
                            II, this);
2177
1.41k
      DEBUG({
2178
1.41k
        dbgs() << "Inst (" << SU->NodeNum << ") ";
2179
1.41k
        SU->getInstr()->dump();
2180
1.41k
        dbgs() << "\n";
2181
1.41k
      });
2182
1.41k
      DEBUG({
2183
1.41k
        dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart
2184
1.41k
               << " me: " << SchedEnd << " ms: " << SchedStart << "\n";
2185
1.41k
      });
2186
1.41k
2187
1.41k
      if (
EarlyStart > LateStart || 1.41k
SchedEnd < EarlyStart1.38k
||
2188
1.38k
          SchedStart > LateStart)
2189
35
        scheduleFound = false;
2190
1.38k
      else 
if (1.38k
EarlyStart != INT_MIN && 1.38k
LateStart == INT_MAX913
) {
2191
644
        SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2192
644
        scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2193
1.38k
      } else 
if (739
EarlyStart == INT_MIN && 739
LateStart != INT_MAX470
) {
2194
280
        SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2195
280
        scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2196
739
      } else 
if (459
EarlyStart != INT_MIN && 459
LateStart != INT_MAX269
) {
2197
269
        SchedEnd =
2198
269
            std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2199
269
        // When scheduling a Phi it is better to start at the late cycle and go
2200
269
        // backwards. The default order may insert the Phi too far away from
2201
269
        // its first dependence.
2202
269
        if (SU->getInstr()->isPHI())
2203
196
          scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2204
269
        else
2205
73
          scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2206
459
      } else {
2207
190
        int FirstCycle = Schedule.getFirstCycle();
2208
190
        scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2209
190
                                        FirstCycle + getASAP(SU) + II - 1, II);
2210
190
      }
2211
1.41k
      // Even if we find a schedule, make sure the schedule doesn't exceed the
2212
1.41k
      // allowable number of stages. We keep trying if this happens.
2213
1.41k
      if (scheduleFound)
2214
1.36k
        
if (1.36k
SwpMaxStages > -1 &&
2215
1.36k
            Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2216
0
          scheduleFound = false;
2217
1.41k
2218
1.41k
      DEBUG({
2219
1.41k
        if (!scheduleFound)
2220
1.41k
          dbgs() << "\tCan't schedule\n";
2221
1.41k
      });
2222
1.41k
    } while (
++NI != NE && 1.41k
scheduleFound1.31k
);
2223
165
2224
165
    // If a schedule is found, check if it is a valid schedule too.
2225
165
    if (scheduleFound)
2226
107
      scheduleFound = Schedule.isValidSchedule(this);
2227
165
  }
2228
112
2229
112
  DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n");
2230
112
2231
112
  if (scheduleFound)
2232
107
    Schedule.finalizeSchedule(this);
2233
112
  else
2234
5
    Schedule.reset();
2235
112
2236
107
  return scheduleFound && Schedule.getMaxStageCount() > 0;
2237
112
}
2238
2239
/// Given a schedule for the loop, generate a new version of the loop,
2240
/// and replace the old version.  This function generates a prolog
2241
/// that contains the initial iterations in the pipeline, and kernel
2242
/// loop, and the epilogue that contains the code for the final
2243
/// iterations.
2244
81
void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
2245
81
  // Create a new basic block for the kernel and add it to the CFG.
2246
81
  MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2247
81
2248
81
  unsigned MaxStageCount = Schedule.getMaxStageCount();
2249
81
2250
81
  // Remember the registers that are used in different stages. The index is
2251
81
  // the iteration, or stage, that the instruction is scheduled in.  This is
2252
81
  // a map between register names in the orignal block and the names created
2253
81
  // in each stage of the pipelined loop.
2254
81
  ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
2255
81
  InstrMapTy InstrMap;
2256
81
2257
81
  SmallVector<MachineBasicBlock *, 4> PrologBBs;
2258
81
  // Generate the prolog instructions that set up the pipeline.
2259
81
  generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
2260
81
  MF.insert(BB->getIterator(), KernelBB);
2261
81
2262
81
  // Rearrange the instructions to generate the new, pipelined loop,
2263
81
  // and update register names as needed.
2264
81
  for (int Cycle = Schedule.getFirstCycle(),
2265
81
           LastCycle = Schedule.getFinalCycle();
2266
205
       
Cycle <= LastCycle205
;
++Cycle124
) {
2267
124
    std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
2268
124
    // This inner loop schedules each instruction in the cycle.
2269
487
    for (SUnit *CI : CycleInstrs) {
2270
487
      if (CI->getInstr()->isPHI())
2271
133
        continue;
2272
354
      unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
2273
354
      MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
2274
354
      updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
2275
354
      KernelBB->push_back(NewMI);
2276
354
      InstrMap[NewMI] = CI->getInstr();
2277
354
    }
2278
124
  }
2279
81
2280
81
  // Copy any terminator instructions to the new kernel, and update
2281
81
  // names as needed.
2282
81
  for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
2283
81
                                   E = BB->instr_end();
2284
243
       
I != E243
;
++I162
) {
2285
162
    MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
2286
162
    updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
2287
162
    KernelBB->push_back(NewMI);
2288
162
    InstrMap[NewMI] = &*I;
2289
162
  }
2290
81
2291
81
  KernelBB->transferSuccessors(BB);
2292
81
  KernelBB->replaceSuccessor(BB, KernelBB);
2293
81
2294
81
  generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
2295
81
                       VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
2296
81
  generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
2297
81
               InstrMap, MaxStageCount, MaxStageCount, false);
2298
81
2299
81
  DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
2300
81
2301
81
  SmallVector<MachineBasicBlock *, 4> EpilogBBs;
2302
81
  // Generate the epilog instructions to complete the pipeline.
2303
81
  generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
2304
81
                 PrologBBs);
2305
81
2306
81
  // We need this step because the register allocation doesn't handle some
2307
81
  // situations well, so we insert copies to help out.
2308
81
  splitLifetimes(KernelBB, EpilogBBs, Schedule);
2309
81
2310
81
  // Remove dead instructions due to loop induction variables.
2311
81
  removeDeadInstructions(KernelBB, EpilogBBs);
2312
81
2313
81
  // Add branches between prolog and epilog blocks.
2314
81
  addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
2315
81
2316
81
  // Remove the original loop since it's no longer referenced.
2317
81
  BB->clear();
2318
81
  BB->eraseFromParent();
2319
81
2320
81
  delete[] VRMap;
2321
81
}
2322
2323
/// Generate the pipeline prolog code.
2324
void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
2325
                                       MachineBasicBlock *KernelBB,
2326
                                       ValueMapTy *VRMap,
2327
81
                                       MBBVectorTy &PrologBBs) {
2328
81
  MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
2329
81
  assert(PreheaderBB != nullptr &&
2330
81
         "Need to add code to handle loops w/o preheader");
2331
81
  MachineBasicBlock *PredBB = PreheaderBB;
2332
81
  InstrMapTy InstrMap;
2333
81
2334
81
  // Generate a basic block for each stage, not including the last stage,
2335
81
  // which will be generated in the kernel. Each basic block may contain
2336
81
  // instructions from multiple stages/iterations.
2337
170
  for (unsigned i = 0; 
i < LastStage170
;
++i89
) {
2338
89
    // Create and insert the prolog basic block prior to the original loop
2339
89
    // basic block.  The original loop is removed later.
2340
89
    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2341
89
    PrologBBs.push_back(NewBB);
2342
89
    MF.insert(BB->getIterator(), NewBB);
2343
89
    NewBB->transferSuccessors(PredBB);
2344
89
    PredBB->addSuccessor(NewBB);
2345
89
    PredBB = NewBB;
2346
89
2347
89
    // Generate instructions for each appropriate stage. Process instructions
2348
89
    // in original program order.
2349
188
    for (int StageNum = i; 
StageNum >= 0188
;
--StageNum99
) {
2350
99
      for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2351
99
                                       BBE = BB->getFirstTerminator();
2352
760
           
BBI != BBE760
;
++BBI661
) {
2353
661
        if (
Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)661
) {
2354
383
          if (BBI->isPHI())
2355
115
            continue;
2356
268
          MachineInstr *NewMI =
2357
268
              cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
2358
268
          updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
2359
268
                            VRMap);
2360
268
          NewBB->push_back(NewMI);
2361
268
          InstrMap[NewMI] = &*BBI;
2362
268
        }
2363
661
      }
2364
99
    }
2365
89
    rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
2366
89
    DEBUG({
2367
89
      dbgs() << "prolog:\n";
2368
89
      NewBB->dump();
2369
89
    });
2370
89
  }
2371
81
2372
81
  PredBB->replaceSuccessor(BB, KernelBB);
2373
81
2374
81
  // Check if we need to remove the branch from the preheader to the original
2375
81
  // loop, and replace it with a branch to the new loop.
2376
81
  unsigned numBranches = TII->removeBranch(*PreheaderBB);
2377
81
  if (
numBranches81
) {
2378
15
    SmallVector<MachineOperand, 0> Cond;
2379
15
    TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
2380
15
  }
2381
81
}
2382
2383
/// Generate the pipeline epilog code. The epilog code finishes the iterations
2384
/// that were started in either the prolog or the kernel.  We create a basic
2385
/// block for each stage that needs to complete.
2386
void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
2387
                                       MachineBasicBlock *KernelBB,
2388
                                       ValueMapTy *VRMap,
2389
                                       MBBVectorTy &EpilogBBs,
2390
81
                                       MBBVectorTy &PrologBBs) {
2391
81
  // We need to change the branch from the kernel to the first epilog block, so
2392
81
  // this call to analyze branch uses the kernel rather than the original BB.
2393
81
  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2394
81
  SmallVector<MachineOperand, 4> Cond;
2395
81
  bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
2396
81
  assert(!checkBranch && "generateEpilog must be able to analyze the branch");
2397
81
  if (checkBranch)
2398
0
    return;
2399
81
2400
81
  MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
2401
81
  if (*LoopExitI == KernelBB)
2402
55
    ++LoopExitI;
2403
81
  assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
2404
81
  MachineBasicBlock *LoopExitBB = *LoopExitI;
2405
81
2406
81
  MachineBasicBlock *PredBB = KernelBB;
2407
81
  MachineBasicBlock *EpilogStart = LoopExitBB;
2408
81
  InstrMapTy InstrMap;
2409
81
2410
81
  // Generate a basic block for each stage, not including the last stage,
2411
81
  // which was generated for the kernel.  Each basic block may contain
2412
81
  // instructions from multiple stages/iterations.
2413
81
  int EpilogStage = LastStage + 1;
2414
170
  for (unsigned i = LastStage; 
i >= 1170
;
--i, ++EpilogStage89
) {
2415
89
    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
2416
89
    EpilogBBs.push_back(NewBB);
2417
89
    MF.insert(BB->getIterator(), NewBB);
2418
89
2419
89
    PredBB->replaceSuccessor(LoopExitBB, NewBB);
2420
89
    NewBB->addSuccessor(LoopExitBB);
2421
89
2422
89
    if (EpilogStart == LoopExitBB)
2423
81
      EpilogStart = NewBB;
2424
89
2425
89
    // Add instructions to the epilog depending on the current block.
2426
89
    // Process instructions in original program order.
2427
188
    for (unsigned StageNum = i; 
StageNum <= LastStage188
;
++StageNum99
) {
2428
859
      for (auto &BBI : *BB) {
2429
859
        if (BBI.isPHI())
2430
192
          continue;
2431
667
        MachineInstr *In = &BBI;
2432
667
        if (
Schedule.isScheduledAtStage(getSUnit(In), StageNum)667
) {
2433
138
          MachineInstr *NewMI = cloneInstr(In, EpilogStage - LastStage, 0);
2434
138
          updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
2435
138
          NewBB->push_back(NewMI);
2436
138
          InstrMap[NewMI] = In;
2437
138
        }
2438
859
      }
2439
99
    }
2440
89
    generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
2441
89
                         VRMap, InstrMap, LastStage, EpilogStage, i == 1);
2442
89
    generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
2443
89
                 InstrMap, LastStage, EpilogStage, i == 1);
2444
89
    PredBB = NewBB;
2445
89
2446
89
    DEBUG({
2447
89
      dbgs() << "epilog:\n";
2448
89
      NewBB->dump();
2449
89
    });
2450
89
  }
2451
81
2452
81
  // Fix any Phi nodes in the loop exit block.
2453
90
  for (MachineInstr &MI : *LoopExitBB) {
2454
90
    if (!MI.isPHI())
2455
81
      break;
2456
30
    
for (unsigned i = 2, e = MI.getNumOperands() + 1; 9
i != e30
;
i += 221
) {
2457
21
      MachineOperand &MO = MI.getOperand(i);
2458
21
      if (MO.getMBB() == BB)
2459
9
        MO.setMBB(PredBB);
2460
21
    }
2461
90
  }
2462
81
2463
81
  // Create a branch to the new epilog from the kernel.
2464
81
  // Remove the original branch and add a new branch to the epilog.
2465
81
  TII->removeBranch(*KernelBB);
2466
81
  TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
2467
81
  // Add a branch to the loop exit.
2468
81
  if (
EpilogBBs.size() > 081
) {
2469
81
    MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
2470
81
    SmallVector<MachineOperand, 4> Cond1;
2471
81
    TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
2472
81
  }
2473
81
}
2474
2475
/// Replace all uses of FromReg that appear outside the specified
2476
/// basic block with ToReg.
2477
static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
2478
                                    MachineBasicBlock *MBB,
2479
                                    MachineRegisterInfo &MRI,
2480
318
                                    LiveIntervals &LIS) {
2481
318
  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
2482
318
                                         E = MRI.use_end();
2483
789
       
I != E789
;) {
2484
471
    MachineOperand &O = *I;
2485
471
    ++I;
2486
471
    if (O.getParent()->getParent() != MBB)
2487
29
      O.setReg(ToReg);
2488
471
  }
2489
318
  if (!LIS.hasInterval(ToReg))
2490
318
    LIS.createEmptyInterval(ToReg);
2491
318
}
2492
2493
/// Return true if the register has a use that occurs outside the
2494
/// specified loop.
2495
static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
2496
155
                            MachineRegisterInfo &MRI) {
2497
155
  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
2498
155
                                         E = MRI.use_end();
2499
321
       
I != E321
;
++I166
)
2500
166
    
if (166
I->getParent()->getParent() != BB166
)
2501
0
      return true;
2502
155
  return false;
2503
155
}
2504
2505
/// Generate Phis for the specific block in the generated pipelined code.
2506
/// This function looks at the Phis from the original code to guide the
2507
/// creation of new Phis.
2508
void SwingSchedulerDAG::generateExistingPhis(
2509
    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2510
    MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2511
    InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2512
170
    bool IsLast) {
2513
170
  // Compute the stage number for the initial value of the Phi, which
2514
170
  // comes from the prolog. The prolog to use depends on to which kernel/
2515
170
  // epilog that we're adding the Phi.
2516
170
  unsigned PrologStage = 0;
2517
170
  unsigned PrevStage = 0;
2518
170
  bool InKernel = (LastStageNum == CurStageNum);
2519
170
  if (
InKernel170
) {
2520
81
    PrologStage = LastStageNum - 1;
2521
81
    PrevStage = CurStageNum;
2522
170
  } else {
2523
89
    PrologStage = LastStageNum - (CurStageNum - LastStageNum);
2524
89
    PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
2525
89
  }
2526
170
2527
170
  for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2528
170
                                   BBE = BB->getFirstNonPHI();
2529
463
       
BBI != BBE463
;
++BBI293
) {
2530
293
    unsigned Def = BBI->getOperand(0).getReg();
2531
293
2532
293
    unsigned InitVal = 0;
2533
293
    unsigned LoopVal = 0;
2534
293
    getPhiRegs(*BBI, BB, InitVal, LoopVal);
2535
293
2536
293
    unsigned PhiOp1 = 0;
2537
293
    // The Phi value from the loop body typically is defined in the loop, but
2538
293
    // not always. So, we need to check if the value is defined in the loop.
2539
293
    unsigned PhiOp2 = LoopVal;
2540
293
    if (VRMap[LastStageNum].count(LoopVal))
2541
293
      PhiOp2 = VRMap[LastStageNum][LoopVal];
2542
293
2543
293
    int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2544
293
    int LoopValStage =
2545
293
        Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
2546
293
    unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
2547
293
    if (
NumStages == 0293
) {
2548
2
      // We don't need to generate a Phi anymore, but we need to rename any uses
2549
2
      // of the Phi value.
2550
2
      unsigned NewReg = VRMap[PrevStage][LoopVal];
2551
2
      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
2552
2
                            Def, NewReg);
2553
2
      if (VRMap[CurStageNum].count(LoopVal))
2554
2
        VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
2555
2
    }
2556
293
    // Adjust the number of Phis needed depending on the number of prologs left,
2557
293
    // and the distance from where the Phi is first scheduled.
2558
293
    unsigned NumPhis = NumStages;
2559
293
    if (
!InKernel && 293
(int)PrologStage < LoopValStage160
)
2560
293
      // The NumPhis is the maximum number of new Phis needed during the steady
2561
293
      // state. If the Phi has not been scheduled in current prolog, then we
2562
293
      // need to generate less Phis.
2563
47
      NumPhis = std::max((int)NumPhis - (int)(LoopValStage - PrologStage), 1);
2564
293
    // The number of Phis cannot exceed the number of prolog stages. Each
2565
293
    // stage can potentially define two values.
2566
293
    NumPhis = std::min(NumPhis, PrologStage + 2);
2567
293
2568
293
    unsigned NewReg = 0;
2569
293
2570
293
    unsigned AccessStage = (LoopValStage != -1) ? 
LoopValStage293
:
StageScheduled0
;
2571
293
    // In the epilog, we may need to look back one stage to get the correct
2572
293
    // Phi name because the epilog and prolog blocks execute the same stage.
2573
293
    // The correct name is from the previous block only when the Phi has
2574
293
    // been completely scheduled prior to the epilog, and Phi value is not
2575
293
    // needed in multiple stages.
2576
293
    int StageDiff = 0;
2577
293
    if (
!InKernel && 293
StageScheduled >= LoopValStage160
&&
AccessStage == 0158
&&
2578
112
        NumPhis == 1)
2579
106
      StageDiff = 1;
2580
293
    // Adjust the computations below when the phi and the loop definition
2581
293
    // are scheduled in different stages.
2582
293
    if (
InKernel && 293
LoopValStage != -1133
&&
StageScheduled > LoopValStage133
)
2583
0
      StageDiff = StageScheduled - LoopValStage;
2584
596
    for (unsigned np = 0; 
np < NumPhis596
;
++np303
) {
2585
303
      // If the Phi hasn't been scheduled, then use the initial Phi operand
2586
303
      // value. Otherwise, use the scheduled version of the instruction. This
2587
303
      // is a little complicated when a Phi references another Phi.
2588
303
      if (
np > PrologStage || 303
StageScheduled >= (int)LastStageNum291
)
2589
92
        PhiOp1 = InitVal;
2590
303
      // Check if the Phi has already been scheduled in a prolog stage.
2591
211
      else 
if (211
PrologStage >= AccessStage + StageDiff + np &&
2592
120
               VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
2593
112
        PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
2594
211
      // Check if the Phi has already been scheduled, but the loop intruction
2595
211
      // is either another Phi, or doesn't occur in the loop.
2596
99
      else 
if (99
PrologStage >= AccessStage + StageDiff + np99
) {
2597
8
        // If the Phi references another Phi, we need to examine the other
2598
8
        // Phi to get the correct value.
2599
8
        PhiOp1 = LoopVal;
2600
8
        MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
2601
8
        int Indirects = 1;
2602
12
        while (
InstOp1 && 12
InstOp1->isPHI()12
&&
InstOp1->getParent() == BB8
) {
2603
8
          int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
2604
8
          if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
2605
4
            PhiOp1 = getInitPhiReg(*InstOp1, BB);
2606
8
          else
2607
4
            PhiOp1 = getLoopPhiReg(*InstOp1, BB);
2608
8
          InstOp1 = MRI.getVRegDef(PhiOp1);
2609
8
          int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
2610
8
          int StageAdj = (PhiOpStage != -1 ? 
PhiStage - PhiOpStage4
:
04
);
2611
8
          if (
PhiOpStage != -1 && 8
PrologStage - StageAdj >= Indirects + np4
&&
2612
8
              
VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)4
) {
2613
4
            PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
2614
4
            break;
2615
4
          }
2616
4
          ++Indirects;
2617
4
        }
2618
8
      } else
2619
91
        PhiOp1 = InitVal;
2620
303
      // If this references a generated Phi in the kernel, get the Phi operand
2621
303
      // from the incoming block.
2622
303
      if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
2623
303
        
if (303
InstOp1->isPHI() && 303
InstOp1->getParent() == KernelBB11
)
2624
1
          PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2625
303
2626
303
      MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
2627
303
      bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
2628
303
      // In the epilog, a map lookup is needed to get the value from the kernel,
2629
303
      // or previous epilog block. How is does this depends on if the
2630
303
      // instruction is scheduled in the previous block.
2631
303
      if (
!InKernel303
) {
2632
166
        int StageDiffAdj = 0;
2633
166
        if (
LoopValStage != -1 && 166
StageScheduled > LoopValStage166
)
2634
0
          StageDiffAdj = StageScheduled - LoopValStage;
2635
166
        // Use the loop value defined in the kernel, unless the kernel
2636
166
        // contains the last definition of the Phi.
2637
166
        if (
np == 0 && 166
PrevStage == LastStageNum160
&&
2638
133
            
(StageScheduled != 0 || 133
LoopValStage != 096
) &&
2639
39
            VRMap[PrevStage - StageDiffAdj].count(LoopVal))
2640
39
          PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
2641
166
        // Use the value defined by the Phi. We add one because we switch
2642
166
        // from looking at the loop value to the Phi definition.
2643
127
        else 
if (127
np > 0 && 127
PrevStage == LastStageNum6
&&
2644
6
                 VRMap[PrevStage - np + 1].count(Def))
2645
6
          PhiOp2 = VRMap[PrevStage - np + 1][Def];
2646
127
        // Use the loop value defined in the kernel.
2647
121
        else 
if (121
(unsigned)LoopValStage + StageDiffAdj > PrologStage + 1 &&
2648
8
                 VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
2649
8
          PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
2650
121
        // Use the value defined by the Phi, unless we're generating the first
2651
121
        // epilog and the Phi refers to a Phi in a different stage.
2652
113
        else 
if (113
VRMap[PrevStage - np].count(Def) &&
2653
113
                 
(!LoopDefIsPhi || 113
PrevStage != LastStageNum8
))
2654
109
          PhiOp2 = VRMap[PrevStage - np][Def];
2655
166
      }
2656
303
2657
303
      // Check if we can reuse an existing Phi. This occurs when a Phi
2658
303
      // references another Phi, and the other Phi is scheduled in an
2659
303
      // earlier stage. We can try to reuse an existing Phi up until the last
2660
303
      // stage of the current Phi.
2661
303
      if (
LoopDefIsPhi && 303
(int)PrologStage >= StageScheduled12
) {
2662
12
        int LVNumStages = Schedule.getStagesForPhi(LoopVal);
2663
12
        int StageDiff = (StageScheduled - LoopValStage);
2664
12
        LVNumStages -= StageDiff;
2665
12
        if (
LVNumStages > (int)np12
) {
2666
0
          NewReg = PhiOp2;
2667
0
          unsigned ReuseStage = CurStageNum;
2668
0
          if (Schedule.isLoopCarried(this, *PhiInst))
2669
0
            ReuseStage -= LVNumStages;
2670
0
          // Check if the Phi to reuse has been generated yet. If not, then
2671
0
          // there is nothing to reuse.
2672
0
          if (
VRMap[ReuseStage].count(LoopVal)0
) {
2673
0
            NewReg = VRMap[ReuseStage][LoopVal];
2674
0
2675
0
            rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2676
0
                                  &*BBI, Def, NewReg);
2677
0
            // Update the map with the new Phi name.
2678
0
            VRMap[CurStageNum - np][Def] = NewReg;
2679
0
            PhiOp2 = NewReg;
2680
0
            if (VRMap[LastStageNum - np - 1].count(LoopVal))
2681
0
              PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
2682
0
2683
0
            if (
IsLast && 0
np == NumPhis - 10
)
2684
0
              replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2685
0
            continue;
2686
0
          }
2687
12
        } else 
if (12
InKernel && 12
StageDiff > 04
&&
2688
0
                   VRMap[CurStageNum - StageDiff - np].count(LoopVal))
2689
0
          PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
2690
12
      }
2691
303
2692
303
      const TargetRegisterClass *RC = MRI.getRegClass(Def);
2693
303
      NewReg = MRI.createVirtualRegister(RC);
2694
303
2695
303
      MachineInstrBuilder NewPhi =
2696
303
          BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2697
303
                  TII->get(TargetOpcode::PHI), NewReg);
2698
303
      NewPhi.addReg(PhiOp1).addMBB(BB1);
2699
303
      NewPhi.addReg(PhiOp2).addMBB(BB2);
2700
303
      if (np == 0)
2701
291
        InstrMap[NewPhi] = &*BBI;
2702
303
2703
303
      // We define the Phis after creating the new pipelined code, so
2704
303
      // we need to rename the Phi values in scheduled instructions.
2705
303
2706
303
      unsigned PrevReg = 0;
2707
303
      if (
InKernel && 303
VRMap[PrevStage - np].count(LoopVal)137
)
2708
137
        PrevReg = VRMap[PrevStage - np][LoopVal];
2709
303
      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2710
303
                            Def, NewReg, PrevReg);
2711
303
      // If the Phi has been scheduled, use the new name for rewriting.
2712
303
      if (
VRMap[CurStageNum - np].count(Def)303
) {
2713
6
        unsigned R = VRMap[CurStageNum - np][Def];
2714
6
        rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2715
6
                              R, NewReg);
2716
6
      }
2717
303
2718
303
      // Check if we need to rename any uses that occurs after the loop. The
2719
303
      // register to replace depends on whether the Phi is scheduled in the
2720
303
      // epilog.
2721
303
      if (
IsLast && 303
np == NumPhis - 1139
)
2722
133
        replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2723
303
2724
303
      // In the kernel, a dependent Phi uses the value from this Phi.
2725
303
      if (InKernel)
2726
137
        PhiOp2 = NewReg;
2727
303
2728
303
      // Update the map with the new Phi name.
2729
303
      VRMap[CurStageNum - np][Def] = NewReg;
2730
303
    }
2731
293
2732
293
    while (
NumPhis++ < NumStages293
) {
2733
0
      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
2734
0
                            &*BBI, Def, NewReg, 0);
2735
0
    }
2736
293
2737
293
    // Check if we need to rename a Phi that has been eliminated due to
2738
293
    // scheduling.
2739
293
    if (
NumStages == 0 && 293
IsLast2
&&
VRMap[CurStageNum].count(LoopVal)0
)
2740
0
      replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
2741
293
  }
2742
170
}
2743
2744
/// Generate Phis for the specified block in the generated pipelined code.
2745
/// These are new Phis needed because the definition is scheduled after the
2746
/// use in the pipelened sequence.
2747
void SwingSchedulerDAG::generatePhis(
2748
    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2749
    MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2750
    InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2751
170
    bool IsLast) {
2752
170
  // Compute the stage number that contains the initial Phi value, and
2753
170
  // the Phi from the previous stage.
2754
170
  unsigned PrologStage = 0;
2755
170
  unsigned PrevStage = 0;
2756
170
  unsigned StageDiff = CurStageNum - LastStageNum;
2757
170
  bool InKernel = (StageDiff == 0);
2758
170
  if (
InKernel170
) {
2759
81
    PrologStage = LastStageNum - 1;
2760
81
    PrevStage = CurStageNum;
2761
170
  } else {
2762
89
    PrologStage = LastStageNum - StageDiff;
2763
89
    PrevStage = LastStageNum + StageDiff - 1;
2764
89
  }
2765
170
2766
170
  for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
2767
170
                                   BBE = BB->instr_end();
2768
1.27k
       
BBI != BBE1.27k
;
++BBI1.10k
) {
2769
4.85k
    for (unsigned i = 0, e = BBI->getNumOperands(); 
i != e4.85k
;
++i3.75k
) {
2770
3.75k
      MachineOperand &MO = BBI->getOperand(i);
2771
3.75k
      if (
!MO.isReg() || 3.75k
!MO.isDef()2.81k
||
2772
1.25k
          !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2773
3.04k
        continue;
2774
712
2775
712
      int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2776
712
      assert(StageScheduled != -1 && "Expecting scheduled instruction.");
2777
712
      unsigned Def = MO.getReg();
2778
712
      unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
2779
712
      // An instruction scheduled in stage 0 and is used after the loop
2780
712
      // requires a phi in the epilog for the last definition from either
2781
712
      // the kernel or prolog.
2782
712
      if (
!InKernel && 712
NumPhis == 0386
&&
StageScheduled == 0242
&&
2783
155
          hasUseAfterLoop(Def, BB, MRI))
2784
0
        NumPhis = 1;
2785
712
      if (
!InKernel && 712
(unsigned)StageScheduled > PrologStage386
)
2786
88
        continue;
2787
624
2788
624
      unsigned PhiOp2 = VRMap[PrevStage][Def];
2789
624
      if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
2790
609
        
if (609
InstOp2->isPHI() && 609
InstOp2->getParent() == NewBB21
)
2791
0
          PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
2792
624
      // The number of Phis can't exceed the number of prolog stages. The
2793
624
      // prolog stage number is zero based.
2794
624
      if (NumPhis > PrologStage + 1 - StageScheduled)
2795
2
        NumPhis = PrologStage + 1 - StageScheduled;
2796
877
      for (unsigned np = 0; 
np < NumPhis877
;
++np253
) {
2797
253
        unsigned PhiOp1 = VRMap[PrologStage][Def];
2798
253
        if (np <= PrologStage)
2799
253
          PhiOp1 = VRMap[PrologStage - np][Def];
2800
253
        if (MachineInstr *
InstOp1253
= MRI.getVRegDef(PhiOp1)) {
2801
253
          if (
InstOp1->isPHI() && 253
InstOp1->getParent() == KernelBB118
)
2802
118
            PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2803
253
          if (
InstOp1->isPHI() && 253
InstOp1->getParent() == NewBB118
)
2804
0
            PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
2805
253
        }
2806
253
        if (!InKernel)
2807
137
          PhiOp2 = VRMap[PrevStage - np][Def];
2808
253
2809
253
        const TargetRegisterClass *RC = MRI.getRegClass(Def);
2810
253
        unsigned NewReg = MRI.createVirtualRegister(RC);
2811
253
2812
253
        MachineInstrBuilder NewPhi =
2813
253
            BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2814
253
                    TII->get(TargetOpcode::PHI), NewReg);
2815
253
        NewPhi.addReg(PhiOp1).addMBB(BB1);
2816
253
        NewPhi.addReg(PhiOp2).addMBB(BB2);
2817
253
        if (np == 0)
2818
249
          InstrMap[NewPhi] = &*BBI;
2819
253
2820
253
        // Rewrite uses and update the map. The actions depend upon whether
2821
253
        // we generating code for the kernel or epilog blocks.
2822
253
        if (
InKernel253
) {
2823
116
          rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2824
116
                                &*BBI, PhiOp1, NewReg);
2825
116
          rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2826
116
                                &*BBI, PhiOp2, NewReg);
2827
116
2828
116
          PhiOp2 = NewReg;
2829
116
          VRMap[PrevStage - np - 1][Def] = NewReg;
2830
253
        } else {
2831
137
          VRMap[CurStageNum - np][Def] = NewReg;
2832
137
          if (np == NumPhis - 1)
2833
135
            rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2834
135
                                  &*BBI, Def, NewReg);
2835
137
        }
2836
253
        if (
IsLast && 253
np == NumPhis - 1106
)
2837
106
          replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2838
253
      }
2839
3.75k
    }
2840
1.10k
  }
2841
170
}
2842
2843
/// Remove instructions that generate values with no uses.
2844
/// Typically, these are induction variable operations that generate values
2845
/// used in the loop itself.  A dead instruction has a definition with
2846
/// no uses, or uses that occur in the original loop only.
2847
void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
2848
81
                                               MBBVectorTy &EpilogBBs) {
2849
81
  // For each epilog block, check that the value defined by each instruction
2850
81
  // is used.  If not, delete it.
2851
81
  for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
2852
81
                                     MBE = EpilogBBs.rend();
2853
170
       
MBB != MBE170
;
++MBB89
)
2854
89
    for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
2855
89
                                                   ME = (*MBB)->instr_rend();
2856
611
         
MI != ME611
;) {
2857
522
      // From DeadMachineInstructionElem. Don't delete inline assembly.
2858
522
      if (
MI->isInlineAsm()522
) {
2859
1
        ++MI;
2860
1
        continue;
2861
1
      }
2862
521
      bool SawStore = false;
2863
521
      // Check if it's safe to remove the instruction due to side effects.
2864
521
      // We can, and want to, remove Phis here.
2865
521
      if (
!MI->isSafeToMove(nullptr, SawStore) && 521
!MI->isPHI()447
) {
2866
144
        ++MI;
2867
144
        continue;
2868
144
      }
2869
377
      bool used = true;
2870
377
      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
2871
377
                                      MOE = MI->operands_end();
2872
981
           
MOI != MOE981
;
++MOI604
) {
2873
859
        if (
!MOI->isReg() || 859
!MOI->isDef()620
)
2874
482
          continue;
2875
377
        unsigned reg = MOI->getReg();
2876
377
        unsigned realUses = 0;
2877
377
        for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
2878
377
                                               EI = MRI.use_end();
2879
377
             
UI != EI377
;
++UI0
) {
2880
255
          // Check if there are any uses that occur only in the original
2881
255
          // loop.  If so, that's not a real use.
2882
255
          if (
UI->getParent()->getParent() != BB255
) {
2883
255
            realUses++;
2884
255
            used = true;
2885
255
            break;
2886
255
          }
2887
255
        }
2888
377
        if (realUses > 0)
2889
255
          break;
2890
122
        used = false;
2891
122
      }
2892
377
      if (
!used377
) {
2893
122
        MI++->eraseFromParent();
2894
122
        continue;
2895
122
      }
2896
255
      ++MI;
2897
255
    }
2898
81
  // In the kernel block, check if we can remove a Phi that generates a value
2899
81
  // used in an instruction removed in the epilog block.
2900
81
  for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
2901
81
                                   BBE = KernelBB->getFirstNonPHI();
2902
334
       
BBI != BBE334
;) {
2903
253
    MachineInstr *MI = &*BBI;
2904
253
    ++BBI;
2905
253
    unsigned reg = MI->getOperand(0).getReg();
2906
253
    if (
MRI.use_begin(reg) == MRI.use_end()253
) {
2907
0
      MI->eraseFromParent();
2908
0
    }
2909
253
  }
2910
81
}
2911
2912
/// For loop carried definitions, we split the lifetime of a virtual register
2913
/// that has uses past the definition in the next iteration. A copy with a new
2914
/// virtual register is inserted before the definition, which helps with
2915
/// generating a better register assignment.
2916
///
2917
///   v1 = phi(a, v2)     v1 = phi(a, v2)
2918
///   v2 = phi(b, v3)     v2 = phi(b, v3)
2919
///   v3 = ..             v4 = copy v1
2920
///   .. = V1             v3 = ..
2921
///                       .. = v4
2922
void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
2923
                                       MBBVectorTy &EpilogBBs,
2924
81
                                       SMSchedule &Schedule) {
2925
81
  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2926
81
  for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
2927
81
                                   BBF = KernelBB->getFirstNonPHI();
2928
334
       
BBI != BBF334
;
++BBI253
) {
2929
253
    unsigned Def = BBI->getOperand(0).getReg();
2930
253
    // Check for any Phi definition that used as an operand of another Phi
2931
253
    // in the same block.
2932
253
    for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
2933
253
                                                 E = MRI.use_instr_end();
2934
701
         
I != E701
;
++I448
) {
2935
454
      if (
I->isPHI() && 454
I->getParent() == KernelBB102
) {
2936
12
        // Get the loop carried definition.
2937
12
        unsigned LCDef = getLoopPhiReg(*BBI, KernelBB);
2938
12
        if (!LCDef)
2939
0
          continue;
2940
12
        MachineInstr *MI = MRI.getVRegDef(LCDef);
2941
12
        if (
!MI || 12
MI->getParent() != KernelBB12
||
MI->isPHI()12
)
2942
0
          continue;
2943
12
        // Search through the rest of the block looking for uses of the Phi
2944
12
        // definition. If one occurs, then split the lifetime.
2945
12
        unsigned SplitReg = 0;
2946
12
        for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
2947
12
                                    KernelBB->instr_end()))
2948
84
          
if (84
BBJ.readsRegister(Def)84
) {
2949
12
            // We split the lifetime when we find the first use.
2950
12
            if (
SplitReg == 012
) {
2951
6
              SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
2952
6
              BuildMI(*KernelBB, MI, MI->getDebugLoc(),
2953
6
                      TII->get(TargetOpcode::COPY), SplitReg)
2954
6
                  .addReg(Def);
2955
6
            }
2956
84
            BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
2957
84
          }
2958
12
        if (!SplitReg)
2959
6
          continue;
2960
6
        // Search through each of the epilog blocks for any uses to be renamed.
2961
6
        for (auto &Epilog : EpilogBBs)
2962
6
          for (auto &I : *Epilog)
2963
56
            
if (56
I.readsRegister(Def)56
)
2964
12
              I.substituteRegister(Def, SplitReg, 0, *TRI);
2965
12
        break;
2966
12
      }
2967
454
    }
2968
253
  }
2969
81
}
2970
2971
/// Remove the incoming block from the Phis in a basic block.
2972
17
static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
2973
48
  for (MachineInstr &MI : *BB) {
2974
48
    if (!MI.isPHI())
2975
17
      break;
2976
33
    
for (unsigned i = 1, e = MI.getNumOperands(); 31
i != e33
;
i += 22
)
2977
33
      
if (33
MI.getOperand(i + 1).getMBB() == Incoming33
) {
2978
31
        MI.RemoveOperand(i + 1);
2979
31
        MI.RemoveOperand(i);
2980
31
        break;
2981
31
      }
2982
48
  }
2983
17
}
2984
2985
/// Create branches from each prolog basic block to the appropriate epilog
2986
/// block.  These edges are needed if the loop ends before reaching the
2987
/// kernel.
2988
void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
2989
                                    MachineBasicBlock *KernelBB,
2990
                                    MBBVectorTy &EpilogBBs,
2991
81
                                    SMSchedule &Schedule, ValueMapTy *VRMap) {
2992
81
  assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
2993
81
  MachineInstr *IndVar = Pass.LI.LoopInductionVar;
2994
81
  MachineInstr *Cmp = Pass.LI.LoopCompare;
2995
81
  MachineBasicBlock *LastPro = KernelBB;
2996
81
  MachineBasicBlock *LastEpi = KernelBB;
2997
81
2998
81
  // Start from the blocks connected to the kernel and work "out"
2999
81
  // to the first prolog and the last epilog blocks.
3000
81
  SmallVector<MachineInstr *, 4> PrevInsts;
3001
81
  unsigned MaxIter = PrologBBs.size() - 1;
3002
81
  unsigned LC = UINT_MAX;
3003
81
  unsigned LCMin = UINT_MAX;
3004
170
  for (unsigned i = 0, j = MaxIter; 
i <= MaxIter170
;
++i, --j89
) {
3005
89
    // Add branches to the prolog that go to the corresponding
3006
89
    // epilog, and the fall-thru prolog/kernel block.
3007
89
    MachineBasicBlock *Prolog = PrologBBs[j];
3008
89
    MachineBasicBlock *Epilog = EpilogBBs[i];
3009
89
    // We've executed one iteration, so decrement the loop count and check for
3010
89
    // the loop end.
3011
89
    SmallVector<MachineOperand, 4> Cond;
3012
89
    // Check if the LOOP0 has already been removed. If so, then there is no need
3013
89
    // to reduce the trip count.
3014
89
    if (LC != 0)
3015
89
      LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
3016
89
                                MaxIter);
3017
89
3018
89
    // Record the value of the first trip count, which is used to determine if
3019
89
    // branches and blocks can be removed for constant trip counts.
3020
89
    if (LCMin == UINT_MAX)
3021
81
      LCMin = LC;
3022
89
3023
89
    unsigned numAdded = 0;
3024
89
    if (
TargetRegisterInfo::isVirtualRegister(LC)89
) {
3025
72
      Prolog->addSuccessor(Epilog);
3026
72
      numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
3027
89
    } else 
if (17
j >= LCMin17
) {
3028
1
      Prolog->addSuccessor(Epilog);
3029
1
      Prolog->removeSuccessor(LastPro);
3030
1
      LastEpi->removeSuccessor(Epilog);
3031
1
      numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
3032
1
      removePhis(Epilog, LastEpi);
3033
1
      // Remove the blocks that are no longer referenced.
3034
1
      if (
LastPro != LastEpi1
) {
3035
0
        LastEpi->clear();
3036
0
        LastEpi->eraseFromParent();
3037
0
      }
3038
1
      LastPro->clear();
3039
1
      LastPro->eraseFromParent();
3040
17
    } else {
3041
16
      numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
3042
16
      removePhis(Epilog, Prolog);
3043
16
    }
3044
89
    LastPro = Prolog;
3045
89
    LastEpi = Epilog;
3046
89
    for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
3047
89
                                                   E = Prolog->instr_rend();
3048
250
         
I != E && 250
numAdded > 0250
;
++I, --numAdded161
)
3049
161
      updateInstruction(&*I, false, j, 0, Schedule, VRMap);
3050
89
  }
3051
81
}
3052
3053
/// Return true if we can compute the amount the instruction changes
3054
/// during each iteration. Set Delta to the amount of the change.
3055
35.7k
bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
3056
35.7k
  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3057
35.7k
  unsigned BaseReg;
3058
35.7k
  int64_t Offset;
3059
35.7k
  if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
3060
34.4k
    return false;
3061
1.29k
3062
1.29k
  MachineRegisterInfo &MRI = MF.getRegInfo();
3063
1.29k
  // Check if there is a Phi. If so, get the definition in the loop.
3064
1.29k
  MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
3065
1.29k
  if (
BaseDef && 1.29k
BaseDef->isPHI()1.29k
) {
3066
1.13k
    BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
3067
1.13k
    BaseDef = MRI.getVRegDef(BaseReg);
3068
1.13k
  }
3069
1.29k
  if (!BaseDef)
3070
7
    return false;
3071
1.28k
3072
1.28k
  int D = 0;
3073
1.28k
  if (
!TII->getIncrementValue(*BaseDef, D) && 1.28k
D >= 0162
)
3074
162
    return false;
3075
1.12k
3076
1.12k
  Delta = D;
3077
1.12k
  return true;
3078
1.12k
}
3079
3080
/// Update the memory operand with a new offset when the pipeliner
3081
/// generates a new copy of the instruction that refers to a
3082
/// different memory location.
3083
void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
3084
760
                                          MachineInstr &OldMI, unsigned Num) {
3085
760
  if (Num == 0)
3086
354
    return;
3087
406
  // If the instruction has memory operands, then adjust the offset
3088
406
  // when the instruction appears in different stages.
3089
406
  unsigned NumRefs = NewMI.memoperands_end() - NewMI.memoperands_begin();
3090
406
  if (NumRefs == 0)
3091
249
    return;
3092
157
  MachineInstr::mmo_iterator NewMemRefs = MF.allocateMemRefsArray(NumRefs);
3093
157
  unsigned Refs = 0;
3094
193
  for (MachineMemOperand *MMO : NewMI.memoperands()) {
3095
193
    if (
MMO->isVolatile() || 193
(MMO->isInvariant() && 193
MMO->isDereferenceable()0
) ||
3096
193
        
(!MMO->getValue())193
) {
3097
8
      NewMemRefs[Refs++] = MMO;
3098
8
      continue;
3099
8
    }
3100
185
    unsigned Delta;
3101
185
    if (
computeDelta(OldMI, Delta)185
) {
3102
77
      int64_t AdjOffset = Delta * Num;
3103
77
      NewMemRefs[Refs++] =
3104
77
          MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize());
3105
77
    } else
3106
108
      NewMemRefs[Refs++] = MF.getMachineMemOperand(MMO, 0, UINT64_MAX);
3107
193
  }
3108
760
  NewMI.setMemRefs(NewMemRefs, NewMemRefs + NumRefs);
3109
760
}
3110
3111
/// Clone the instruction for the new pipelined loop and update the
3112
/// memory operands, if needed.
3113
MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
3114
                                            unsigned CurStageNum,
3115
492
                                            unsigned InstStageNum) {
3116
492
  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3117
492
  // Check for tied operands in inline asm instructions. This should be handled
3118
492
  // elsewhere, but I'm not sure of the best solution.
3119
492
  if (OldMI->isInlineAsm())
3120
8
    
for (unsigned i = 0, e = OldMI->getNumOperands(); 2
i != e8
;
++i6
) {
3121
8
      const auto &MO = OldMI->getOperand(i);
3122
8
      if (
MO.isReg() && 8
MO.isUse()2
)
3123
2
        break;
3124
6
      unsigned UseIdx;
3125
6
      if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
3126
0
        NewMI->tieOperands(i, UseIdx);
3127
2
    }
3128
492
  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3129
492
  return NewMI;
3130
492
}
3131
3132
/// Clone the instruction for the new pipelined loop. If needed, this
3133
/// function updates the instruction using the values saved in the
3134
/// InstrChanges structure.
3135
MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
3136
                                                     unsigned CurStageNum,
3137
                                                     unsigned InstStageNum,
3138
268
                                                     SMSchedule &Schedule) {
3139
268
  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3140
268
  DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3141
268
      InstrChanges.find(getSUnit(OldMI));
3142
268
  if (
It != InstrChanges.end()268
) {
3143
7
    std::pair<unsigned, int64_t> RegAndOffset = It->second;
3144
7
    unsigned BasePos, OffsetPos;
3145
7
    if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
3146
0
      return nullptr;
3147
7
    int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
3148
7
    MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
3149
7
    if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
3150
7
      NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
3151
7
    NewMI->getOperand(OffsetPos).setImm(NewOffset);
3152
7
  }
3153
268
  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3154
268
  return NewMI;
3155
268
}
3156
3157
/// Update the machine instruction with new virtual registers.  This
3158
/// function may change the defintions and/or uses.
3159
void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
3160
                                          unsigned CurStageNum,
3161
                                          unsigned InstrStageNum,
3162
                                          SMSchedule &Schedule,
3163
1.08k
                                          ValueMapTy *VRMap) {
3164
4.60k
  for (unsigned i = 0, e = NewMI->getNumOperands(); 
i != e4.60k
;
++i3.52k
) {
3165
3.52k
    MachineOperand &MO = NewMI->getOperand(i);
3166
3.52k
    if (
!MO.isReg() || 3.52k
!TargetRegisterInfo::isVirtualRegister(MO.getReg())2.59k
)
3167
1.54k
      continue;
3168
1.98k
    unsigned reg = MO.getReg();
3169
1.98k
    if (
MO.isDef()1.98k
) {
3170
712
      // Create a new virtual register for the definition.
3171
712
      const TargetRegisterClass *RC = MRI.getRegClass(reg);
3172
712
      unsigned NewReg = MRI.createVirtualRegister(RC);
3173
712
      MO.setReg(NewReg);
3174
712
      VRMap[CurStageNum][reg] = NewReg;
3175
712
      if (LastDef)
3176
79
        replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
3177
1.98k
    } else 
if (1.27k
MO.isUse()1.27k
) {
3178
1.27k
      MachineInstr *Def = MRI.getVRegDef(reg);
3179
1.27k
      // Compute the stage that contains the last definition for instruction.
3180
1.27k
      int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
3181
1.27k
      unsigned StageNum = CurStageNum;
3182
1.27k
      if (
DefStageNum != -1 && 1.27k
(int)InstrStageNum > DefStageNum999
) {
3183
150
        // Compute the difference in stages between the defintion and the use.
3184
150
        unsigned StageDiff = (InstrStageNum - DefStageNum);
3185
150
        // Make an adjustment to get the last definition.
3186
150
        StageNum -= StageDiff;
3187
150
      }
3188
1.27k
      if (VRMap[StageNum].count(reg))
3189
371
        MO.setReg(VRMap[StageNum][reg]);
3190
1.27k
    }
3191
3.52k
  }
3192
1.08k
}
3193
3194
/// Return the instruction in the loop that defines the register.
3195
/// If the definition is a Phi, then follow the Phi operand to
3196
/// the instruction in the loop.
3197
16
MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
3198
16
  SmallPtrSet<MachineInstr *, 8> Visited;
3199
16
  MachineInstr *Def = MRI.getVRegDef(Reg);
3200
25
  while (
Def->isPHI()25
) {
3201
9
    if (!Visited.insert(Def).second)
3202
0
      break;
3203
18
    
for (unsigned i = 1, e = Def->getNumOperands(); 9
i < e18
;
i += 29
)
3204
18
      
if (18
Def->getOperand(i + 1).getMBB() == BB18
) {
3205
9
        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3206
9
        break;
3207
9
      }
3208
9
  }
3209
16
  return Def;
3210
16
}
3211
3212
/// Return the new name for the value from the previous stage.
3213
unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
3214
                                          unsigned LoopVal, unsigned LoopStage,
3215
                                          ValueMapTy *VRMap,
3216
160
                                          MachineBasicBlock *BB) {
3217
160
  unsigned PrevVal = 0;
3218
160
  if (
StageNum > PhiStage160
) {
3219
18
    MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
3220
18
    if (
PhiStage == LoopStage && 18
VRMap[StageNum - 1].count(LoopVal)18
)
3221
18
      // The name is defined in the previous stage.
3222
14
      PrevVal = VRMap[StageNum - 1][LoopVal];
3223
4
    else 
if (4
VRMap[StageNum].count(LoopVal)4
)
3224
4
      // The previous name is defined in the current stage when the instruction
3225
4
      // order is swapped.
3226
0
      PrevVal = VRMap[StageNum][LoopVal];
3227
4
    else 
if (4
!LoopInst->isPHI() || 4
LoopInst->getParent() != BB4
)
3228
4
      // The loop value hasn't yet been scheduled.
3229
0
      PrevVal = LoopVal;
3230
4
    else 
if (4
StageNum == PhiStage + 14
)
3231
4
      // The loop value is another phi, which has not been scheduled.
3232
4
      PrevVal = getInitPhiReg(*LoopInst, BB);
3233
0
    else 
if (0
StageNum > PhiStage + 1 && 0
LoopInst->getParent() == BB0
)
3234
0
      // The loop value is another phi, which has been scheduled.
3235
0
      PrevVal =
3236
0
          getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
3237
0
                        LoopStage, VRMap, BB);
3238
18
  }
3239
160
  return PrevVal;
3240
160
}
3241
3242
/// Rewrite the Phi values in the specified block to use the mappings
3243
/// from the initial operand. Once the Phi is scheduled, we switch
3244
/// to using the loop value instead of the Phi value, so those names
3245
/// do not need to be rewritten.
3246
void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
3247
                                         unsigned StageNum,
3248
                                         SMSchedule &Schedule,
3249
                                         ValueMapTy *VRMap,
3250
89
                                         InstrMapTy &InstrMap) {
3251
89
  for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
3252
89
                                   BBE = BB->getFirstNonPHI();
3253
249
       
BBI != BBE249
;
++BBI160
) {
3254
160
    unsigned InitVal = 0;
3255
160
    unsigned LoopVal = 0;
3256
160
    getPhiRegs(*BBI, BB, InitVal, LoopVal);
3257
160
    unsigned PhiDef = BBI->getOperand(0).getReg();
3258
160
3259
160
    unsigned PhiStage =
3260
160
        (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
3261
160
    unsigned LoopStage =
3262
160
        (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
3263
160
    unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
3264
160
    if (NumPhis > StageNum)
3265
6
      NumPhis = StageNum;
3266
320
    for (unsigned np = 0; 
np <= NumPhis320
;
++np160
) {
3267
160
      unsigned NewVal =
3268
160
          getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
3269
160
      if (!NewVal)
3270
142
        NewVal = InitVal;
3271
160
      rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &*BBI,
3272
160
                            PhiDef, NewVal);
3273
160
    }
3274
160
  }
3275
89
}
3276
3277
/// Rewrite a previously scheduled instruction to use the register value
3278
/// from the new instruction. Make sure the instruction occurs in the
3279
/// basic block, and we don't change the uses in the new instruction.
3280
void SwingSchedulerDAG::rewriteScheduledInstr(
3281
    MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
3282
    unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
3283
838
    unsigned NewReg, unsigned PrevReg) {
3284
838
  bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
3285
838
  int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
3286
838
  // Rewrite uses that have been scheduled already to use the new
3287
838
  // Phi register.
3288
838
  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
3289
838
                                         EI = MRI.use_end();
3290
2.91k
       
UI != EI2.91k
;) {
3291
2.07k
    MachineOperand &UseOp = *UI;
3292
2.07k
    MachineInstr *UseMI = UseOp.getParent();
3293
2.07k
    ++UI;
3294
2.07k
    if (UseMI->getParent() != BB)
3295
1.04k
      continue;
3296
1.02k
    
if (1.02k
UseMI->isPHI()1.02k
) {
3297
254
      if (
!Phi->isPHI() && 254
UseMI->getOperand(0).getReg() == NewReg242
)
3298
232
        continue;
3299
22
      
if (22
getLoopPhiReg(*UseMI, BB) != OldReg22
)
3300
17
        continue;
3301
780
    }
3302
780
    InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
3303
780
    assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
3304
780
    SUnit *OrigMISU = getSUnit(OrigInstr->second);
3305
780
    int StageSched = Schedule.stageScheduled(OrigMISU);
3306
780
    int CycleSched = Schedule.cycleScheduled(OrigMISU);
3307
780
    unsigned ReplaceReg = 0;
3308
780
    // This is the stage for the scheduled instruction.
3309
780
    if (
StagePhi == StageSched && 780
Phi->isPHI()502
) {
3310
495
      int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
3311
495
      if (
PrevReg && 495
InProlog229
)
3312
0
        ReplaceReg = PrevReg;
3313
495
      else 
if (495
PrevReg && 495
!Schedule.isLoopCarried(this, *Phi)229
&&
3314
0
               
(CyclePhi <= CycleSched || 0
OrigMISU->getInstr()->isPHI()0
))
3315
0
        ReplaceReg = PrevReg;
3316
495
      else
3317
495
        ReplaceReg = NewReg;
3318
495
    }
3319
780
    // The scheduled instruction occurs before the scheduled Phi, and the
3320
780
    // Phi is not loop carried.
3321
780
    if (
!InProlog && 780
StagePhi + 1 == StageSched570
&&
3322
262
        !Schedule.isLoopCarried(this, *Phi))
3323
234
      ReplaceReg = NewReg;
3324
780
    if (
StagePhi > StageSched && 780
Phi->isPHI()14
)
3325
14
      ReplaceReg = NewReg;
3326
780
    if (
!InProlog && 780
!Phi->isPHI()570
&&
StagePhi < StageSched243
)
3327
236
      ReplaceReg = NewReg;
3328
780
    if (
ReplaceReg780
) {
3329
745
      MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
3330
745
      UseOp.setReg(ReplaceReg);
3331
745
    }
3332
2.07k
  }
3333
838
}
3334
3335
/// Check if we can change the instruction to use an offset value from the
3336
/// previous iteration. If so, return true and set the base and offset values
3337
/// so that we can rewrite the load, if necessary.
3338
///   v1 = Phi(v0, v3)
3339
///   v2 = load v1, 0
3340
///   v3 = post_store v1, 4, x
3341
/// This function enables the load to be rewritten as v2 = load v3, 4.
3342
bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
3343
                                              unsigned &BasePos,
3344
                                              unsigned &OffsetPos,
3345
                                              unsigned &NewBase,
3346
1.36k
                                              int64_t &Offset) {
3347
1.36k
  // Get the load instruction.
3348
1.36k
  if (TII->isPostIncrement(*MI))
3349
66
    return false;
3350
1.29k
  unsigned BasePosLd, OffsetPosLd;
3351
1.29k
  if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
3352
1.14k
    return false;
3353
155
  unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
3354
155
3355
155
  // Look for the Phi instruction.
3356
155
  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
3357
155
  MachineInstr *Phi = MRI.getVRegDef(BaseReg);
3358
155
  if (
!Phi || 155
!Phi->isPHI()155
)
3359
76
    return false;
3360
79
  // Get the register defined in the loop block.
3361
79
  unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
3362
79
  if (!PrevReg)
3363
0
    return false;
3364
79
3365
79
  // Check for the post-increment load/store instruction.
3366
79
  MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3367
79
  if (
!PrevDef || 79
PrevDef == MI79
)
3368
0
    return false;
3369
79
3370
79
  
if (79
!TII->isPostIncrement(*PrevDef)79
)
3371
69
    return false;
3372
10
3373
10
  unsigned BasePos1 = 0, OffsetPos1 = 0;
3374
10
  if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3375
0
    return false;
3376
10
3377
10
  // Make sure offset values are both positive or both negative.
3378
10
  int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3379
10
  int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3380
10
  if ((LoadOffset >= 0) != (StoreOffset >= 0))
3381
0
    return false;
3382
10
3383
10
  // Set the return value once we determine that we return true.
3384
10
  BasePos = BasePosLd;
3385
10
  OffsetPos = OffsetPosLd;
3386
10
  NewBase = PrevReg;
3387
10
  Offset = StoreOffset;
3388
10
  return true;
3389
10
}
3390
3391
/// Apply changes to the instruction if needed. The changes are need
3392
/// to improve the scheduling and depend up on the final schedule.
3393
MachineInstr *SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
3394
                                                  SMSchedule &Schedule,
3395
644
                                                  bool UpdateDAG) {
3396
644
  SUnit *SU = getSUnit(MI);
3397
644
  DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3398
644
      InstrChanges.find(SU);
3399
644
  if (
It != InstrChanges.end()644
) {
3400
9
    std::pair<unsigned, int64_t> RegAndOffset = It->second;
3401
9
    unsigned BasePos, OffsetPos;
3402
9
    if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3403
0
      return nullptr;
3404
9
    unsigned BaseReg = MI->getOperand(BasePos).getReg();
3405
9
    MachineInstr *LoopDef = findDefInLoop(BaseReg);
3406
9
    int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3407
9
    int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3408
9
    int BaseStageNum = Schedule.stageScheduled(SU);
3409
9
    int BaseCycleNum = Schedule.cycleScheduled(SU);
3410
9
    if (
BaseStageNum < DefStageNum9
) {
3411
7
      MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3412
7
      int OffsetDiff = DefStageNum - BaseStageNum;
3413
7
      if (
DefCycleNum < BaseCycleNum7
) {
3414
0
        NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3415
0
        if (OffsetDiff > 0)
3416
0
          --OffsetDiff;
3417
0
      }
3418
7
      int64_t NewOffset =
3419
7
          MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3420
7
      NewMI->getOperand(OffsetPos).setImm(NewOffset);
3421
7
      if (
UpdateDAG7
) {
3422
7
        SU->setInstr(NewMI);
3423
7
        MISUnitMap[NewMI] = SU;
3424
7
      }
3425
7
      NewMIs.insert(NewMI);
3426
7
      return NewMI;
3427
7
    }
3428
637
  }
3429
637
  return nullptr;
3430
637
}
3431
3432
/// Return true for an order dependence that is loop carried potentially.
3433
/// An order dependence is loop carried if the destination defines a value
3434
/// that may be used by the source in a subsequent iteration.
3435
bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep,
3436
54.6k
                                           bool isSucc) {
3437
54.6k
  if (
!isOrder(Source, Dep) || 54.6k
Dep.isArtificial()52.6k
)
3438
1.90k
    return false;
3439
52.6k
3440
52.6k
  
if (52.6k
!SwpPruneLoopCarried52.6k
)
3441
0
    return true;
3442
52.6k
3443
52.6k
  MachineInstr *SI = Source->getInstr();
3444
52.6k
  MachineInstr *DI = Dep.getSUnit()->getInstr();
3445
52.6k
  if (!isSucc)
3446
52.5k
    std::swap(SI, DI);
3447
52.6k
  assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3448
52.6k
3449
52.6k
  // Assume ordered loads and stores may have a loop carried dependence.
3450
52.6k
  if (
SI->hasUnmodeledSideEffects() || 52.6k
DI->hasUnmodeledSideEffects()52.6k
||
3451
52.6k
      
SI->hasOrderedMemoryRef()52.6k
||
DI->hasOrderedMemoryRef()52.6k
)
3452
48
    return true;
3453
52.6k
3454
52.6k
  // Only chain dependences between a load and store can be loop carried.
3455
52.6k
  
if (52.6k
!DI->mayStore() || 52.6k
!SI->mayLoad()52.6k
)
3456
17.9k
    return false;
3457
34.6k
3458
34.6k
  unsigned DeltaS, DeltaD;
3459
34.6k
  if (
!computeDelta(*SI, DeltaS) || 34.6k
!computeDelta(*DI, DeltaD)923
)
3460
34.5k
    return true;
3461
123
3462
123
  unsigned BaseRegS, BaseRegD;
3463
123
  int64_t OffsetS, OffsetD;
3464
123
  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3465
123
  if (!TII->getMemOpBaseRegImmOfs(*SI, BaseRegS, OffsetS, TRI) ||
3466
123
      !TII->getMemOpBaseRegImmOfs(*DI, BaseRegD, OffsetD, TRI))
3467
0
    return true;
3468
123
3469
123
  
if (123
BaseRegS != BaseRegD123
)
3470
49
    return true;
3471
74
3472
74
  uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
3473
74
  uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
3474
74
3475
74
  // This is the main test, which checks the offset values and the loop
3476
74
  // increment value to determine if the accesses may be loop carried.
3477
74
  if (OffsetS >= OffsetD)
3478
48
    return OffsetS + AccessSizeS > DeltaS;
3479
74
  else
3480
26
    return OffsetD + AccessSizeD > DeltaD;
3481
0
3482
0
  return true;
3483
0
}
3484
3485
113
void SwingSchedulerDAG::postprocessDAG() {
3486
113
  for (auto &M : Mutations)
3487
226
    M->apply(this);
3488
113
}
3489
3490
/// Try to schedule the node at the specified StartCycle and continue
3491
/// until the node is schedule or the EndCycle is reached.  This function
3492
/// returns true if the node is scheduled.  This routine may search either
3493
/// forward or backward for a place to insert the instruction based upon
3494
/// the relative values of StartCycle and EndCycle.
3495
1.38k
bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3496
1.38k
  bool forward = true;
3497
1.38k
  if (StartCycle > EndCycle)
3498
313
    forward = false;
3499
1.38k
3500
1.38k
  // The terminating condition depends on the direction.
3501
1.38k
  int termCycle = forward ? 
EndCycle + 11.07k
:
EndCycle - 1313
;
3502
2.03k
  for (int curCycle = StartCycle; curCycle != termCycle;
3503
1.38k
       
forward ? 654
++curCycle652
:
--curCycle2
) {
3504
2.01k
3505
2.01k
    // Add the already scheduled instructions at the specified cycle to the DFA.
3506
2.01k
    Resources->clearResources();
3507
2.01k
    for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
3508
4.10k
         
checkCycle <= LastCycle4.10k
;
checkCycle += II2.08k
) {
3509
2.08k
      std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
3510
2.08k
3511
2.08k
      for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
3512
2.08k
                                         E = cycleInstrs.end();
3513
5.27k
           
I != E5.27k
;
++I3.18k
) {
3514
3.18k
        if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
3515
676
          continue;
3516
3.18k
        assert(Resources->canReserveResources(*(*I)->getInstr()) &&
3517
2.51k
               "These instructions have already been scheduled.");
3518
2.51k
        Resources->reserveResources(*(*I)->getInstr());
3519
2.51k
      }
3520
2.08k
    }
3521
2.01k
    if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3522
2.01k
        
Resources->canReserveResources(*SU->getInstr())1.74k
) {
3523
1.36k
      DEBUG({
3524
1.36k
        dbgs() << "\tinsert at cycle " << curCycle << " ";
3525
1.36k
        SU->getInstr()->dump();
3526
1.36k
      });
3527
1.36k
3528
1.36k
      ScheduledInstrs[curCycle].push_back(SU);
3529
1.36k
      InstrToCycle.insert(std::make_pair(SU, curCycle));
3530
1.36k
      if (curCycle > LastCycle)
3531
340
        LastCycle = curCycle;
3532
1.36k
      if (curCycle < FirstCycle)
3533
6
        FirstCycle = curCycle;
3534
1.36k
      return true;
3535
1.36k
    }
3536
654
    
DEBUG654
({
3537
654
      dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3538
654
      SU->getInstr()->dump();
3539
654
    });
3540
654
  }
3541
23
  return false;
3542
1.38k
}
3543
3544
// Return the cycle of the earliest scheduled instruction in the chain.
3545
11
int SMSchedule::earliestCycleInChain(const SDep &Dep) {
3546
11
  SmallPtrSet<SUnit *, 8> Visited;
3547
11
  SmallVector<SDep, 8> Worklist;
3548
11
  Worklist.push_back(Dep);
3549
11
  int EarlyCycle = INT_MAX;
3550
248
  while (
!Worklist.empty()248
) {
3551
237
    const SDep &Cur = Worklist.pop_back_val();
3552
237
    SUnit *PrevSU = Cur.getSUnit();
3553
237
    if (Visited.count(PrevSU))
3554
97
      continue;
3555
140
    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3556
140
    if (it == InstrToCycle.end())
3557
15
      continue;
3558
125
    EarlyCycle = std::min(EarlyCycle, it->second);
3559
125
    for (const auto &PI : PrevSU->Preds)
3560
353
      
if (353
SwingSchedulerDAG::isOrder(PrevSU, PI)353
)
3561
226
        Worklist.push_back(PI);
3562
237
    Visited.insert(PrevSU);
3563
237
  }
3564
11
  return EarlyCycle;
3565
11
}
3566
3567
// Return the cycle of the latest scheduled instruction in the chain.
3568
86
int SMSchedule::latestCycleInChain(const SDep &Dep) {
3569
86
  SmallPtrSet<SUnit *, 8> Visited;
3570
86
  SmallVector<SDep, 8> Worklist;
3571
86
  Worklist.push_back(Dep);
3572
86
  int LateCycle = INT_MIN;
3573
671
  while (
!Worklist.empty()671
) {
3574
585
    const SDep &Cur = Worklist.pop_back_val();
3575
585
    SUnit *SuccSU = Cur.getSUnit();
3576
585
    if (Visited.count(SuccSU))
3577
136
      continue;
3578
449
    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3579
449
    if (it == InstrToCycle.end())
3580
158
      continue;
3581
291
    LateCycle = std::max(LateCycle, it->second);
3582
291
    for (const auto &SI : SuccSU->Succs)
3583
814
      
if (814
SwingSchedulerDAG::isOrder(SuccSU, SI)814
)
3584
499
        Worklist.push_back(SI);
3585
585
    Visited.insert(SuccSU);
3586
585
  }
3587
86
  return LateCycle;
3588
86
}
3589
3590
/// If an instruction has a use that spans multiple iterations, then
3591
/// return true. These instructions are characterized by having a back-ege
3592
/// to a Phi, which contains a reference to another Phi.
3593
14.3k
static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
3594
14.3k
  for (auto &P : SU->Preds)
3595
23.7k
    
if (23.7k
DAG->isBackedge(SU, P) && 23.7k
P.getSUnit()->getInstr()->isPHI()1.76k
)
3596
1.76k
      for (auto &S : P.getSUnit()->Succs)
3597
8.90k
        
if (8.90k
S.getKind() == SDep::Order && 8.90k
S.getSUnit()->getInstr()->isPHI()140
)
3598
140
          return P.getSUnit();
3599
14.2k
  return nullptr;
3600
14.2k
}
3601
3602
/// Compute the scheduling start slot for the instruction.  The start slot
3603
/// depends on any predecessor or successor nodes scheduled already.
3604
void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3605
                              int *MinEnd, int *MaxStart, int II,
3606
1.41k
                              SwingSchedulerDAG *DAG) {
3607
1.41k
  // Iterate over each instruction that has been scheduled already.  The start
3608
1.41k
  // slot computuation depends on whether the previously scheduled instruction
3609
1.41k
  // is a predecessor or successor of the specified instruction.
3610
9.59k
  for (int cycle = getFirstCycle(); 
cycle <= LastCycle9.59k
;
++cycle8.18k
) {
3611
8.18k
3612
8.18k
    // Iterate over each instruction in the current cycle.
3613
9.22k
    for (SUnit *I : getInstructions(cycle)) {
3614
9.22k
      // Because we're processing a DAG for the dependences, we recognize
3615
9.22k
      // the back-edge in recurrences by anti dependences.
3616
23.5k
      for (unsigned i = 0, e = (unsigned)SU->Preds.size(); 
i != e23.5k
;
++i14.3k
) {
3617
14.3k
        const SDep &Dep = SU->Preds[i];
3618
14.3k
        if (
Dep.getSUnit() == I14.3k
) {
3619
977
          if (
!DAG->isBackedge(SU, Dep)977
) {
3620
950
            int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
3621
950
                             DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3622
950
            *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3623
950
            if (
DAG->isLoopCarriedOrder(SU, Dep, false)950
) {
3624
11
              int End = earliestCycleInChain(Dep) + (II - 1);
3625
11
              *MinEnd = std::min(*MinEnd, End);
3626
11
            }
3627
977
          } else {
3628
27
            int LateStart = cycle - DAG->getLatency(SU, Dep) +
3629
27
                            DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3630
27
            *MinLateStart = std::min(*MinLateStart, LateStart);
3631
27
          }
3632
977
        }
3633
14.3k
        // For instruction that requires multiple iterations, make sure that
3634
14.3k
        // the dependent instruction is not scheduled past the definition.
3635
14.3k
        SUnit *BE = multipleIterations(I, DAG);
3636
14.3k
        if (
BE && 14.3k
Dep.getSUnit() == BE140
&&
!SU->getInstr()->isPHI()24
&&
3637
8
            !SU->isPred(I))
3638
8
          *MinLateStart = std::min(*MinLateStart, cycle);
3639
14.3k
      }
3640
25.1k
      for (unsigned i = 0, e = (unsigned)SU->Succs.size(); 
i != e25.1k
;
++i15.9k
)
3641
15.9k
        
if (15.9k
SU->Succs[i].getSUnit() == I15.9k
) {
3642
879
          const SDep &Dep = SU->Succs[i];
3643
879
          if (
!DAG->isBackedge(SU, Dep)879
) {
3644
675
            int LateStart = cycle - DAG->getLatency(SU, Dep) +
3645
675
                            DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3646
675
            *MinLateStart = std::min(*MinLateStart, LateStart);
3647
675
            if (
DAG->isLoopCarriedOrder(SU, Dep)675
) {
3648
86
              int Start = latestCycleInChain(Dep) + 1 - II;
3649
86
              *MaxStart = std::max(*MaxStart, Start);
3650
86
            }
3651
879
          } else {
3652
204
            int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
3653
204
                             DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3654
204
            *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3655
204
          }
3656
15.9k
        }
3657
9.22k
    }
3658
8.18k
  }
3659
1.41k
}
3660
3661
/// Order the instructions within a cycle so that the definitions occur
3662
/// before the uses. Returns true if the instruction is added to the start
3663
/// of the list, or false if added to the end.
3664
bool SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
3665
732
                                 std::deque<SUnit *> &Insts) {
3666
732
  MachineInstr *MI = SU->getInstr();
3667
732
  bool OrderBeforeUse = false;
3668
732
  bool OrderAfterDef = false;
3669
732
  bool OrderBeforeDef = false;
3670
732
  unsigned MoveDef = 0;
3671
732
  unsigned MoveUse = 0;
3672
732
  int StageInst1 = stageScheduled(SU);
3673
732
3674
732
  unsigned Pos = 0;
3675
1.32k
  for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3676
732
       
++I, ++Pos597
) {
3677
597
    // Relative order of Phis does not matter.
3678
597
    if (
MI->isPHI() && 597
(*I)->getInstr()->isPHI()81
)
3679
76
      continue;
3680
2.17k
    
for (unsigned i = 0, e = MI->getNumOperands(); 521
i < e2.17k
;
++i1.65k
) {
3681
1.65k
      MachineOperand &MO = MI->getOperand(i);
3682
1.65k
      if (
!MO.isReg() || 1.65k
!TargetRegisterInfo::isVirtualRegister(MO.getReg())1.22k
)
3683
463
        continue;
3684
1.19k
      unsigned Reg = MO.getReg();
3685
1.19k
      unsigned BasePos, OffsetPos;
3686
1.19k
      if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3687
323
        
if (323
MI->getOperand(BasePos).getReg() == Reg323
)
3688
180
          
if (unsigned 180
NewReg180
= SSD->getInstrBaseReg(SU))
3689
16
            Reg = NewReg;
3690
1.19k
      bool Reads, Writes;
3691
1.19k
      std::tie(Reads, Writes) =
3692
1.19k
          (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3693
1.19k
      if (
MO.isDef() && 1.19k
Reads456
&&
stageScheduled(*I) <= StageInst1120
) {
3694
31
        OrderBeforeUse = true;
3695
31
        MoveUse = Pos;
3696
1.19k
      } else 
if (1.16k
MO.isDef() && 1.16k
Reads425
&&
stageScheduled(*I) > StageInst189
) {
3697
89
        // Add the instruction after the scheduled instruction.
3698
89
        OrderAfterDef = true;
3699
89
        MoveDef = Pos;
3700
1.16k
      } else 
if (1.07k
MO.isUse() && 1.07k
Writes739
&&
stageScheduled(*I) == StageInst159
) {
3701
9
        if (
cycleScheduled(*I) == cycleScheduled(SU) && 9
!(*I)->isSucc(SU)9
) {
3702
0
          OrderBeforeUse = true;
3703
0
          MoveUse = Pos;
3704
9
        } else {
3705
9
          OrderAfterDef = true;
3706
9
          MoveDef = Pos;
3707
9
        }
3708
1.07k
      } else 
if (1.06k
MO.isUse() && 1.06k
Writes730
&&
stageScheduled(*I) > StageInst150
) {
3709
7
        OrderBeforeUse = true;
3710
7
        MoveUse = Pos;
3711
7
        if (
MoveUse != 07
) {
3712
7
          OrderAfterDef = true;
3713
7
          MoveDef = Pos - 1;
3714
7
        }
3715
1.06k
      } else 
if (1.05k
MO.isUse() && 1.05k
Writes723
&&
stageScheduled(*I) < StageInst143
) {
3716
43
        // Add the instruction before the scheduled instruction.
3717
43
        OrderBeforeUse = true;
3718
43
        MoveUse = Pos;
3719
1.05k
      } else 
if (1.01k
MO.isUse() && 1.01k
stageScheduled(*I) == StageInst1680
&&
3720
1.01k
                 
isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)349
) {
3721
57
        OrderBeforeDef = true;
3722
57
        MoveUse = Pos;
3723
57
      }
3724
1.65k
    }
3725
521
    // Check for order dependences between instructions. Make sure the source
3726
521
    // is ordered before the destination.
3727
521
    for (auto &S : SU->Succs)
3728
420
      
if (420
S.getKind() == SDep::Order420
) {
3729
69
        if (
S.getSUnit() == *I && 69
stageScheduled(*I) == StageInst110
) {
3730
10
          OrderBeforeUse = true;
3731
10
          MoveUse = Pos;
3732
10
        }
3733
420
      } else 
if (351
TargetRegisterInfo::isPhysicalRegister(S.getReg())351
) {
3734
0
        if (
cycleScheduled(SU) != cycleScheduled(S.getSUnit())0
) {
3735
0
          if (
S.isAssignedRegDep()0
) {
3736
0
            OrderAfterDef = true;
3737
0
            MoveDef = Pos;
3738
0
          }
3739
0
        } else {
3740
0
          OrderBeforeUse = true;
3741
0
          MoveUse = Pos;
3742
0
        }
3743
420
      }
3744
521
    for (auto &P : SU->Preds)
3745
774
      
if (774
P.getKind() == SDep::Order774
) {
3746
43
        if (
P.getSUnit() == *I && 43
stageScheduled(*I) == StageInst11
) {
3747
1
          OrderAfterDef = true;
3748
1
          MoveDef = Pos;
3749
1
        }
3750
774
      } else 
if (731
TargetRegisterInfo::isPhysicalRegister(P.getReg())731
) {
3751
0
        if (
cycleScheduled(SU) != cycleScheduled(P.getSUnit())0
) {
3752
0
          if (
P.isAssignedRegDep()0
) {
3753
0
            OrderBeforeUse = true;
3754
0
            MoveUse = Pos;
3755
0
          }
3756
0
        } else {
3757
0
          OrderAfterDef = true;
3758
0
          MoveDef = Pos;
3759
0
        }
3760
774
      }
3761
597
  }
3762
732
3763
732
  // A circular dependence.
3764
732
  if (
OrderAfterDef && 732
OrderBeforeUse99
&&
MoveUse == MoveDef7
)
3765
0
    OrderBeforeUse = false;
3766
732
3767
732
  // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3768
732
  // to a loop-carried dependence.
3769
732
  if (OrderBeforeDef)
3770
56
    
OrderBeforeUse = !OrderAfterDef || 56
(MoveUse > MoveDef)37
;
3771
732
3772
732
  // The uncommon case when the instruction order needs to be updated because
3773
732
  // there is both a use and def.
3774
732
  if (
OrderBeforeUse && 732
OrderAfterDef135
) {
3775
44
    SUnit *UseSU = Insts.at(MoveUse);
3776
44
    SUnit *DefSU = Insts.at(MoveDef);
3777
44
    if (
MoveUse > MoveDef44
) {
3778
44
      Insts.erase(Insts.begin() + MoveUse);
3779
44
      Insts.erase(Insts.begin() + MoveDef);
3780
44
    } else {
3781
0
      Insts.erase(Insts.begin() + MoveDef);
3782
0
      Insts.erase(Insts.begin() + MoveUse);
3783
0
    }
3784
44
    if (
orderDependence(SSD, UseSU, Insts)44
) {
3785
0
      Insts.push_front(SU);
3786
0
      orderDependence(SSD, DefSU, Insts);
3787
0
      return true;
3788
0
    }
3789
44
    Insts.pop_back();
3790
44
    Insts.push_back(SU);
3791
44
    Insts.push_back(UseSU);
3792
44
    orderDependence(SSD, DefSU, Insts);
3793
44
    return false;
3794
44
  }
3795
688
  // Put the new instruction first if there is a use in the list. Otherwise,
3796
688
  // put it at the end of the list.
3797
688
  
if (688
OrderBeforeUse688
)
3798
91
    Insts.push_front(SU);
3799
688
  else
3800
597
    Insts.push_back(SU);
3801
732
  return OrderBeforeUse;
3802
732
}
3803
3804
/// Return true if the scheduled Phi has a loop carried operand.
3805
1.00k
bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
3806
1.00k
  if (!Phi.isPHI())
3807
234
    return false;
3808
1.00k
  assert(Phi.isPHI() && "Expecing a Phi.");
3809
770
  SUnit *DefSU = SSD->getSUnit(&Phi);
3810
770
  unsigned DefCycle = cycleScheduled(DefSU);
3811
770
  int DefStage = stageScheduled(DefSU);
3812
770
3813
770
  unsigned InitVal = 0;
3814
770
  unsigned LoopVal = 0;
3815
770
  getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3816
770
  SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3817
770
  if (!UseSU)
3818
6
    return true;
3819
764
  
if (764
UseSU->getInstr()->isPHI()764
)
3820
14
    return true;
3821
750
  unsigned LoopCycle = cycleScheduled(UseSU);
3822
750
  int LoopStage = stageScheduled(UseSU);
3823
724
  return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3824
1.00k
}
3825
3826
/// Return true if the instruction is a definition that is loop carried
3827
/// and defines the use on the next iteration.
3828
///        v1 = phi(v2, v3)
3829
///  (Def) v3 = op v1
3830
///  (MO)   = v1
3831
/// If MO appears before Def, then then v1 and v3 may get assigned to the same
3832
/// register.
3833
bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
3834
349
                                       MachineInstr *Def, MachineOperand &MO) {
3835
349
  if (!MO.isReg())
3836
0
    return false;
3837
349
  
if (349
Def->isPHI()349
)
3838
0
    return false;
3839
349
  MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3840
349
  if (
!Phi || 349
!Phi->isPHI()349
||
Phi->getParent() != Def->getParent()184
)
3841
166
    return false;
3842
183
  
if (183
!isLoopCarried(SSD, *Phi)183
)
3843
2
    return false;
3844
181
  unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3845
608
  for (unsigned i = 0, e = Def->getNumOperands(); 
i != e608
;
++i427
) {
3846
484
    MachineOperand &DMO = Def->getOperand(i);
3847
484
    if (
!DMO.isReg() || 484
!DMO.isDef()381
)
3848
296
      continue;
3849
188
    
if (188
DMO.getReg() == LoopReg188
)
3850
57
      return true;
3851
484
  }
3852
124
  return false;
3853
349
}
3854
3855
// Check if the generated schedule is valid. This function checks if
3856
// an instruction that uses a physical register is scheduled in a
3857
// different stage than the definition. The pipeliner does not handle
3858
// physical register values that may cross a basic block boundary.
3859
107
bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
3860
751
  for (int i = 0, e = SSD->SUnits.size(); 
i < e751
;
++i644
) {
3861
644
    SUnit &SU = SSD->SUnits[i];
3862
644
    if (!SU.hasPhysRegDefs)
3863
644
      continue;
3864
0
    int StageDef = stageScheduled(&SU);
3865
0
    assert(StageDef != -1 && "Instruction should have been scheduled.");
3866
0
    for (auto &SI : SU.Succs)
3867
0
      
if (0
SI.isAssignedRegDep()0
)
3868
0
        
if (0
ST.getRegisterInfo()->isPhysicalRegister(SI.getReg())0
)
3869
0
          
if (0
stageScheduled(SI.getSUnit()) != StageDef0
)
3870
0
            return false;
3871
644
  }
3872
107
  return true;
3873
107
}
3874
3875
/// After the schedule has been formed, call this function to combine
3876
/// the instructions from the different stages/cycles.  That is, this
3877
/// function creates a schedule that represents a single iteration.
3878
107
void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
3879
107
  // Move all instructions to the first stage from later stages.
3880
291
  for (int cycle = getFirstCycle(); 
cycle <= getFinalCycle()291
;
++cycle184
) {
3881
324
    for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3882
184
         
++stage140
) {
3883
140
      std::deque<SUnit *> &cycleInstrs =
3884
140
          ScheduledInstrs[cycle + (stage * InitiationInterval)];
3885
140
      for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
3886
140
                                                 E = cycleInstrs.rend();
3887
306
           
I != E306
;
++I166
)
3888
166
        ScheduledInstrs[cycle].push_front(*I);
3889
140
    }
3890
184
  }
3891
107
  // Iterate over the definitions in each instruction, and compute the
3892
107
  // stage difference for each use.  Keep the maximum value.
3893
644
  for (auto &I : InstrToCycle) {
3894
644
    int DefStage = stageScheduled(I.first);
3895
644
    MachineInstr *MI = I.first->getInstr();
3896
3.10k
    for (unsigned i = 0, e = MI->getNumOperands(); 
i < e3.10k
;
++i2.45k
) {
3897
2.45k
      MachineOperand &Op = MI->getOperand(i);
3898
2.45k
      if (
!Op.isReg() || 2.45k
!Op.isDef()1.71k
)
3899
1.83k
        continue;
3900
617
3901
617
      unsigned Reg = Op.getReg();
3902
617
      unsigned MaxDiff = 0;
3903
617
      bool PhiIsSwapped = false;
3904
617
      for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
3905
617
                                             EI = MRI.use_end();
3906
1.43k
           
UI != EI1.43k
;
++UI817
) {
3907
817
        MachineOperand &UseOp = *UI;
3908
817
        MachineInstr *UseMI = UseOp.getParent();
3909
817
        SUnit *SUnitUse = SSD->getSUnit(UseMI);
3910
817
        int UseStage = stageScheduled(SUnitUse);
3911
817
        unsigned Diff = 0;
3912
817
        if (
UseStage != -1 && 817
UseStage >= DefStage782
)
3913
773
          Diff = UseStage - DefStage;
3914
817
        if (
MI->isPHI()817
) {
3915
330
          if (isLoopCarried(SSD, *MI))
3916
327
            ++Diff;
3917
330
          else
3918
3
            PhiIsSwapped = true;
3919
330
        }
3920
817
        MaxDiff = std::max(Diff, MaxDiff);
3921
817
      }
3922
2.45k
      RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
3923
2.45k
    }
3924
644
  }
3925
107
3926
107
  // Erase all the elements in the later stages. Only one iteration should
3927
107
  // remain in the scheduled list, and it contains all the instructions.
3928
221
  for (int cycle = getFinalCycle() + 1; 
cycle <= LastCycle221
;
++cycle114
)
3929
114
    ScheduledInstrs.erase(cycle);
3930
107
3931
107
  // Change the registers in instruction as specified in the InstrChanges
3932
107
  // map. We need to use the new registers to create the correct order.
3933
751
  for (int i = 0, e = SSD->SUnits.size(); 
i != e751
;
++i644
) {
3934
644
    SUnit *SU = &SSD->SUnits[i];
3935
644
    SSD->applyInstrChange(SU->getInstr(), *this, true);
3936
644
  }
3937
107
3938
107
  // Reorder the instructions in each cycle to fix and improve the
3939
107
  // generated code.
3940
291
  for (int Cycle = getFirstCycle(), E = getFinalCycle(); 
Cycle <= E291
;
++Cycle184
) {
3941
184
    std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3942
184
    std::deque<SUnit *> newOrderZC;
3943
184
    // Put the zero-cost, pseudo instructions at the start of the cycle.
3944
828
    for (unsigned i = 0, e = cycleInstrs.size(); 
i < e828
;
++i644
) {
3945
644
      SUnit *SU = cycleInstrs[i];
3946
644
      if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3947
193
        orderDependence(SSD, SU, newOrderZC);
3948
644
    }
3949
184
    std::deque<SUnit *> newOrderI;
3950
184
    // Then, add the regular instructions back.
3951
828
    for (unsigned i = 0, e = cycleInstrs.size(); 
i < e828
;
++i644
) {
3952
644
      SUnit *SU = cycleInstrs[i];
3953
644
      if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3954
451
        orderDependence(SSD, SU, newOrderI);
3955
644
    }
3956
184
    // Replace the old order with the new order.
3957
184
    cycleInstrs.swap(newOrderZC);
3958
184
    cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
3959
184
  }
3960
107
3961
107
  DEBUG(dump(););
3962
107
}
3963
3964
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3965
/// Print the schedule information to the given output.
3966
void SMSchedule::print(raw_ostream &os) const {
3967
  // Iterate over each cycle.
3968
  for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3969
    // Iterate over each instruction in the cycle.
3970
    const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3971
    for (SUnit *CI : cycleInstrs->second) {
3972
      os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3973
      os << "(" << CI->NodeNum << ") ";
3974
      CI->getInstr()->print(os);
3975
      os << "\n";
3976
    }
3977
  }
3978
}
3979
3980
/// Utility function used for debugging to print the schedule.
3981
LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
3982
#endif