Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// SI Machine Scheduler interface
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "SIMachineScheduler.h"
15
#include "AMDGPU.h"
16
#include "SIInstrInfo.h"
17
#include "SIRegisterInfo.h"
18
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19
#include "llvm/ADT/STLExtras.h"
20
#include "llvm/ADT/SmallVector.h"
21
#include "llvm/CodeGen/LiveInterval.h"
22
#include "llvm/CodeGen/LiveIntervals.h"
23
#include "llvm/CodeGen/MachineInstr.h"
24
#include "llvm/CodeGen/MachineRegisterInfo.h"
25
#include "llvm/CodeGen/MachineScheduler.h"
26
#include "llvm/CodeGen/RegisterPressure.h"
27
#include "llvm/CodeGen/SlotIndexes.h"
28
#include "llvm/CodeGen/TargetRegisterInfo.h"
29
#include "llvm/Support/Debug.h"
30
#include "llvm/Support/ErrorHandling.h"
31
#include "llvm/Support/raw_ostream.h"
32
#include <algorithm>
33
#include <cassert>
34
#include <map>
35
#include <set>
36
#include <utility>
37
#include <vector>
38
39
using namespace llvm;
40
41
#define DEBUG_TYPE "machine-scheduler"
42
43
// This scheduler implements a different scheduling algorithm than
44
// GenericScheduler.
45
//
46
// There are several specific architecture behaviours that can't be modelled
47
// for GenericScheduler:
48
// . When accessing the result of an SGPR load instruction, you have to wait
49
// for all the SGPR load instructions before your current instruction to
50
// have finished.
51
// . When accessing the result of an VGPR load instruction, you have to wait
52
// for all the VGPR load instructions previous to the VGPR load instruction
53
// you are interested in to finish.
54
// . The less the register pressure, the best load latencies are hidden
55
//
56
// Moreover some specifities (like the fact a lot of instructions in the shader
57
// have few dependencies) makes the generic scheduler have some unpredictable
58
// behaviours. For example when register pressure becomes high, it can either
59
// manage to prevent register pressure from going too high, or it can
60
// increase register pressure even more than if it hadn't taken register
61
// pressure into account.
62
//
63
// Also some other bad behaviours are generated, like loading at the beginning
64
// of the shader a constant in VGPR you won't need until the end of the shader.
65
//
66
// The scheduling problem for SI can distinguish three main parts:
67
// . Hiding high latencies (texture sampling, etc)
68
// . Hiding low latencies (SGPR constant loading, etc)
69
// . Keeping register usage low for better latency hiding and general
70
//   performance
71
//
72
// Some other things can also affect performance, but are hard to predict
73
// (cache usage, the fact the HW can issue several instructions from different
74
// wavefronts if different types, etc)
75
//
76
// This scheduler tries to solve the scheduling problem by dividing it into
77
// simpler sub-problems. It divides the instructions into blocks, schedules
78
// locally inside the blocks where it takes care of low latencies, and then
79
// chooses the order of the blocks by taking care of high latencies.
80
// Dividing the instructions into blocks helps control keeping register
81
// usage low.
82
//
83
// First the instructions are put into blocks.
84
//   We want the blocks help control register usage and hide high latencies
85
//   later. To help control register usage, we typically want all local
86
//   computations, when for example you create a result that can be comsummed
87
//   right away, to be contained in a block. Block inputs and outputs would
88
//   typically be important results that are needed in several locations of
89
//   the shader. Since we do want blocks to help hide high latencies, we want
90
//   the instructions inside the block to have a minimal set of dependencies
91
//   on high latencies. It will make it easy to pick blocks to hide specific
92
//   high latencies.
93
//   The block creation algorithm is divided into several steps, and several
94
//   variants can be tried during the scheduling process.
95
//
96
// Second the order of the instructions inside the blocks is chosen.
97
//   At that step we do take into account only register usage and hiding
98
//   low latency instructions
99
//
100
// Third the block order is chosen, there we try to hide high latencies
101
// and keep register usage low.
102
//
103
// After the third step, a pass is done to improve the hiding of low
104
// latencies.
105
//
106
// Actually when talking about 'low latency' or 'high latency' it includes
107
// both the latency to get the cache (or global mem) data go to the register,
108
// and the bandwidth limitations.
109
// Increasing the number of active wavefronts helps hide the former, but it
110
// doesn't solve the latter, thus why even if wavefront count is high, we have
111
// to try have as many instructions hiding high latencies as possible.
112
// The OpenCL doc says for example latency of 400 cycles for a global mem access,
113
// which is hidden by 10 instructions if the wavefront count is 10.
114
115
// Some figures taken from AMD docs:
116
// Both texture and constant L1 caches are 4-way associative with 64 bytes
117
// lines.
118
// Constant cache is shared with 4 CUs.
119
// For texture sampling, the address generation unit receives 4 texture
120
// addresses per cycle, thus we could expect texture sampling latency to be
121
// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
122
// instructions in a wavefront group are executed every 4 cycles),
123
// or 16 instructions if the other wavefronts associated to the 3 other VALUs
124
// of the CU do texture sampling too. (Don't take these figures too seriously,
125
// as I'm not 100% sure of the computation)
126
// Data exports should get similar latency.
127
// For constant loading, the cache is shader with 4 CUs.
128
// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
129
// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
130
// It means a simple s_buffer_load should take one instruction to hide, as
131
// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
132
// cache line.
133
//
134
// As of today the driver doesn't preload the constants in cache, thus the
135
// first loads get extra latency. The doc says global memory access can be
136
// 300-600 cycles. We do not specially take that into account when scheduling
137
// As we expect the driver to be able to preload the constants soon.
138
139
// common code //
140
141
#ifndef NDEBUG
142
143
static const char *getReasonStr(SIScheduleCandReason Reason) {
144
  switch (Reason) {
145
  case NoCand:         return "NOCAND";
146
  case RegUsage:       return "REGUSAGE";
147
  case Latency:        return "LATENCY";
148
  case Successor:      return "SUCCESSOR";
149
  case Depth:          return "DEPTH";
150
  case NodeOrder:      return "ORDER";
151
  }
152
  llvm_unreachable("Unknown reason!");
153
}
154
155
#endif
156
157
namespace llvm {
158
namespace SISched {
159
static bool tryLess(int TryVal, int CandVal,
160
                    SISchedulerCandidate &TryCand,
161
                    SISchedulerCandidate &Cand,
162
85
                    SIScheduleCandReason Reason) {
163
85
  if (TryVal < CandVal) {
164
4
    TryCand.Reason = Reason;
165
4
    return true;
166
4
  }
167
81
  if (TryVal > CandVal) {
168
0
    if (Cand.Reason > Reason)
169
0
      Cand.Reason = Reason;
170
0
    return true;
171
0
  }
172
81
  Cand.setRepeat(Reason);
173
81
  return false;
174
81
}
175
176
static bool tryGreater(int TryVal, int CandVal,
177
                       SISchedulerCandidate &TryCand,
178
                       SISchedulerCandidate &Cand,
179
44
                       SIScheduleCandReason Reason) {
180
44
  if (TryVal > CandVal) {
181
5
    TryCand.Reason = Reason;
182
5
    return true;
183
5
  }
184
39
  if (TryVal < CandVal) {
185
0
    if (Cand.Reason > Reason)
186
0
      Cand.Reason = Reason;
187
0
    return true;
188
0
  }
189
39
  Cand.setRepeat(Reason);
190
39
  return false;
191
39
}
192
} // end namespace SISched
193
} // end namespace llvm
194
195
// SIScheduleBlock //
196
197
63
void SIScheduleBlock::addUnit(SUnit *SU) {
198
63
  NodeNum2Index[SU->NodeNum] = SUnits.size();
199
63
  SUnits.push_back(SU);
200
63
}
201
202
#ifndef NDEBUG
203
void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
204
205
  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
206
  dbgs() << '\n';
207
}
208
#endif
209
210
void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
211
107
                                          SISchedCandidate &TryCand) {
212
107
  // Initialize the candidate if needed.
213
107
  if (!Cand.isValid()) {
214
63
    TryCand.Reason = NodeOrder;
215
63
    return;
216
63
  }
217
44
218
44
  if (Cand.SGPRUsage > 60 &&
219
44
      SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
220
0
                       TryCand, Cand, RegUsage))
221
0
    return;
222
44
223
44
  // Schedule low latency instructions as top as possible.
224
44
  // Order of priority is:
225
44
  // . Low latency instructions which do not depend on other low latency
226
44
  //   instructions we haven't waited for
227
44
  // . Other instructions which do not depend on low latency instructions
228
44
  //   we haven't waited for
229
44
  // . Low latencies
230
44
  // . All other instructions
231
44
  // Goal is to get: low latency instructions - independent instructions
232
44
  //     - (eventually some more low latency instructions)
233
44
  //     - instructions that depend on the first low latency instructions.
234
44
  // If in the block there is a lot of constant loads, the SGPR usage
235
44
  // could go quite high, thus above the arbitrary limit of 60 will encourage
236
44
  // use the already loaded constants (in order to release some SGPRs) before
237
44
  // loading more.
238
44
  if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
239
44
                       Cand.HasLowLatencyNonWaitedParent,
240
44
                       TryCand, Cand, SIScheduleCandReason::Depth))
241
0
    return;
242
44
243
44
  if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
244
44
                          TryCand, Cand, SIScheduleCandReason::Depth))
245
5
    return;
246
39
247
39
  if (TryCand.IsLowLatency &&
248
39
      SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
249
2
                       TryCand, Cand, SIScheduleCandReason::Depth))
250
0
    return;
251
39
252
39
  if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
253
39
                       TryCand, Cand, RegUsage))
254
4
    return;
255
35
256
35
  // Fall through to original instruction order.
257
35
  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
258
4
    TryCand.Reason = NodeOrder;
259
4
  }
260
35
}
261
262
63
SUnit* SIScheduleBlock::pickNode() {
263
63
  SISchedCandidate TopCand;
264
63
265
107
  for (SUnit* SU : TopReadySUs) {
266
107
    SISchedCandidate TryCand;
267
107
    std::vector<unsigned> pressure;
268
107
    std::vector<unsigned> MaxPressure;
269
107
    // Predict register usage after this instruction.
270
107
    TryCand.SU = SU;
271
107
    TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
272
107
    TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
273
107
    TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
274
107
    TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
275
107
    TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
276
107
    TryCand.HasLowLatencyNonWaitedParent =
277
107
      HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
278
107
    tryCandidateTopDown(TopCand, TryCand);
279
107
    if (TryCand.Reason != NoCand)
280
76
      TopCand.setBest(TryCand);
281
107
  }
282
63
283
63
  return TopCand.SU;
284
63
}
285
286
287
// Schedule something valid.
288
21
void SIScheduleBlock::fastSchedule() {
289
21
  TopReadySUs.clear();
290
21
  if (Scheduled)
291
0
    undoSchedule();
292
21
293
63
  for (SUnit* SU : SUnits) {
294
63
    if (!SU->NumPredsLeft)
295
38
      TopReadySUs.push_back(SU);
296
63
  }
297
21
298
84
  while (!TopReadySUs.empty()) {
299
63
    SUnit *SU = TopReadySUs[0];
300
63
    ScheduledSUnits.push_back(SU);
301
63
    nodeScheduled(SU);
302
63
  }
303
21
304
21
  Scheduled = true;
305
21
}
306
307
// Returns if the register was set between first and last.
308
static bool isDefBetween(unsigned Reg,
309
                           SlotIndex First, SlotIndex Last,
310
                           const MachineRegisterInfo *MRI,
311
36
                           const LiveIntervals *LIS) {
312
36
  for (MachineRegisterInfo::def_instr_iterator
313
36
       UI = MRI->def_instr_begin(Reg),
314
47
       UE = MRI->def_instr_end(); UI != UE; 
++UI11
) {
315
44
    const MachineInstr* MI = &*UI;
316
44
    if (MI->isDebugValue())
317
0
      continue;
318
44
    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
319
44
    if (InstSlot >= First && 
InstSlot <= Last41
)
320
33
      return true;
321
44
  }
322
36
  
return false3
;
323
36
}
324
325
void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
326
21
                                      MachineBasicBlock::iterator EndBlock) {
327
21
  IntervalPressure Pressure, BotPressure;
328
21
  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
329
21
  LiveIntervals *LIS = DAG->getLIS();
330
21
  MachineRegisterInfo *MRI = DAG->getMRI();
331
21
  DAG->initRPTracker(TopRPTracker);
332
21
  DAG->initRPTracker(BotRPTracker);
333
21
  DAG->initRPTracker(RPTracker);
334
21
335
21
  // Goes though all SU. RPTracker captures what had to be alive for the SUs
336
21
  // to execute, and what is still alive at the end.
337
63
  for (SUnit* SU : ScheduledSUnits) {
338
63
    RPTracker.setPos(SU->getInstr());
339
63
    RPTracker.advance();
340
63
  }
341
21
342
21
  // Close the RPTracker to finalize live ins/outs.
343
21
  RPTracker.closeRegion();
344
21
345
21
  // Initialize the live ins and live outs.
346
21
  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
347
21
  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
348
21
349
21
  // Do not Track Physical Registers, because it messes up.
350
49
  for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
351
49
    if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
352
33
      LiveInRegs.insert(RegMaskPair.RegUnit);
353
49
  }
354
21
  LiveOutRegs.clear();
355
21
  // There is several possibilities to distinguish:
356
21
  // 1) Reg is not input to any instruction in the block, but is output of one
357
21
  // 2) 1) + read in the block and not needed after it
358
21
  // 3) 1) + read in the block but needed in another block
359
21
  // 4) Reg is input of an instruction but another block will read it too
360
21
  // 5) Reg is input of an instruction and then rewritten in the block.
361
21
  //    result is not read in the block (implies used in another block)
362
21
  // 6) Reg is input of an instruction and then rewritten in the block.
363
21
  //    result is read in the block and not needed in another block
364
21
  // 7) Reg is input of an instruction and then rewritten in the block.
365
21
  //    result is read in the block but also needed in another block
366
21
  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
367
21
  // We want LiveOutRegs to contain only Regs whose content will be read after
368
21
  // in another block, and whose content was written in the current block,
369
21
  // that is we want it to get 1, 3, 5, 7
370
21
  // Since we made the MIs of a block to be packed all together before
371
21
  // scheduling, then the LiveIntervals were correct, and the RPTracker was
372
21
  // able to correctly handle 5 vs 6, 2 vs 3.
373
21
  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
374
21
  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
375
21
  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
376
21
  // The use of findDefBetween removes the case 4.
377
36
  for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
378
36
    unsigned Reg = RegMaskPair.RegUnit;
379
36
    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
380
36
        isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
381
36
                     LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
382
36
                     LIS)) {
383
33
      LiveOutRegs.insert(Reg);
384
33
    }
385
36
  }
386
21
387
21
  // Pressure = sum_alive_registers register size
388
21
  // Internally llvm will represent some registers as big 128 bits registers
389
21
  // for example, but they actually correspond to 4 actual 32 bits registers.
390
21
  // Thus Pressure is not equal to num_alive_registers * constant.
391
21
  LiveInPressure = TopPressure.MaxSetPressure;
392
21
  LiveOutPressure = BotPressure.MaxSetPressure;
393
21
394
21
  // Prepares TopRPTracker for top down scheduling.
395
21
  TopRPTracker.closeTop();
396
21
}
397
398
void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
399
21
                               MachineBasicBlock::iterator EndBlock) {
400
21
  if (!Scheduled)
401
0
    fastSchedule();
402
21
403
21
  // PreScheduling phase to set LiveIn and LiveOut.
404
21
  initRegPressure(BeginBlock, EndBlock);
405
21
  undoSchedule();
406
21
407
21
  // Schedule for real now.
408
21
409
21
  TopReadySUs.clear();
410
21
411
63
  for (SUnit* SU : SUnits) {
412
63
    if (!SU->NumPredsLeft)
413
38
      TopReadySUs.push_back(SU);
414
63
  }
415
21
416
84
  while (!TopReadySUs.empty()) {
417
63
    SUnit *SU = pickNode();
418
63
    ScheduledSUnits.push_back(SU);
419
63
    TopRPTracker.setPos(SU->getInstr());
420
63
    TopRPTracker.advance();
421
63
    nodeScheduled(SU);
422
63
  }
423
21
424
21
  // TODO: compute InternalAdditionnalPressure.
425
21
  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
426
21
427
21
  // Check everything is right.
428
#ifndef NDEBUG
429
  assert(SUnits.size() == ScheduledSUnits.size() &&
430
            TopReadySUs.empty());
431
  for (SUnit* SU : SUnits) {
432
    assert(SU->isScheduled &&
433
              SU->NumPredsLeft == 0);
434
  }
435
#endif
436
437
21
  Scheduled = true;
438
21
}
439
440
21
void SIScheduleBlock::undoSchedule() {
441
63
  for (SUnit* SU : SUnits) {
442
63
    SU->isScheduled = false;
443
94
    for (SDep& Succ : SU->Succs) {
444
94
      if (BC->isSUInBlock(Succ.getSUnit(), ID))
445
43
        undoReleaseSucc(SU, &Succ);
446
94
    }
447
63
  }
448
21
  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
449
21
  ScheduledSUnits.clear();
450
21
  Scheduled = false;
451
21
}
452
453
43
void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
454
43
  SUnit *SuccSU = SuccEdge->getSUnit();
455
43
456
43
  if (SuccEdge->isWeak()) {
457
0
    ++SuccSU->WeakPredsLeft;
458
0
    return;
459
0
  }
460
43
  ++SuccSU->NumPredsLeft;
461
43
}
462
463
117
void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
464
117
  SUnit *SuccSU = SuccEdge->getSUnit();
465
117
466
117
  if (SuccEdge->isWeak()) {
467
0
    --SuccSU->WeakPredsLeft;
468
0
    return;
469
0
  }
470
#ifndef NDEBUG
471
  if (SuccSU->NumPredsLeft == 0) {
472
    dbgs() << "*** Scheduling failed! ***\n";
473
    DAG->dumpNode(*SuccSU);
474
    dbgs() << " has been released too many times!\n";
475
    llvm_unreachable(nullptr);
476
  }
477
#endif
478
479
117
  --SuccSU->NumPredsLeft;
480
117
}
481
482
/// Release Successors of the SU that are in the block or not.
483
189
void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
484
282
  for (SDep& Succ : SU->Succs) {
485
282
    SUnit *SuccSU = Succ.getSUnit();
486
282
487
282
    if (SuccSU->NodeNum >= DAG->SUnits.size())
488
60
        continue;
489
222
490
222
    if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
491
105
      continue;
492
117
493
117
    releaseSucc(SU, &Succ);
494
117
    if (SuccSU->NumPredsLeft == 0 && 
InOrOutBlock68
)
495
50
      TopReadySUs.push_back(SuccSU);
496
117
  }
497
189
}
498
499
126
void SIScheduleBlock::nodeScheduled(SUnit *SU) {
500
126
  // Is in TopReadySUs
501
126
  assert (!SU->NumPredsLeft);
502
126
  std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
503
126
  if (I == TopReadySUs.end()) {
504
0
    dbgs() << "Data Structure Bug in SI Scheduler\n";
505
0
    llvm_unreachable(nullptr);
506
0
  }
507
126
  TopReadySUs.erase(I);
508
126
509
126
  releaseSuccessors(SU, true);
510
126
  // Scheduling this node will trigger a wait,
511
126
  // thus propagate to other instructions that they do not need to wait either.
512
126
  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
513
0
    HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
514
126
515
126
  if (DAG->IsLowLatencySU[SU->NodeNum]) {
516
12
     for (SDep& Succ : SU->Succs) {
517
12
      std::map<unsigned, unsigned>::iterator I =
518
12
        NodeNum2Index.find(Succ.getSUnit()->NodeNum);
519
12
      if (I != NodeNum2Index.end())
520
0
        HasLowLatencyNonWaitedParent[I->second] = 1;
521
12
    }
522
12
  }
523
126
  SU->isScheduled = true;
524
126
}
525
526
21
void SIScheduleBlock::finalizeUnits() {
527
21
  // We remove links from outside blocks to enable scheduling inside the block.
528
63
  for (SUnit* SU : SUnits) {
529
63
    releaseSuccessors(SU, false);
530
63
    if (DAG->IsHighLatencySU[SU->NodeNum])
531
3
      HighLatencyBlock = true;
532
63
  }
533
21
  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
534
21
}
535
536
// we maintain ascending order of IDs
537
31
void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
538
31
  unsigned PredID = Pred->getID();
539
31
540
31
  // Check if not already predecessor.
541
31
  for (SIScheduleBlock* P : Preds) {
542
19
    if (PredID == P->getID())
543
14
      return;
544
19
  }
545
31
  Preds.push_back(Pred);
546
17
547
17
  assert(none_of(Succs,
548
17
                 [=](std::pair<SIScheduleBlock*,
549
17
                     SIScheduleBlockLinkKind> S) {
550
17
                   return PredID == S.first->getID();
551
17
                    }) &&
552
17
         "Loop in the Block Graph!");
553
17
}
554
555
void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
556
31
                              SIScheduleBlockLinkKind Kind) {
557
31
  unsigned SuccID = Succ->getID();
558
31
559
31
  // Check if not already predecessor.
560
31
  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
561
22
    if (SuccID == S.first->getID()) {
562
14
      if (S.second == SIScheduleBlockLinkKind::NoData &&
563
14
          
Kind == SIScheduleBlockLinkKind::Data0
)
564
0
        S.second = Kind;
565
14
      return;
566
14
    }
567
22
  }
568
31
  
if (17
Succ->isHighLatencyBlock()17
)
569
0
    ++NumHighLatencySuccessors;
570
17
  Succs.push_back(std::make_pair(Succ, Kind));
571
17
572
17
  assert(none_of(Preds,
573
17
                 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
574
17
         "Loop in the Block Graph!");
575
17
}
576
577
#ifndef NDEBUG
578
void SIScheduleBlock::printDebug(bool full) {
579
  dbgs() << "Block (" << ID << ")\n";
580
  if (!full)
581
    return;
582
583
  dbgs() << "\nContains High Latency Instruction: "
584
         << HighLatencyBlock << '\n';
585
  dbgs() << "\nDepends On:\n";
586
  for (SIScheduleBlock* P : Preds) {
587
    P->printDebug(false);
588
  }
589
590
  dbgs() << "\nSuccessors:\n";
591
  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
592
    if (S.second == SIScheduleBlockLinkKind::Data)
593
      dbgs() << "(Data Dep) ";
594
    S.first->printDebug(false);
595
  }
596
597
  if (Scheduled) {
598
    dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
599
           << LiveInPressure[DAG->getVGPRSetID()] << '\n';
600
    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
601
           << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
602
    dbgs() << "LiveIns:\n";
603
    for (unsigned Reg : LiveInRegs)
604
      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
605
606
    dbgs() << "\nLiveOuts:\n";
607
    for (unsigned Reg : LiveOutRegs)
608
      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
609
  }
610
611
  dbgs() << "\nInstructions:\n";
612
  if (!Scheduled) {
613
    for (const SUnit* SU : SUnits)
614
      DAG->dumpNode(*SU);
615
  } else {
616
    for (const SUnit* SU : SUnits)
617
      DAG->dumpNode(*SU);
618
  }
619
620
  dbgs() << "///////////////////////\n";
621
}
622
#endif
623
624
// SIScheduleBlockCreator //
625
626
SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
627
9
DAG(DAG) {
628
9
}
629
630
9
SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
631
632
SIScheduleBlocks
633
9
SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
634
9
  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
635
9
    Blocks.find(BlockVariant);
636
9
  if (B == Blocks.end()) {
637
9
    SIScheduleBlocks Res;
638
9
    createBlocksForVariant(BlockVariant);
639
9
    topologicalSort();
640
9
    scheduleInsideBlocks();
641
9
    fillStats();
642
9
    Res.Blocks = CurrentBlocks;
643
9
    Res.TopDownIndex2Block = TopDownIndex2Block;
644
9
    Res.TopDownBlock2Index = TopDownBlock2Index;
645
9
    Blocks[BlockVariant] = Res;
646
9
    return Res;
647
9
  } else {
648
0
    return B->second;
649
0
  }
650
9
}
651
652
316
bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
653
316
  if (SU->NodeNum >= DAG->SUnits.size())
654
20
    return false;
655
296
  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
656
296
}
657
658
9
void SIScheduleBlockCreator::colorHighLatenciesAlone() {
659
9
  unsigned DAGSize = DAG->SUnits.size();
660
9
661
72
  for (unsigned i = 0, e = DAGSize; i != e; 
++i63
) {
662
63
    SUnit *SU = &DAG->SUnits[i];
663
63
    if (DAG->IsHighLatencySU[SU->NodeNum]) {
664
3
      CurrentColoring[SU->NodeNum] = NextReservedID++;
665
3
    }
666
63
  }
667
9
}
668
669
static bool
670
0
hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
671
0
  for (const auto &PredDep : SU.Preds) {
672
0
    if (PredDep.getSUnit() == &FromSU &&
673
0
        PredDep.getKind() == llvm::SDep::Data)
674
0
      return true;
675
0
  }
676
0
  return false;
677
0
}
678
679
0
void SIScheduleBlockCreator::colorHighLatenciesGroups() {
680
0
  unsigned DAGSize = DAG->SUnits.size();
681
0
  unsigned NumHighLatencies = 0;
682
0
  unsigned GroupSize;
683
0
  int Color = NextReservedID;
684
0
  unsigned Count = 0;
685
0
  std::set<unsigned> FormingGroup;
686
0
687
0
  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
688
0
    SUnit *SU = &DAG->SUnits[i];
689
0
    if (DAG->IsHighLatencySU[SU->NodeNum])
690
0
      ++NumHighLatencies;
691
0
  }
692
0
693
0
  if (NumHighLatencies == 0)
694
0
    return;
695
0
696
0
  if (NumHighLatencies <= 6)
697
0
    GroupSize = 2;
698
0
  else if (NumHighLatencies <= 12)
699
0
    GroupSize = 3;
700
0
  else
701
0
    GroupSize = 4;
702
0
703
0
  for (unsigned SUNum : DAG->TopDownIndex2SU) {
704
0
    const SUnit &SU = DAG->SUnits[SUNum];
705
0
    if (DAG->IsHighLatencySU[SU.NodeNum]) {
706
0
      unsigned CompatibleGroup = true;
707
0
      int ProposedColor = Color;
708
0
      std::vector<int> AdditionalElements;
709
0
710
0
      // We don't want to put in the same block
711
0
      // two high latency instructions that depend
712
0
      // on each other.
713
0
      // One way would be to check canAddEdge
714
0
      // in both directions, but that currently is not
715
0
      // enough because there the high latency order is
716
0
      // enforced (via links).
717
0
      // Instead, look at the dependencies between the
718
0
      // high latency instructions and deduce if it is
719
0
      // a data dependency or not.
720
0
      for (unsigned j : FormingGroup) {
721
0
        bool HasSubGraph;
722
0
        std::vector<int> SubGraph;
723
0
        // By construction (topological order), if SU and
724
0
        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
725
0
        // in the parent graph of SU.
726
#ifndef NDEBUG
727
        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
728
                                               HasSubGraph);
729
        assert(!HasSubGraph);
730
#endif
731
        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
732
0
                                               HasSubGraph);
733
0
        if (!HasSubGraph)
734
0
          continue; // No dependencies between each other
735
0
        else if (SubGraph.size() > 5) {
736
0
          // Too many elements would be required to be added to the block.
737
0
          CompatibleGroup = false;
738
0
          break;
739
0
        }
740
0
        else {
741
0
          // Check the type of dependency
742
0
          for (unsigned k : SubGraph) {
743
0
            // If in the path to join the two instructions,
744
0
            // there is another high latency instruction,
745
0
            // or instructions colored for another block
746
0
            // abort the merge.
747
0
            if (DAG->IsHighLatencySU[k] ||
748
0
                (CurrentColoring[k] != ProposedColor &&
749
0
                 CurrentColoring[k] != 0)) {
750
0
              CompatibleGroup = false;
751
0
              break;
752
0
            }
753
0
            // If one of the SU in the subgraph depends on the result of SU j,
754
0
            // there'll be a data dependency.
755
0
            if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
756
0
              CompatibleGroup = false;
757
0
              break;
758
0
            }
759
0
          }
760
0
          if (!CompatibleGroup)
761
0
            break;
762
0
          // Same check for the SU
763
0
          if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
764
0
            CompatibleGroup = false;
765
0
            break;
766
0
          }
767
0
          // Add all the required instructions to the block
768
0
          // These cannot live in another block (because they
769
0
          // depend (order dependency) on one of the
770
0
          // instruction in the block, and are required for the
771
0
          // high latency instruction we add.
772
0
          AdditionalElements.insert(AdditionalElements.end(),
773
0
                                    SubGraph.begin(), SubGraph.end());
774
0
        }
775
0
      }
776
0
      if (CompatibleGroup) {
777
0
        FormingGroup.insert(SU.NodeNum);
778
0
        for (unsigned j : AdditionalElements)
779
0
          CurrentColoring[j] = ProposedColor;
780
0
        CurrentColoring[SU.NodeNum] = ProposedColor;
781
0
        ++Count;
782
0
      }
783
0
      // Found one incompatible instruction,
784
0
      // or has filled a big enough group.
785
0
      // -> start a new one.
786
0
      if (!CompatibleGroup) {
787
0
        FormingGroup.clear();
788
0
        Color = ++NextReservedID;
789
0
        ProposedColor = Color;
790
0
        FormingGroup.insert(SU.NodeNum);
791
0
        CurrentColoring[SU.NodeNum] = ProposedColor;
792
0
        Count = 0;
793
0
      } else if (Count == GroupSize) {
794
0
        FormingGroup.clear();
795
0
        Color = ++NextReservedID;
796
0
        ProposedColor = Color;
797
0
        Count = 0;
798
0
      }
799
0
    }
800
0
  }
801
0
}
802
803
9
void SIScheduleBlockCreator::colorComputeReservedDependencies() {
804
9
  unsigned DAGSize = DAG->SUnits.size();
805
9
  std::map<std::set<unsigned>, unsigned> ColorCombinations;
806
9
807
9
  CurrentTopDownReservedDependencyColoring.clear();
808
9
  CurrentBottomUpReservedDependencyColoring.clear();
809
9
810
9
  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
811
9
  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
812
9
813
9
  // Traverse TopDown, and give different colors to SUs depending
814
9
  // on which combination of High Latencies they depend on.
815
9
816
63
  for (unsigned SUNum : DAG->TopDownIndex2SU) {
817
63
    SUnit *SU = &DAG->SUnits[SUNum];
818
63
    std::set<unsigned> SUColors;
819
63
820
63
    // Already given.
821
63
    if (CurrentColoring[SU->NodeNum]) {
822
3
      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
823
3
        CurrentColoring[SU->NodeNum];
824
3
      continue;
825
3
    }
826
60
827
72
   
for (SDep& PredDep : SU->Preds)60
{
828
72
      SUnit *Pred = PredDep.getSUnit();
829
72
      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
830
0
        continue;
831
72
      if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
832
18
        SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
833
72
    }
834
60
    // Color 0 by default.
835
60
    if (SUColors.empty())
836
49
      continue;
837
11
    // Same color than parents.
838
11
    if (SUColors.size() == 1 && 
*SUColors.begin() > DAGSize6
)
839
1
      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
840
1
        *SUColors.begin();
841
10
    else {
842
10
      std::map<std::set<unsigned>, unsigned>::iterator Pos =
843
10
        ColorCombinations.find(SUColors);
844
10
      if (Pos != ColorCombinations.end()) {
845
2
          CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
846
8
      } else {
847
8
        CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
848
8
          NextNonReservedID;
849
8
        ColorCombinations[SUColors] = NextNonReservedID++;
850
8
      }
851
10
    }
852
11
  }
853
9
854
9
  ColorCombinations.clear();
855
9
856
9
  // Same as before, but BottomUp.
857
9
858
63
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
859
63
    SUnit *SU = &DAG->SUnits[SUNum];
860
63
    std::set<unsigned> SUColors;
861
63
862
63
    // Already given.
863
63
    if (CurrentColoring[SU->NodeNum]) {
864
3
      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
865
3
        CurrentColoring[SU->NodeNum];
866
3
      continue;
867
3
    }
868
60
869
84
    
for (SDep& SuccDep : SU->Succs)60
{
870
84
      SUnit *Succ = SuccDep.getSUnit();
871
84
      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
872
20
        continue;
873
64
      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
874
8
        SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
875
64
    }
876
60
    // Keep color 0.
877
60
    if (SUColors.empty())
878
54
      continue;
879
6
    // Same color than parents.
880
6
    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
881
4
      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
882
4
        *SUColors.begin();
883
2
    else {
884
2
      std::map<std::set<unsigned>, unsigned>::iterator Pos =
885
2
        ColorCombinations.find(SUColors);
886
2
      if (Pos != ColorCombinations.end()) {
887
1
        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
888
1
      } else {
889
1
        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
890
1
          NextNonReservedID;
891
1
        ColorCombinations[SUColors] = NextNonReservedID++;
892
1
      }
893
2
    }
894
6
  }
895
9
}
896
897
9
void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
898
9
  unsigned DAGSize = DAG->SUnits.size();
899
9
  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
900
9
901
9
  // Every combination of colors given by the top down
902
9
  // and bottom up Reserved node dependency
903
9
904
72
  for (unsigned i = 0, e = DAGSize; i != e; 
++i63
) {
905
63
    SUnit *SU = &DAG->SUnits[i];
906
63
    std::pair<unsigned, unsigned> SUColors;
907
63
908
63
    // High latency instructions: already given.
909
63
    if (CurrentColoring[SU->NodeNum])
910
3
      continue;
911
60
912
60
    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
913
60
    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
914
60
915
60
    std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
916
60
      ColorCombinations.find(SUColors);
917
60
    if (Pos != ColorCombinations.end()) {
918
45
      CurrentColoring[SU->NodeNum] = Pos->second;
919
45
    } else {
920
15
      CurrentColoring[SU->NodeNum] = NextNonReservedID;
921
15
      ColorCombinations[SUColors] = NextNonReservedID++;
922
15
    }
923
60
  }
924
9
}
925
926
9
void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
927
9
  unsigned DAGSize = DAG->SUnits.size();
928
9
  std::vector<int> PendingColoring = CurrentColoring;
929
9
930
9
  assert(DAGSize >= 1 &&
931
9
         CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
932
9
         CurrentTopDownReservedDependencyColoring.size() == DAGSize);
933
9
  // If there is no reserved block at all, do nothing. We don't want
934
9
  // everything in one block.
935
9
  if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
936
9
                        CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
937
9
      *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
938
6
                        CurrentTopDownReservedDependencyColoring.end()) == 0)
939
6
    return;
940
3
941
20
  
for (unsigned SUNum : DAG->BottomUpIndex2SU)3
{
942
20
    SUnit *SU = &DAG->SUnits[SUNum];
943
20
    std::set<unsigned> SUColors;
944
20
    std::set<unsigned> SUColorsPending;
945
20
946
20
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
947
3
      continue;
948
17
949
17
    if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
950
17
        
CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 011
)
951
17
      continue;
952
0
953
0
    for (SDep& SuccDep : SU->Succs) {
954
0
      SUnit *Succ = SuccDep.getSUnit();
955
0
      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
956
0
        continue;
957
0
      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
958
0
          CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
959
0
        SUColors.insert(CurrentColoring[Succ->NodeNum]);
960
0
      SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
961
0
    }
962
0
    // If there is only one child/parent block, and that block
963
0
    // is not among the ones we are removing in this path, then
964
0
    // merge the instruction to that block
965
0
    if (SUColors.size() == 1 && SUColorsPending.size() == 1)
966
0
      PendingColoring[SU->NodeNum] = *SUColors.begin();
967
0
    else // TODO: Attribute new colors depending on color
968
0
         // combination of children.
969
0
      PendingColoring[SU->NodeNum] = NextNonReservedID++;
970
0
  }
971
3
  CurrentColoring = PendingColoring;
972
3
}
973
974
975
0
void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
976
0
  unsigned DAGSize = DAG->SUnits.size();
977
0
  unsigned PreviousColor;
978
0
  std::set<unsigned> SeenColors;
979
0
980
0
  if (DAGSize <= 1)
981
0
    return;
982
0
983
0
  PreviousColor = CurrentColoring[0];
984
0
985
0
  for (unsigned i = 1, e = DAGSize; i != e; ++i) {
986
0
    SUnit *SU = &DAG->SUnits[i];
987
0
    unsigned CurrentColor = CurrentColoring[i];
988
0
    unsigned PreviousColorSave = PreviousColor;
989
0
    assert(i == SU->NodeNum);
990
0
991
0
    if (CurrentColor != PreviousColor)
992
0
      SeenColors.insert(PreviousColor);
993
0
    PreviousColor = CurrentColor;
994
0
995
0
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
996
0
      continue;
997
0
998
0
    if (SeenColors.find(CurrentColor) == SeenColors.end())
999
0
      continue;
1000
0
1001
0
    if (PreviousColorSave != CurrentColor)
1002
0
      CurrentColoring[i] = NextNonReservedID++;
1003
0
    else
1004
0
      CurrentColoring[i] = CurrentColoring[i-1];
1005
0
  }
1006
0
}
1007
1008
9
void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
1009
9
  unsigned DAGSize = DAG->SUnits.size();
1010
9
1011
63
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1012
63
    SUnit *SU = &DAG->SUnits[SUNum];
1013
63
    std::set<unsigned> SUColors;
1014
63
1015
63
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1016
3
      continue;
1017
60
1018
60
    // No predecessor: Vgpr constant loading.
1019
60
    // Low latency instructions usually have a predecessor (the address)
1020
60
    if (SU->Preds.size() > 0 && 
!DAG->IsLowLatencySU[SU->NodeNum]42
)
1021
36
      continue;
1022
24
1023
31
    
for (SDep& SuccDep : SU->Succs)24
{
1024
31
      SUnit *Succ = SuccDep.getSUnit();
1025
31
      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1026
12
        continue;
1027
19
      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1028
19
    }
1029
24
    if (SUColors.size() == 1)
1030
13
      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1031
24
  }
1032
9
}
1033
1034
0
void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1035
0
  unsigned DAGSize = DAG->SUnits.size();
1036
0
1037
0
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1038
0
    SUnit *SU = &DAG->SUnits[SUNum];
1039
0
    std::set<unsigned> SUColors;
1040
0
1041
0
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1042
0
      continue;
1043
0
1044
0
    for (SDep& SuccDep : SU->Succs) {
1045
0
       SUnit *Succ = SuccDep.getSUnit();
1046
0
      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1047
0
        continue;
1048
0
      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1049
0
    }
1050
0
    if (SUColors.size() == 1)
1051
0
      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1052
0
  }
1053
0
}
1054
1055
9
void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1056
9
  unsigned DAGSize = DAG->SUnits.size();
1057
9
1058
63
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1059
63
    SUnit *SU = &DAG->SUnits[SUNum];
1060
63
    std::set<unsigned> SUColors;
1061
63
1062
63
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1063
3
      continue;
1064
60
1065
84
    
for (SDep& SuccDep : SU->Succs)60
{
1066
84
       SUnit *Succ = SuccDep.getSUnit();
1067
84
      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1068
20
        continue;
1069
64
      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1070
64
    }
1071
60
    if (SUColors.size() == 1 && 
*SUColors.begin() <= DAGSize41
)
1072
6
      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1073
60
  }
1074
9
}
1075
1076
0
void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1077
0
  unsigned DAGSize = DAG->SUnits.size();
1078
0
  std::map<unsigned, unsigned> ColorCount;
1079
0
1080
0
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1081
0
    SUnit *SU = &DAG->SUnits[SUNum];
1082
0
    unsigned color = CurrentColoring[SU->NodeNum];
1083
0
     ++ColorCount[color];
1084
0
  }
1085
0
1086
0
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1087
0
    SUnit *SU = &DAG->SUnits[SUNum];
1088
0
    unsigned color = CurrentColoring[SU->NodeNum];
1089
0
    std::set<unsigned> SUColors;
1090
0
1091
0
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1092
0
      continue;
1093
0
1094
0
    if (ColorCount[color] > 1)
1095
0
      continue;
1096
0
1097
0
    for (SDep& SuccDep : SU->Succs) {
1098
0
       SUnit *Succ = SuccDep.getSUnit();
1099
0
      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1100
0
        continue;
1101
0
      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1102
0
    }
1103
0
    if (SUColors.size() == 1 && *SUColors.begin() != color) {
1104
0
      --ColorCount[color];
1105
0
      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1106
0
      ++ColorCount[*SUColors.begin()];
1107
0
    }
1108
0
  }
1109
0
}
1110
1111
0
void SIScheduleBlockCreator::cutHugeBlocks() {
1112
0
  // TODO
1113
0
}
1114
1115
9
void SIScheduleBlockCreator::regroupNoUserInstructions() {
1116
9
  unsigned DAGSize = DAG->SUnits.size();
1117
9
  int GroupID = NextNonReservedID++;
1118
9
1119
63
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1120
63
    SUnit *SU = &DAG->SUnits[SUNum];
1121
63
    bool hasSuccessor = false;
1122
63
1123
63
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1124
3
      continue;
1125
60
1126
84
    
for (SDep& SuccDep : SU->Succs)60
{
1127
84
       SUnit *Succ = SuccDep.getSUnit();
1128
84
      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1129
20
        continue;
1130
64
      hasSuccessor = true;
1131
64
    }
1132
60
    if (!hasSuccessor)
1133
15
      CurrentColoring[SU->NodeNum] = GroupID;
1134
60
  }
1135
9
}
1136
1137
9
void SIScheduleBlockCreator::colorExports() {
1138
9
  unsigned ExportColor = NextNonReservedID++;
1139
9
  SmallVector<unsigned, 8> ExpGroup;
1140
9
1141
9
  // Put all exports together in a block.
1142
9
  // The block will naturally end up being scheduled last,
1143
9
  // thus putting exports at the end of the schedule, which
1144
9
  // is better for performance.
1145
9
  // However we must ensure, for safety, the exports can be put
1146
9
  // together in the same block without any other instruction.
1147
9
  // This could happen, for example, when scheduling after regalloc
1148
9
  // if reloading a spilled register from memory using the same
1149
9
  // register than used in a previous export.
1150
9
  // If that happens, do not regroup the exports.
1151
63
  for (unsigned SUNum : DAG->TopDownIndex2SU) {
1152
63
    const SUnit &SU = DAG->SUnits[SUNum];
1153
63
    if (SIInstrInfo::isEXP(*SU.getInstr())) {
1154
2
      // Check the EXP can be added to the group safely,
1155
2
      // ie without needing any other instruction.
1156
2
      // The EXP is allowed to depend on other EXP
1157
2
      // (they will be in the same group).
1158
2
      for (unsigned j : ExpGroup) {
1159
0
        bool HasSubGraph;
1160
0
        std::vector<int> SubGraph;
1161
0
        // By construction (topological order), if SU and
1162
0
        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
1163
0
        // in the parent graph of SU.
1164
#ifndef NDEBUG
1165
        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
1166
                                               HasSubGraph);
1167
        assert(!HasSubGraph);
1168
#endif
1169
        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
1170
0
                                               HasSubGraph);
1171
0
        if (!HasSubGraph)
1172
0
          continue; // No dependencies between each other
1173
0
1174
0
        // SubGraph contains all the instructions required
1175
0
        // between EXP SUnits[j] and EXP SU.
1176
0
        for (unsigned k : SubGraph) {
1177
0
          if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
1178
0
            // Other instructions than EXP would be required in the group.
1179
0
            // Abort the groupping.
1180
0
            return;
1181
0
        }
1182
0
      }
1183
2
1184
2
      ExpGroup.push_back(SUNum);
1185
2
    }
1186
63
  }
1187
9
1188
9
  // The group can be formed. Give the color.
1189
9
  for (unsigned j : ExpGroup)
1190
2
    CurrentColoring[j] = ExportColor;
1191
9
}
1192
1193
9
void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1194
9
  unsigned DAGSize = DAG->SUnits.size();
1195
9
  std::map<unsigned,unsigned> RealID;
1196
9
1197
9
  CurrentBlocks.clear();
1198
9
  CurrentColoring.clear();
1199
9
  CurrentColoring.resize(DAGSize, 0);
1200
9
  Node2CurrentBlock.clear();
1201
9
1202
9
  // Restore links previous scheduling variant has overridden.
1203
9
  DAG->restoreSULinksLeft();
1204
9
1205
9
  NextReservedID = 1;
1206
9
  NextNonReservedID = DAGSize + 1;
1207
9
1208
9
  LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1209
9
1210
9
  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1211
0
    colorHighLatenciesGroups();
1212
9
  else
1213
9
    colorHighLatenciesAlone();
1214
9
  colorComputeReservedDependencies();
1215
9
  colorAccordingToReservedDependencies();
1216
9
  colorEndsAccordingToDependencies();
1217
9
  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1218
0
    colorForceConsecutiveOrderInGroup();
1219
9
  regroupNoUserInstructions();
1220
9
  colorMergeConstantLoadsNextGroup();
1221
9
  colorMergeIfPossibleNextGroupOnlyForReserved();
1222
9
  colorExports();
1223
9
1224
9
  // Put SUs of same color into same block
1225
9
  Node2CurrentBlock.resize(DAGSize, -1);
1226
72
  for (unsigned i = 0, e = DAGSize; i != e; 
++i63
) {
1227
63
    SUnit *SU = &DAG->SUnits[i];
1228
63
    unsigned Color = CurrentColoring[SU->NodeNum];
1229
63
    if (RealID.find(Color) == RealID.end()) {
1230
21
      int ID = CurrentBlocks.size();
1231
21
      BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
1232
21
      CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1233
21
      RealID[Color] = ID;
1234
21
    }
1235
63
    CurrentBlocks[RealID[Color]]->addUnit(SU);
1236
63
    Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1237
63
  }
1238
9
1239
9
  // Build dependencies between blocks.
1240
72
  for (unsigned i = 0, e = DAGSize; i != e; 
++i63
) {
1241
63
    SUnit *SU = &DAG->SUnits[i];
1242
63
    int SUID = Node2CurrentBlock[i];
1243
94
     for (SDep& SuccDep : SU->Succs) {
1244
94
       SUnit *Succ = SuccDep.getSUnit();
1245
94
      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1246
20
        continue;
1247
74
      if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1248
31
        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1249
31
                                     SuccDep.isCtrl() ? 
NoData7
:
Data24
);
1250
74
    }
1251
74
    for (SDep& PredDep : SU->Preds) {
1252
74
      SUnit *Pred = PredDep.getSUnit();
1253
74
      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1254
0
        continue;
1255
74
      if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1256
31
        CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1257
74
    }
1258
63
  }
1259
9
1260
9
  // Free root and leafs of all blocks to enable scheduling inside them.
1261
30
  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; 
++i21
) {
1262
21
    SIScheduleBlock *Block = CurrentBlocks[i];
1263
21
    Block->finalizeUnits();
1264
21
  }
1265
9
  LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
1266
9
             for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1267
9
               SIScheduleBlock *Block = CurrentBlocks[i];
1268
9
               Block->printDebug(true);
1269
9
             });
1270
9
}
1271
1272
// Two functions taken from Codegen/MachineScheduler.cpp
1273
1274
/// Non-const version.
1275
static MachineBasicBlock::iterator
1276
nextIfDebug(MachineBasicBlock::iterator I,
1277
52
            MachineBasicBlock::const_iterator End) {
1278
52
  for (; I != End; 
++I0
) {
1279
43
    if (!I->isDebugInstr())
1280
43
      break;
1281
43
  }
1282
52
  return I;
1283
52
}
1284
1285
9
void SIScheduleBlockCreator::topologicalSort() {
1286
9
  unsigned DAGSize = CurrentBlocks.size();
1287
9
  std::vector<int> WorkList;
1288
9
1289
9
  LLVM_DEBUG(dbgs() << "Topological Sort\n");
1290
9
1291
9
  WorkList.reserve(DAGSize);
1292
9
  TopDownIndex2Block.resize(DAGSize);
1293
9
  TopDownBlock2Index.resize(DAGSize);
1294
9
  BottomUpIndex2Block.resize(DAGSize);
1295
9
1296
30
  for (unsigned i = 0, e = DAGSize; i != e; 
++i21
) {
1297
21
    SIScheduleBlock *Block = CurrentBlocks[i];
1298
21
    unsigned Degree = Block->getSuccs().size();
1299
21
    TopDownBlock2Index[i] = Degree;
1300
21
    if (Degree == 0) {
1301
9
      WorkList.push_back(i);
1302
9
    }
1303
21
  }
1304
9
1305
9
  int Id = DAGSize;
1306
30
  while (!WorkList.empty()) {
1307
21
    int i = WorkList.back();
1308
21
    SIScheduleBlock *Block = CurrentBlocks[i];
1309
21
    WorkList.pop_back();
1310
21
    TopDownBlock2Index[i] = --Id;
1311
21
    TopDownIndex2Block[Id] = i;
1312
21
    for (SIScheduleBlock* Pred : Block->getPreds()) {
1313
17
      if (!--TopDownBlock2Index[Pred->getID()])
1314
12
        WorkList.push_back(Pred->getID());
1315
17
    }
1316
21
  }
1317
9
1318
#ifndef NDEBUG
1319
  // Check correctness of the ordering.
1320
  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1321
    SIScheduleBlock *Block = CurrentBlocks[i];
1322
    for (SIScheduleBlock* Pred : Block->getPreds()) {
1323
      assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1324
      "Wrong Top Down topological sorting");
1325
    }
1326
  }
1327
#endif
1328
1329
9
  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1330
9
                                         TopDownIndex2Block.rend());
1331
9
}
1332
1333
9
void SIScheduleBlockCreator::scheduleInsideBlocks() {
1334
9
  unsigned DAGSize = CurrentBlocks.size();
1335
9
1336
9
  LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1337
9
1338
9
  // We do schedule a valid scheduling such that a Block corresponds
1339
9
  // to a range of instructions.
1340
9
  LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1341
30
  for (unsigned i = 0, e = DAGSize; i != e; 
++i21
) {
1342
21
    SIScheduleBlock *Block = CurrentBlocks[i];
1343
21
    Block->fastSchedule();
1344
21
  }
1345
9
1346
9
  // Note: the following code, and the part restoring previous position
1347
9
  // is by far the most expensive operation of the Scheduler.
1348
9
1349
9
  // Do not update CurrentTop.
1350
9
  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1351
9
  std::vector<MachineBasicBlock::iterator> PosOld;
1352
9
  std::vector<MachineBasicBlock::iterator> PosNew;
1353
9
  PosOld.reserve(DAG->SUnits.size());
1354
9
  PosNew.reserve(DAG->SUnits.size());
1355
9
1356
30
  for (unsigned i = 0, e = DAGSize; i != e; 
++i21
) {
1357
21
    int BlockIndice = TopDownIndex2Block[i];
1358
21
    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1359
21
    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1360
21
1361
63
    for (SUnit* SU : SUs) {
1362
63
      MachineInstr *MI = SU->getInstr();
1363
63
      MachineBasicBlock::iterator Pos = MI;
1364
63
      PosOld.push_back(Pos);
1365
63
      if (&*CurrentTopFastSched == MI) {
1366
52
        PosNew.push_back(Pos);
1367
52
        CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1368
52
                                          DAG->getCurrentBottom());
1369
52
      } else {
1370
11
        // Update the instruction stream.
1371
11
        DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1372
11
1373
11
        // Update LiveIntervals.
1374
11
        // Note: Moving all instructions and calling handleMove every time
1375
11
        // is the most cpu intensive operation of the scheduler.
1376
11
        // It would gain a lot if there was a way to recompute the
1377
11
        // LiveIntervals for the entire scheduling region.
1378
11
        DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
1379
11
        PosNew.push_back(CurrentTopFastSched);
1380
11
      }
1381
63
    }
1382
21
  }
1383
9
1384
9
  // Now we have Block of SUs == Block of MI.
1385
9
  // We do the final schedule for the instructions inside the block.
1386
9
  // The property that all the SUs of the Block are grouped together as MI
1387
9
  // is used for correct reg usage tracking.
1388
30
  for (unsigned i = 0, e = DAGSize; i != e; 
++i21
) {
1389
21
    SIScheduleBlock *Block = CurrentBlocks[i];
1390
21
    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1391
21
    Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1392
21
  }
1393
9
1394
9
  LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1395
9
  // Restore old ordering (which prevents a LIS->handleMove bug).
1396
72
  for (unsigned i = PosOld.size(), e = 0; i != e; 
--i63
) {
1397
63
    MachineBasicBlock::iterator POld = PosOld[i-1];
1398
63
    MachineBasicBlock::iterator PNew = PosNew[i-1];
1399
63
    if (PNew != POld) {
1400
11
      // Update the instruction stream.
1401
11
      DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1402
11
1403
11
      // Update LiveIntervals.
1404
11
      DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
1405
11
    }
1406
63
  }
1407
9
1408
9
  LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1409
9
    SIScheduleBlock *Block = CurrentBlocks[i];
1410
9
    Block->printDebug(true);
1411
9
  });
1412
9
}
1413
1414
9
void SIScheduleBlockCreator::fillStats() {
1415
9
  unsigned DAGSize = CurrentBlocks.size();
1416
9
1417
30
  for (unsigned i = 0, e = DAGSize; i != e; 
++i21
) {
1418
21
    int BlockIndice = TopDownIndex2Block[i];
1419
21
    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1420
21
    if (Block->getPreds().empty())
1421
9
      Block->Depth = 0;
1422
12
    else {
1423
12
      unsigned Depth = 0;
1424
17
      for (SIScheduleBlock *Pred : Block->getPreds()) {
1425
17
        if (Depth < Pred->Depth + Pred->getCost())
1426
12
          Depth = Pred->Depth + Pred->getCost();
1427
17
      }
1428
12
      Block->Depth = Depth;
1429
12
    }
1430
21
  }
1431
9
1432
30
  for (unsigned i = 0, e = DAGSize; i != e; 
++i21
) {
1433
21
    int BlockIndice = BottomUpIndex2Block[i];
1434
21
    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1435
21
    if (Block->getSuccs().empty())
1436
9
      Block->Height = 0;
1437
12
    else {
1438
12
      unsigned Height = 0;
1439
12
      for (const auto &Succ : Block->getSuccs())
1440
17
        Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
1441
12
      Block->Height = Height;
1442
12
    }
1443
21
  }
1444
9
}
1445
1446
// SIScheduleBlockScheduler //
1447
1448
SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1449
                                                   SISchedulerBlockSchedulerVariant Variant,
1450
                                                   SIScheduleBlocks  BlocksStruct) :
1451
  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1452
  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1453
9
  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1454
9
1455
9
  // Fill the usage of every output
1456
9
  // Warning: while by construction we always have a link between two blocks
1457
9
  // when one needs a result from the other, the number of users of an output
1458
9
  // is not the sum of child blocks having as input the same virtual register.
1459
9
  // Here is an example. A produces x and y. B eats x and produces x'.
1460
9
  // C eats x' and y. The register coalescer may have attributed the same
1461
9
  // virtual register to x and x'.
1462
9
  // To count accurately, we do a topological sort. In case the register is
1463
9
  // found for several parents, we increment the usage of the one with the
1464
9
  // highest topological index.
1465
9
  LiveOutRegsNumUsages.resize(Blocks.size());
1466
30
  for (unsigned i = 0, e = Blocks.size(); i != e; 
++i21
) {
1467
21
    SIScheduleBlock *Block = Blocks[i];
1468
33
    for (unsigned Reg : Block->getInRegs()) {
1469
33
      bool Found = false;
1470
33
      int topoInd = -1;
1471
33
      for (SIScheduleBlock* Pred: Block->getPreds()) {
1472
33
        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1473
33
        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1474
33
1475
33
        if (RegPos != PredOutRegs.end()) {
1476
23
          Found = true;
1477
23
          if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1478
23
            topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1479
23
          }
1480
23
        }
1481
33
      }
1482
33
1483
33
      if (!Found)
1484
10
        continue;
1485
23
1486
23
      int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1487
23
      ++LiveOutRegsNumUsages[PredID][Reg];
1488
23
    }
1489
21
  }
1490
9
1491
9
  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1492
9
  BlockNumPredsLeft.resize(Blocks.size());
1493
9
  BlockNumSuccsLeft.resize(Blocks.size());
1494
9
1495
30
  for (unsigned i = 0, e = Blocks.size(); i != e; 
++i21
) {
1496
21
    SIScheduleBlock *Block = Blocks[i];
1497
21
    BlockNumPredsLeft[i] = Block->getPreds().size();
1498
21
    BlockNumSuccsLeft[i] = Block->getSuccs().size();
1499
21
  }
1500
9
1501
#ifndef NDEBUG
1502
  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1503
    SIScheduleBlock *Block = Blocks[i];
1504
    assert(Block->getID() == i);
1505
  }
1506
#endif
1507
1508
9
  std::set<unsigned> InRegs = DAG->getInRegs();
1509
9
  addLiveRegs(InRegs);
1510
9
1511
9
  // Increase LiveOutRegsNumUsages for blocks
1512
9
  // producing registers consumed in another
1513
9
  // scheduling region.
1514
11
  for (unsigned Reg : DAG->getOutRegs()) {
1515
11
    for (unsigned i = 0, e = Blocks.size(); i != e; 
++i0
) {
1516
11
      // Do reverse traversal
1517
11
      int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1518
11
      SIScheduleBlock *Block = Blocks[ID];
1519
11
      const std::set<unsigned> &OutRegs = Block->getOutRegs();
1520
11
1521
11
      if (OutRegs.find(Reg) == OutRegs.end())
1522
0
        continue;
1523
11
1524
11
      ++LiveOutRegsNumUsages[ID][Reg];
1525
11
      break;
1526
11
    }
1527
11
  }
1528
9
1529
9
  // Fill LiveRegsConsumers for regs that were already
1530
9
  // defined before scheduling.
1531
30
  for (unsigned i = 0, e = Blocks.size(); i != e; 
++i21
) {
1532
21
    SIScheduleBlock *Block = Blocks[i];
1533
33
    for (unsigned Reg : Block->getInRegs()) {
1534
33
      bool Found = false;
1535
33
      for (SIScheduleBlock* Pred: Block->getPreds()) {
1536
26
        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1537
26
        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1538
26
1539
26
        if (RegPos != PredOutRegs.end()) {
1540
23
          Found = true;
1541
23
          break;
1542
23
        }
1543
26
      }
1544
33
1545
33
      if (!Found)
1546
10
        ++LiveRegsConsumers[Reg];
1547
33
    }
1548
21
  }
1549
9
1550
30
  for (unsigned i = 0, e = Blocks.size(); i != e; 
++i21
) {
1551
21
    SIScheduleBlock *Block = Blocks[i];
1552
21
    if (BlockNumPredsLeft[i] == 0) {
1553
9
      ReadyBlocks.push_back(Block);
1554
9
    }
1555
21
  }
1556
9
1557
30
  while (SIScheduleBlock *Block = pickBlock()) {
1558
21
    BlocksScheduled.push_back(Block);
1559
21
    blockScheduled(Block);
1560
21
  }
1561
9
1562
9
  LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
1563
9
                                            : BlocksScheduled) {
1564
9
    dbgs() << ' ' << Block->getID();
1565
9
  } dbgs() << '\n';);
1566
9
}
1567
1568
bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1569
21
                                                   SIBlockSchedCandidate &TryCand) {
1570
21
  if (!Cand.isValid()) {
1571
21
    TryCand.Reason = NodeOrder;
1572
21
    return true;
1573
21
  }
1574
0
1575
0
  // Try to hide high latencies.
1576
0
  if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
1577
0
                 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1578
0
    return true;
1579
0
  // Schedule high latencies early so you can hide them better.
1580
0
  if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1581
0
                          TryCand, Cand, Latency))
1582
0
    return true;
1583
0
  if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
1584
0
                                                   TryCand, Cand, Depth))
1585
0
    return true;
1586
0
  if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
1587
0
                          Cand.NumHighLatencySuccessors,
1588
0
                          TryCand, Cand, Successor))
1589
0
    return true;
1590
0
  return false;
1591
0
}
1592
1593
bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1594
0
                                                    SIBlockSchedCandidate &TryCand) {
1595
0
  if (!Cand.isValid()) {
1596
0
    TryCand.Reason = NodeOrder;
1597
0
    return true;
1598
0
  }
1599
0
1600
0
  if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1601
0
                       TryCand, Cand, RegUsage))
1602
0
    return true;
1603
0
  if (SISched::tryGreater(TryCand.NumSuccessors > 0,
1604
0
                          Cand.NumSuccessors > 0,
1605
0
                          TryCand, Cand, Successor))
1606
0
    return true;
1607
0
  if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1608
0
    return true;
1609
0
  if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1610
0
                       TryCand, Cand, RegUsage))
1611
0
    return true;
1612
0
  return false;
1613
0
}
1614
1615
30
SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1616
30
  SIBlockSchedCandidate Cand;
1617
30
  std::vector<SIScheduleBlock*>::iterator Best;
1618
30
  SIScheduleBlock *Block;
1619
30
  if (ReadyBlocks.empty())
1620
9
    return nullptr;
1621
21
1622
21
  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1623
21
                        VregCurrentUsage, SregCurrentUsage);
1624
21
  if (VregCurrentUsage > maxVregUsage)
1625
7
    maxVregUsage = VregCurrentUsage;
1626
21
  if (SregCurrentUsage > maxSregUsage)
1627
10
    maxSregUsage = SregCurrentUsage;
1628
21
  LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1629
21
             for (SIScheduleBlock *Block
1630
21
                  : ReadyBlocks) dbgs()
1631
21
             << Block->getID() << ' ';
1632
21
             dbgs() << "\nCurrent Live:\n";
1633
21
             for (unsigned Reg
1634
21
                  : LiveRegs) dbgs()
1635
21
             << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1636
21
             dbgs() << '\n';
1637
21
             dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1638
21
             dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
1639
21
1640
21
  Cand.Block = nullptr;
1641
21
  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1642
42
       E = ReadyBlocks.end(); I != E; 
++I21
) {
1643
21
    SIBlockSchedCandidate TryCand;
1644
21
    TryCand.Block = *I;
1645
21
    TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1646
21
    TryCand.VGPRUsageDiff =
1647
21
      checkRegUsageImpact(TryCand.Block->getInRegs(),
1648
21
                          TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1649
21
    TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1650
21
    TryCand.NumHighLatencySuccessors =
1651
21
      TryCand.Block->getNumHighLatencySuccessors();
1652
21
    TryCand.LastPosHighLatParentScheduled =
1653
21
      (unsigned int) std::max<int> (0,
1654
21
         LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1655
21
           LastPosWaitedHighLatency);
1656
21
    TryCand.Height = TryCand.Block->Height;
1657
21
    // Try not to increase VGPR usage too much, else we may spill.
1658
21
    if (VregCurrentUsage > 120 ||
1659
21
        Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1660
0
      if (!tryCandidateRegUsage(Cand, TryCand) &&
1661
0
          Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1662
0
        tryCandidateLatency(Cand, TryCand);
1663
21
    } else {
1664
21
      if (!tryCandidateLatency(Cand, TryCand))
1665
0
        tryCandidateRegUsage(Cand, TryCand);
1666
21
    }
1667
21
    if (TryCand.Reason != NoCand) {
1668
21
      Cand.setBest(TryCand);
1669
21
      Best = I;
1670
21
      LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1671
21
                        << getReasonStr(Cand.Reason) << '\n');
1672
21
    }
1673
21
  }
1674
21
1675
21
  LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1676
21
             dbgs() << "Is a block with high latency instruction: "
1677
21
                    << (Cand.IsHighLatency ? "yes\n" : "no\n");
1678
21
             dbgs() << "Position of last high latency dependency: "
1679
21
                    << Cand.LastPosHighLatParentScheduled << '\n';
1680
21
             dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1681
21
             dbgs() << '\n';);
1682
21
1683
21
  Block = Cand.Block;
1684
21
  ReadyBlocks.erase(Best);
1685
21
  return Block;
1686
21
}
1687
1688
// Tracking of currently alive registers to determine VGPR Usage.
1689
1690
30
void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1691
61
  for (unsigned Reg : Regs) {
1692
61
    // For now only track virtual registers.
1693
61
    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1694
16
      continue;
1695
45
    // If not already in the live set, then add it.
1696
45
    (void) LiveRegs.insert(Reg);
1697
45
  }
1698
30
}
1699
1700
void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1701
21
                                       std::set<unsigned> &Regs) {
1702
33
  for (unsigned Reg : Regs) {
1703
33
    // For now only track virtual registers.
1704
33
    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1705
33
    assert (Pos != LiveRegs.end() && // Reg must be live.
1706
33
               LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1707
33
               LiveRegsConsumers[Reg] >= 1);
1708
33
    --LiveRegsConsumers[Reg];
1709
33
    if (LiveRegsConsumers[Reg] == 0)
1710
30
      LiveRegs.erase(Pos);
1711
33
  }
1712
21
}
1713
1714
21
void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1715
21
  for (const auto &Block : Parent->getSuccs()) {
1716
17
    if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1717
12
      ReadyBlocks.push_back(Block.first);
1718
17
1719
17
    if (Parent->isHighLatencyBlock() &&
1720
17
        
Block.second == SIScheduleBlockLinkKind::Data8
)
1721
6
      LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
1722
17
  }
1723
21
}
1724
1725
21
void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1726
21
  decreaseLiveRegs(Block, Block->getInRegs());
1727
21
  addLiveRegs(Block->getOutRegs());
1728
21
  releaseBlockSuccs(Block);
1729
21
  for (std::map<unsigned, unsigned>::iterator RegI =
1730
21
       LiveOutRegsNumUsages[Block->getID()].begin(),
1731
52
       E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; 
++RegI31
) {
1732
31
    std::pair<unsigned, unsigned> RegP = *RegI;
1733
31
    // We produce this register, thus it must not be previously alive.
1734
31
    assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
1735
31
           LiveRegsConsumers[RegP.first] == 0);
1736
31
    LiveRegsConsumers[RegP.first] += RegP.second;
1737
31
  }
1738
21
  if (LastPosHighLatencyParentScheduled[Block->getID()] >
1739
21
        (unsigned)LastPosWaitedHighLatency)
1740
0
    LastPosWaitedHighLatency =
1741
0
      LastPosHighLatencyParentScheduled[Block->getID()];
1742
21
  ++NumBlockScheduled;
1743
21
}
1744
1745
std::vector<int>
1746
SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1747
21
                                     std::set<unsigned> &OutRegs) {
1748
21
  std::vector<int> DiffSetPressure;
1749
21
  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1750
21
1751
33
  for (unsigned Reg : InRegs) {
1752
33
    // For now only track virtual registers.
1753
33
    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1754
0
      continue;
1755
33
    if (LiveRegsConsumers[Reg] > 1)
1756
3
      continue;
1757
30
    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1758
97
    for (; PSetI.isValid(); 
++PSetI67
) {
1759
67
      DiffSetPressure[*PSetI] -= PSetI.getWeight();
1760
67
    }
1761
30
  }
1762
21
1763
33
  for (unsigned Reg : OutRegs) {
1764
33
    // For now only track virtual registers.
1765
33
    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1766
0
      continue;
1767
33
    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1768
103
    for (; PSetI.isValid(); 
++PSetI70
) {
1769
70
      DiffSetPressure[*PSetI] += PSetI.getWeight();
1770
70
    }
1771
33
  }
1772
21
1773
21
  return DiffSetPressure;
1774
21
}
1775
1776
// SIScheduler //
1777
1778
struct SIScheduleBlockResult
1779
SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1780
9
                             SISchedulerBlockSchedulerVariant ScheduleVariant) {
1781
9
  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1782
9
  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1783
9
  std::vector<SIScheduleBlock*> ScheduledBlocks;
1784
9
  struct SIScheduleBlockResult Res;
1785
9
1786
9
  ScheduledBlocks = Scheduler.getBlocks();
1787
9
1788
30
  for (unsigned b = 0; b < ScheduledBlocks.size(); 
++b21
) {
1789
21
    SIScheduleBlock *Block = ScheduledBlocks[b];
1790
21
    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1791
21
1792
21
    for (SUnit* SU : SUs)
1793
63
      Res.SUs.push_back(SU->NodeNum);
1794
21
  }
1795
9
1796
9
  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1797
9
  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1798
9
  return Res;
1799
9
}
1800
1801
// SIScheduleDAGMI //
1802
1803
SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1804
4
  ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
1805
4
  SITII = static_cast<const SIInstrInfo*>(TII);
1806
4
  SITRI = static_cast<const SIRegisterInfo*>(TRI);
1807
4
1808
4
  VGPRSetID = SITRI->getVGPRPressureSet();
1809
4
  SGPRSetID = SITRI->getSGPRPressureSet();
1810
4
}
1811
1812
4
SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1813
1814
// Code adapted from scheduleDAG.cpp
1815
// Does a topological sort over the SUs.
1816
// Both TopDown and BottomUp
1817
9
void SIScheduleDAGMI::topologicalSort() {
1818
9
  Topo.InitDAGTopologicalSorting();
1819
9
1820
9
  TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1821
9
  BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1822
9
}
1823
1824
// Move low latencies further from their user without
1825
// increasing SGPR usage (in general)
1826
// This is to be replaced by a better pass that would
1827
// take into account SGPR usage (based on VGPR Usage
1828
// and the corresponding wavefront count), that would
1829
// try to merge groups of loads if it make sense, etc
1830
9
void SIScheduleDAGMI::moveLowLatencies() {
1831
9
   unsigned DAGSize = SUnits.size();
1832
9
   int LastLowLatencyUser = -1;
1833
9
   int LastLowLatencyPos = -1;
1834
9
1835
72
   for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; 
++i63
) {
1836
63
    SUnit *SU = &SUnits[ScheduledSUnits[i]];
1837
63
    bool IsLowLatencyUser = false;
1838
63
    unsigned MinPos = 0;
1839
63
1840
74
    for (SDep& PredDep : SU->Preds) {
1841
74
      SUnit *Pred = PredDep.getSUnit();
1842
74
      if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
1843
0
        IsLowLatencyUser = true;
1844
0
      }
1845
74
      if (Pred->NodeNum >= DAGSize)
1846
0
        continue;
1847
74
      unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1848
74
      if (PredPos >= MinPos)
1849
44
        MinPos = PredPos + 1;
1850
74
    }
1851
63
1852
63
    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1853
6
      unsigned BestPos = LastLowLatencyUser + 1;
1854
6
      if ((int)BestPos <= LastLowLatencyPos)
1855
2
        BestPos = LastLowLatencyPos + 1;
1856
6
      if (BestPos < MinPos)
1857
6
        BestPos = MinPos;
1858
6
      if (BestPos < i) {
1859
24
        for (unsigned u = i; u > BestPos; 
--u20
) {
1860
20
          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1861
20
          ScheduledSUnits[u] = ScheduledSUnits[u-1];
1862
20
        }
1863
4
        ScheduledSUnits[BestPos] = SU->NodeNum;
1864
4
        ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1865
4
      }
1866
6
      LastLowLatencyPos = BestPos;
1867
6
      if (IsLowLatencyUser)
1868
0
        LastLowLatencyUser = BestPos;
1869
57
    } else if (IsLowLatencyUser) {
1870
0
      LastLowLatencyUser = i;
1871
0
    // Moves COPY instructions on which depends
1872
0
    // the low latency instructions too.
1873
57
    } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1874
19
      bool CopyForLowLat = false;
1875
27
      for (SDep& SuccDep : SU->Succs) {
1876
27
        SUnit *Succ = SuccDep.getSUnit();
1877
27
        if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1878
4
          continue;
1879
23
        if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
1880
4
          CopyForLowLat = true;
1881
4
        }
1882
23
      }
1883
19
      if (!CopyForLowLat)
1884
15
        continue;
1885
4
      if (MinPos < i) {
1886
0
        for (unsigned u = i; u > MinPos; --u) {
1887
0
          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1888
0
          ScheduledSUnits[u] = ScheduledSUnits[u-1];
1889
0
        }
1890
0
        ScheduledSUnits[MinPos] = SU->NodeNum;
1891
0
        ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1892
0
      }
1893
4
    }
1894
63
  }
1895
9
}
1896
1897
9
void SIScheduleDAGMI::restoreSULinksLeft() {
1898
72
  for (unsigned i = 0, e = SUnits.size(); i != e; 
++i63
) {
1899
63
    SUnits[i].isScheduled = false;
1900
63
    SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1901
63
    SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1902
63
    SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1903
63
    SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1904
63
  }
1905
9
}
1906
1907
// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1908
template<typename _Iterator> void
1909
SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1910
21
                                  unsigned &VgprUsage, unsigned &SgprUsage) {
1911
21
  VgprUsage = 0;
1912
21
  SgprUsage = 0;
1913
58
  for (_Iterator RegI = First; RegI != End; 
++RegI37
) {
1914
37
    unsigned Reg = *RegI;
1915
37
    // For now only track virtual registers
1916
37
    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1917
0
      continue;
1918
37
    PSetIterator PSetI = MRI.getPressureSets(Reg);
1919
113
    for (; PSetI.isValid(); 
++PSetI76
) {
1920
76
      if (*PSetI == VGPRSetID)
1921
16
        VgprUsage += PSetI.getWeight();
1922
60
      else if (*PSetI == SGPRSetID)
1923
21
        SgprUsage += PSetI.getWeight();
1924
76
    }
1925
37
  }
1926
21
}
1927
1928
void SIScheduleDAGMI::schedule()
1929
9
{
1930
9
  SmallVector<SUnit*, 8> TopRoots, BotRoots;
1931
9
  SIScheduleBlockResult Best, Temp;
1932
9
  LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1933
9
1934
9
  buildDAGWithRegPressure();
1935
9
  LLVM_DEBUG(dump());
1936
9
1937
9
  topologicalSort();
1938
9
  findRootsAndBiasEdges(TopRoots, BotRoots);
1939
9
  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1940
9
  // functions, but to make them happy we must initialize
1941
9
  // the default Scheduler implementation (even if we do not
1942
9
  // run it)
1943
9
  SchedImpl->initialize(this);
1944
9
  initQueues(TopRoots, BotRoots);
1945
9
1946
9
  // Fill some stats to help scheduling.
1947
9
1948
9
  SUnitsLinksBackup = SUnits;
1949
9
  IsLowLatencySU.clear();
1950
9
  LowLatencyOffset.clear();
1951
9
  IsHighLatencySU.clear();
1952
9
1953
9
  IsLowLatencySU.resize(SUnits.size(), 0);
1954
9
  LowLatencyOffset.resize(SUnits.size(), 0);
1955
9
  IsHighLatencySU.resize(SUnits.size(), 0);
1956
9
1957
72
  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; 
++i63
) {
1958
63
    SUnit *SU = &SUnits[i];
1959
63
    const MachineOperand *BaseLatOp;
1960
63
    int64_t OffLatReg;
1961
63
    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1962
6
      IsLowLatencySU[i] = 1;
1963
6
      if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
1964
6
                                         TRI))
1965
6
        LowLatencyOffset[i] = OffLatReg;
1966
57
    } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
1967
3
      IsHighLatencySU[i] = 1;
1968
63
  }
1969
9
1970
9
  SIScheduler Scheduler(this);
1971
9
  Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1972
9
                                   SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1973
9
1974
9
  // if VGPR usage is extremely high, try other good performing variants
1975
9
  // which could lead to lower VGPR usage
1976
9
  if (Best.MaxVGPRUsage > 180) {
1977
0
    static const std::pair<SISchedulerBlockCreatorVariant,
1978
0
                           SISchedulerBlockSchedulerVariant>
1979
0
        Variants[] = {
1980
0
      { LatenciesAlone, BlockRegUsageLatency },
1981
0
//      { LatenciesAlone, BlockRegUsage },
1982
0
      { LatenciesGrouped, BlockLatencyRegUsage },
1983
0
//      { LatenciesGrouped, BlockRegUsageLatency },
1984
0
//      { LatenciesGrouped, BlockRegUsage },
1985
0
      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1986
0
//      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1987
0
//      { LatenciesAlonePlusConsecutive, BlockRegUsage }
1988
0
    };
1989
0
    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1990
0
      Temp = Scheduler.scheduleVariant(v.first, v.second);
1991
0
      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1992
0
        Best = Temp;
1993
0
    }
1994
0
  }
1995
9
  // if VGPR usage is still extremely high, we may spill. Try other variants
1996
9
  // which are less performing, but that could lead to lower VGPR usage.
1997
9
  if (Best.MaxVGPRUsage > 200) {
1998
0
    static const std::pair<SISchedulerBlockCreatorVariant,
1999
0
                           SISchedulerBlockSchedulerVariant>
2000
0
        Variants[] = {
2001
0
//      { LatenciesAlone, BlockRegUsageLatency },
2002
0
      { LatenciesAlone, BlockRegUsage },
2003
0
//      { LatenciesGrouped, BlockLatencyRegUsage },
2004
0
      { LatenciesGrouped, BlockRegUsageLatency },
2005
0
      { LatenciesGrouped, BlockRegUsage },
2006
0
//      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
2007
0
      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
2008
0
      { LatenciesAlonePlusConsecutive, BlockRegUsage }
2009
0
    };
2010
0
    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
2011
0
      Temp = Scheduler.scheduleVariant(v.first, v.second);
2012
0
      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
2013
0
        Best = Temp;
2014
0
    }
2015
0
  }
2016
9
2017
9
  ScheduledSUnits = Best.SUs;
2018
9
  ScheduledSUnitsInv.resize(SUnits.size());
2019
9
2020
72
  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; 
++i63
) {
2021
63
    ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
2022
63
  }
2023
9
2024
9
  moveLowLatencies();
2025
9
2026
9
  // Tell the outside world about the result of the scheduling.
2027
9
2028
9
  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
2029
9
  TopRPTracker.setPos(CurrentTop);
2030
9
2031
9
  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
2032
72
       E = ScheduledSUnits.end(); I != E; 
++I63
) {
2033
63
    SUnit *SU = &SUnits[*I];
2034
63
2035
63
    scheduleMI(SU, true);
2036
63
2037
63
    LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
2038
63
                      << *SU->getInstr());
2039
63
  }
2040
9
2041
9
  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
2042
9
2043
9
  placeDebugValues();
2044
9
2045
9
  LLVM_DEBUG({
2046
9
    dbgs() << "*** Final schedule for "
2047
9
           << printMBBReference(*begin()->getParent()) << " ***\n";
2048
9
    dumpSchedule();
2049
9
    dbgs() << '\n';
2050
9
  });
2051
9
}