Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
/// \file
11
/// \brief SI Machine Scheduler interface
12
//
13
//===----------------------------------------------------------------------===//
14
15
#include "SIMachineScheduler.h"
16
#include "AMDGPU.h"
17
#include "SIInstrInfo.h"
18
#include "SIRegisterInfo.h"
19
#include "llvm/ADT/STLExtras.h"
20
#include "llvm/ADT/SmallVector.h"
21
#include "llvm/CodeGen/LiveInterval.h"
22
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
23
#include "llvm/CodeGen/MachineInstr.h"
24
#include "llvm/CodeGen/MachineRegisterInfo.h"
25
#include "llvm/CodeGen/MachineScheduler.h"
26
#include "llvm/CodeGen/RegisterPressure.h"
27
#include "llvm/CodeGen/SlotIndexes.h"
28
#include "llvm/Support/Debug.h"
29
#include "llvm/Support/ErrorHandling.h"
30
#include "llvm/Support/raw_ostream.h"
31
#include "llvm/Target/TargetRegisterInfo.h"
32
#include <algorithm>
33
#include <cassert>
34
#include <map>
35
#include <set>
36
#include <utility>
37
#include <vector>
38
39
using namespace llvm;
40
41
#define DEBUG_TYPE "machine-scheduler"
42
43
// This scheduler implements a different scheduling algorithm than
44
// GenericScheduler.
45
//
46
// There are several specific architecture behaviours that can't be modelled
47
// for GenericScheduler:
48
// . When accessing the result of an SGPR load instruction, you have to wait
49
// for all the SGPR load instructions before your current instruction to
50
// have finished.
51
// . When accessing the result of an VGPR load instruction, you have to wait
52
// for all the VGPR load instructions previous to the VGPR load instruction
53
// you are interested in to finish.
54
// . The less the register pressure, the best load latencies are hidden
55
//
56
// Moreover some specifities (like the fact a lot of instructions in the shader
57
// have few dependencies) makes the generic scheduler have some unpredictable
58
// behaviours. For example when register pressure becomes high, it can either
59
// manage to prevent register pressure from going too high, or it can
60
// increase register pressure even more than if it hadn't taken register
61
// pressure into account.
62
//
63
// Also some other bad behaviours are generated, like loading at the beginning
64
// of the shader a constant in VGPR you won't need until the end of the shader.
65
//
66
// The scheduling problem for SI can distinguish three main parts:
67
// . Hiding high latencies (texture sampling, etc)
68
// . Hiding low latencies (SGPR constant loading, etc)
69
// . Keeping register usage low for better latency hiding and general
70
//   performance
71
//
72
// Some other things can also affect performance, but are hard to predict
73
// (cache usage, the fact the HW can issue several instructions from different
74
// wavefronts if different types, etc)
75
//
76
// This scheduler tries to solve the scheduling problem by dividing it into
77
// simpler sub-problems. It divides the instructions into blocks, schedules
78
// locally inside the blocks where it takes care of low latencies, and then
79
// chooses the order of the blocks by taking care of high latencies.
80
// Dividing the instructions into blocks helps control keeping register
81
// usage low.
82
//
83
// First the instructions are put into blocks.
84
//   We want the blocks help control register usage and hide high latencies
85
//   later. To help control register usage, we typically want all local
86
//   computations, when for example you create a result that can be comsummed
87
//   right away, to be contained in a block. Block inputs and outputs would
88
//   typically be important results that are needed in several locations of
89
//   the shader. Since we do want blocks to help hide high latencies, we want
90
//   the instructions inside the block to have a minimal set of dependencies
91
//   on high latencies. It will make it easy to pick blocks to hide specific
92
//   high latencies.
93
//   The block creation algorithm is divided into several steps, and several
94
//   variants can be tried during the scheduling process.
95
//
96
// Second the order of the instructions inside the blocks is chosen.
97
//   At that step we do take into account only register usage and hiding
98
//   low latency instructions
99
//
100
// Third the block order is chosen, there we try to hide high latencies
101
// and keep register usage low.
102
//
103
// After the third step, a pass is done to improve the hiding of low
104
// latencies.
105
//
106
// Actually when talking about 'low latency' or 'high latency' it includes
107
// both the latency to get the cache (or global mem) data go to the register,
108
// and the bandwidth limitations.
109
// Increasing the number of active wavefronts helps hide the former, but it
110
// doesn't solve the latter, thus why even if wavefront count is high, we have
111
// to try have as many instructions hiding high latencies as possible.
112
// The OpenCL doc says for example latency of 400 cycles for a global mem access,
113
// which is hidden by 10 instructions if the wavefront count is 10.
114
115
// Some figures taken from AMD docs:
116
// Both texture and constant L1 caches are 4-way associative with 64 bytes
117
// lines.
118
// Constant cache is shared with 4 CUs.
119
// For texture sampling, the address generation unit receives 4 texture
120
// addresses per cycle, thus we could expect texture sampling latency to be
121
// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
122
// instructions in a wavefront group are executed every 4 cycles),
123
// or 16 instructions if the other wavefronts associated to the 3 other VALUs
124
// of the CU do texture sampling too. (Don't take these figures too seriously,
125
// as I'm not 100% sure of the computation)
126
// Data exports should get similar latency.
127
// For constant loading, the cache is shader with 4 CUs.
128
// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
129
// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
130
// It means a simple s_buffer_load should take one instruction to hide, as
131
// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
132
// cache line.
133
//
134
// As of today the driver doesn't preload the constants in cache, thus the
135
// first loads get extra latency. The doc says global memory access can be
136
// 300-600 cycles. We do not specially take that into account when scheduling
137
// As we expect the driver to be able to preload the constants soon.
138
139
// common code //
140
141
#ifndef NDEBUG
142
143
static const char *getReasonStr(SIScheduleCandReason Reason) {
144
  switch (Reason) {
145
  case NoCand:         return "NOCAND";
146
  case RegUsage:       return "REGUSAGE";
147
  case Latency:        return "LATENCY";
148
  case Successor:      return "SUCCESSOR";
149
  case Depth:          return "DEPTH";
150
  case NodeOrder:      return "ORDER";
151
  }
152
  llvm_unreachable("Unknown reason!");
153
}
154
155
#endif
156
157
static bool tryLess(int TryVal, int CandVal,
158
                    SISchedulerCandidate &TryCand,
159
                    SISchedulerCandidate &Cand,
160
17
                    SIScheduleCandReason Reason) {
161
17
  if (
TryVal < CandVal17
) {
162
2
    TryCand.Reason = Reason;
163
2
    return true;
164
2
  }
165
15
  
if (15
TryVal > CandVal15
) {
166
0
    if (Cand.Reason > Reason)
167
0
      Cand.Reason = Reason;
168
0
    return true;
169
0
  }
170
15
  Cand.setRepeat(Reason);
171
15
  return false;
172
15
}
173
174
static bool tryGreater(int TryVal, int CandVal,
175
                       SISchedulerCandidate &TryCand,
176
                       SISchedulerCandidate &Cand,
177
9
                       SIScheduleCandReason Reason) {
178
9
  if (
TryVal > CandVal9
) {
179
1
    TryCand.Reason = Reason;
180
1
    return true;
181
1
  }
182
8
  
if (8
TryVal < CandVal8
) {
183
0
    if (Cand.Reason > Reason)
184
0
      Cand.Reason = Reason;
185
0
    return true;
186
0
  }
187
8
  Cand.setRepeat(Reason);
188
8
  return false;
189
8
}
190
191
// SIScheduleBlock //
192
193
16
void SIScheduleBlock::addUnit(SUnit *SU) {
194
16
  NodeNum2Index[SU->NodeNum] = SUnits.size();
195
16
  SUnits.push_back(SU);
196
16
}
197
198
#ifndef NDEBUG
199
void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
200
201
  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
202
  dbgs() << '\n';
203
}
204
#endif
205
206
void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
207
25
                                          SISchedCandidate &TryCand) {
208
25
  // Initialize the candidate if needed.
209
25
  if (
!Cand.isValid()25
) {
210
16
    TryCand.Reason = NodeOrder;
211
16
    return;
212
16
  }
213
9
214
9
  
if (9
Cand.SGPRUsage > 60 &&
215
0
      tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
216
0
    return;
217
9
218
9
  // Schedule low latency instructions as top as possible.
219
9
  // Order of priority is:
220
9
  // . Low latency instructions which do not depend on other low latency
221
9
  //   instructions we haven't waited for
222
9
  // . Other instructions which do not depend on low latency instructions
223
9
  //   we haven't waited for
224
9
  // . Low latencies
225
9
  // . All other instructions
226
9
  // Goal is to get: low latency instructions - independent instructions
227
9
  //     - (eventually some more low latency instructions)
228
9
  //     - instructions that depend on the first low latency instructions.
229
9
  // If in the block there is a lot of constant loads, the SGPR usage
230
9
  // could go quite high, thus above the arbitrary limit of 60 will encourage
231
9
  // use the already loaded constants (in order to release some SGPRs) before
232
9
  // loading more.
233
9
  
if (9
tryLess(TryCand.HasLowLatencyNonWaitedParent,
234
9
              Cand.HasLowLatencyNonWaitedParent,
235
9
              TryCand, Cand, SIScheduleCandReason::Depth))
236
0
    return;
237
9
238
9
  
if (9
tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
239
9
                 TryCand, Cand, SIScheduleCandReason::Depth))
240
1
    return;
241
8
242
8
  
if (8
TryCand.IsLowLatency &&
243
0
      tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
244
0
              TryCand, Cand, SIScheduleCandReason::Depth))
245
0
    return;
246
8
247
8
  
if (8
tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage)8
)
248
2
    return;
249
6
250
6
  // Fall through to original instruction order.
251
6
  
if (6
TryCand.SU->NodeNum < Cand.SU->NodeNum6
) {
252
0
    TryCand.Reason = NodeOrder;
253
0
  }
254
25
}
255
256
16
SUnit* SIScheduleBlock::pickNode() {
257
16
  SISchedCandidate TopCand;
258
16
259
25
  for (SUnit* SU : TopReadySUs) {
260
25
    SISchedCandidate TryCand;
261
25
    std::vector<unsigned> pressure;
262
25
    std::vector<unsigned> MaxPressure;
263
25
    // Predict register usage after this instruction.
264
25
    TryCand.SU = SU;
265
25
    TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
266
25
    TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
267
25
    TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
268
25
    TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
269
25
    TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
270
25
    TryCand.HasLowLatencyNonWaitedParent =
271
25
      HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
272
25
    tryCandidateTopDown(TopCand, TryCand);
273
25
    if (TryCand.Reason != NoCand)
274
19
      TopCand.setBest(TryCand);
275
25
  }
276
16
277
16
  return TopCand.SU;
278
16
}
279
280
281
// Schedule something valid.
282
5
void SIScheduleBlock::fastSchedule() {
283
5
  TopReadySUs.clear();
284
5
  if (Scheduled)
285
0
    undoSchedule();
286
5
287
16
  for (SUnit* SU : SUnits) {
288
16
    if (!SU->NumPredsLeft)
289
10
      TopReadySUs.push_back(SU);
290
16
  }
291
5
292
21
  while (
!TopReadySUs.empty()21
) {
293
16
    SUnit *SU = TopReadySUs[0];
294
16
    ScheduledSUnits.push_back(SU);
295
16
    nodeScheduled(SU);
296
16
  }
297
5
298
5
  Scheduled = true;
299
5
}
300
301
// Returns if the register was set between first and last.
302
static bool isDefBetween(unsigned Reg,
303
                           SlotIndex First, SlotIndex Last,
304
                           const MachineRegisterInfo *MRI,
305
8
                           const LiveIntervals *LIS) {
306
8
  for (MachineRegisterInfo::def_instr_iterator
307
8
       UI = MRI->def_instr_begin(Reg),
308
9
       UE = MRI->def_instr_end(); 
UI != UE9
;
++UI1
) {
309
9
    const MachineInstr* MI = &*UI;
310
9
    if (MI->isDebugValue())
311
0
      continue;
312
9
    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
313
9
    if (
InstSlot >= First && 9
InstSlot <= Last9
)
314
8
      return true;
315
9
  }
316
0
  return false;
317
8
}
318
319
void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
320
5
                                      MachineBasicBlock::iterator EndBlock) {
321
5
  IntervalPressure Pressure, BotPressure;
322
5
  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
323
5
  LiveIntervals *LIS = DAG->getLIS();
324
5
  MachineRegisterInfo *MRI = DAG->getMRI();
325
5
  DAG->initRPTracker(TopRPTracker);
326
5
  DAG->initRPTracker(BotRPTracker);
327
5
  DAG->initRPTracker(RPTracker);
328
5
329
5
  // Goes though all SU. RPTracker captures what had to be alive for the SUs
330
5
  // to execute, and what is still alive at the end.
331
16
  for (SUnit* SU : ScheduledSUnits) {
332
16
    RPTracker.setPos(SU->getInstr());
333
16
    RPTracker.advance();
334
16
  }
335
5
336
5
  // Close the RPTracker to finalize live ins/outs.
337
5
  RPTracker.closeRegion();
338
5
339
5
  // Initialize the live ins and live outs.
340
5
  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
341
5
  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
342
5
343
5
  // Do not Track Physical Registers, because it messes up.
344
15
  for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
345
15
    if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
346
8
      LiveInRegs.insert(RegMaskPair.RegUnit);
347
15
  }
348
5
  LiveOutRegs.clear();
349
5
  // There is several possibilities to distinguish:
350
5
  // 1) Reg is not input to any instruction in the block, but is output of one
351
5
  // 2) 1) + read in the block and not needed after it
352
5
  // 3) 1) + read in the block but needed in another block
353
5
  // 4) Reg is input of an instruction but another block will read it too
354
5
  // 5) Reg is input of an instruction and then rewritten in the block.
355
5
  //    result is not read in the block (implies used in another block)
356
5
  // 6) Reg is input of an instruction and then rewritten in the block.
357
5
  //    result is read in the block and not needed in another block
358
5
  // 7) Reg is input of an instruction and then rewritten in the block.
359
5
  //    result is read in the block but also needed in another block
360
5
  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
361
5
  // We want LiveOutRegs to contain only Regs whose content will be read after
362
5
  // in another block, and whose content was written in the current block,
363
5
  // that is we want it to get 1, 3, 5, 7
364
5
  // Since we made the MIs of a block to be packed all together before
365
5
  // scheduling, then the LiveIntervals were correct, and the RPTracker was
366
5
  // able to correctly handle 5 vs 6, 2 vs 3.
367
5
  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
368
5
  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
369
5
  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
370
5
  // The use of findDefBetween removes the case 4.
371
8
  for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
372
8
    unsigned Reg = RegMaskPair.RegUnit;
373
8
    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
374
8
        isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
375
8
                     LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
376
8
                     LIS)) {
377
8
      LiveOutRegs.insert(Reg);
378
8
    }
379
8
  }
380
5
381
5
  // Pressure = sum_alive_registers register size
382
5
  // Internally llvm will represent some registers as big 128 bits registers
383
5
  // for example, but they actually correspond to 4 actual 32 bits registers.
384
5
  // Thus Pressure is not equal to num_alive_registers * constant.
385
5
  LiveInPressure = TopPressure.MaxSetPressure;
386
5
  LiveOutPressure = BotPressure.MaxSetPressure;
387
5
388
5
  // Prepares TopRPTracker for top down scheduling.
389
5
  TopRPTracker.closeTop();
390
5
}
391
392
void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
393
5
                               MachineBasicBlock::iterator EndBlock) {
394
5
  if (!Scheduled)
395
0
    fastSchedule();
396
5
397
5
  // PreScheduling phase to set LiveIn and LiveOut.
398
5
  initRegPressure(BeginBlock, EndBlock);
399
5
  undoSchedule();
400
5
401
5
  // Schedule for real now.
402
5
403
5
  TopReadySUs.clear();
404
5
405
16
  for (SUnit* SU : SUnits) {
406
16
    if (!SU->NumPredsLeft)
407
10
      TopReadySUs.push_back(SU);
408
16
  }
409
5
410
21
  while (
!TopReadySUs.empty()21
) {
411
16
    SUnit *SU = pickNode();
412
16
    ScheduledSUnits.push_back(SU);
413
16
    TopRPTracker.setPos(SU->getInstr());
414
16
    TopRPTracker.advance();
415
16
    nodeScheduled(SU);
416
16
  }
417
5
418
5
  // TODO: compute InternalAdditionnalPressure.
419
5
  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
420
5
421
5
  // Check everything is right.
422
#ifndef NDEBUG
423
  assert(SUnits.size() == ScheduledSUnits.size() &&
424
            TopReadySUs.empty());
425
  for (SUnit* SU : SUnits) {
426
    assert(SU->isScheduled &&
427
              SU->NumPredsLeft == 0);
428
  }
429
#endif
430
431
5
  Scheduled = true;
432
5
}
433
434
5
void SIScheduleBlock::undoSchedule() {
435
16
  for (SUnit* SU : SUnits) {
436
16
    SU->isScheduled = false;
437
24
    for (SDep& Succ : SU->Succs) {
438
24
      if (BC->isSUInBlock(Succ.getSUnit(), ID))
439
12
        undoReleaseSucc(SU, &Succ);
440
24
    }
441
16
  }
442
5
  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
443
5
  ScheduledSUnits.clear();
444
5
  Scheduled = false;
445
5
}
446
447
12
void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
448
12
  SUnit *SuccSU = SuccEdge->getSUnit();
449
12
450
12
  if (
SuccEdge->isWeak()12
) {
451
0
    ++SuccSU->WeakPredsLeft;
452
0
    return;
453
0
  }
454
12
  ++SuccSU->NumPredsLeft;
455
12
}
456
457
33
void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
458
33
  SUnit *SuccSU = SuccEdge->getSUnit();
459
33
460
33
  if (
SuccEdge->isWeak()33
) {
461
0
    --SuccSU->WeakPredsLeft;
462
0
    return;
463
0
  }
464
#ifndef NDEBUG
465
  if (SuccSU->NumPredsLeft == 0) {
466
    dbgs() << "*** Scheduling failed! ***\n";
467
    SuccSU->dump(DAG);
468
    dbgs() << " has been released too many times!\n";
469
    llvm_unreachable(nullptr);
470
  }
471
#endif
472
473
33
  --SuccSU->NumPredsLeft;
474
33
}
475
476
/// Release Successors of the SU that are in the block or not.
477
48
void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
478
72
  for (SDep& Succ : SU->Succs) {
479
72
    SUnit *SuccSU = Succ.getSUnit();
480
72
481
72
    if (SuccSU->NodeNum >= DAG->SUnits.size())
482
9
        continue;
483
63
484
63
    
if (63
BC->isSUInBlock(SuccSU, ID) != InOrOutBlock63
)
485
30
      continue;
486
33
487
33
    releaseSucc(SU, &Succ);
488
33
    if (
SuccSU->NumPredsLeft == 0 && 33
InOrOutBlock16
)
489
12
      TopReadySUs.push_back(SuccSU);
490
72
  }
491
48
}
492
493
32
void SIScheduleBlock::nodeScheduled(SUnit *SU) {
494
32
  // Is in TopReadySUs
495
32
  assert (!SU->NumPredsLeft);
496
32
  std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
497
32
  if (
I == TopReadySUs.end()32
) {
498
0
    dbgs() << "Data Structure Bug in SI Scheduler\n";
499
0
    llvm_unreachable(nullptr);
500
0
  }
501
32
  TopReadySUs.erase(I);
502
32
503
32
  releaseSuccessors(SU, true);
504
32
  // Scheduling this node will trigger a wait,
505
32
  // thus propagate to other instructions that they do not need to wait either.
506
32
  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
507
0
    HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
508
32
509
32
  if (
DAG->IsLowLatencySU[SU->NodeNum]32
) {
510
4
     for (SDep& Succ : SU->Succs) {
511
4
      std::map<unsigned, unsigned>::iterator I =
512
4
        NodeNum2Index.find(Succ.getSUnit()->NodeNum);
513
4
      if (I != NodeNum2Index.end())
514
0
        HasLowLatencyNonWaitedParent[I->second] = 1;
515
4
    }
516
4
  }
517
32
  SU->isScheduled = true;
518
32
}
519
520
5
void SIScheduleBlock::finalizeUnits() {
521
5
  // We remove links from outside blocks to enable scheduling inside the block.
522
16
  for (SUnit* SU : SUnits) {
523
16
    releaseSuccessors(SU, false);
524
16
    if (DAG->IsHighLatencySU[SU->NodeNum])
525
1
      HighLatencyBlock = true;
526
16
  }
527
5
  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
528
5
}
529
530
// we maintain ascending order of IDs
531
9
void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
532
9
  unsigned PredID = Pred->getID();
533
9
534
9
  // Check if not already predecessor.
535
6
  for (SIScheduleBlock* P : Preds) {
536
6
    if (PredID == P->getID())
537
5
      return;
538
4
  }
539
4
  Preds.push_back(Pred);
540
4
541
4
  assert(none_of(Succs,
542
4
                 [=](std::pair<SIScheduleBlock*,
543
4
                     SIScheduleBlockLinkKind> S) {
544
4
                   return PredID == S.first->getID();
545
4
                    }) &&
546
4
         "Loop in the Block Graph!");
547
4
}
548
549
void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
550
9
                              SIScheduleBlockLinkKind Kind) {
551
9
  unsigned SuccID = Succ->getID();
552
9
553
9
  // Check if not already predecessor.
554
6
  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
555
6
    if (
SuccID == S.first->getID()6
) {
556
5
      if (S.second == SIScheduleBlockLinkKind::NoData &&
557
0
          Kind == SIScheduleBlockLinkKind::Data)
558
0
        S.second = Kind;
559
5
      return;
560
5
    }
561
4
  }
562
4
  
if (4
Succ->isHighLatencyBlock()4
)
563
0
    ++NumHighLatencySuccessors;
564
9
  Succs.push_back(std::make_pair(Succ, Kind));
565
9
566
9
  assert(none_of(Preds,
567
9
                 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
568
9
         "Loop in the Block Graph!");
569
9
}
570
571
#ifndef NDEBUG
572
void SIScheduleBlock::printDebug(bool full) {
573
  dbgs() << "Block (" << ID << ")\n";
574
  if (!full)
575
    return;
576
577
  dbgs() << "\nContains High Latency Instruction: "
578
         << HighLatencyBlock << '\n';
579
  dbgs() << "\nDepends On:\n";
580
  for (SIScheduleBlock* P : Preds) {
581
    P->printDebug(false);
582
  }
583
584
  dbgs() << "\nSuccessors:\n";
585
  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
586
    if (S.second == SIScheduleBlockLinkKind::Data)
587
      dbgs() << "(Data Dep) ";
588
    S.first->printDebug(false);
589
  }
590
591
  if (Scheduled) {
592
    dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
593
           << LiveInPressure[DAG->getVGPRSetID()] << '\n';
594
    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
595
           << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
596
    dbgs() << "LiveIns:\n";
597
    for (unsigned Reg : LiveInRegs)
598
      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
599
600
    dbgs() << "\nLiveOuts:\n";
601
    for (unsigned Reg : LiveOutRegs)
602
      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
603
  }
604
605
  dbgs() << "\nInstructions:\n";
606
  if (!Scheduled) {
607
    for (SUnit* SU : SUnits) {
608
      SU->dump(DAG);
609
    }
610
  } else {
611
    for (SUnit* SU : SUnits) {
612
      SU->dump(DAG);
613
    }
614
  }
615
616
  dbgs() << "///////////////////////\n";
617
}
618
#endif
619
620
// SIScheduleBlockCreator //
621
622
SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
623
2
DAG(DAG) {
624
2
}
625
626
2
SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
627
628
SIScheduleBlocks
629
2
SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
630
2
  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
631
2
    Blocks.find(BlockVariant);
632
2
  if (
B == Blocks.end()2
) {
633
2
    SIScheduleBlocks Res;
634
2
    createBlocksForVariant(BlockVariant);
635
2
    topologicalSort();
636
2
    scheduleInsideBlocks();
637
2
    fillStats();
638
2
    Res.Blocks = CurrentBlocks;
639
2
    Res.TopDownIndex2Block = TopDownIndex2Block;
640
2
    Res.TopDownBlock2Index = TopDownBlock2Index;
641
2
    Blocks[BlockVariant] = Res;
642
2
    return Res;
643
0
  } else {
644
0
    return B->second;
645
0
  }
646
0
}
647
648
87
bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
649
87
  if (SU->NodeNum >= DAG->SUnits.size())
650
3
    return false;
651
84
  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
652
84
}
653
654
2
void SIScheduleBlockCreator::colorHighLatenciesAlone() {
655
2
  unsigned DAGSize = DAG->SUnits.size();
656
2
657
18
  for (unsigned i = 0, e = DAGSize; 
i != e18
;
++i16
) {
658
16
    SUnit *SU = &DAG->SUnits[i];
659
16
    if (
DAG->IsHighLatencySU[SU->NodeNum]16
) {
660
1
      CurrentColoring[SU->NodeNum] = NextReservedID++;
661
1
    }
662
16
  }
663
2
}
664
665
static bool
666
0
hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
667
0
  for (const auto &PredDep : SU.Preds) {
668
0
    if (PredDep.getSUnit() == &FromSU &&
669
0
        PredDep.getKind() == llvm::SDep::Data)
670
0
      return true;
671
0
  }
672
0
  return false;
673
0
}
674
675
0
void SIScheduleBlockCreator::colorHighLatenciesGroups() {
676
0
  unsigned DAGSize = DAG->SUnits.size();
677
0
  unsigned NumHighLatencies = 0;
678
0
  unsigned GroupSize;
679
0
  int Color = NextReservedID;
680
0
  unsigned Count = 0;
681
0
  std::set<unsigned> FormingGroup;
682
0
683
0
  for (unsigned i = 0, e = DAGSize; 
i != e0
;
++i0
) {
684
0
    SUnit *SU = &DAG->SUnits[i];
685
0
    if (DAG->IsHighLatencySU[SU->NodeNum])
686
0
      ++NumHighLatencies;
687
0
  }
688
0
689
0
  if (NumHighLatencies == 0)
690
0
    return;
691
0
692
0
  
if (0
NumHighLatencies <= 60
)
693
0
    GroupSize = 2;
694
0
  else 
if (0
NumHighLatencies <= 120
)
695
0
    GroupSize = 3;
696
0
  else
697
0
    GroupSize = 4;
698
0
699
0
  for (unsigned SUNum : DAG->TopDownIndex2SU) {
700
0
    const SUnit &SU = DAG->SUnits[SUNum];
701
0
    if (
DAG->IsHighLatencySU[SU.NodeNum]0
) {
702
0
      unsigned CompatibleGroup = true;
703
0
      int ProposedColor = Color;
704
0
      std::vector<int> AdditionalElements;
705
0
706
0
      // We don't want to put in the same block
707
0
      // two high latency instructions that depend
708
0
      // on each other.
709
0
      // One way would be to check canAddEdge
710
0
      // in both directions, but that currently is not
711
0
      // enough because there the high latency order is
712
0
      // enforced (via links).
713
0
      // Instead, look at the dependencies between the
714
0
      // high latency instructions and deduce if it is
715
0
      // a data dependency or not.
716
0
      for (unsigned j : FormingGroup) {
717
0
        bool HasSubGraph;
718
0
        std::vector<int> SubGraph;
719
0
        // By construction (topological order), if SU and
720
0
        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
721
0
        // in the parent graph of SU.
722
#ifndef NDEBUG
723
        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
724
                                               HasSubGraph);
725
        assert(!HasSubGraph);
726
#endif
727
        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
728
0
                                               HasSubGraph);
729
0
        if (!HasSubGraph)
730
0
          continue; // No dependencies between each other
731
0
        else 
if (0
SubGraph.size() > 50
) {
732
0
          // Too many elements would be required to be added to the block.
733
0
          CompatibleGroup = false;
734
0
          break;
735
0
        }
736
0
        else {
737
0
          // Check the type of dependency
738
0
          for (unsigned k : SubGraph) {
739
0
            // If in the path to join the two instructions,
740
0
            // there is another high latency instruction,
741
0
            // or instructions colored for another block
742
0
            // abort the merge.
743
0
            if (DAG->IsHighLatencySU[k] ||
744
0
                (CurrentColoring[k] != ProposedColor &&
745
0
                 
CurrentColoring[k] != 00
)) {
746
0
              CompatibleGroup = false;
747
0
              break;
748
0
            }
749
0
            // If one of the SU in the subgraph depends on the result of SU j,
750
0
            // there'll be a data dependency.
751
0
            
if (0
hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])0
) {
752
0
              CompatibleGroup = false;
753
0
              break;
754
0
            }
755
0
          }
756
0
          if (!CompatibleGroup)
757
0
            break;
758
0
          // Same check for the SU
759
0
          
if (0
hasDataDependencyPred(SU, DAG->SUnits[j])0
) {
760
0
            CompatibleGroup = false;
761
0
            break;
762
0
          }
763
0
          // Add all the required instructions to the block
764
0
          // These cannot live in another block (because they
765
0
          // depend (order dependency) on one of the
766
0
          // instruction in the block, and are required for the
767
0
          // high latency instruction we add.
768
0
          AdditionalElements.insert(AdditionalElements.end(),
769
0
                                    SubGraph.begin(), SubGraph.end());
770
0
        }
771
0
      }
772
0
      if (
CompatibleGroup0
) {
773
0
        FormingGroup.insert(SU.NodeNum);
774
0
        for (unsigned j : AdditionalElements)
775
0
          CurrentColoring[j] = ProposedColor;
776
0
        CurrentColoring[SU.NodeNum] = ProposedColor;
777
0
        ++Count;
778
0
      }
779
0
      // Found one incompatible instruction,
780
0
      // or has filled a big enough group.
781
0
      // -> start a new one.
782
0
      if (
!CompatibleGroup0
) {
783
0
        FormingGroup.clear();
784
0
        Color = ++NextReservedID;
785
0
        ProposedColor = Color;
786
0
        FormingGroup.insert(SU.NodeNum);
787
0
        CurrentColoring[SU.NodeNum] = ProposedColor;
788
0
        Count = 0;
789
0
      } else 
if (0
Count == GroupSize0
) {
790
0
        FormingGroup.clear();
791
0
        Color = ++NextReservedID;
792
0
        ProposedColor = Color;
793
0
        Count = 0;
794
0
      }
795
0
    }
796
0
  }
797
0
}
798
799
2
void SIScheduleBlockCreator::colorComputeReservedDependencies() {
800
2
  unsigned DAGSize = DAG->SUnits.size();
801
2
  std::map<std::set<unsigned>, unsigned> ColorCombinations;
802
2
803
2
  CurrentTopDownReservedDependencyColoring.clear();
804
2
  CurrentBottomUpReservedDependencyColoring.clear();
805
2
806
2
  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
807
2
  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
808
2
809
2
  // Traverse TopDown, and give different colors to SUs depending
810
2
  // on which combination of High Latencies they depend on.
811
2
812
16
  for (unsigned SUNum : DAG->TopDownIndex2SU) {
813
16
    SUnit *SU = &DAG->SUnits[SUNum];
814
16
    std::set<unsigned> SUColors;
815
16
816
16
    // Already given.
817
16
    if (
CurrentColoring[SU->NodeNum]16
) {
818
1
      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
819
1
        CurrentColoring[SU->NodeNum];
820
1
      continue;
821
1
    }
822
15
823
15
   
for (SDep& PredDep : SU->Preds) 15
{
824
21
      SUnit *Pred = PredDep.getSUnit();
825
21
      if (
PredDep.isWeak() || 21
Pred->NodeNum >= DAGSize21
)
826
0
        continue;
827
21
      
if (21
CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 021
)
828
5
        SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
829
21
    }
830
15
    // Color 0 by default.
831
15
    if (SUColors.empty())
832
12
      continue;
833
3
    // Same color than parents.
834
3
    
if (3
SUColors.size() == 1 && 3
*SUColors.begin() > DAGSize2
)
835
0
      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
836
0
        *SUColors.begin();
837
3
    else {
838
3
      std::map<std::set<unsigned>, unsigned>::iterator Pos =
839
3
        ColorCombinations.find(SUColors);
840
3
      if (
Pos != ColorCombinations.end()3
) {
841
1
          CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
842
3
      } else {
843
2
        CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
844
2
          NextNonReservedID;
845
2
        ColorCombinations[SUColors] = NextNonReservedID++;
846
2
      }
847
3
    }
848
16
  }
849
2
850
2
  ColorCombinations.clear();
851
2
852
2
  // Same as before, but BottomUp.
853
2
854
16
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
855
16
    SUnit *SU = &DAG->SUnits[SUNum];
856
16
    std::set<unsigned> SUColors;
857
16
858
16
    // Already given.
859
16
    if (
CurrentColoring[SU->NodeNum]16
) {
860
1
      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
861
1
        CurrentColoring[SU->NodeNum];
862
1
      continue;
863
1
    }
864
15
865
15
    
for (SDep& SuccDep : SU->Succs) 15
{
866
21
      SUnit *Succ = SuccDep.getSUnit();
867
21
      if (
SuccDep.isWeak() || 21
Succ->NodeNum >= DAGSize21
)
868
3
        continue;
869
18
      
if (18
CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 018
)
870
0
        SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
871
21
    }
872
15
    // Keep color 0.
873
15
    if (SUColors.empty())
874
15
      continue;
875
0
    // Same color than parents.
876
0
    
if (0
SUColors.size() == 1 && 0
*SUColors.begin() > DAGSize0
)
877
0
      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
878
0
        *SUColors.begin();
879
0
    else {
880
0
      std::map<std::set<unsigned>, unsigned>::iterator Pos =
881
0
        ColorCombinations.find(SUColors);
882
0
      if (
Pos != ColorCombinations.end()0
) {
883
0
        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
884
0
      } else {
885
0
        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
886
0
          NextNonReservedID;
887
0
        ColorCombinations[SUColors] = NextNonReservedID++;
888
0
      }
889
0
    }
890
16
  }
891
2
}
892
893
2
void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
894
2
  unsigned DAGSize = DAG->SUnits.size();
895
2
  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
896
2
897
2
  // Every combination of colors given by the top down
898
2
  // and bottom up Reserved node dependency
899
2
900
18
  for (unsigned i = 0, e = DAGSize; 
i != e18
;
++i16
) {
901
16
    SUnit *SU = &DAG->SUnits[i];
902
16
    std::pair<unsigned, unsigned> SUColors;
903
16
904
16
    // High latency instructions: already given.
905
16
    if (CurrentColoring[SU->NodeNum])
906
1
      continue;
907
15
908
15
    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
909
15
    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
910
15
911
15
    std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
912
15
      ColorCombinations.find(SUColors);
913
15
    if (
Pos != ColorCombinations.end()15
) {
914
12
      CurrentColoring[SU->NodeNum] = Pos->second;
915
15
    } else {
916
3
      CurrentColoring[SU->NodeNum] = NextNonReservedID;
917
3
      ColorCombinations[SUColors] = NextNonReservedID++;
918
3
    }
919
16
  }
920
2
}
921
922
2
void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
923
2
  unsigned DAGSize = DAG->SUnits.size();
924
2
  std::vector<int> PendingColoring = CurrentColoring;
925
2
926
2
  assert(DAGSize >= 1 &&
927
2
         CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
928
2
         CurrentTopDownReservedDependencyColoring.size() == DAGSize);
929
2
  // If there is no reserved block at all, do nothing. We don't want
930
2
  // everything in one block.
931
2
  if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
932
2
                        CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
933
1
      *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
934
1
                        CurrentTopDownReservedDependencyColoring.end()) == 0)
935
1
    return;
936
1
937
1
  
for (unsigned SUNum : DAG->BottomUpIndex2SU) 1
{
938
4
    SUnit *SU = &DAG->SUnits[SUNum];
939
4
    std::set<unsigned> SUColors;
940
4
    std::set<unsigned> SUColorsPending;
941
4
942
4
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
943
1
      continue;
944
3
945
3
    
if (3
CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
946
3
        CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
947
3
      continue;
948
0
949
0
    
for (SDep& SuccDep : SU->Succs) 0
{
950
0
      SUnit *Succ = SuccDep.getSUnit();
951
0
      if (
SuccDep.isWeak() || 0
Succ->NodeNum >= DAGSize0
)
952
0
        continue;
953
0
      
if (0
CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
954
0
          CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
955
0
        SUColors.insert(CurrentColoring[Succ->NodeNum]);
956
0
      SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
957
0
    }
958
0
    // If there is only one child/parent block, and that block
959
0
    // is not among the ones we are removing in this path, then
960
0
    // merge the instruction to that block
961
0
    if (
SUColors.size() == 1 && 0
SUColorsPending.size() == 10
)
962
0
      PendingColoring[SU->NodeNum] = *SUColors.begin();
963
0
    else // TODO: Attribute new colors depending on color
964
0
         // combination of children.
965
0
      PendingColoring[SU->NodeNum] = NextNonReservedID++;
966
4
  }
967
2
  CurrentColoring = PendingColoring;
968
2
}
969
970
971
0
void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
972
0
  unsigned DAGSize = DAG->SUnits.size();
973
0
  unsigned PreviousColor;
974
0
  std::set<unsigned> SeenColors;
975
0
976
0
  if (DAGSize <= 1)
977
0
    return;
978
0
979
0
  PreviousColor = CurrentColoring[0];
980
0
981
0
  for (unsigned i = 1, e = DAGSize; 
i != e0
;
++i0
) {
982
0
    SUnit *SU = &DAG->SUnits[i];
983
0
    unsigned CurrentColor = CurrentColoring[i];
984
0
    unsigned PreviousColorSave = PreviousColor;
985
0
    assert(i == SU->NodeNum);
986
0
987
0
    if (CurrentColor != PreviousColor)
988
0
      SeenColors.insert(PreviousColor);
989
0
    PreviousColor = CurrentColor;
990
0
991
0
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
992
0
      continue;
993
0
994
0
    
if (0
SeenColors.find(CurrentColor) == SeenColors.end()0
)
995
0
      continue;
996
0
997
0
    
if (0
PreviousColorSave != CurrentColor0
)
998
0
      CurrentColoring[i] = NextNonReservedID++;
999
0
    else
1000
0
      CurrentColoring[i] = CurrentColoring[i-1];
1001
0
  }
1002
0
}
1003
1004
2
void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
1005
2
  unsigned DAGSize = DAG->SUnits.size();
1006
2
1007
16
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1008
16
    SUnit *SU = &DAG->SUnits[SUNum];
1009
16
    std::set<unsigned> SUColors;
1010
16
1011
16
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1012
1
      continue;
1013
15
1014
15
    // No predecessor: Vgpr constant loading.
1015
15
    // Low latency instructions usually have a predecessor (the address)
1016
15
    
if (15
SU->Preds.size() > 0 && 15
!DAG->IsLowLatencySU[SU->NodeNum]10
)
1017
8
      continue;
1018
7
1019
7
    
for (SDep& SuccDep : SU->Succs) 7
{
1020
9
      SUnit *Succ = SuccDep.getSUnit();
1021
9
      if (
SuccDep.isWeak() || 9
Succ->NodeNum >= DAGSize9
)
1022
2
        continue;
1023
7
      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1024
7
    }
1025
7
    if (SUColors.size() == 1)
1026
4
      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1027
16
  }
1028
2
}
1029
1030
0
void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1031
0
  unsigned DAGSize = DAG->SUnits.size();
1032
0
1033
0
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1034
0
    SUnit *SU = &DAG->SUnits[SUNum];
1035
0
    std::set<unsigned> SUColors;
1036
0
1037
0
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1038
0
      continue;
1039
0
1040
0
    
for (SDep& SuccDep : SU->Succs) 0
{
1041
0
       SUnit *Succ = SuccDep.getSUnit();
1042
0
      if (
SuccDep.isWeak() || 0
Succ->NodeNum >= DAGSize0
)
1043
0
        continue;
1044
0
      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1045
0
    }
1046
0
    if (SUColors.size() == 1)
1047
0
      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1048
0
  }
1049
0
}
1050
1051
2
void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1052
2
  unsigned DAGSize = DAG->SUnits.size();
1053
2
1054
16
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1055
16
    SUnit *SU = &DAG->SUnits[SUNum];
1056
16
    std::set<unsigned> SUColors;
1057
16
1058
16
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1059
1
      continue;
1060
15
1061
15
    
for (SDep& SuccDep : SU->Succs) 15
{
1062
21
       SUnit *Succ = SuccDep.getSUnit();
1063
21
      if (
SuccDep.isWeak() || 21
Succ->NodeNum >= DAGSize21
)
1064
3
        continue;
1065
18
      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1066
18
    }
1067
15
    if (
SUColors.size() == 1 && 15
*SUColors.begin() <= DAGSize9
)
1068
0
      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1069
16
  }
1070
2
}
1071
1072
0
void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1073
0
  unsigned DAGSize = DAG->SUnits.size();
1074
0
  std::map<unsigned, unsigned> ColorCount;
1075
0
1076
0
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1077
0
    SUnit *SU = &DAG->SUnits[SUNum];
1078
0
    unsigned color = CurrentColoring[SU->NodeNum];
1079
0
     ++ColorCount[color];
1080
0
  }
1081
0
1082
0
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1083
0
    SUnit *SU = &DAG->SUnits[SUNum];
1084
0
    unsigned color = CurrentColoring[SU->NodeNum];
1085
0
    std::set<unsigned> SUColors;
1086
0
1087
0
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1088
0
      continue;
1089
0
1090
0
    
if (0
ColorCount[color] > 10
)
1091
0
      continue;
1092
0
1093
0
    
for (SDep& SuccDep : SU->Succs) 0
{
1094
0
       SUnit *Succ = SuccDep.getSUnit();
1095
0
      if (
SuccDep.isWeak() || 0
Succ->NodeNum >= DAGSize0
)
1096
0
        continue;
1097
0
      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1098
0
    }
1099
0
    if (
SUColors.size() == 1 && 0
*SUColors.begin() != color0
) {
1100
0
      --ColorCount[color];
1101
0
      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1102
0
      ++ColorCount[*SUColors.begin()];
1103
0
    }
1104
0
  }
1105
0
}
1106
1107
0
void SIScheduleBlockCreator::cutHugeBlocks() {
1108
0
  // TODO
1109
0
}
1110
1111
2
void SIScheduleBlockCreator::regroupNoUserInstructions() {
1112
2
  unsigned DAGSize = DAG->SUnits.size();
1113
2
  int GroupID = NextNonReservedID++;
1114
2
1115
16
  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1116
16
    SUnit *SU = &DAG->SUnits[SUNum];
1117
16
    bool hasSuccessor = false;
1118
16
1119
16
    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1120
1
      continue;
1121
15
1122
15
    
for (SDep& SuccDep : SU->Succs) 15
{
1123
21
       SUnit *Succ = SuccDep.getSUnit();
1124
21
      if (
SuccDep.isWeak() || 21
Succ->NodeNum >= DAGSize21
)
1125
3
        continue;
1126
18
      hasSuccessor = true;
1127
18
    }
1128
15
    if (!hasSuccessor)
1129
4
      CurrentColoring[SU->NodeNum] = GroupID;
1130
16
  }
1131
2
}
1132
1133
2
void SIScheduleBlockCreator::colorExports() {
1134
2
  unsigned ExportColor = NextNonReservedID++;
1135
2
  SmallVector<unsigned, 8> ExpGroup;
1136
2
1137
2
  // Put all exports together in a block.
1138
2
  // The block will naturally end up being scheduled last,
1139
2
  // thus putting exports at the end of the schedule, which
1140
2
  // is better for performance.
1141
2
  // However we must ensure, for safety, the exports can be put
1142
2
  // together in the same block without any other instruction.
1143
2
  // This could happen, for example, when scheduling after regalloc
1144
2
  // if reloading a spilled register from memory using the same
1145
2
  // register than used in a previous export.
1146
2
  // If that happens, do not regroup the exports.
1147
16
  for (unsigned SUNum : DAG->TopDownIndex2SU) {
1148
16
    const SUnit &SU = DAG->SUnits[SUNum];
1149
16
    if (
SIInstrInfo::isEXP(*SU.getInstr())16
) {
1150
1
      // Check the EXP can be added to the group safely,
1151
1
      // ie without needing any other instruction.
1152
1
      // The EXP is allowed to depend on other EXP
1153
1
      // (they will be in the same group).
1154
0
      for (unsigned j : ExpGroup) {
1155
0
        bool HasSubGraph;
1156
0
        std::vector<int> SubGraph;
1157
0
        // By construction (topological order), if SU and
1158
0
        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
1159
0
        // in the parent graph of SU.
1160
#ifndef NDEBUG
1161
        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
1162
                                               HasSubGraph);
1163
        assert(!HasSubGraph);
1164
#endif
1165
        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
1166
0
                                               HasSubGraph);
1167
0
        if (!HasSubGraph)
1168
0
          continue; // No dependencies between each other
1169
0
1170
0
        // SubGraph contains all the instructions required
1171
0
        // between EXP SUnits[j] and EXP SU.
1172
0
        
for (unsigned k : SubGraph) 0
{
1173
0
          if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
1174
0
            // Other instructions than EXP would be required in the group.
1175
0
            // Abort the groupping.
1176
0
            return;
1177
1
        }
1178
0
      }
1179
1
1180
1
      ExpGroup.push_back(SUNum);
1181
1
    }
1182
16
  }
1183
2
1184
2
  // The group can be formed. Give the color.
1185
2
  for (unsigned j : ExpGroup)
1186
1
    CurrentColoring[j] = ExportColor;
1187
2
}
1188
1189
2
void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1190
2
  unsigned DAGSize = DAG->SUnits.size();
1191
2
  std::map<unsigned,unsigned> RealID;
1192
2
1193
2
  CurrentBlocks.clear();
1194
2
  CurrentColoring.clear();
1195
2
  CurrentColoring.resize(DAGSize, 0);
1196
2
  Node2CurrentBlock.clear();
1197
2
1198
2
  // Restore links previous scheduling variant has overridden.
1199
2
  DAG->restoreSULinksLeft();
1200
2
1201
2
  NextReservedID = 1;
1202
2
  NextNonReservedID = DAGSize + 1;
1203
2
1204
2
  DEBUG(dbgs() << "Coloring the graph\n");
1205
2
1206
2
  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1207
0
    colorHighLatenciesGroups();
1208
2
  else
1209
2
    colorHighLatenciesAlone();
1210
2
  colorComputeReservedDependencies();
1211
2
  colorAccordingToReservedDependencies();
1212
2
  colorEndsAccordingToDependencies();
1213
2
  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1214
0
    colorForceConsecutiveOrderInGroup();
1215
2
  regroupNoUserInstructions();
1216
2
  colorMergeConstantLoadsNextGroup();
1217
2
  colorMergeIfPossibleNextGroupOnlyForReserved();
1218
2
  colorExports();
1219
2
1220
2
  // Put SUs of same color into same block
1221
2
  Node2CurrentBlock.resize(DAGSize, -1);
1222
18
  for (unsigned i = 0, e = DAGSize; 
i != e18
;
++i16
) {
1223
16
    SUnit *SU = &DAG->SUnits[i];
1224
16
    unsigned Color = CurrentColoring[SU->NodeNum];
1225
16
    if (
RealID.find(Color) == RealID.end()16
) {
1226
5
      int ID = CurrentBlocks.size();
1227
5
      BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
1228
5
      CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1229
5
      RealID[Color] = ID;
1230
5
    }
1231
16
    CurrentBlocks[RealID[Color]]->addUnit(SU);
1232
16
    Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1233
16
  }
1234
2
1235
2
  // Build dependencies between blocks.
1236
18
  for (unsigned i = 0, e = DAGSize; 
i != e18
;
++i16
) {
1237
16
    SUnit *SU = &DAG->SUnits[i];
1238
16
    int SUID = Node2CurrentBlock[i];
1239
24
     for (SDep& SuccDep : SU->Succs) {
1240
24
       SUnit *Succ = SuccDep.getSUnit();
1241
24
      if (
SuccDep.isWeak() || 24
Succ->NodeNum >= DAGSize24
)
1242
3
        continue;
1243
21
      
if (21
Node2CurrentBlock[Succ->NodeNum] != SUID21
)
1244
9
        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1245
9
                                     SuccDep.isCtrl() ? 
NoData2
:
Data7
);
1246
24
    }
1247
21
    for (SDep& PredDep : SU->Preds) {
1248
21
      SUnit *Pred = PredDep.getSUnit();
1249
21
      if (
PredDep.isWeak() || 21
Pred->NodeNum >= DAGSize21
)
1250
0
        continue;
1251
21
      
if (21
Node2CurrentBlock[Pred->NodeNum] != SUID21
)
1252
9
        CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1253
21
    }
1254
16
  }
1255
2
1256
2
  // Free root and leafs of all blocks to enable scheduling inside them.
1257
7
  for (unsigned i = 0, e = CurrentBlocks.size(); 
i != e7
;
++i5
) {
1258
5
    SIScheduleBlock *Block = CurrentBlocks[i];
1259
5
    Block->finalizeUnits();
1260
5
  }
1261
2
  DEBUG(
1262
2
    dbgs() << "Blocks created:\n\n";
1263
2
    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1264
2
      SIScheduleBlock *Block = CurrentBlocks[i];
1265
2
      Block->printDebug(true);
1266
2
    }
1267
2
  );
1268
2
}
1269
1270
// Two functions taken from Codegen/MachineScheduler.cpp
1271
1272
/// Non-const version.
1273
static MachineBasicBlock::iterator
1274
nextIfDebug(MachineBasicBlock::iterator I,
1275
12
            MachineBasicBlock::const_iterator End) {
1276
12
  for (; 
I != End12
;
++I0
) {
1277
10
    if (!I->isDebugValue())
1278
10
      break;
1279
10
  }
1280
12
  return I;
1281
12
}
1282
1283
2
void SIScheduleBlockCreator::topologicalSort() {
1284
2
  unsigned DAGSize = CurrentBlocks.size();
1285
2
  std::vector<int> WorkList;
1286
2
1287
2
  DEBUG(dbgs() << "Topological Sort\n");
1288
2
1289
2
  WorkList.reserve(DAGSize);
1290
2
  TopDownIndex2Block.resize(DAGSize);
1291
2
  TopDownBlock2Index.resize(DAGSize);
1292
2
  BottomUpIndex2Block.resize(DAGSize);
1293
2
1294
7
  for (unsigned i = 0, e = DAGSize; 
i != e7
;
++i5
) {
1295
5
    SIScheduleBlock *Block = CurrentBlocks[i];
1296
5
    unsigned Degree = Block->getSuccs().size();
1297
5
    TopDownBlock2Index[i] = Degree;
1298
5
    if (
Degree == 05
) {
1299
2
      WorkList.push_back(i);
1300
2
    }
1301
5
  }
1302
2
1303
2
  int Id = DAGSize;
1304
7
  while (
!WorkList.empty()7
) {
1305
5
    int i = WorkList.back();
1306
5
    SIScheduleBlock *Block = CurrentBlocks[i];
1307
5
    WorkList.pop_back();
1308
5
    TopDownBlock2Index[i] = --Id;
1309
5
    TopDownIndex2Block[Id] = i;
1310
4
    for (SIScheduleBlock* Pred : Block->getPreds()) {
1311
4
      if (!--TopDownBlock2Index[Pred->getID()])
1312
3
        WorkList.push_back(Pred->getID());
1313
4
    }
1314
5
  }
1315
2
1316
#ifndef NDEBUG
1317
  // Check correctness of the ordering.
1318
  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1319
    SIScheduleBlock *Block = CurrentBlocks[i];
1320
    for (SIScheduleBlock* Pred : Block->getPreds()) {
1321
      assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1322
      "Wrong Top Down topological sorting");
1323
    }
1324
  }
1325
#endif
1326
1327
2
  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1328
2
                                         TopDownIndex2Block.rend());
1329
2
}
1330
1331
2
void SIScheduleBlockCreator::scheduleInsideBlocks() {
1332
2
  unsigned DAGSize = CurrentBlocks.size();
1333
2
1334
2
  DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1335
2
1336
2
  // We do schedule a valid scheduling such that a Block corresponds
1337
2
  // to a range of instructions.
1338
2
  DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1339
7
  for (unsigned i = 0, e = DAGSize; 
i != e7
;
++i5
) {
1340
5
    SIScheduleBlock *Block = CurrentBlocks[i];
1341
5
    Block->fastSchedule();
1342
5
  }
1343
2
1344
2
  // Note: the following code, and the part restoring previous position
1345
2
  // is by far the most expensive operation of the Scheduler.
1346
2
1347
2
  // Do not update CurrentTop.
1348
2
  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1349
2
  std::vector<MachineBasicBlock::iterator> PosOld;
1350
2
  std::vector<MachineBasicBlock::iterator> PosNew;
1351
2
  PosOld.reserve(DAG->SUnits.size());
1352
2
  PosNew.reserve(DAG->SUnits.size());
1353
2
1354
7
  for (unsigned i = 0, e = DAGSize; 
i != e7
;
++i5
) {
1355
5
    int BlockIndice = TopDownIndex2Block[i];
1356
5
    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1357
5
    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1358
5
1359
16
    for (SUnit* SU : SUs) {
1360
16
      MachineInstr *MI = SU->getInstr();
1361
16
      MachineBasicBlock::iterator Pos = MI;
1362
16
      PosOld.push_back(Pos);
1363
16
      if (
&*CurrentTopFastSched == MI16
) {
1364
12
        PosNew.push_back(Pos);
1365
12
        CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1366
12
                                          DAG->getCurrentBottom());
1367
16
      } else {
1368
4
        // Update the instruction stream.
1369
4
        DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1370
4
1371
4
        // Update LiveIntervals.
1372
4
        // Note: Moving all instructions and calling handleMove every time
1373
4
        // is the most cpu intensive operation of the scheduler.
1374
4
        // It would gain a lot if there was a way to recompute the
1375
4
        // LiveIntervals for the entire scheduling region.
1376
4
        DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
1377
4
        PosNew.push_back(CurrentTopFastSched);
1378
4
      }
1379
16
    }
1380
5
  }
1381
2
1382
2
  // Now we have Block of SUs == Block of MI.
1383
2
  // We do the final schedule for the instructions inside the block.
1384
2
  // The property that all the SUs of the Block are grouped together as MI
1385
2
  // is used for correct reg usage tracking.
1386
7
  for (unsigned i = 0, e = DAGSize; 
i != e7
;
++i5
) {
1387
5
    SIScheduleBlock *Block = CurrentBlocks[i];
1388
5
    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1389
5
    Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1390
5
  }
1391
2
1392
2
  DEBUG(dbgs() << "Restoring MI Pos\n");
1393
2
  // Restore old ordering (which prevents a LIS->handleMove bug).
1394
18
  for (unsigned i = PosOld.size(), e = 0; 
i != e18
;
--i16
) {
1395
16
    MachineBasicBlock::iterator POld = PosOld[i-1];
1396
16
    MachineBasicBlock::iterator PNew = PosNew[i-1];
1397
16
    if (
PNew != POld16
) {
1398
4
      // Update the instruction stream.
1399
4
      DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1400
4
1401
4
      // Update LiveIntervals.
1402
4
      DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
1403
4
    }
1404
16
  }
1405
2
1406
2
  DEBUG(
1407
2
    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1408
2
      SIScheduleBlock *Block = CurrentBlocks[i];
1409
2
      Block->printDebug(true);
1410
2
    }
1411
2
  );
1412
2
}
1413
1414
2
void SIScheduleBlockCreator::fillStats() {
1415
2
  unsigned DAGSize = CurrentBlocks.size();
1416
2
1417
7
  for (unsigned i = 0, e = DAGSize; 
i != e7
;
++i5
) {
1418
5
    int BlockIndice = TopDownIndex2Block[i];
1419
5
    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1420
5
    if (Block->getPreds().empty())
1421
2
      Block->Depth = 0;
1422
3
    else {
1423
3
      unsigned Depth = 0;
1424
4
      for (SIScheduleBlock *Pred : Block->getPreds()) {
1425
4
        if (Depth < Pred->Depth + Pred->getCost())
1426
3
          Depth = Pred->Depth + Pred->getCost();
1427
4
      }
1428
3
      Block->Depth = Depth;
1429
3
    }
1430
5
  }
1431
2
1432
7
  for (unsigned i = 0, e = DAGSize; 
i != e7
;
++i5
) {
1433
5
    int BlockIndice = BottomUpIndex2Block[i];
1434
5
    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1435
5
    if (Block->getSuccs().empty())
1436
2
      Block->Height = 0;
1437
3
    else {
1438
3
      unsigned Height = 0;
1439
3
      for (const auto &Succ : Block->getSuccs())
1440
4
        Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
1441
3
      Block->Height = Height;
1442
3
    }
1443
5
  }
1444
2
}
1445
1446
// SIScheduleBlockScheduler //
1447
1448
SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1449
                                                   SISchedulerBlockSchedulerVariant Variant,
1450
                                                   SIScheduleBlocks  BlocksStruct) :
1451
  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1452
  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1453
2
  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1454
2
1455
2
  // Fill the usage of every output
1456
2
  // Warning: while by construction we always have a link between two blocks
1457
2
  // when one needs a result from the other, the number of users of an output
1458
2
  // is not the sum of child blocks having as input the same virtual register.
1459
2
  // Here is an example. A produces x and y. B eats x and produces x'.
1460
2
  // C eats x' and y. The register coalescer may have attributed the same
1461
2
  // virtual register to x and x'.
1462
2
  // To count accurately, we do a topological sort. In case the register is
1463
2
  // found for several parents, we increment the usage of the one with the
1464
2
  // highest topological index.
1465
2
  LiveOutRegsNumUsages.resize(Blocks.size());
1466
7
  for (unsigned i = 0, e = Blocks.size(); 
i != e7
;
++i5
) {
1467
5
    SIScheduleBlock *Block = Blocks[i];
1468
8
    for (unsigned Reg : Block->getInRegs()) {
1469
8
      bool Found = false;
1470
8
      int topoInd = -1;
1471
7
      for (SIScheduleBlock* Pred: Block->getPreds()) {
1472
7
        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1473
7
        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1474
7
1475
7
        if (
RegPos != PredOutRegs.end()7
) {
1476
5
          Found = true;
1477
5
          if (
topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]5
) {
1478
5
            topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1479
5
          }
1480
5
        }
1481
7
      }
1482
8
1483
8
      if (!Found)
1484
3
        continue;
1485
5
1486
5
      int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1487
5
      ++LiveOutRegsNumUsages[PredID][Reg];
1488
5
    }
1489
5
  }
1490
2
1491
2
  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1492
2
  BlockNumPredsLeft.resize(Blocks.size());
1493
2
  BlockNumSuccsLeft.resize(Blocks.size());
1494
2
1495
7
  for (unsigned i = 0, e = Blocks.size(); 
i != e7
;
++i5
) {
1496
5
    SIScheduleBlock *Block = Blocks[i];
1497
5
    BlockNumPredsLeft[i] = Block->getPreds().size();
1498
5
    BlockNumSuccsLeft[i] = Block->getSuccs().size();
1499
5
  }
1500
2
1501
#ifndef NDEBUG
1502
  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1503
    SIScheduleBlock *Block = Blocks[i];
1504
    assert(Block->getID() == i);
1505
  }
1506
#endif
1507
1508
2
  std::set<unsigned> InRegs = DAG->getInRegs();
1509
2
  addLiveRegs(InRegs);
1510
2
1511
2
  // Increase LiveOutRegsNumUsages for blocks
1512
2
  // producing registers consumed in another
1513
2
  // scheduling region.
1514
3
  for (unsigned Reg : DAG->getOutRegs()) {
1515
3
    for (unsigned i = 0, e = Blocks.size(); 
i != e3
;
++i0
) {
1516
3
      // Do reverse traversal
1517
3
      int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1518
3
      SIScheduleBlock *Block = Blocks[ID];
1519
3
      const std::set<unsigned> &OutRegs = Block->getOutRegs();
1520
3
1521
3
      if (OutRegs.find(Reg) == OutRegs.end())
1522
0
        continue;
1523
3
1524
3
      ++LiveOutRegsNumUsages[ID][Reg];
1525
3
      break;
1526
3
    }
1527
3
  }
1528
2
1529
2
  // Fill LiveRegsConsumers for regs that were already
1530
2
  // defined before scheduling.
1531
7
  for (unsigned i = 0, e = Blocks.size(); 
i != e7
;
++i5
) {
1532
5
    SIScheduleBlock *Block = Blocks[i];
1533
8
    for (unsigned Reg : Block->getInRegs()) {
1534
8
      bool Found = false;
1535
5
      for (SIScheduleBlock* Pred: Block->getPreds()) {
1536
5
        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1537
5
        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1538
5
1539
5
        if (
RegPos != PredOutRegs.end()5
) {
1540
5
          Found = true;
1541
5
          break;
1542
5
        }
1543
8
      }
1544
8
1545
8
      if (!Found)
1546
3
        ++LiveRegsConsumers[Reg];
1547
8
    }
1548
5
  }
1549
2
1550
7
  for (unsigned i = 0, e = Blocks.size(); 
i != e7
;
++i5
) {
1551
5
    SIScheduleBlock *Block = Blocks[i];
1552
5
    if (
BlockNumPredsLeft[i] == 05
) {
1553
2
      ReadyBlocks.push_back(Block);
1554
2
    }
1555
5
  }
1556
2
1557
7
  while (SIScheduleBlock *
Block7
= pickBlock()) {
1558
5
    BlocksScheduled.push_back(Block);
1559
5
    blockScheduled(Block);
1560
5
  }
1561
2
1562
2
  DEBUG(
1563
2
    dbgs() << "Block Order:";
1564
2
    for (SIScheduleBlock* Block : BlocksScheduled) {
1565
2
      dbgs() << ' ' << Block->getID();
1566
2
    }
1567
2
    dbgs() << '\n';
1568
2
  );
1569
2
}
1570
1571
bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1572
5
                                                   SIBlockSchedCandidate &TryCand) {
1573
5
  if (
!Cand.isValid()5
) {
1574
5
    TryCand.Reason = NodeOrder;
1575
5
    return true;
1576
5
  }
1577
0
1578
0
  // Try to hide high latencies.
1579
0
  
if (0
tryLess(TryCand.LastPosHighLatParentScheduled,
1580
0
              Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1581
0
    return true;
1582
0
  // Schedule high latencies early so you can hide them better.
1583
0
  
if (0
tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1584
0
                 TryCand, Cand, Latency))
1585
0
    return true;
1586
0
  
if (0
TryCand.IsHighLatency && 0
tryGreater(TryCand.Height, Cand.Height,
1587
0
                                          TryCand, Cand, Depth))
1588
0
    return true;
1589
0
  
if (0
tryGreater(TryCand.NumHighLatencySuccessors,
1590
0
                 Cand.NumHighLatencySuccessors,
1591
0
                 TryCand, Cand, Successor))
1592
0
    return true;
1593
0
  return false;
1594
0
}
1595
1596
bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1597
0
                                                    SIBlockSchedCandidate &TryCand) {
1598
0
  if (
!Cand.isValid()0
) {
1599
0
    TryCand.Reason = NodeOrder;
1600
0
    return true;
1601
0
  }
1602
0
1603
0
  
if (0
tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1604
0
              TryCand, Cand, RegUsage))
1605
0
    return true;
1606
0
  
if (0
tryGreater(TryCand.NumSuccessors > 0,
1607
0
                 Cand.NumSuccessors > 0,
1608
0
                 TryCand, Cand, Successor))
1609
0
    return true;
1610
0
  
if (0
tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)0
)
1611
0
    return true;
1612
0
  
if (0
tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1613
0
              TryCand, Cand, RegUsage))
1614
0
    return true;
1615
0
  return false;
1616
0
}
1617
1618
7
SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1619
7
  SIBlockSchedCandidate Cand;
1620
7
  std::vector<SIScheduleBlock*>::iterator Best;
1621
7
  SIScheduleBlock *Block;
1622
7
  if (ReadyBlocks.empty())
1623
2
    return nullptr;
1624
5
1625
5
  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1626
5
                        VregCurrentUsage, SregCurrentUsage);
1627
5
  if (VregCurrentUsage > maxVregUsage)
1628
3
    maxVregUsage = VregCurrentUsage;
1629
5
  if (SregCurrentUsage > maxSregUsage)
1630
2
    maxSregUsage = SregCurrentUsage;
1631
5
  DEBUG(
1632
5
    dbgs() << "Picking New Blocks\n";
1633
5
    dbgs() << "Available: ";
1634
5
    for (SIScheduleBlock* Block : ReadyBlocks)
1635
5
      dbgs() << Block->getID() << ' ';
1636
5
    dbgs() << "\nCurrent Live:\n";
1637
5
    for (unsigned Reg : LiveRegs)
1638
5
      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1639
5
    dbgs() << '\n';
1640
5
    dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1641
5
    dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
1642
5
  );
1643
5
1644
5
  Cand.Block = nullptr;
1645
5
  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1646
10
       E = ReadyBlocks.end(); 
I != E10
;
++I5
) {
1647
5
    SIBlockSchedCandidate TryCand;
1648
5
    TryCand.Block = *I;
1649
5
    TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1650
5
    TryCand.VGPRUsageDiff =
1651
5
      checkRegUsageImpact(TryCand.Block->getInRegs(),
1652
5
                          TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1653
5
    TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1654
5
    TryCand.NumHighLatencySuccessors =
1655
5
      TryCand.Block->getNumHighLatencySuccessors();
1656
5
    TryCand.LastPosHighLatParentScheduled =
1657
5
      (unsigned int) std::max<int> (0,
1658
5
         LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1659
5
           LastPosWaitedHighLatency);
1660
5
    TryCand.Height = TryCand.Block->Height;
1661
5
    // Try not to increase VGPR usage too much, else we may spill.
1662
5
    if (VregCurrentUsage > 120 ||
1663
5
        
Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage5
) {
1664
0
      if (!tryCandidateRegUsage(Cand, TryCand) &&
1665
0
          Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1666
0
        tryCandidateLatency(Cand, TryCand);
1667
5
    } else {
1668
5
      if (!tryCandidateLatency(Cand, TryCand))
1669
0
        tryCandidateRegUsage(Cand, TryCand);
1670
5
    }
1671
5
    if (
TryCand.Reason != NoCand5
) {
1672
5
      Cand.setBest(TryCand);
1673
5
      Best = I;
1674
5
      DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1675
5
                   << getReasonStr(Cand.Reason) << '\n');
1676
5
    }
1677
5
  }
1678
5
1679
5
  DEBUG(
1680
7
    dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1681
7
    dbgs() << "Is a block with high latency instruction: "
1682
7
      << (Cand.IsHighLatency ? "yes\n" : "no\n");
1683
7
    dbgs() << "Position of last high latency dependency: "
1684
7
           << Cand.LastPosHighLatParentScheduled << '\n';
1685
7
    dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1686
7
    dbgs() << '\n';
1687
7
  );
1688
7
1689
7
  Block = Cand.Block;
1690
7
  ReadyBlocks.erase(Best);
1691
7
  return Block;
1692
7
}
1693
1694
// Tracking of currently alive registers to determine VGPR Usage.
1695
1696
7
void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1697
19
  for (unsigned Reg : Regs) {
1698
19
    // For now only track virtual registers.
1699
19
    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1700
7
      continue;
1701
12
    // If not already in the live set, then add it.
1702
12
    (void) LiveRegs.insert(Reg);
1703
12
  }
1704
7
}
1705
1706
void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1707
5
                                       std::set<unsigned> &Regs) {
1708
8
  for (unsigned Reg : Regs) {
1709
8
    // For now only track virtual registers.
1710
8
    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1711
8
    assert (Pos != LiveRegs.end() && // Reg must be live.
1712
8
               LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1713
8
               LiveRegsConsumers[Reg] >= 1);
1714
8
    --LiveRegsConsumers[Reg];
1715
8
    if (LiveRegsConsumers[Reg] == 0)
1716
8
      LiveRegs.erase(Pos);
1717
8
  }
1718
5
}
1719
1720
5
void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1721
4
  for (const auto &Block : Parent->getSuccs()) {
1722
4
    if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1723
3
      ReadyBlocks.push_back(Block.first);
1724
4
1725
4
    if (Parent->isHighLatencyBlock() &&
1726
2
        Block.second == SIScheduleBlockLinkKind::Data)
1727
1
      LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
1728
4
  }
1729
5
}
1730
1731
5
void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1732
5
  decreaseLiveRegs(Block, Block->getInRegs());
1733
5
  addLiveRegs(Block->getOutRegs());
1734
5
  releaseBlockSuccs(Block);
1735
5
  for (std::map<unsigned, unsigned>::iterator RegI =
1736
5
       LiveOutRegsNumUsages[Block->getID()].begin(),
1737
13
       E = LiveOutRegsNumUsages[Block->getID()].end(); 
RegI != E13
;
++RegI8
) {
1738
8
    std::pair<unsigned, unsigned> RegP = *RegI;
1739
8
    // We produce this register, thus it must not be previously alive.
1740
8
    assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
1741
8
           LiveRegsConsumers[RegP.first] == 0);
1742
8
    LiveRegsConsumers[RegP.first] += RegP.second;
1743
8
  }
1744
5
  if (LastPosHighLatencyParentScheduled[Block->getID()] >
1745
5
        (unsigned)LastPosWaitedHighLatency)
1746
0
    LastPosWaitedHighLatency =
1747
0
      LastPosHighLatencyParentScheduled[Block->getID()];
1748
5
  ++NumBlockScheduled;
1749
5
}
1750
1751
std::vector<int>
1752
SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1753
5
                                     std::set<unsigned> &OutRegs) {
1754
5
  std::vector<int> DiffSetPressure;
1755
5
  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1756
5
1757
8
  for (unsigned Reg : InRegs) {
1758
8
    // For now only track virtual registers.
1759
8
    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1760
0
      continue;
1761
8
    
if (8
LiveRegsConsumers[Reg] > 18
)
1762
0
      continue;
1763
8
    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1764
21
    for (; 
PSetI.isValid()21
;
++PSetI13
) {
1765
13
      DiffSetPressure[*PSetI] -= PSetI.getWeight();
1766
13
    }
1767
8
  }
1768
5
1769
8
  for (unsigned Reg : OutRegs) {
1770
8
    // For now only track virtual registers.
1771
8
    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1772
0
      continue;
1773
8
    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1774
21
    for (; 
PSetI.isValid()21
;
++PSetI13
) {
1775
13
      DiffSetPressure[*PSetI] += PSetI.getWeight();
1776
13
    }
1777
8
  }
1778
5
1779
5
  return DiffSetPressure;
1780
5
}
1781
1782
// SIScheduler //
1783
1784
struct SIScheduleBlockResult
1785
SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1786
2
                             SISchedulerBlockSchedulerVariant ScheduleVariant) {
1787
2
  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1788
2
  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1789
2
  std::vector<SIScheduleBlock*> ScheduledBlocks;
1790
2
  struct SIScheduleBlockResult Res;
1791
2
1792
2
  ScheduledBlocks = Scheduler.getBlocks();
1793
2
1794
7
  for (unsigned b = 0; 
b < ScheduledBlocks.size()7
;
++b5
) {
1795
5
    SIScheduleBlock *Block = ScheduledBlocks[b];
1796
5
    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1797
5
1798
5
    for (SUnit* SU : SUs)
1799
16
      Res.SUs.push_back(SU->NodeNum);
1800
5
  }
1801
2
1802
2
  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1803
2
  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1804
2
  return Res;
1805
2
}
1806
1807
// SIScheduleDAGMI //
1808
1809
SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1810
1
  ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
1811
1
  SITII = static_cast<const SIInstrInfo*>(TII);
1812
1
  SITRI = static_cast<const SIRegisterInfo*>(TRI);
1813
1
1814
1
  VGPRSetID = SITRI->getVGPRPressureSet();
1815
1
  SGPRSetID = SITRI->getSGPRPressureSet();
1816
1
}
1817
1818
1
SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1819
1820
// Code adapted from scheduleDAG.cpp
1821
// Does a topological sort over the SUs.
1822
// Both TopDown and BottomUp
1823
2
void SIScheduleDAGMI::topologicalSort() {
1824
2
  Topo.InitDAGTopologicalSorting();
1825
2
1826
2
  TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1827
2
  BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1828
2
}
1829
1830
// Move low latencies further from their user without
1831
// increasing SGPR usage (in general)
1832
// This is to be replaced by a better pass that would
1833
// take into account SGPR usage (based on VGPR Usage
1834
// and the corresponding wavefront count), that would
1835
// try to merge groups of loads if it make sense, etc
1836
2
void SIScheduleDAGMI::moveLowLatencies() {
1837
2
   unsigned DAGSize = SUnits.size();
1838
2
   int LastLowLatencyUser = -1;
1839
2
   int LastLowLatencyPos = -1;
1840
2
1841
18
   for (unsigned i = 0, e = ScheduledSUnits.size(); 
i != e18
;
++i16
) {
1842
16
    SUnit *SU = &SUnits[ScheduledSUnits[i]];
1843
16
    bool IsLowLatencyUser = false;
1844
16
    unsigned MinPos = 0;
1845
16
1846
21
    for (SDep& PredDep : SU->Preds) {
1847
21
      SUnit *Pred = PredDep.getSUnit();
1848
21
      if (
SITII->isLowLatencyInstruction(*Pred->getInstr())21
) {
1849
0
        IsLowLatencyUser = true;
1850
0
      }
1851
21
      if (Pred->NodeNum >= DAGSize)
1852
0
        continue;
1853
21
      unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1854
21
      if (PredPos >= MinPos)
1855
10
        MinPos = PredPos + 1;
1856
21
    }
1857
16
1858
16
    if (
SITII->isLowLatencyInstruction(*SU->getInstr())16
) {
1859
2
      unsigned BestPos = LastLowLatencyUser + 1;
1860
2
      if ((int)BestPos <= LastLowLatencyPos)
1861
1
        BestPos = LastLowLatencyPos + 1;
1862
2
      if (BestPos < MinPos)
1863
1
        BestPos = MinPos;
1864
2
      if (
BestPos < i2
) {
1865
18
        for (unsigned u = i; 
u > BestPos18
;
--u16
) {
1866
16
          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1867
16
          ScheduledSUnits[u] = ScheduledSUnits[u-1];
1868
16
        }
1869
2
        ScheduledSUnits[BestPos] = SU->NodeNum;
1870
2
        ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1871
2
      }
1872
2
      LastLowLatencyPos = BestPos;
1873
2
      if (IsLowLatencyUser)
1874
0
        LastLowLatencyUser = BestPos;
1875
16
    } else 
if (14
IsLowLatencyUser14
) {
1876
0
      LastLowLatencyUser = i;
1877
0
    // Moves COPY instructions on which depends
1878
0
    // the low latency instructions too.
1879
14
    } else 
if (14
SU->getInstr()->getOpcode() == AMDGPU::COPY14
) {
1880
5
      bool CopyForLowLat = false;
1881
7
      for (SDep& SuccDep : SU->Succs) {
1882
7
        SUnit *Succ = SuccDep.getSUnit();
1883
7
        if (
SITII->isLowLatencyInstruction(*Succ->getInstr())7
) {
1884
2
          CopyForLowLat = true;
1885
2
        }
1886
7
      }
1887
5
      if (!CopyForLowLat)
1888
3
        continue;
1889
2
      
if (2
MinPos < i2
) {
1890
19
        for (unsigned u = i; 
u > MinPos19
;
--u17
) {
1891
17
          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1892
17
          ScheduledSUnits[u] = ScheduledSUnits[u-1];
1893
17
        }
1894
2
        ScheduledSUnits[MinPos] = SU->NodeNum;
1895
2
        ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1896
2
      }
1897
14
    }
1898
16
  }
1899
2
}
1900
1901
2
void SIScheduleDAGMI::restoreSULinksLeft() {
1902
18
  for (unsigned i = 0, e = SUnits.size(); 
i != e18
;
++i16
) {
1903
16
    SUnits[i].isScheduled = false;
1904
16
    SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1905
16
    SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1906
16
    SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1907
16
    SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1908
16
  }
1909
2
}
1910
1911
// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1912
template<typename _Iterator> void
1913
SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1914
5
                                  unsigned &VgprUsage, unsigned &SgprUsage) {
1915
5
  VgprUsage = 0;
1916
5
  SgprUsage = 0;
1917
15
  for (_Iterator RegI = First; 
RegI != End15
;
++RegI10
) {
1918
10
    unsigned Reg = *RegI;
1919
10
    // For now only track virtual registers
1920
10
    if (!TargetRegisterInfo::isVirtualRegister(Reg))
1921
0
      continue;
1922
10
    PSetIterator PSetI = MRI.getPressureSets(Reg);
1923
27
    for (; 
PSetI.isValid()27
;
++PSetI17
) {
1924
17
      if (*PSetI == VGPRSetID)
1925
6
        VgprUsage += PSetI.getWeight();
1926
11
      else 
if (11
*PSetI == SGPRSetID11
)
1927
4
        SgprUsage += PSetI.getWeight();
1928
17
    }
1929
10
  }
1930
5
}
1931
1932
void SIScheduleDAGMI::schedule()
1933
2
{
1934
2
  SmallVector<SUnit*, 8> TopRoots, BotRoots;
1935
2
  SIScheduleBlockResult Best, Temp;
1936
2
  DEBUG(dbgs() << "Preparing Scheduling\n");
1937
2
1938
2
  buildDAGWithRegPressure();
1939
2
  DEBUG(
1940
2
    for(SUnit& SU : SUnits)
1941
2
       SU.dumpAll(this)
1942
2
  );
1943
2
1944
2
  topologicalSort();
1945
2
  findRootsAndBiasEdges(TopRoots, BotRoots);
1946
2
  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1947
2
  // functions, but to make them happy we must initialize
1948
2
  // the default Scheduler implementation (even if we do not
1949
2
  // run it)
1950
2
  SchedImpl->initialize(this);
1951
2
  initQueues(TopRoots, BotRoots);
1952
2
1953
2
  // Fill some stats to help scheduling.
1954
2
1955
2
  SUnitsLinksBackup = SUnits;
1956
2
  IsLowLatencySU.clear();
1957
2
  LowLatencyOffset.clear();
1958
2
  IsHighLatencySU.clear();
1959
2
1960
2
  IsLowLatencySU.resize(SUnits.size(), 0);
1961
2
  LowLatencyOffset.resize(SUnits.size(), 0);
1962
2
  IsHighLatencySU.resize(SUnits.size(), 0);
1963
2
1964
18
  for (unsigned i = 0, e = (unsigned)SUnits.size(); 
i != e18
;
++i16
) {
1965
16
    SUnit *SU = &SUnits[i];
1966
16
    unsigned BaseLatReg;
1967
16
    int64_t OffLatReg;
1968
16
    if (
SITII->isLowLatencyInstruction(*SU->getInstr())16
) {
1969
2
      IsLowLatencySU[i] = 1;
1970
2
      if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
1971
2
                                       TRI))
1972
2
        LowLatencyOffset[i] = OffLatReg;
1973
16
    } else 
if (14
SITII->isHighLatencyInstruction(*SU->getInstr())14
)
1974
1
      IsHighLatencySU[i] = 1;
1975
16
  }
1976
2
1977
2
  SIScheduler Scheduler(this);
1978
2
  Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1979
2
                                   SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1980
2
1981
2
  // if VGPR usage is extremely high, try other good performing variants
1982
2
  // which could lead to lower VGPR usage
1983
2
  if (
Best.MaxVGPRUsage > 1802
) {
1984
0
    static const std::pair<SISchedulerBlockCreatorVariant,
1985
0
                           SISchedulerBlockSchedulerVariant>
1986
0
        Variants[] = {
1987
0
      { LatenciesAlone, BlockRegUsageLatency },
1988
0
//      { LatenciesAlone, BlockRegUsage },
1989
0
      { LatenciesGrouped, BlockLatencyRegUsage },
1990
0
//      { LatenciesGrouped, BlockRegUsageLatency },
1991
0
//      { LatenciesGrouped, BlockRegUsage },
1992
0
      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1993
0
//      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1994
0
//      { LatenciesAlonePlusConsecutive, BlockRegUsage }
1995
0
    };
1996
0
    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1997
0
      Temp = Scheduler.scheduleVariant(v.first, v.second);
1998
0
      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1999
0
        Best = Temp;
2000
0
    }
2001
0
  }
2002
2
  // if VGPR usage is still extremely high, we may spill. Try other variants
2003
2
  // which are less performing, but that could lead to lower VGPR usage.
2004
2
  if (
Best.MaxVGPRUsage > 2002
) {
2005
0
    static const std::pair<SISchedulerBlockCreatorVariant,
2006
0
                           SISchedulerBlockSchedulerVariant>
2007
0
        Variants[] = {
2008
0
//      { LatenciesAlone, BlockRegUsageLatency },
2009
0
      { LatenciesAlone, BlockRegUsage },
2010
0
//      { LatenciesGrouped, BlockLatencyRegUsage },
2011
0
      { LatenciesGrouped, BlockRegUsageLatency },
2012
0
      { LatenciesGrouped, BlockRegUsage },
2013
0
//      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
2014
0
      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
2015
0
      { LatenciesAlonePlusConsecutive, BlockRegUsage }
2016
0
    };
2017
0
    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
2018
0
      Temp = Scheduler.scheduleVariant(v.first, v.second);
2019
0
      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
2020
0
        Best = Temp;
2021
0
    }
2022
0
  }
2023
2
2024
2
  ScheduledSUnits = Best.SUs;
2025
2
  ScheduledSUnitsInv.resize(SUnits.size());
2026
2
2027
18
  for (unsigned i = 0, e = (unsigned)SUnits.size(); 
i != e18
;
++i16
) {
2028
16
    ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
2029
16
  }
2030
2
2031
2
  moveLowLatencies();
2032
2
2033
2
  // Tell the outside world about the result of the scheduling.
2034
2
2035
2
  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
2036
2
  TopRPTracker.setPos(CurrentTop);
2037
2
2038
2
  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
2039
18
       E = ScheduledSUnits.end(); 
I != E18
;
++I16
) {
2040
16
    SUnit *SU = &SUnits[*I];
2041
16
2042
16
    scheduleMI(SU, true);
2043
16
2044
16
    DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
2045
16
                 << *SU->getInstr());
2046
16
  }
2047
2
2048
2
  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
2049
2
2050
2
  placeDebugValues();
2051
2
2052
2
  DEBUG({
2053
2
      unsigned BBNum = begin()->getParent()->getNumber();
2054
2
      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
2055
2
      dumpSchedule();
2056
2
      dbgs() << '\n';
2057
2
    });
2058
2
}