/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===---------------------------- GCNILPSched.cpp - -----------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | /// \file |
10 | | // |
11 | | //===----------------------------------------------------------------------===// |
12 | | |
13 | | #include "llvm/CodeGen/ScheduleDAG.h" |
14 | | |
15 | | using namespace llvm; |
16 | | |
17 | | #define DEBUG_TYPE "machine-scheduler" |
18 | | |
19 | | namespace { |
20 | | |
21 | | class GCNILPScheduler { |
22 | | struct Candidate : ilist_node<Candidate> { |
23 | | SUnit *SU; |
24 | | |
25 | | Candidate(SUnit *SU_) |
26 | 454 | : SU(SU_) {} |
27 | | }; |
28 | | |
29 | | SpecificBumpPtrAllocator<Candidate> Alloc; |
30 | | typedef simple_ilist<Candidate> Queue; |
31 | | Queue PendingQueue; |
32 | | Queue AvailQueue; |
33 | | unsigned CurQueueId = 0; |
34 | | |
35 | | std::vector<unsigned> SUNumbers; |
36 | | |
37 | | /// CurCycle - The current scheduler state corresponds to this cycle. |
38 | | unsigned CurCycle = 0; |
39 | | |
40 | | unsigned getNodePriority(const SUnit *SU) const; |
41 | | |
42 | | const SUnit *pickBest(const SUnit *left, const SUnit *right); |
43 | | Candidate* pickCandidate(); |
44 | | |
45 | | void releasePending(); |
46 | | void advanceToCycle(unsigned NextCycle); |
47 | | void releasePredecessors(const SUnit* SU); |
48 | | |
49 | | public: |
50 | | std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots, |
51 | | const ScheduleDAG &DAG); |
52 | | }; |
53 | | } // namespace |
54 | | |
55 | | /// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number. |
56 | | /// Smaller number is the higher priority. |
57 | | static unsigned |
58 | 1.41k | CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) { |
59 | 1.41k | unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum]; |
60 | 1.41k | if (SethiUllmanNumber != 0) |
61 | 958 | return SethiUllmanNumber; |
62 | 454 | |
63 | 454 | unsigned Extra = 0; |
64 | 4.50k | for (const SDep &Pred : SU->Preds) { |
65 | 4.50k | if (Pred.isCtrl()) continue3.55k ; // ignore chain preds |
66 | 958 | SUnit *PredSU = Pred.getSUnit(); |
67 | 958 | unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers); |
68 | 958 | if (PredSethiUllman > SethiUllmanNumber) { |
69 | 476 | SethiUllmanNumber = PredSethiUllman; |
70 | 476 | Extra = 0; |
71 | 476 | } |
72 | 482 | else if (PredSethiUllman == SethiUllmanNumber) |
73 | 446 | ++Extra; |
74 | 958 | } |
75 | 454 | |
76 | 454 | SethiUllmanNumber += Extra; |
77 | 454 | |
78 | 454 | if (SethiUllmanNumber == 0) |
79 | 4 | SethiUllmanNumber = 1; |
80 | 454 | |
81 | 454 | return SethiUllmanNumber; |
82 | 454 | } |
83 | | |
84 | | // Lower priority means schedule further down. For bottom-up scheduling, lower |
85 | | // priority SUs are scheduled before higher priority SUs. |
86 | 19.4k | unsigned GCNILPScheduler::getNodePriority(const SUnit *SU) const { |
87 | 19.4k | assert(SU->NodeNum < SUNumbers.size()); |
88 | 19.4k | if (SU->NumSuccs == 0 && SU->NumPreds != 0106 ) |
89 | 106 | // If SU does not have a register use, i.e. it doesn't produce a value |
90 | 106 | // that would be consumed (e.g. store), then it terminates a chain of |
91 | 106 | // computation. Give it a large SethiUllman number so it will be |
92 | 106 | // scheduled right before its predecessors that it doesn't lengthen |
93 | 106 | // their live ranges. |
94 | 106 | return 0xffff; |
95 | 19.3k | |
96 | 19.3k | if (SU->NumPreds == 0 && SU->NumSuccs != 04 ) |
97 | 4 | // If SU does not have a register def, schedule it close to its uses |
98 | 4 | // because it does not lengthen any live ranges. |
99 | 4 | return 0; |
100 | 19.3k | |
101 | 19.3k | return SUNumbers[SU->NodeNum]; |
102 | 19.3k | } |
103 | | |
104 | | /// closestSucc - Returns the scheduled cycle of the successor which is |
105 | | /// closest to the current cycle. |
106 | 17.6k | static unsigned closestSucc(const SUnit *SU) { |
107 | 17.6k | unsigned MaxHeight = 0; |
108 | 290k | for (const SDep &Succ : SU->Succs) { |
109 | 290k | if (Succ.isCtrl()) continue267k ; // ignore chain succs |
110 | 22.9k | unsigned Height = Succ.getSUnit()->getHeight(); |
111 | 22.9k | // If there are bunch of CopyToRegs stacked up, they should be considered |
112 | 22.9k | // to be at the same position. |
113 | 22.9k | if (Height > MaxHeight) |
114 | 16.1k | MaxHeight = Height; |
115 | 22.9k | } |
116 | 17.6k | return MaxHeight; |
117 | 17.6k | } |
118 | | |
119 | | /// calcMaxScratches - Returns an cost estimate of the worse case requirement |
120 | | /// for scratch registers, i.e. number of data dependencies. |
121 | 17.6k | static unsigned calcMaxScratches(const SUnit *SU) { |
122 | 17.6k | unsigned Scratches = 0; |
123 | 36.7k | for (const SDep &Pred : SU->Preds) { |
124 | 36.7k | if (Pred.isCtrl()) continue1.48k ; // ignore chain preds |
125 | 35.2k | Scratches++; |
126 | 35.2k | } |
127 | 17.6k | return Scratches; |
128 | 17.6k | } |
129 | | |
130 | | // Return -1 if left has higher priority, 1 if right has higher priority. |
131 | | // Return 0 if latency-based priority is equivalent. |
132 | 8.80k | static int BUCompareLatency(const SUnit *left, const SUnit *right) { |
133 | 8.80k | // Scheduling an instruction that uses a VReg whose postincrement has not yet |
134 | 8.80k | // been scheduled will induce a copy. Model this as an extra cycle of latency. |
135 | 8.80k | int LHeight = (int)left->getHeight(); |
136 | 8.80k | int RHeight = (int)right->getHeight(); |
137 | 8.80k | |
138 | 8.80k | // If either node is scheduling for latency, sort them by height/depth |
139 | 8.80k | // and latency. |
140 | 8.80k | |
141 | 8.80k | // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer |
142 | 8.80k | // is enabled, grouping instructions by cycle, then its height is already |
143 | 8.80k | // covered so only its depth matters. We also reach this point if both stall |
144 | 8.80k | // but have the same height. |
145 | 8.80k | if (LHeight != RHeight) |
146 | 0 | return LHeight > RHeight ? 1 : -1; |
147 | 8.80k | |
148 | 8.80k | int LDepth = left->getDepth(); |
149 | 8.80k | int RDepth = right->getDepth(); |
150 | 8.80k | if (LDepth != RDepth) { |
151 | 0 | LLVM_DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum |
152 | 0 | << ") depth " << LDepth << " vs SU (" << right->NodeNum |
153 | 0 | << ") depth " << RDepth << "\n"); |
154 | 0 | return LDepth < RDepth ? 1 : -1; |
155 | 0 | } |
156 | 8.80k | if (left->Latency != right->Latency) |
157 | 0 | return left->Latency > right->Latency ? 1 : -1; |
158 | 8.80k | |
159 | 8.80k | return 0; |
160 | 8.80k | } |
161 | | |
162 | | const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right) |
163 | 9.78k | { |
164 | 9.78k | // TODO: add register pressure lowering checks |
165 | 9.78k | |
166 | 9.78k | bool const DisableSchedCriticalPath = false; |
167 | 9.78k | int MaxReorderWindow = 6; |
168 | 9.78k | if (!DisableSchedCriticalPath) { |
169 | 9.78k | int spread = (int)left->getDepth() - (int)right->getDepth(); |
170 | 9.78k | if (std::abs(spread) > MaxReorderWindow) { |
171 | 76 | LLVM_DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): " |
172 | 76 | << left->getDepth() << " != SU(" << right->NodeNum |
173 | 76 | << "): " << right->getDepth() << "\n"); |
174 | 76 | return left->getDepth() < right->getDepth() ? right : left0 ; |
175 | 76 | } |
176 | 9.71k | } |
177 | 9.71k | |
178 | 9.71k | bool const DisableSchedHeight = false; |
179 | 9.71k | if (!DisableSchedHeight && left->getHeight() != right->getHeight()) { |
180 | 838 | int spread = (int)left->getHeight() - (int)right->getHeight(); |
181 | 838 | if (std::abs(spread) > MaxReorderWindow) |
182 | 0 | return left->getHeight() > right->getHeight() ? right : left; |
183 | 9.71k | } |
184 | 9.71k | |
185 | 9.71k | // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down. |
186 | 9.71k | unsigned LPriority = getNodePriority(left); |
187 | 9.71k | unsigned RPriority = getNodePriority(right); |
188 | 9.71k | |
189 | 9.71k | if (LPriority != RPriority) |
190 | 900 | return LPriority > RPriority ? right240 : left660 ; |
191 | 8.81k | |
192 | 8.81k | // Try schedule def + use closer when Sethi-Ullman numbers are the same. |
193 | 8.81k | // e.g. |
194 | 8.81k | // t1 = op t2, c1 |
195 | 8.81k | // t3 = op t4, c2 |
196 | 8.81k | // |
197 | 8.81k | // and the following instructions are both ready. |
198 | 8.81k | // t2 = op c3 |
199 | 8.81k | // t4 = op c4 |
200 | 8.81k | // |
201 | 8.81k | // Then schedule t2 = op first. |
202 | 8.81k | // i.e. |
203 | 8.81k | // t4 = op c4 |
204 | 8.81k | // t2 = op c3 |
205 | 8.81k | // t1 = op t2, c1 |
206 | 8.81k | // t3 = op t4, c2 |
207 | 8.81k | // |
208 | 8.81k | // This creates more short live intervals. |
209 | 8.81k | unsigned LDist = closestSucc(left); |
210 | 8.81k | unsigned RDist = closestSucc(right); |
211 | 8.81k | if (LDist != RDist) |
212 | 6 | return LDist < RDist ? right : left0 ; |
213 | 8.80k | |
214 | 8.80k | // How many registers becomes live when the node is scheduled. |
215 | 8.80k | unsigned LScratch = calcMaxScratches(left); |
216 | 8.80k | unsigned RScratch = calcMaxScratches(right); |
217 | 8.80k | if (LScratch != RScratch) |
218 | 0 | return LScratch > RScratch ? right : left; |
219 | 8.80k | |
220 | 8.80k | bool const DisableSchedCycles = false; |
221 | 8.80k | if (!DisableSchedCycles) { |
222 | 8.80k | int result = BUCompareLatency(left, right); |
223 | 8.80k | if (result != 0) |
224 | 0 | return result > 0 ? right : left; |
225 | 8.80k | return left; |
226 | 8.80k | } |
227 | 0 | else { |
228 | 0 | if (left->getHeight() != right->getHeight()) |
229 | 0 | return (left->getHeight() > right->getHeight()) ? right : left; |
230 | 0 | |
231 | 0 | if (left->getDepth() != right->getDepth()) |
232 | 0 | return (left->getDepth() < right->getDepth()) ? right : left; |
233 | 0 | } |
234 | 0 | |
235 | 0 | assert(left->NodeQueueId && right->NodeQueueId && |
236 | 0 | "NodeQueueId cannot be zero"); |
237 | 0 | return (left->NodeQueueId > right->NodeQueueId) ? right : left; |
238 | 0 | } |
239 | | |
240 | 454 | GCNILPScheduler::Candidate* GCNILPScheduler::pickCandidate() { |
241 | 454 | if (AvailQueue.empty()) |
242 | 0 | return nullptr; |
243 | 454 | auto Best = AvailQueue.begin(); |
244 | 10.2k | for (auto I = std::next(AvailQueue.begin()), E = AvailQueue.end(); I != E; ++I9.78k ) { |
245 | 9.78k | auto NewBestSU = pickBest(Best->SU, I->SU); |
246 | 9.78k | if (NewBestSU != Best->SU) { |
247 | 322 | assert(NewBestSU == I->SU); |
248 | 322 | Best = I; |
249 | 322 | } |
250 | 9.78k | } |
251 | 454 | return &*Best; |
252 | 454 | } |
253 | | |
254 | 66 | void GCNILPScheduler::releasePending() { |
255 | 66 | // Check to see if any of the pending instructions are ready to issue. If |
256 | 66 | // so, add them to the available queue. |
257 | 518 | for(auto I = PendingQueue.begin(), E = PendingQueue.end(); I != E;) { |
258 | 452 | auto &C = *I++; |
259 | 452 | if (C.SU->getHeight() <= CurCycle) { |
260 | 452 | PendingQueue.remove(C); |
261 | 452 | AvailQueue.push_back(C); |
262 | 452 | C.SU->NodeQueueId = CurQueueId++; |
263 | 452 | } |
264 | 452 | } |
265 | 66 | } |
266 | | |
267 | | /// Move the scheduler state forward by the specified number of Cycles. |
268 | 520 | void GCNILPScheduler::advanceToCycle(unsigned NextCycle) { |
269 | 520 | if (NextCycle <= CurCycle) |
270 | 454 | return; |
271 | 66 | CurCycle = NextCycle; |
272 | 66 | releasePending(); |
273 | 66 | } |
274 | | |
275 | 456 | void GCNILPScheduler::releasePredecessors(const SUnit* SU) { |
276 | 4.50k | for (const auto &PredEdge : SU->Preds) { |
277 | 4.50k | auto PredSU = PredEdge.getSUnit(); |
278 | 4.50k | if (PredEdge.isWeak()) |
279 | 0 | continue; |
280 | 4.50k | assert(PredSU->isBoundaryNode() || PredSU->NumSuccsLeft > 0); |
281 | 4.50k | |
282 | 4.50k | PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge.getLatency()); |
283 | 4.50k | |
284 | 4.50k | if (!PredSU->isBoundaryNode() && --PredSU->NumSuccsLeft == 0) |
285 | 452 | PendingQueue.push_front(*new (Alloc.Allocate()) Candidate(PredSU)); |
286 | 4.50k | } |
287 | 456 | } |
288 | | |
289 | | std::vector<const SUnit*> |
290 | | GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots, |
291 | 2 | const ScheduleDAG &DAG) { |
292 | 2 | auto &SUnits = const_cast<ScheduleDAG&>(DAG).SUnits; |
293 | 2 | |
294 | 2 | std::vector<SUnit> SUSavedCopy; |
295 | 2 | SUSavedCopy.resize(SUnits.size()); |
296 | 2 | |
297 | 2 | // we cannot save only those fields we touch: some of them are private |
298 | 2 | // so save units verbatim: this assumes SUnit should have value semantics |
299 | 2 | for (const SUnit &SU : SUnits) |
300 | 454 | SUSavedCopy[SU.NodeNum] = SU; |
301 | 2 | |
302 | 2 | SUNumbers.assign(SUnits.size(), 0); |
303 | 2 | for (const SUnit &SU : SUnits) |
304 | 454 | CalcNodeSethiUllmanNumber(&SU, SUNumbers); |
305 | 2 | |
306 | 2 | for (auto SU : BotRoots) { |
307 | 2 | AvailQueue.push_back( |
308 | 2 | *new (Alloc.Allocate()) Candidate(const_cast<SUnit*>(SU))); |
309 | 2 | } |
310 | 2 | releasePredecessors(&DAG.ExitSU); |
311 | 2 | |
312 | 2 | std::vector<const SUnit*> Schedule; |
313 | 2 | Schedule.reserve(SUnits.size()); |
314 | 456 | while (true) { |
315 | 456 | if (AvailQueue.empty() && !PendingQueue.empty()68 ) { |
316 | 66 | auto EarliestSU = std::min_element( |
317 | 66 | PendingQueue.begin(), PendingQueue.end(), |
318 | 386 | [=](const Candidate& C1, const Candidate& C2) { |
319 | 386 | return C1.SU->getHeight() < C2.SU->getHeight(); |
320 | 386 | })->SU; |
321 | 66 | advanceToCycle(std::max(CurCycle + 1, EarliestSU->getHeight())); |
322 | 66 | } |
323 | 456 | if (AvailQueue.empty()) |
324 | 2 | break; |
325 | 454 | |
326 | 454 | LLVM_DEBUG(dbgs() << "\n=== Picking candidate\n" |
327 | 454 | "Ready queue:"; |
328 | 454 | for (auto &C |
329 | 454 | : AvailQueue) dbgs() |
330 | 454 | << ' ' << C.SU->NodeNum; |
331 | 454 | dbgs() << '\n';); |
332 | 454 | |
333 | 454 | auto C = pickCandidate(); |
334 | 454 | assert(C); |
335 | 454 | AvailQueue.remove(*C); |
336 | 454 | auto SU = C->SU; |
337 | 454 | LLVM_DEBUG(dbgs() << "Selected "; DAG.dumpNode(*SU)); |
338 | 454 | |
339 | 454 | advanceToCycle(SU->getHeight()); |
340 | 454 | |
341 | 454 | releasePredecessors(SU); |
342 | 454 | Schedule.push_back(SU); |
343 | 454 | SU->isScheduled = true; |
344 | 454 | } |
345 | 2 | assert(SUnits.size() == Schedule.size()); |
346 | 2 | |
347 | 2 | std::reverse(Schedule.begin(), Schedule.end()); |
348 | 2 | |
349 | 2 | // restore units |
350 | 2 | for (auto &SU : SUnits) |
351 | 454 | SU = SUSavedCopy[SU.NodeNum]; |
352 | 2 | |
353 | 2 | return Schedule; |
354 | 2 | } |
355 | | |
356 | | namespace llvm { |
357 | | std::vector<const SUnit*> makeGCNILPScheduler(ArrayRef<const SUnit*> BotRoots, |
358 | 2 | const ScheduleDAG &DAG) { |
359 | 2 | GCNILPScheduler S; |
360 | 2 | return S.schedule(BotRoots, DAG); |
361 | 2 | } |
362 | | } |