/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/CodeGen/ExecutionDepsFix.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- ExecutionDepsFix.cpp - Fix execution dependecy issues ----*- C++ -*-===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | |
10 | | #include "llvm/CodeGen/ExecutionDepsFix.h" |
11 | | |
12 | | #include "llvm/ADT/PostOrderIterator.h" |
13 | | #include "llvm/ADT/iterator_range.h" |
14 | | #include "llvm/CodeGen/LivePhysRegs.h" |
15 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
16 | | #include "llvm/CodeGen/MachineRegisterInfo.h" |
17 | | #include "llvm/CodeGen/RegisterClassInfo.h" |
18 | | #include "llvm/CodeGen/TargetInstrInfo.h" |
19 | | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
20 | | #include "llvm/Support/Allocator.h" |
21 | | #include "llvm/Support/Debug.h" |
22 | | #include "llvm/Support/raw_ostream.h" |
23 | | |
24 | | using namespace llvm; |
25 | | |
26 | | #define DEBUG_TYPE "execution-deps-fix" |
27 | | |
28 | | /// Translate TRI register number to a list of indices into our smaller tables |
29 | | /// of interesting registers. |
30 | | iterator_range<SmallVectorImpl<int>::const_iterator> |
31 | 3.13M | ExecutionDepsFix::regIndices(unsigned Reg) const { |
32 | 3.13M | assert(Reg < AliasMap.size() && "Invalid register"); |
33 | 3.13M | const auto &Entry = AliasMap[Reg]; |
34 | 3.13M | return make_range(Entry.begin(), Entry.end()); |
35 | 3.13M | } |
36 | | |
37 | 310k | DomainValue *ExecutionDepsFix::alloc(int domain) { |
38 | 310k | DomainValue *dv = Avail.empty() ? |
39 | 146k | new(Allocator.Allocate()) DomainValue : |
40 | 310k | Avail.pop_back_val()164k ; |
41 | 310k | if (domain >= 0) |
42 | 261k | dv->addDomain(domain); |
43 | 310k | assert(dv->Refs == 0 && "Reference count wasn't cleared"); |
44 | 310k | assert(!dv->Next && "Chained DomainValue shouldn't have been recycled"); |
45 | 310k | return dv; |
46 | 310k | } |
47 | | |
48 | | /// Release a reference to DV. When the last reference is released, |
49 | | /// collapse if needed. |
50 | 2.33M | void ExecutionDepsFix::release(DomainValue *DV) { |
51 | 2.64M | while (DV) { |
52 | 482k | assert(DV->Refs && "Bad DomainValue"); |
53 | 482k | if (--DV->Refs) |
54 | 171k | return; |
55 | 310k | |
56 | 310k | // There are no more DV references. Collapse any contained instructions. |
57 | 310k | if (DV->AvailableDomains && !DV->isCollapsed()307k ) |
58 | 22.6k | collapse(DV, DV->getFirstDomain()); |
59 | 310k | |
60 | 310k | DomainValue *Next = DV->Next; |
61 | 310k | DV->clear(); |
62 | 310k | Avail.push_back(DV); |
63 | 310k | // Also release the next DomainValue in the chain. |
64 | 310k | DV = Next; |
65 | 310k | } |
66 | 2.33M | } |
67 | | |
68 | | /// Follow the chain of dead DomainValues until a live DomainValue is reached. |
69 | | /// Update the referenced pointer when necessary. |
70 | 17.9M | DomainValue *ExecutionDepsFix::resolve(DomainValue *&DVRef) { |
71 | 17.9M | DomainValue *DV = DVRef; |
72 | 17.9M | if (!DV || !DV->Next248k ) |
73 | 17.9M | return DV; |
74 | 1.07k | |
75 | 1.07k | // DV has a chain. Find the end. |
76 | 1.08k | do 1.07k DV = DV->Next; |
77 | 1.08k | while (DV->Next); |
78 | 1.07k | |
79 | 1.07k | // Update DVRef to point to DV. |
80 | 1.07k | retain(DV); |
81 | 1.07k | release(DVRef); |
82 | 1.07k | DVRef = DV; |
83 | 1.07k | return DV; |
84 | 1.07k | } |
85 | | |
86 | | /// Set LiveRegs[rx] = dv, updating reference counts. |
87 | 478k | void ExecutionDepsFix::setLiveReg(int rx, DomainValue *dv) { |
88 | 478k | assert(unsigned(rx) < NumRegs && "Invalid index"); |
89 | 478k | assert(LiveRegs && "Must enter basic block first."); |
90 | 478k | |
91 | 478k | if (LiveRegs[rx].Value == dv) |
92 | 0 | return; |
93 | 478k | if (LiveRegs[rx].Value) |
94 | 15.7k | release(LiveRegs[rx].Value); |
95 | 478k | LiveRegs[rx].Value = retain(dv); |
96 | 478k | } |
97 | | |
98 | | // Kill register rx, recycle or collapse any DomainValue. |
99 | 279k | void ExecutionDepsFix::kill(int rx) { |
100 | 279k | assert(unsigned(rx) < NumRegs && "Invalid index"); |
101 | 279k | assert(LiveRegs && "Must enter basic block first."); |
102 | 279k | if (!LiveRegs[rx].Value) |
103 | 107k | return; |
104 | 171k | |
105 | 171k | release(LiveRegs[rx].Value); |
106 | 171k | LiveRegs[rx].Value = nullptr; |
107 | 171k | } |
108 | | |
109 | | /// Force register rx into domain. |
110 | 455k | void ExecutionDepsFix::force(int rx, unsigned domain) { |
111 | 455k | assert(unsigned(rx) < NumRegs && "Invalid index"); |
112 | 455k | assert(LiveRegs && "Must enter basic block first."); |
113 | 455k | if (DomainValue *dv = LiveRegs[rx].Value) { |
114 | 207k | if (dv->isCollapsed()) |
115 | 184k | dv->addDomain(domain); |
116 | 23.1k | else if (dv->hasDomain(domain)) |
117 | 23.0k | collapse(dv, domain); |
118 | 83 | else { |
119 | 83 | // This is an incompatible open DomainValue. Collapse it to whatever and |
120 | 83 | // force the new value into domain. This costs a domain crossing. |
121 | 83 | collapse(dv, dv->getFirstDomain()); |
122 | 83 | assert(LiveRegs[rx].Value && "Not live after collapse?"); |
123 | 83 | LiveRegs[rx].Value->addDomain(domain); |
124 | 83 | } |
125 | 247k | } else { |
126 | 247k | // Set up basic collapsed DomainValue. |
127 | 247k | setLiveReg(rx, alloc(domain)); |
128 | 247k | } |
129 | 455k | } |
130 | | |
131 | | /// Collapse open DomainValue into given domain. If there are multiple |
132 | | /// registers using dv, they each get a unique collapsed DomainValue. |
133 | 46.1k | void ExecutionDepsFix::collapse(DomainValue *dv, unsigned domain) { |
134 | 46.1k | assert(dv->hasDomain(domain) && "Cannot collapse"); |
135 | 46.1k | |
136 | 46.1k | // Collapse all the instructions. |
137 | 114k | while (!dv->Instrs.empty()) |
138 | 68.7k | TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain); |
139 | 46.1k | dv->setSingleDomain(domain); |
140 | 46.1k | |
141 | 46.1k | // If there are multiple users, give them new, unique DomainValues. |
142 | 46.1k | if (LiveRegs && dv->Refs > 132.1k ) |
143 | 210k | for (unsigned rx = 0; 6.38k rx != NumRegs; ++rx204k ) |
144 | 204k | if (LiveRegs[rx].Value == dv) |
145 | 13.5k | setLiveReg(rx, alloc(domain)); |
146 | 46.1k | } |
147 | | |
148 | | /// All instructions and registers in B are moved to A, and B is released. |
149 | 26.8k | bool ExecutionDepsFix::merge(DomainValue *A, DomainValue *B) { |
150 | 26.8k | assert(!A->isCollapsed() && "Cannot merge into collapsed"); |
151 | 26.8k | assert(!B->isCollapsed() && "Cannot merge from collapsed"); |
152 | 26.8k | if (A == B) |
153 | 23.6k | return true; |
154 | 3.14k | // Restrict to the domains that A and B have in common. |
155 | 3.14k | unsigned common = A->getCommonDomains(B->AvailableDomains); |
156 | 3.14k | if (!common) |
157 | 0 | return false; |
158 | 3.14k | A->AvailableDomains = common; |
159 | 3.14k | A->Instrs.append(B->Instrs.begin(), B->Instrs.end()); |
160 | 3.14k | |
161 | 3.14k | // Clear the old DomainValue so we won't try to swizzle instructions twice. |
162 | 3.14k | B->clear(); |
163 | 3.14k | // All uses of B are referred to A. |
164 | 3.14k | B->Next = retain(A); |
165 | 3.14k | |
166 | 103k | for (unsigned rx = 0; rx != NumRegs; ++rx100k ) { |
167 | 100k | assert(LiveRegs && "no space allocated for live registers"); |
168 | 100k | if (LiveRegs[rx].Value == B) |
169 | 2.18k | setLiveReg(rx, A); |
170 | 100k | } |
171 | 3.14k | return true; |
172 | 3.14k | } |
173 | | |
174 | | /// Set up LiveRegs by merging predecessor live-out values. |
175 | 459k | void ExecutionDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { |
176 | 459k | // Reset instruction counter in each basic block. |
177 | 459k | CurInstr = 0; |
178 | 459k | |
179 | 459k | // Set up UndefReads to track undefined register reads. |
180 | 459k | UndefReads.clear(); |
181 | 459k | LiveRegSet.clear(); |
182 | 459k | |
183 | 459k | // Set up LiveRegs to represent registers entering MBB. |
184 | 459k | if (!LiveRegs) |
185 | 459k | LiveRegs = new LiveReg[NumRegs]; |
186 | 459k | |
187 | 459k | // Default values are 'nothing happened a long time ago'. |
188 | 15.1M | for (unsigned rx = 0; rx != NumRegs; ++rx14.7M ) { |
189 | 14.7M | LiveRegs[rx].Value = nullptr; |
190 | 14.7M | LiveRegs[rx].Def = -(1 << 20); |
191 | 14.7M | } |
192 | 459k | |
193 | 459k | // This is the entry block. |
194 | 459k | if (MBB->pred_empty()) { |
195 | 251k | for (const auto &LI : MBB->liveins()) { |
196 | 251k | for (int rx : regIndices(LI.PhysReg)) { |
197 | 95.7k | // Treat function live-ins as if they were defined just before the first |
198 | 95.7k | // instruction. Usually, function arguments are set up immediately |
199 | 95.7k | // before the call. |
200 | 95.7k | LiveRegs[rx].Def = -1; |
201 | 95.7k | } |
202 | 251k | } |
203 | 94.2k | DEBUG(dbgs() << printMBBReference(*MBB) << ": entry\n"); |
204 | 94.2k | return; |
205 | 94.2k | } |
206 | 365k | |
207 | 365k | // Try to coalesce live-out registers from predecessors. |
208 | 365k | for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), |
209 | 938k | pe = MBB->pred_end(); pi != pe; ++pi573k ) { |
210 | 573k | auto fi = MBBInfos.find(*pi); |
211 | 573k | assert(fi != MBBInfos.end() && |
212 | 573k | "Should have pre-allocated MBBInfos for all MBBs"); |
213 | 573k | LiveReg *Incoming = fi->second.OutRegs; |
214 | 573k | // Incoming is null if this is a backedge from a BB |
215 | 573k | // we haven't processed yet |
216 | 573k | if (Incoming == nullptr) { |
217 | 14.0k | continue; |
218 | 14.0k | } |
219 | 559k | |
220 | 18.4M | for (unsigned rx = 0; 559k rx != NumRegs; ++rx17.9M ) { |
221 | 17.9M | // Use the most recent predecessor def for each register. |
222 | 17.9M | LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def); |
223 | 17.9M | |
224 | 17.9M | DomainValue *pdv = resolve(Incoming[rx].Value); |
225 | 17.9M | if (!pdv) |
226 | 17.6M | continue; |
227 | 248k | if (!LiveRegs[rx].Value) { |
228 | 154k | setLiveReg(rx, pdv); |
229 | 154k | continue; |
230 | 154k | } |
231 | 93.6k | |
232 | 93.6k | // We have a live DomainValue from more than one predecessor. |
233 | 93.6k | if (LiveRegs[rx].Value->isCollapsed()) { |
234 | 68.1k | // We are already collapsed, but predecessor is not. Force it. |
235 | 68.1k | unsigned Domain = LiveRegs[rx].Value->getFirstDomain(); |
236 | 68.1k | if (!pdv->isCollapsed() && pdv->hasDomain(Domain)301 ) |
237 | 301 | collapse(pdv, Domain); |
238 | 68.1k | continue; |
239 | 68.1k | } |
240 | 25.5k | |
241 | 25.5k | // Currently open, merge in predecessor. |
242 | 25.5k | if (!pdv->isCollapsed()) |
243 | 25.3k | merge(LiveRegs[rx].Value, pdv); |
244 | 158 | else |
245 | 158 | force(rx, pdv->getFirstDomain()); |
246 | 25.5k | } |
247 | 559k | } |
248 | 365k | DEBUG( |
249 | 365k | dbgs() << printMBBReference(*MBB) |
250 | 365k | << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); |
251 | 365k | } |
252 | | |
253 | 459k | void ExecutionDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { |
254 | 459k | assert(LiveRegs && "Must enter basic block first."); |
255 | 459k | LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs; |
256 | 459k | // Save register clearances at end of MBB - used by enterBasicBlock(). |
257 | 459k | MBBInfos[MBB].OutRegs = LiveRegs; |
258 | 459k | |
259 | 459k | // While processing the basic block, we kept `Def` relative to the start |
260 | 459k | // of the basic block for convenience. However, future use of this information |
261 | 459k | // only cares about the clearance from the end of the block, so adjust |
262 | 459k | // everything to be relative to the end of the basic block. |
263 | 15.1M | for (unsigned i = 0, e = NumRegs; i != e; ++i14.7M ) |
264 | 14.7M | LiveRegs[i].Def -= CurInstr; |
265 | 459k | if (OldOutRegs) { |
266 | 58.6k | // This must be the second pass. |
267 | 58.6k | // Release all the DomainValues instead of keeping them. |
268 | 1.93M | for (unsigned i = 0, e = NumRegs; i != e; ++i1.87M ) |
269 | 1.87M | release(OldOutRegs[i].Value); |
270 | 58.6k | delete[] OldOutRegs; |
271 | 58.6k | } |
272 | 459k | LiveRegs = nullptr; |
273 | 459k | } |
274 | | |
275 | 2.76M | bool ExecutionDepsFix::visitInstr(MachineInstr *MI) { |
276 | 2.76M | // Update instructions with explicit execution domains. |
277 | 2.76M | std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI); |
278 | 2.76M | if (DomP.first) { |
279 | 262k | if (DomP.second) |
280 | 106k | visitSoftInstr(MI, DomP.second); |
281 | 156k | else |
282 | 156k | visitHardInstr(MI, DomP.first); |
283 | 262k | } |
284 | 2.76M | |
285 | 2.76M | return !DomP.first; |
286 | 2.76M | } |
287 | | |
288 | | /// \brief Helps avoid false dependencies on undef registers by updating the |
289 | | /// machine instructions' undef operand to use a register that the instruction |
290 | | /// is truly dependent on, or use a register with clearance higher than Pref. |
291 | | /// Returns true if it was able to find a true dependency, thus not requiring |
292 | | /// a dependency breaking instruction regardless of clearance. |
293 | | bool ExecutionDepsFix::pickBestRegisterForUndef(MachineInstr *MI, |
294 | 1.46k | unsigned OpIdx, unsigned Pref) { |
295 | 1.46k | MachineOperand &MO = MI->getOperand(OpIdx); |
296 | 1.46k | assert(MO.isUndef() && "Expected undef machine operand"); |
297 | 1.46k | |
298 | 1.46k | unsigned OriginalReg = MO.getReg(); |
299 | 1.46k | |
300 | 1.46k | // Update only undef operands that are mapped to one register. |
301 | 1.46k | if (AliasMap[OriginalReg].size() != 1) |
302 | 0 | return false; |
303 | 1.46k | |
304 | 1.46k | // Get the undef operand's register class |
305 | 1.46k | const TargetRegisterClass *OpRC = |
306 | 1.46k | TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF); |
307 | 1.46k | |
308 | 1.46k | // If the instruction has a true dependency, we can hide the false depdency |
309 | 1.46k | // behind it. |
310 | 4.94k | for (MachineOperand &CurrMO : MI->operands()) { |
311 | 4.94k | if (!CurrMO.isReg() || CurrMO.isDef()4.66k || CurrMO.isUndef()3.20k || |
312 | 4.94k | !OpRC->contains(CurrMO.getReg())1.72k ) |
313 | 4.54k | continue; |
314 | 401 | // We found a true dependency - replace the undef register with the true |
315 | 401 | // dependency. |
316 | 401 | MO.setReg(CurrMO.getReg()); |
317 | 401 | return true; |
318 | 401 | } |
319 | 1.46k | |
320 | 1.46k | // Go over all registers in the register class and find the register with |
321 | 1.46k | // max clearance or clearance higher than Pref. |
322 | 1.46k | unsigned MaxClearance = 0; |
323 | 1.06k | unsigned MaxClearanceReg = OriginalReg; |
324 | 1.06k | ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(OpRC); |
325 | 4.13k | for (auto Reg : Order) { |
326 | 4.13k | assert(AliasMap[Reg].size() == 1 && |
327 | 4.13k | "Reg is expected to be mapped to a single index"); |
328 | 4.13k | int RCrx = *regIndices(Reg).begin(); |
329 | 4.13k | unsigned Clearance = CurInstr - LiveRegs[RCrx].Def; |
330 | 4.13k | if (Clearance <= MaxClearance) |
331 | 1.93k | continue; |
332 | 2.20k | MaxClearance = Clearance; |
333 | 2.20k | MaxClearanceReg = Reg; |
334 | 2.20k | |
335 | 2.20k | if (MaxClearance > Pref) |
336 | 1.05k | break; |
337 | 2.20k | } |
338 | 1.06k | |
339 | 1.06k | // Update the operand if we found a register with better clearance. |
340 | 1.06k | if (MaxClearanceReg != OriginalReg) |
341 | 897 | MO.setReg(MaxClearanceReg); |
342 | 1.06k | |
343 | 1.06k | return false; |
344 | 1.46k | } |
345 | | |
346 | | /// \brief Return true to if it makes sense to break dependence on a partial def |
347 | | /// or undef use. |
348 | | bool ExecutionDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, |
349 | 1.71k | unsigned Pref) { |
350 | 1.71k | unsigned reg = MI->getOperand(OpIdx).getReg(); |
351 | 1.75k | for (int rx : regIndices(reg)) { |
352 | 1.75k | unsigned Clearance = CurInstr - LiveRegs[rx].Def; |
353 | 1.75k | DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref); |
354 | 1.75k | |
355 | 1.75k | if (Pref > Clearance) { |
356 | 324 | DEBUG(dbgs() << ": Break dependency.\n"); |
357 | 324 | continue; |
358 | 324 | } |
359 | 1.43k | DEBUG(dbgs() << ": OK .\n"); |
360 | 1.43k | return false; |
361 | 1.43k | } |
362 | 1.71k | return true282 ; |
363 | 1.71k | } |
364 | | |
365 | | // Update def-ages for registers defined by MI. |
366 | | // If Kill is set, also kill off DomainValues clobbered by the defs. |
367 | | // |
368 | | // Also break dependencies on partial defs and undef uses. |
369 | | void ExecutionDepsFix::processDefs(MachineInstr *MI, bool breakDependency, |
370 | 3.08M | bool Kill) { |
371 | 3.08M | assert(!MI->isDebugValue() && "Won't process debug values"); |
372 | 3.08M | |
373 | 3.08M | // Break dependence on undef uses. Do this before updating LiveRegs below. |
374 | 3.08M | unsigned OpNum; |
375 | 3.08M | if (breakDependency) { |
376 | 2.76M | unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); |
377 | 2.76M | if (Pref) { |
378 | 1.46k | bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref); |
379 | 1.46k | // We don't need to bother trying to break a dependency if this |
380 | 1.46k | // instruction has a true dependency on that register through another |
381 | 1.46k | // operand - we'll have to wait for it to be available regardless. |
382 | 1.46k | if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref)1.06k ) |
383 | 10 | UndefReads.push_back(std::make_pair(MI, OpNum)); |
384 | 1.46k | } |
385 | 2.76M | } |
386 | 3.08M | const MCInstrDesc &MCID = MI->getDesc(); |
387 | 3.08M | for (unsigned i = 0, |
388 | 3.08M | e = MI->isVariadic() ? MI->getNumOperands()129k : MCID.getNumDefs()2.95M ; |
389 | 5.14M | i != e; ++i2.05M ) { |
390 | 2.05M | MachineOperand &MO = MI->getOperand(i); |
391 | 2.05M | if (!MO.isReg()) |
392 | 124k | continue; |
393 | 1.93M | if (MO.isUse()) |
394 | 189k | continue; |
395 | 1.74M | for (int rx : regIndices(MO.getReg())) { |
396 | 285k | // This instruction explicitly defines rx. |
397 | 285k | DEBUG(dbgs() << printReg(RC->getRegister(rx), TRI) << ":\t" << CurInstr |
398 | 285k | << '\t' << *MI); |
399 | 285k | |
400 | 285k | if (breakDependency) { |
401 | 273k | // Check clearance before partial register updates. |
402 | 273k | // Call breakDependence before setting LiveRegs[rx].Def. |
403 | 273k | unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); |
404 | 273k | if (Pref && shouldBreakDependence(MI, i, Pref)653 ) |
405 | 272 | TII->breakPartialRegDependency(*MI, i, TRI); |
406 | 273k | } |
407 | 285k | |
408 | 285k | // How many instructions since rx was last written? |
409 | 285k | LiveRegs[rx].Def = CurInstr; |
410 | 285k | |
411 | 285k | // Kill off domains redefined by generic instructions. |
412 | 285k | if (Kill) |
413 | 47.7k | kill(rx); |
414 | 285k | } |
415 | 1.74M | } |
416 | 3.08M | ++CurInstr; |
417 | 3.08M | } |
418 | | |
419 | | /// \break Break false dependencies on undefined register reads. |
420 | | /// |
421 | | /// Walk the block backward computing precise liveness. This is expensive, so we |
422 | | /// only do it on demand. Note that the occurrence of undefined register reads |
423 | | /// that should be broken is very rare, but when they occur we may have many in |
424 | | /// a single block. |
425 | 400k | void ExecutionDepsFix::processUndefReads(MachineBasicBlock *MBB) { |
426 | 400k | if (UndefReads.empty()) |
427 | 400k | return; |
428 | 10 | |
429 | 10 | // Collect this block's live out register units. |
430 | 10 | LiveRegSet.init(*TRI); |
431 | 10 | // We do not need to care about pristine registers as they are just preserved |
432 | 10 | // but not actually used in the function. |
433 | 10 | LiveRegSet.addLiveOutsNoPristines(*MBB); |
434 | 10 | |
435 | 10 | MachineInstr *UndefMI = UndefReads.back().first; |
436 | 10 | unsigned OpIdx = UndefReads.back().second; |
437 | 10 | |
438 | 79 | for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) { |
439 | 79 | // Update liveness, including the current instruction's defs. |
440 | 79 | LiveRegSet.stepBackward(I); |
441 | 79 | |
442 | 79 | if (UndefMI == &I) { |
443 | 10 | if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) |
444 | 9 | TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI); |
445 | 10 | |
446 | 10 | UndefReads.pop_back(); |
447 | 10 | if (UndefReads.empty()) |
448 | 10 | return; |
449 | 0 | |
450 | 0 | UndefMI = UndefReads.back().first; |
451 | 0 | OpIdx = UndefReads.back().second; |
452 | 0 | } |
453 | 79 | } |
454 | 10 | } |
455 | | |
456 | | // A hard instruction only works in one domain. All input registers will be |
457 | | // forced into that domain. |
458 | 193k | void ExecutionDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { |
459 | 193k | // Collapse all uses. |
460 | 193k | for (unsigned i = mi->getDesc().getNumDefs(), |
461 | 753k | e = mi->getDesc().getNumOperands(); i != e; ++i559k ) { |
462 | 559k | MachineOperand &mo = mi->getOperand(i); |
463 | 559k | if (!mo.isReg()) continue140k ; |
464 | 418k | for (int rx : regIndices(mo.getReg())) { |
465 | 284k | force(rx, domain); |
466 | 284k | } |
467 | 418k | } |
468 | 193k | |
469 | 193k | // Kill all defs and force them. |
470 | 380k | for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i186k ) { |
471 | 186k | MachineOperand &mo = mi->getOperand(i); |
472 | 186k | if (!mo.isReg()) continue0 ; |
473 | 186k | for (int rx : regIndices(mo.getReg())) { |
474 | 171k | kill(rx); |
475 | 171k | force(rx, domain); |
476 | 171k | } |
477 | 186k | } |
478 | 193k | } |
479 | | |
480 | | // A soft instruction can be changed to work in other domains given by mask. |
481 | 106k | void ExecutionDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { |
482 | 106k | // Bitmask of available domains for this instruction after taking collapsed |
483 | 106k | // operands into account. |
484 | 106k | unsigned available = mask; |
485 | 106k | |
486 | 106k | // Scan the explicit use operands for incoming domains. |
487 | 106k | SmallVector<int, 4> used; |
488 | 106k | if (LiveRegs) |
489 | 106k | for (unsigned i = mi->getDesc().getNumDefs(), |
490 | 500k | e = mi->getDesc().getNumOperands(); i != e; ++i394k ) { |
491 | 394k | MachineOperand &mo = mi->getOperand(i); |
492 | 394k | if (!mo.isReg()) continue121k ; |
493 | 273k | for (int rx : regIndices(mo.getReg())) { |
494 | 108k | DomainValue *dv = LiveRegs[rx].Value; |
495 | 108k | if (dv == nullptr) |
496 | 31.7k | continue; |
497 | 77.2k | // Bitmask of domains that dv and available have in common. |
498 | 77.2k | unsigned common = dv->getCommonDomains(available); |
499 | 77.2k | // Is it possible to use this collapsed register for free? |
500 | 77.2k | if (dv->isCollapsed()) { |
501 | 52.7k | // Restrict available domains to the ones in common with the operand. |
502 | 52.7k | // If there are no common domains, we must pay the cross-domain |
503 | 52.7k | // penalty for this operand. |
504 | 52.7k | if (common) available = common52.6k ; |
505 | 52.7k | } else if (24.4k common24.4k ) |
506 | 24.4k | // Open DomainValue is compatible, save it for merging. |
507 | 24.4k | used.push_back(rx); |
508 | 3 | else |
509 | 3 | // Open DomainValue is not compatible with instruction. It is useless |
510 | 3 | // now. |
511 | 3 | kill(rx); |
512 | 77.2k | } |
513 | 273k | } |
514 | 106k | |
515 | 106k | // If the collapsed operands force a single domain, propagate the collapse. |
516 | 106k | if (isPowerOf2_32(available)) { |
517 | 37.2k | unsigned domain = countTrailingZeros(available); |
518 | 37.2k | TII->setExecutionDomain(*mi, domain); |
519 | 37.2k | visitHardInstr(mi, domain); |
520 | 37.2k | return; |
521 | 37.2k | } |
522 | 68.7k | |
523 | 68.7k | // Kill off any remaining uses that don't match available, and build a list of |
524 | 68.7k | // incoming DomainValues that we want to merge. |
525 | 68.7k | SmallVector<const LiveReg *, 4> Regs; |
526 | 68.7k | for (int rx : used) { |
527 | 21.9k | assert(LiveRegs && "no space allocated for live registers"); |
528 | 21.9k | const LiveReg &LR = LiveRegs[rx]; |
529 | 21.9k | // This useless DomainValue could have been missed above. |
530 | 21.9k | if (!LR.Value->getCommonDomains(available)) { |
531 | 0 | kill(rx); |
532 | 0 | continue; |
533 | 0 | } |
534 | 21.9k | // Sorted insertion. |
535 | 21.9k | auto I = std::upper_bound(Regs.begin(), Regs.end(), &LR, |
536 | 21.9k | [](const LiveReg *LHS, const LiveReg *RHS) { |
537 | 2.52k | return LHS->Def < RHS->Def; |
538 | 2.52k | }); |
539 | 21.9k | Regs.insert(I, &LR); |
540 | 21.9k | } |
541 | 68.7k | |
542 | 68.7k | // doms are now sorted in order of appearance. Try to merge them all, giving |
543 | 68.7k | // priority to the latest ones. |
544 | 68.7k | DomainValue *dv = nullptr; |
545 | 90.7k | while (!Regs.empty()) { |
546 | 21.9k | if (!dv) { |
547 | 19.4k | dv = Regs.pop_back_val()->Value; |
548 | 19.4k | // Force the first dv to match the current instruction. |
549 | 19.4k | dv->AvailableDomains = dv->getCommonDomains(available); |
550 | 19.4k | assert(dv->AvailableDomains && "Domain should have been filtered"); |
551 | 19.4k | continue; |
552 | 19.4k | } |
553 | 2.52k | |
554 | 2.52k | DomainValue *Latest = Regs.pop_back_val()->Value; |
555 | 2.52k | // Skip already merged values. |
556 | 2.52k | if (Latest == dv || Latest->Next1.47k ) |
557 | 1.04k | continue; |
558 | 1.47k | if (merge(dv, Latest)) |
559 | 1.47k | continue; |
560 | 0 | |
561 | 0 | // If latest didn't merge, it is useless now. Kill all registers using it. |
562 | 0 | for (int i : used) { |
563 | 0 | assert(LiveRegs && "no space allocated for live registers"); |
564 | 0 | if (LiveRegs[i].Value == Latest) |
565 | 0 | kill(i); |
566 | 0 | } |
567 | 0 | } |
568 | 68.7k | |
569 | 68.7k | // dv is the DomainValue we are going to use for this instruction. |
570 | 68.7k | if (!dv) { |
571 | 49.3k | dv = alloc(); |
572 | 49.3k | dv->AvailableDomains = available; |
573 | 49.3k | } |
574 | 68.7k | dv->Instrs.push_back(mi); |
575 | 68.7k | |
576 | 68.7k | // Finally set all defs and non-collapsed uses to dv. We must iterate through |
577 | 68.7k | // all the operators, including imp-def ones. |
578 | 68.7k | for (MachineInstr::mop_iterator ii = mi->operands_begin(), |
579 | 68.7k | ee = mi->operands_end(); |
580 | 422k | ii != ee; ++ii353k ) { |
581 | 353k | MachineOperand &mo = *ii; |
582 | 353k | if (!mo.isReg()) continue101k ; |
583 | 252k | for (int rx : regIndices(mo.getReg())) { |
584 | 109k | if (!LiveRegs[rx].Value || (64.0k mo.isDef()64.0k && LiveRegs[rx].Value != dv22.7k )) { |
585 | 60.0k | kill(rx); |
586 | 60.0k | setLiveReg(rx, dv); |
587 | 60.0k | } |
588 | 109k | } |
589 | 252k | } |
590 | 68.7k | } |
591 | | |
592 | | void ExecutionDepsFix::processBasicBlock(MachineBasicBlock *MBB, |
593 | 459k | bool PrimaryPass) { |
594 | 459k | enterBasicBlock(MBB); |
595 | 459k | // If this block is not done, it makes little sense to make any decisions |
596 | 459k | // based on clearance information. We need to make a second pass anyway, |
597 | 459k | // and by then we'll have better information, so we can avoid doing the work |
598 | 459k | // to try and break dependencies now. |
599 | 459k | bool breakDependency = isBlockDone(MBB); |
600 | 3.08M | for (MachineInstr &MI : *MBB) { |
601 | 3.08M | if (!MI.isDebugValue()) { |
602 | 3.08M | bool Kill = false; |
603 | 3.08M | if (PrimaryPass) |
604 | 2.76M | Kill = visitInstr(&MI); |
605 | 3.08M | processDefs(&MI, breakDependency, Kill); |
606 | 3.08M | } |
607 | 3.08M | } |
608 | 459k | if (breakDependency) |
609 | 400k | processUndefReads(MBB); |
610 | 459k | leaveBasicBlock(MBB); |
611 | 459k | } |
612 | | |
613 | 2.46M | bool ExecutionDepsFix::isBlockDone(MachineBasicBlock *MBB) { |
614 | 2.46M | return MBBInfos[MBB].PrimaryCompleted && |
615 | 2.46M | MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming1.50M && |
616 | 2.46M | MBBInfos[MBB].IncomingProcessed == MBB->pred_size()1.30M ; |
617 | 2.46M | } |
618 | | |
619 | 123k | bool ExecutionDepsFix::runOnMachineFunction(MachineFunction &mf) { |
620 | 123k | if (skipFunction(mf.getFunction())) |
621 | 74 | return false; |
622 | 123k | MF = &mf; |
623 | 123k | TII = MF->getSubtarget().getInstrInfo(); |
624 | 123k | TRI = MF->getSubtarget().getRegisterInfo(); |
625 | 123k | RegClassInfo.runOnMachineFunction(mf); |
626 | 123k | LiveRegs = nullptr; |
627 | 123k | assert(NumRegs == RC->getNumRegs() && "Bad regclass"); |
628 | 123k | |
629 | 123k | DEBUG(dbgs() << "********** FIX EXECUTION DEPENDENCIES: " |
630 | 123k | << TRI->getRegClassName(RC) << " **********\n"); |
631 | 123k | |
632 | 123k | // If no relevant registers are used in the function, we can skip it |
633 | 123k | // completely. |
634 | 123k | bool anyregs = false; |
635 | 123k | const MachineRegisterInfo &MRI = mf.getRegInfo(); |
636 | 1.05M | for (unsigned Reg : *RC) { |
637 | 1.05M | if (MRI.isPhysRegUsed(Reg)) { |
638 | 94.2k | anyregs = true; |
639 | 94.2k | break; |
640 | 94.2k | } |
641 | 1.05M | } |
642 | 123k | if (!anyregs) return false28.8k ; |
643 | 94.2k | |
644 | 94.2k | // Initialize the AliasMap on the first use. |
645 | 94.2k | if (AliasMap.empty()) { |
646 | 8.25k | // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and |
647 | 8.25k | // therefore the LiveRegs array. |
648 | 8.25k | AliasMap.resize(TRI->getNumRegs()); |
649 | 272k | for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i264k ) |
650 | 264k | for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true); |
651 | 3.16M | AI.isValid(); ++AI2.90M ) |
652 | 2.90M | AliasMap[*AI].push_back(i); |
653 | 8.25k | } |
654 | 94.2k | |
655 | 94.2k | // Initialize the MMBInfos |
656 | 400k | for (auto &MBB : mf) { |
657 | 400k | MBBInfo InitialInfo; |
658 | 400k | MBBInfos.insert(std::make_pair(&MBB, InitialInfo)); |
659 | 400k | } |
660 | 94.2k | |
661 | 94.2k | /* |
662 | 94.2k | * We want to visit every instruction in every basic block in order to update |
663 | 94.2k | * it's execution domain or break any false dependencies. However, for the |
664 | 94.2k | * dependency breaking, we need to know clearances from all predecessors |
665 | 94.2k | * (including any backedges). One way to do so would be to do two complete |
666 | 94.2k | * passes over all basic blocks/instructions, the first for recording |
667 | 94.2k | * clearances, the second to break the dependencies. However, for functions |
668 | 94.2k | * without backedges, or functions with a lot of straight-line code, and |
669 | 94.2k | * a small loop, that would be a lot of unnecessary work (since only the |
670 | 94.2k | * BBs that are part of the loop require two passes). As an example, |
671 | 94.2k | * consider the following loop. |
672 | 94.2k | * |
673 | 94.2k | * |
674 | 94.2k | * PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT |
675 | 94.2k | * ^ | |
676 | 94.2k | * +----------------------------------+ |
677 | 94.2k | * |
678 | 94.2k | * The iteration order is as follows: |
679 | 94.2k | * Naive: PH A B C D A' B' C' D' |
680 | 94.2k | * Optimized: PH A B C A' B' C' D |
681 | 94.2k | * |
682 | 94.2k | * Note that we avoid processing D twice, because we can entirely process |
683 | 94.2k | * the predecessors before getting to D. We call a block that is ready |
684 | 94.2k | * for its second round of processing `done` (isBlockDone). Once we finish |
685 | 94.2k | * processing some block, we update the counters in MBBInfos and re-process |
686 | 94.2k | * any successors that are now done. |
687 | 94.2k | */ |
688 | 94.2k | |
689 | 94.2k | MachineBasicBlock *Entry = &*MF->begin(); |
690 | 94.2k | ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry); |
691 | 94.2k | SmallVector<MachineBasicBlock *, 4> Workqueue; |
692 | 94.2k | for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator |
693 | 494k | MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI400k ) { |
694 | 400k | MachineBasicBlock *MBB = *MBBI; |
695 | 400k | // N.B: IncomingProcessed and IncomingCompleted were already updated while |
696 | 400k | // processing this block's predecessors. |
697 | 400k | MBBInfos[MBB].PrimaryCompleted = true; |
698 | 400k | MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed; |
699 | 400k | bool Primary = true; |
700 | 400k | Workqueue.push_back(MBB); |
701 | 860k | while (!Workqueue.empty()) { |
702 | 459k | MachineBasicBlock *ActiveMBB = &*Workqueue.back(); |
703 | 459k | Workqueue.pop_back(); |
704 | 459k | processBasicBlock(ActiveMBB, Primary); |
705 | 459k | bool Done = isBlockDone(ActiveMBB); |
706 | 577k | for (auto *Succ : ActiveMBB->successors()) { |
707 | 577k | if (!isBlockDone(Succ)) { |
708 | 563k | if (Primary) { |
709 | 478k | MBBInfos[Succ].IncomingProcessed++; |
710 | 478k | } |
711 | 563k | if (Done) { |
712 | 464k | MBBInfos[Succ].IncomingCompleted++; |
713 | 464k | } |
714 | 563k | if (isBlockDone(Succ)) { |
715 | 58.6k | Workqueue.push_back(Succ); |
716 | 58.6k | } |
717 | 563k | } |
718 | 577k | } |
719 | 459k | Primary = false; |
720 | 459k | } |
721 | 400k | } |
722 | 94.2k | |
723 | 94.2k | // We need to go through again and finalize any blocks that are not done yet. |
724 | 94.2k | // This is possible if blocks have dead predecessors, so we didn't visit them |
725 | 94.2k | // above. |
726 | 94.2k | for (ReversePostOrderTraversal<MachineBasicBlock *>::rpo_iterator |
727 | 94.2k | MBBI = RPOT.begin(), |
728 | 94.2k | MBBE = RPOT.end(); |
729 | 494k | MBBI != MBBE; ++MBBI400k ) { |
730 | 400k | MachineBasicBlock *MBB = *MBBI; |
731 | 400k | if (!isBlockDone(MBB)) { |
732 | 0 | processBasicBlock(MBB, false); |
733 | 0 | // Don't update successors here. We'll get to them anyway through this |
734 | 0 | // loop. |
735 | 0 | } |
736 | 400k | } |
737 | 94.2k | |
738 | 94.2k | // Clear the LiveOuts vectors and collapse any remaining DomainValues. |
739 | 94.2k | for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator |
740 | 494k | MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI400k ) { |
741 | 400k | auto FI = MBBInfos.find(*MBBI); |
742 | 400k | if (FI == MBBInfos.end() || !FI->second.OutRegs400k ) |
743 | 0 | continue; |
744 | 13.2M | for (unsigned i = 0, e = NumRegs; 400k i != e; ++i12.8M ) |
745 | 12.8M | if (FI->second.OutRegs[i].Value) |
746 | 266k | release(FI->second.OutRegs[i].Value); |
747 | 400k | delete[] FI->second.OutRegs; |
748 | 400k | } |
749 | 94.2k | MBBInfos.clear(); |
750 | 94.2k | UndefReads.clear(); |
751 | 94.2k | Avail.clear(); |
752 | 94.2k | Allocator.DestroyAll(); |
753 | 94.2k | |
754 | 94.2k | return false; |
755 | 94.2k | } |