/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // Loops should be simplified before this analysis. |
11 | | // |
12 | | //===----------------------------------------------------------------------===// |
13 | | |
14 | | #include "llvm/Analysis/BlockFrequencyInfoImpl.h" |
15 | | #include "llvm/ADT/APInt.h" |
16 | | #include "llvm/ADT/DenseMap.h" |
17 | | #include "llvm/ADT/GraphTraits.h" |
18 | | #include "llvm/ADT/None.h" |
19 | | #include "llvm/ADT/SCCIterator.h" |
20 | | #include "llvm/IR/Function.h" |
21 | | #include "llvm/Support/BlockFrequency.h" |
22 | | #include "llvm/Support/BranchProbability.h" |
23 | | #include "llvm/Support/Compiler.h" |
24 | | #include "llvm/Support/Debug.h" |
25 | | #include "llvm/Support/ScaledNumber.h" |
26 | | #include "llvm/Support/MathExtras.h" |
27 | | #include "llvm/Support/raw_ostream.h" |
28 | | #include <algorithm> |
29 | | #include <cassert> |
30 | | #include <cstddef> |
31 | | #include <cstdint> |
32 | | #include <iterator> |
33 | | #include <list> |
34 | | #include <numeric> |
35 | | #include <utility> |
36 | | #include <vector> |
37 | | |
38 | | using namespace llvm; |
39 | | using namespace llvm::bfi_detail; |
40 | | |
41 | | #define DEBUG_TYPE "block-freq" |
42 | | |
43 | 41.2M | ScaledNumber<uint64_t> BlockMass::toScaled() const { |
44 | 41.2M | if (isFull()) |
45 | 11.3M | return ScaledNumber<uint64_t>(1, 0); |
46 | 29.9M | return ScaledNumber<uint64_t>(getMass() + 1, -64); |
47 | 29.9M | } |
48 | | |
49 | | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
50 | | LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); } |
51 | | #endif |
52 | | |
53 | 0 | static char getHexDigit(int N) { |
54 | 0 | assert(N < 16); |
55 | 0 | if (N < 10) |
56 | 0 | return '0' + N; |
57 | 0 | return 'a' + N - 10; |
58 | 0 | } |
59 | | |
60 | 0 | raw_ostream &BlockMass::print(raw_ostream &OS) const { |
61 | 0 | for (int Digits = 0; Digits < 160 ; ++Digits0 ) |
62 | 0 | OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf); |
63 | 0 | return OS; |
64 | 0 | } |
65 | | |
66 | | namespace { |
67 | | |
68 | | using BlockNode = BlockFrequencyInfoImplBase::BlockNode; |
69 | | using Distribution = BlockFrequencyInfoImplBase::Distribution; |
70 | | using WeightList = BlockFrequencyInfoImplBase::Distribution::WeightList; |
71 | | using Scaled64 = BlockFrequencyInfoImplBase::Scaled64; |
72 | | using LoopData = BlockFrequencyInfoImplBase::LoopData; |
73 | | using Weight = BlockFrequencyInfoImplBase::Weight; |
74 | | using FrequencyData = BlockFrequencyInfoImplBase::FrequencyData; |
75 | | |
76 | | /// \brief Dithering mass distributer. |
77 | | /// |
78 | | /// This class splits up a single mass into portions by weight, dithering to |
79 | | /// spread out error. No mass is lost. The dithering precision depends on the |
80 | | /// precision of the product of \a BlockMass and \a BranchProbability. |
81 | | /// |
82 | | /// The distribution algorithm follows. |
83 | | /// |
84 | | /// 1. Initialize by saving the sum of the weights in \a RemWeight and the |
85 | | /// mass to distribute in \a RemMass. |
86 | | /// |
87 | | /// 2. For each portion: |
88 | | /// |
89 | | /// 1. Construct a branch probability, P, as the portion's weight divided |
90 | | /// by the current value of \a RemWeight. |
91 | | /// 2. Calculate the portion's mass as \a RemMass times P. |
92 | | /// 3. Update \a RemWeight and \a RemMass at each portion by subtracting |
93 | | /// the current portion's weight and mass. |
94 | | struct DitheringDistributer { |
95 | | uint32_t RemWeight; |
96 | | BlockMass RemMass; |
97 | | |
98 | | DitheringDistributer(Distribution &Dist, const BlockMass &Mass); |
99 | | |
100 | | BlockMass takeMass(uint32_t Weight); |
101 | | }; |
102 | | |
103 | | } // end anonymous namespace |
104 | | |
105 | | DitheringDistributer::DitheringDistributer(Distribution &Dist, |
106 | 38.1M | const BlockMass &Mass) { |
107 | 38.1M | Dist.normalize(); |
108 | 38.1M | RemWeight = Dist.Total; |
109 | 38.1M | RemMass = Mass; |
110 | 38.1M | } |
111 | | |
112 | 49.8M | BlockMass DitheringDistributer::takeMass(uint32_t Weight) { |
113 | 49.8M | assert(Weight && "invalid weight"); |
114 | 49.8M | assert(Weight <= RemWeight); |
115 | 49.8M | BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight); |
116 | 49.8M | |
117 | 49.8M | // Decrement totals (dither). |
118 | 49.8M | RemWeight -= Weight; |
119 | 49.8M | RemMass -= Mass; |
120 | 49.8M | return Mass; |
121 | 49.8M | } |
122 | | |
123 | | void Distribution::add(const BlockNode &Node, uint64_t Amount, |
124 | 50.2M | Weight::DistType Type) { |
125 | 50.2M | assert(Amount && "invalid weight of 0"); |
126 | 50.2M | uint64_t NewTotal = Total + Amount; |
127 | 50.2M | |
128 | 50.2M | // Check for overflow. It should be impossible to overflow twice. |
129 | 50.2M | bool IsOverflow = NewTotal < Total; |
130 | 50.2M | assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow"); |
131 | 50.2M | DidOverflow |= IsOverflow; |
132 | 50.2M | |
133 | 50.2M | // Update the total. |
134 | 50.2M | Total = NewTotal; |
135 | 50.2M | |
136 | 50.2M | // Save the weight. |
137 | 50.2M | Weights.push_back(Weight(Type, Node, Amount)); |
138 | 50.2M | } |
139 | | |
140 | 383k | static void combineWeight(Weight &W, const Weight &OtherW) { |
141 | 383k | assert(OtherW.TargetNode.isValid()); |
142 | 383k | if (!W.Amount383k ) { |
143 | 13.1k | W = OtherW; |
144 | 13.1k | return; |
145 | 13.1k | } |
146 | 383k | assert(W.Type == OtherW.Type); |
147 | 370k | assert(W.TargetNode == OtherW.TargetNode); |
148 | 370k | assert(OtherW.Amount && "Expected non-zero weight"); |
149 | 370k | if (W.Amount > W.Amount + OtherW.Amount) |
150 | 370k | // Saturate on overflow. |
151 | 2 | W.Amount = UINT64_MAX; |
152 | 370k | else |
153 | 370k | W.Amount += OtherW.Amount; |
154 | 383k | } |
155 | | |
156 | 17.4M | static void combineWeightsBySorting(WeightList &Weights) { |
157 | 17.4M | // Sort so edges to the same node are adjacent. |
158 | 17.4M | std::sort(Weights.begin(), Weights.end(), |
159 | 17.4M | [](const Weight &L, |
160 | 19.2M | const Weight &R) { return L.TargetNode < R.TargetNode; }); |
161 | 17.4M | |
162 | 17.4M | // Combine adjacent edges. |
163 | 17.4M | WeightList::iterator O = Weights.begin(); |
164 | 52.4M | for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E; |
165 | 35.0M | ++O, (I = L)35.0M ) { |
166 | 35.0M | *O = *I; |
167 | 35.0M | |
168 | 35.0M | // Find the adjacent weights to the same node. |
169 | 35.4M | for (++L; L != E && 35.4M I->TargetNode == L->TargetNode18.0M ; ++L368k ) |
170 | 368k | combineWeight(*O, *L); |
171 | 35.0M | } |
172 | 17.4M | |
173 | 17.4M | // Erase extra entries. |
174 | 17.4M | Weights.erase(O, Weights.end()); |
175 | 17.4M | } |
176 | | |
177 | 85 | static void combineWeightsByHashing(WeightList &Weights) { |
178 | 85 | // Collect weights into a DenseMap. |
179 | 85 | using HashTable = DenseMap<BlockNode::IndexType, Weight>; |
180 | 85 | |
181 | 85 | HashTable Combined(NextPowerOf2(2 * Weights.size())); |
182 | 85 | for (const Weight &W : Weights) |
183 | 15.2k | combineWeight(Combined[W.TargetNode.Index], W); |
184 | 85 | |
185 | 85 | // Check whether anything changed. |
186 | 85 | if (Weights.size() == Combined.size()) |
187 | 63 | return; |
188 | 22 | |
189 | 22 | // Fill in the new weights. |
190 | 22 | Weights.clear(); |
191 | 22 | Weights.reserve(Combined.size()); |
192 | 22 | for (const auto &I : Combined) |
193 | 2.23k | Weights.push_back(I.second); |
194 | 85 | } |
195 | | |
196 | 17.4M | static void combineWeights(WeightList &Weights) { |
197 | 17.4M | // Use a hash table for many successors to keep this linear. |
198 | 17.4M | if (Weights.size() > 12817.4M ) { |
199 | 85 | combineWeightsByHashing(Weights); |
200 | 85 | return; |
201 | 85 | } |
202 | 17.4M | |
203 | 17.4M | combineWeightsBySorting(Weights); |
204 | 17.4M | } |
205 | | |
206 | 1.43M | static uint64_t shiftRightAndRound(uint64_t N, int Shift) { |
207 | 1.43M | assert(Shift >= 0); |
208 | 1.43M | assert(Shift < 64); |
209 | 1.43M | if (!Shift) |
210 | 0 | return N; |
211 | 1.43M | return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1)); |
212 | 1.43M | } |
213 | | |
214 | 38.1M | void Distribution::normalize() { |
215 | 38.1M | // Early exit for termination nodes. |
216 | 38.1M | if (Weights.empty()) |
217 | 5.95M | return; |
218 | 32.2M | |
219 | 32.2M | // Only bother if there are multiple successors. |
220 | 32.2M | if (32.2M Weights.size() > 132.2M ) |
221 | 17.4M | combineWeights(Weights); |
222 | 32.2M | |
223 | 32.2M | // Early exit when combined into a single successor. |
224 | 32.2M | if (Weights.size() == 132.2M ) { |
225 | 15.0M | Total = 1; |
226 | 15.0M | Weights.front().Amount = 1; |
227 | 15.0M | return; |
228 | 15.0M | } |
229 | 17.2M | |
230 | 17.2M | // Determine how much to shift right so that the total fits into 32-bits. |
231 | 17.2M | // |
232 | 17.2M | // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1 |
233 | 17.2M | // for each weight can cause a 32-bit overflow. |
234 | 17.2M | int Shift = 0; |
235 | 17.2M | if (DidOverflow) |
236 | 0 | Shift = 33; |
237 | 17.2M | else if (17.2M Total > UINT32_MAX17.2M ) |
238 | 647k | Shift = 33 - countLeadingZeros(Total); |
239 | 17.2M | |
240 | 17.2M | // Early exit if nothing needs to be scaled. |
241 | 17.2M | if (!Shift17.2M ) { |
242 | 16.5M | // If we didn't overflow then combineWeights() shouldn't have changed the |
243 | 16.5M | // sum of the weights, but let's double-check. |
244 | 16.5M | assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0), |
245 | 16.5M | [](uint64_t Sum, const Weight &W) { |
246 | 16.5M | return Sum + W.Amount; |
247 | 16.5M | }) && |
248 | 16.5M | "Expected total to be correct"); |
249 | 16.5M | return; |
250 | 16.5M | } |
251 | 647k | |
252 | 647k | // Recompute the total through accumulation (rather than shifting it) so that |
253 | 647k | // it's accurate after shifting and any changes combineWeights() made above. |
254 | 647k | Total = 0; |
255 | 647k | |
256 | 647k | // Sum the weights to each node and shift right if necessary. |
257 | 1.43M | for (Weight &W : Weights) { |
258 | 1.43M | // Scale down below UINT32_MAX. Since Shift is larger than necessary, we |
259 | 1.43M | // can round here without concern about overflow. |
260 | 1.43M | assert(W.TargetNode.isValid()); |
261 | 1.43M | W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift)); |
262 | 1.43M | assert(W.Amount <= UINT32_MAX); |
263 | 1.43M | |
264 | 1.43M | // Update the total. |
265 | 1.43M | Total += W.Amount; |
266 | 1.43M | } |
267 | 38.1M | assert(Total <= UINT32_MAX); |
268 | 38.1M | } |
269 | | |
270 | 9.75M | void BlockFrequencyInfoImplBase::clear() { |
271 | 9.75M | // Swap with a default-constructed std::vector, since std::vector<>::clear() |
272 | 9.75M | // does not actually clear heap storage. |
273 | 9.75M | std::vector<FrequencyData>().swap(Freqs); |
274 | 9.75M | std::vector<WorkingData>().swap(Working); |
275 | 9.75M | Loops.clear(); |
276 | 9.75M | } |
277 | | |
278 | | /// \brief Clear all memory not needed downstream. |
279 | | /// |
280 | | /// Releases all memory not used downstream. In particular, saves Freqs. |
281 | 4.87M | static void cleanup(BlockFrequencyInfoImplBase &BFI) { |
282 | 4.87M | std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs)); |
283 | 4.87M | BFI.clear(); |
284 | 4.87M | BFI.Freqs = std::move(SavedFreqs); |
285 | 4.87M | } |
286 | | |
287 | | bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, |
288 | | const LoopData *OuterLoop, |
289 | | const BlockNode &Pred, |
290 | | const BlockNode &Succ, |
291 | 50.2M | uint64_t Weight) { |
292 | 50.2M | if (!Weight) |
293 | 1.66M | Weight = 1; |
294 | 50.2M | |
295 | 50.2M | auto isLoopHeader = [&OuterLoop](const BlockNode &Node) { |
296 | 18.3M | return OuterLoop && OuterLoop->isHeader(Node); |
297 | 50.2M | }; |
298 | 50.2M | |
299 | 50.2M | BlockNode Resolved = Working[Succ.Index].getResolvedNode(); |
300 | 50.2M | |
301 | | #ifndef NDEBUG |
302 | | auto debugSuccessor = [&](const char *Type) { |
303 | | dbgs() << " =>" |
304 | | << " [" << Type << "] weight = " << Weight; |
305 | | if (!isLoopHeader(Resolved)) |
306 | | dbgs() << ", succ = " << getBlockName(Succ); |
307 | | if (Resolved != Succ) |
308 | | dbgs() << ", resolved = " << getBlockName(Resolved); |
309 | | dbgs() << "\n"; |
310 | | }; |
311 | | (void)debugSuccessor; |
312 | | #endif |
313 | | |
314 | 50.2M | if (isLoopHeader(Resolved)50.2M ) { |
315 | 3.17M | DEBUG(debugSuccessor("backedge")); |
316 | 3.17M | Dist.addBackedge(Resolved, Weight); |
317 | 3.17M | return true; |
318 | 3.17M | } |
319 | 47.0M | |
320 | 47.0M | if (47.0M Working[Resolved.Index].getContainingLoop() != OuterLoop47.0M ) { |
321 | 4.16M | DEBUG(debugSuccessor(" exit ")); |
322 | 4.16M | Dist.addExit(Resolved, Weight); |
323 | 4.16M | return true; |
324 | 4.16M | } |
325 | 42.9M | |
326 | 42.9M | if (42.9M Resolved < Pred42.9M ) { |
327 | 4.68k | if (!isLoopHeader(Pred)4.68k ) { |
328 | 584 | // If OuterLoop is an irreducible loop, we can't actually handle this. |
329 | 584 | assert((!OuterLoop || !OuterLoop->isIrreducible()) && |
330 | 584 | "unhandled irreducible control flow"); |
331 | 584 | |
332 | 584 | // Irreducible backedge. Abort. |
333 | 584 | DEBUG(debugSuccessor("abort!!!")); |
334 | 584 | return false; |
335 | 584 | } |
336 | 4.09k | |
337 | 4.09k | // If "Pred" is a loop header, then this isn't really a backedge; rather, |
338 | 4.09k | // OuterLoop must be irreducible. These false backedges can come only from |
339 | 4.09k | // secondary loop headers. |
340 | 4.68k | assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) && |
341 | 4.09k | "unhandled irreducible control flow"); |
342 | 4.09k | } |
343 | 42.9M | |
344 | 42.9M | DEBUG42.9M (debugSuccessor(" local ")); |
345 | 42.9M | Dist.addLocal(Resolved, Weight); |
346 | 42.9M | return true; |
347 | 50.2M | } |
348 | | |
349 | | bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist( |
350 | 3.11M | const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) { |
351 | 3.11M | // Copy the exit map into Dist. |
352 | 3.11M | for (const auto &I : Loop.Exits) |
353 | 4.14M | if (4.14M !addToDist(Dist, OuterLoop, Loop.getHeader(), I.first, |
354 | 4.14M | I.second.getMass())) |
355 | 4.14M | // Irreducible backedge. |
356 | 48 | return false; |
357 | 3.11M | |
358 | 3.11M | return true; |
359 | 3.11M | } |
360 | | |
361 | | /// \brief Compute the loop scale for a loop. |
362 | 3.11M | void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { |
363 | 3.11M | // Compute loop scale. |
364 | 3.11M | DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n"); |
365 | 3.11M | |
366 | 3.11M | // Infinite loops need special handling. If we give the back edge an infinite |
367 | 3.11M | // mass, they may saturate all the other scales in the function down to 1, |
368 | 3.11M | // making all the other region temperatures look exactly the same. Choose an |
369 | 3.11M | // arbitrary scale to avoid these issues. |
370 | 3.11M | // |
371 | 3.11M | // FIXME: An alternate way would be to select a symbolic scale which is later |
372 | 3.11M | // replaced to be the maximum of all computed scales plus 1. This would |
373 | 3.11M | // appropriately describe the loop as having a large scale, without skewing |
374 | 3.11M | // the final frequency computation. |
375 | 3.11M | const Scaled64 InfiniteLoopScale(1, 12); |
376 | 3.11M | |
377 | 3.11M | // LoopScale == 1 / ExitMass |
378 | 3.11M | // ExitMass == HeadMass - BackedgeMass |
379 | 3.11M | BlockMass TotalBackedgeMass; |
380 | 3.11M | for (auto &Mass : Loop.BackedgeMass) |
381 | 3.11M | TotalBackedgeMass += Mass; |
382 | 3.11M | BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass; |
383 | 3.11M | |
384 | 3.11M | // Block scale stores the inverse of the scale. If this is an infinite loop, |
385 | 3.11M | // its exit mass will be zero. In this case, use an arbitrary scale for the |
386 | 3.11M | // loop scale. |
387 | 3.11M | Loop.Scale = |
388 | 3.11M | ExitMass.isEmpty() ? InfiniteLoopScale1.94k : ExitMass.toScaled().inverse()3.10M ; |
389 | 3.11M | |
390 | 3.11M | DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull() |
391 | 3.11M | << " - " << TotalBackedgeMass << ")\n" |
392 | 3.11M | << " - scale = " << Loop.Scale << "\n"); |
393 | 3.11M | } |
394 | | |
395 | | /// \brief Package up a loop. |
396 | 3.11M | void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) { |
397 | 3.11M | DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n"); |
398 | 3.11M | |
399 | 3.11M | // Clear the subloop exits to prevent quadratic memory usage. |
400 | 10.9M | for (const BlockNode &M : Loop.Nodes) { |
401 | 10.9M | if (auto *Loop = Working[M.Index].getPackagedLoop()) |
402 | 895k | Loop->Exits.clear(); |
403 | 10.9M | DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n"); |
404 | 10.9M | } |
405 | 3.11M | Loop.IsPackaged = true; |
406 | 3.11M | } |
407 | | |
408 | | #ifndef NDEBUG |
409 | | static void debugAssign(const BlockFrequencyInfoImplBase &BFI, |
410 | | const DitheringDistributer &D, const BlockNode &T, |
411 | | const BlockMass &M, const char *Desc) { |
412 | | dbgs() << " => assign " << M << " (" << D.RemMass << ")"; |
413 | | if (Desc) |
414 | | dbgs() << " [" << Desc << "]"; |
415 | | if (T.isValid()) |
416 | | dbgs() << " to " << BFI.getBlockName(T); |
417 | | dbgs() << "\n"; |
418 | | } |
419 | | #endif |
420 | | |
421 | | void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, |
422 | | LoopData *OuterLoop, |
423 | 38.1M | Distribution &Dist) { |
424 | 38.1M | BlockMass Mass = Working[Source.Index].getMass(); |
425 | 38.1M | DEBUG(dbgs() << " => mass: " << Mass << "\n"); |
426 | 38.1M | |
427 | 38.1M | // Distribute mass to successors as laid out in Dist. |
428 | 38.1M | DitheringDistributer D(Dist, Mass); |
429 | 38.1M | |
430 | 49.8M | for (const Weight &W : Dist.Weights) { |
431 | 49.8M | // Check for a local edge (non-backedge and non-exit). |
432 | 49.8M | BlockMass Taken = D.takeMass(W.Amount); |
433 | 49.8M | if (W.Type == Weight::Local49.8M ) { |
434 | 42.5M | Working[W.TargetNode.Index].getMass() += Taken; |
435 | 42.5M | DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); |
436 | 42.5M | continue; |
437 | 42.5M | } |
438 | 7.31M | |
439 | 7.31M | // Backedges and exits only make sense if we're processing a loop. |
440 | 49.8M | assert(OuterLoop && "backedge or exit outside of loop"); |
441 | 7.31M | |
442 | 7.31M | // Check for a backedge. |
443 | 7.31M | if (W.Type == Weight::Backedge7.31M ) { |
444 | 3.17M | OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(W.TargetNode)] += Taken; |
445 | 3.17M | DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back")); |
446 | 3.17M | continue; |
447 | 3.17M | } |
448 | 4.14M | |
449 | 4.14M | // This must be an exit. |
450 | 7.31M | assert(W.Type == Weight::Exit); |
451 | 4.14M | OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken)); |
452 | 4.14M | DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit")); |
453 | 49.8M | } |
454 | 38.1M | } |
455 | | |
456 | | static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, |
457 | 4.87M | const Scaled64 &Min, const Scaled64 &Max) { |
458 | 4.87M | // Scale the Factor to a size that creates integers. Ideally, integers would |
459 | 4.87M | // be scaled so that Max == UINT64_MAX so that they can be best |
460 | 4.87M | // differentiated. However, in the presence of large frequency values, small |
461 | 4.87M | // frequencies are scaled down to 1, making it impossible to differentiate |
462 | 4.87M | // small, unequal numbers. When the spread between Min and Max frequencies |
463 | 4.87M | // fits well within MaxBits, we make the scale be at least 8. |
464 | 4.87M | const unsigned MaxBits = 64; |
465 | 4.87M | const unsigned SpreadBits = (Max / Min).lg(); |
466 | 4.87M | Scaled64 ScalingFactor; |
467 | 4.87M | if (SpreadBits <= MaxBits - 34.87M ) { |
468 | 4.87M | // If the values are small enough, make the scaling factor at least 8 to |
469 | 4.87M | // allow distinguishing small values. |
470 | 4.87M | ScalingFactor = Min.inverse(); |
471 | 4.87M | ScalingFactor <<= 3; |
472 | 4.87M | } else { |
473 | 2.60k | // If the values need more than MaxBits to be represented, saturate small |
474 | 2.60k | // frequency values down to 1 by using a scaling factor that benefits large |
475 | 2.60k | // frequency values. |
476 | 2.60k | ScalingFactor = Scaled64(1, MaxBits) / Max; |
477 | 2.60k | } |
478 | 4.87M | |
479 | 4.87M | // Translate the floats to integers. |
480 | 4.87M | DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max |
481 | 4.87M | << ", factor = " << ScalingFactor << "\n"); |
482 | 39.9M | for (size_t Index = 0; Index < BFI.Freqs.size()39.9M ; ++Index35.0M ) { |
483 | 35.0M | Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor; |
484 | 35.0M | BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>()); |
485 | 35.0M | DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = " |
486 | 35.0M | << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled |
487 | 35.0M | << ", int = " << BFI.Freqs[Index].Integer << "\n"); |
488 | 35.0M | } |
489 | 4.87M | } |
490 | | |
491 | | /// \brief Unwrap a loop package. |
492 | | /// |
493 | | /// Visits all the members of a loop, adjusting their BlockData according to |
494 | | /// the loop's pseudo-node. |
495 | 3.11M | static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) { |
496 | 3.11M | DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop) |
497 | 3.11M | << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale |
498 | 3.11M | << "\n"); |
499 | 3.11M | Loop.Scale *= Loop.Mass.toScaled(); |
500 | 3.11M | Loop.IsPackaged = false; |
501 | 3.11M | DEBUG(dbgs() << " => combined-scale = " << Loop.Scale << "\n"); |
502 | 3.11M | |
503 | 3.11M | // Propagate the head scale through the loop. Since members are visited in |
504 | 3.11M | // RPO, the head scale will be updated by the loop scale first, and then the |
505 | 3.11M | // final head scale will be used for updated the rest of the members. |
506 | 10.9M | for (const BlockNode &N : Loop.Nodes) { |
507 | 10.9M | const auto &Working = BFI.Working[N.Index]; |
508 | 895k | Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale |
509 | 10.0M | : BFI.Freqs[N.Index].Scaled; |
510 | 10.9M | Scaled64 New = Loop.Scale * F; |
511 | 10.9M | DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " << New |
512 | 10.9M | << "\n"); |
513 | 10.9M | F = New; |
514 | 10.9M | } |
515 | 3.11M | } |
516 | | |
517 | 4.87M | void BlockFrequencyInfoImplBase::unwrapLoops() { |
518 | 4.87M | // Set initial frequencies from loop-local masses. |
519 | 39.9M | for (size_t Index = 0; Index < Working.size()39.9M ; ++Index35.0M ) |
520 | 35.0M | Freqs[Index].Scaled = Working[Index].Mass.toScaled(); |
521 | 4.87M | |
522 | 4.87M | for (LoopData &Loop : Loops) |
523 | 3.11M | unwrapLoop(*this, Loop); |
524 | 4.87M | } |
525 | | |
526 | 4.87M | void BlockFrequencyInfoImplBase::finalizeMetrics() { |
527 | 4.87M | // Unwrap loop packages in reverse post-order, tracking min and max |
528 | 4.87M | // frequencies. |
529 | 4.87M | auto Min = Scaled64::getLargest(); |
530 | 4.87M | auto Max = Scaled64::getZero(); |
531 | 39.9M | for (size_t Index = 0; Index < Working.size()39.9M ; ++Index35.0M ) { |
532 | 35.0M | // Update min/max scale. |
533 | 35.0M | Min = std::min(Min, Freqs[Index].Scaled); |
534 | 35.0M | Max = std::max(Max, Freqs[Index].Scaled); |
535 | 35.0M | } |
536 | 4.87M | |
537 | 4.87M | // Convert to integers. |
538 | 4.87M | convertFloatingToInteger(*this, Min, Max); |
539 | 4.87M | |
540 | 4.87M | // Clean up data structures. |
541 | 4.87M | cleanup(*this); |
542 | 4.87M | |
543 | 4.87M | // Print out the final stats. |
544 | 4.87M | DEBUG(dump()); |
545 | 4.87M | } |
546 | | |
547 | | BlockFrequency |
548 | 67.0M | BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const { |
549 | 67.0M | if (!Node.isValid()) |
550 | 524k | return 0; |
551 | 66.4M | return Freqs[Node.Index].Integer; |
552 | 66.4M | } |
553 | | |
554 | | Optional<uint64_t> |
555 | | BlockFrequencyInfoImplBase::getBlockProfileCount(const Function &F, |
556 | 876 | const BlockNode &Node) const { |
557 | 876 | return getProfileCountFromFreq(F, getBlockFreq(Node).getFrequency()); |
558 | 876 | } |
559 | | |
560 | | Optional<uint64_t> |
561 | | BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F, |
562 | 946 | uint64_t Freq) const { |
563 | 946 | auto EntryCount = F.getEntryCount(); |
564 | 946 | if (!EntryCount) |
565 | 737 | return None; |
566 | 209 | // Use 128 bit APInt to do the arithmetic to avoid overflow. |
567 | 209 | APInt BlockCount(128, EntryCount.getValue()); |
568 | 209 | APInt BlockFreq(128, Freq); |
569 | 209 | APInt EntryFreq(128, getEntryFreq()); |
570 | 209 | BlockCount *= BlockFreq; |
571 | 209 | BlockCount = BlockCount.udiv(EntryFreq); |
572 | 209 | return BlockCount.getLimitedValue(); |
573 | 209 | } |
574 | | |
575 | | Scaled64 |
576 | 435 | BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { |
577 | 435 | if (!Node.isValid()) |
578 | 0 | return Scaled64::getZero(); |
579 | 435 | return Freqs[Node.Index].Scaled; |
580 | 435 | } |
581 | | |
582 | | void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node, |
583 | 1.92k | uint64_t Freq) { |
584 | 1.92k | assert(Node.isValid() && "Expected valid node"); |
585 | 1.92k | assert(Node.Index < Freqs.size() && "Expected legal index"); |
586 | 1.92k | Freqs[Node.Index].Integer = Freq; |
587 | 1.92k | } |
588 | | |
589 | | std::string |
590 | 0 | BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const { |
591 | 0 | return {}; |
592 | 0 | } |
593 | | |
594 | | std::string |
595 | 0 | BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const { |
596 | 0 | return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**"0 : "*"0 ); |
597 | 0 | } |
598 | | |
599 | | raw_ostream & |
600 | | BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, |
601 | 0 | const BlockNode &Node) const { |
602 | 0 | return OS << getFloatingBlockFreq(Node); |
603 | 0 | } |
604 | | |
605 | | raw_ostream & |
606 | | BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, |
607 | 0 | const BlockFrequency &Freq) const { |
608 | 0 | Scaled64 Block(Freq.getFrequency(), 0); |
609 | 0 | Scaled64 Entry(getEntryFreq(), 0); |
610 | 0 |
|
611 | 0 | return OS << Block / Entry; |
612 | 0 | } |
613 | | |
614 | 207 | void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) { |
615 | 207 | Start = OuterLoop.getHeader(); |
616 | 207 | Nodes.reserve(OuterLoop.Nodes.size()); |
617 | 207 | for (auto N : OuterLoop.Nodes) |
618 | 11.8k | addNode(N); |
619 | 207 | indexNodes(); |
620 | 207 | } |
621 | | |
622 | 377 | void IrreducibleGraph::addNodesInFunction() { |
623 | 377 | Start = 0; |
624 | 12.5k | for (uint32_t Index = 0; Index < BFI.Working.size()12.5k ; ++Index12.1k ) |
625 | 12.1k | if (12.1k !BFI.Working[Index].isPackaged()12.1k ) |
626 | 9.72k | addNode(Index); |
627 | 377 | indexNodes(); |
628 | 377 | } |
629 | | |
630 | 584 | void IrreducibleGraph::indexNodes() { |
631 | 584 | for (auto &I : Nodes) |
632 | 21.5k | Lookup[I.Node.Index] = &I; |
633 | 584 | } |
634 | | |
635 | | void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ, |
636 | 42.9k | const BFIBase::LoopData *OuterLoop) { |
637 | 42.9k | if (OuterLoop && 42.9k OuterLoop->isHeader(Succ)20.0k ) |
638 | 353 | return; |
639 | 42.6k | auto L = Lookup.find(Succ.Index); |
640 | 42.6k | if (L == Lookup.end()) |
641 | 2.75k | return; |
642 | 39.8k | IrrNode &SuccIrr = *L->second; |
643 | 39.8k | Irr.Edges.push_back(&SuccIrr); |
644 | 39.8k | SuccIrr.Edges.push_front(&Irr); |
645 | 39.8k | ++SuccIrr.NumIn; |
646 | 39.8k | } |
647 | | |
648 | | namespace llvm { |
649 | | |
650 | | template <> struct GraphTraits<IrreducibleGraph> { |
651 | | using GraphT = bfi_detail::IrreducibleGraph; |
652 | | using NodeRef = const GraphT::IrrNode *; |
653 | | using ChildIteratorType = GraphT::IrrNode::iterator; |
654 | | |
655 | 584 | static NodeRef getEntryNode(const GraphT &G) { return G.StartIrr; } |
656 | 21.5k | static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); } |
657 | 61.4k | static ChildIteratorType child_end(NodeRef N) { return N->succ_end(); } |
658 | | }; |
659 | | |
660 | | } // end namespace llvm |
661 | | |
662 | | /// \brief Find extra irreducible headers. |
663 | | /// |
664 | | /// Find entry blocks and other blocks with backedges, which exist when \c G |
665 | | /// contains irreducible sub-SCCs. |
666 | | static void findIrreducibleHeaders( |
667 | | const BlockFrequencyInfoImplBase &BFI, |
668 | | const IrreducibleGraph &G, |
669 | | const std::vector<const IrreducibleGraph::IrrNode *> &SCC, |
670 | 804 | LoopData::NodeList &Headers, LoopData::NodeList &Others) { |
671 | 804 | // Map from nodes in the SCC to whether it's an entry block. |
672 | 804 | SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC; |
673 | 804 | |
674 | 804 | // InSCC also acts the set of nodes in the graph. Seed it. |
675 | 804 | for (const auto *I : SCC) |
676 | 8.80k | InSCC[I] = false; |
677 | 804 | |
678 | 9.61k | for (auto I = InSCC.begin(), E = InSCC.end(); I != E9.61k ; ++I8.80k ) { |
679 | 8.80k | auto &Irr = *I->first; |
680 | 22.5k | for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { |
681 | 22.5k | if (InSCC.count(P)) |
682 | 19.9k | continue; |
683 | 2.67k | |
684 | 2.67k | // This is an entry block. |
685 | 2.67k | I->second = true; |
686 | 2.67k | Headers.push_back(Irr.Node); |
687 | 2.67k | DEBUG(dbgs() << " => entry = " << BFI.getBlockName(Irr.Node) << "\n"); |
688 | 22.5k | break; |
689 | 22.5k | } |
690 | 8.80k | } |
691 | 804 | assert(Headers.size() >= 2 && |
692 | 804 | "Expected irreducible CFG; -loop-info is likely invalid"); |
693 | 804 | if (Headers.size() == InSCC.size()804 ) { |
694 | 184 | // Every block is a header. |
695 | 184 | std::sort(Headers.begin(), Headers.end()); |
696 | 184 | return; |
697 | 184 | } |
698 | 620 | |
699 | 620 | // Look for extra headers from irreducible sub-SCCs. |
700 | 620 | for (const auto &I : InSCC) 620 { |
701 | 8.21k | // Entry blocks are already headers. |
702 | 8.21k | if (I.second) |
703 | 2.08k | continue; |
704 | 6.13k | |
705 | 6.13k | auto &Irr = *I.first; |
706 | 15.2k | for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { |
707 | 15.2k | // Skip forward edges. |
708 | 15.2k | if (P->Node < Irr.Node) |
709 | 10.8k | continue; |
710 | 4.33k | |
711 | 4.33k | // Skip predecessors from entry blocks. These can have inverted |
712 | 4.33k | // ordering. |
713 | 4.33k | if (4.33k InSCC.lookup(P)4.33k ) |
714 | 4.09k | continue; |
715 | 234 | |
716 | 234 | // Store the extra header. |
717 | 234 | Headers.push_back(Irr.Node); |
718 | 234 | DEBUG(dbgs() << " => extra = " << BFI.getBlockName(Irr.Node) << "\n"); |
719 | 15.2k | break; |
720 | 15.2k | } |
721 | 6.13k | if (Headers.back() == Irr.Node) |
722 | 6.13k | // Added this as a header. |
723 | 234 | continue; |
724 | 5.89k | |
725 | 5.89k | // This is not a header. |
726 | 5.89k | Others.push_back(Irr.Node); |
727 | 5.89k | DEBUG(dbgs() << " => other = " << BFI.getBlockName(Irr.Node) << "\n"); |
728 | 8.21k | } |
729 | 804 | std::sort(Headers.begin(), Headers.end()); |
730 | 804 | std::sort(Others.begin(), Others.end()); |
731 | 804 | } |
732 | | |
733 | | static void createIrreducibleLoop( |
734 | | BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, |
735 | | LoopData *OuterLoop, std::list<LoopData>::iterator Insert, |
736 | 804 | const std::vector<const IrreducibleGraph::IrrNode *> &SCC) { |
737 | 804 | // Translate the SCC into RPO. |
738 | 804 | DEBUG(dbgs() << " - found-scc\n"); |
739 | 804 | |
740 | 804 | LoopData::NodeList Headers; |
741 | 804 | LoopData::NodeList Others; |
742 | 804 | findIrreducibleHeaders(BFI, G, SCC, Headers, Others); |
743 | 804 | |
744 | 804 | auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(), |
745 | 804 | Headers.end(), Others.begin(), Others.end()); |
746 | 804 | |
747 | 804 | // Update loop hierarchy. |
748 | 804 | for (const auto &N : Loop->Nodes) |
749 | 8.80k | if (8.80k BFI.Working[N.Index].isLoopHeader()8.80k ) |
750 | 892 | BFI.Working[N.Index].Loop->Parent = &*Loop; |
751 | 8.80k | else |
752 | 7.91k | BFI.Working[N.Index].Loop = &*Loop; |
753 | 804 | } |
754 | | |
755 | | iterator_range<std::list<LoopData>::iterator> |
756 | | BlockFrequencyInfoImplBase::analyzeIrreducible( |
757 | | const IrreducibleGraph &G, LoopData *OuterLoop, |
758 | 584 | std::list<LoopData>::iterator Insert) { |
759 | 584 | assert((OuterLoop == nullptr) == (Insert == Loops.begin())); |
760 | 584 | auto Prev = OuterLoop ? std::prev(Insert)207 : Loops.end()377 ; |
761 | 584 | |
762 | 14.1k | for (auto I = scc_begin(G); !I.isAtEnd()14.1k ; ++I13.5k ) { |
763 | 13.5k | if (I->size() < 2) |
764 | 12.7k | continue; |
765 | 804 | |
766 | 804 | // Translate the SCC into RPO. |
767 | 804 | createIrreducibleLoop(*this, G, OuterLoop, Insert, *I); |
768 | 804 | } |
769 | 584 | |
770 | 584 | if (OuterLoop) |
771 | 207 | return make_range(std::next(Prev), Insert); |
772 | 377 | return make_range(Loops.begin(), Insert); |
773 | 377 | } |
774 | | |
775 | | void |
776 | 207 | BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { |
777 | 207 | OuterLoop.Exits.clear(); |
778 | 207 | for (auto &Mass : OuterLoop.BackedgeMass) |
779 | 207 | Mass = BlockMass::getEmpty(); |
780 | 207 | auto O = OuterLoop.Nodes.begin() + 1; |
781 | 11.8k | for (auto I = O, E = OuterLoop.Nodes.end(); I != E11.8k ; ++I11.6k ) |
782 | 11.6k | if (11.6k !Working[I->Index].isPackaged()11.6k ) |
783 | 7.19k | *O++ = *I; |
784 | 207 | OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end()); |
785 | 207 | } |
786 | | |
787 | 804 | void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { |
788 | 804 | assert(Loop.isIrreducible() && "this only makes sense on irreducible loops"); |
789 | 804 | |
790 | 804 | // Since the loop has more than one header block, the mass flowing back into |
791 | 804 | // each header will be different. Adjust the mass in each header loop to |
792 | 804 | // reflect the masses flowing through back edges. |
793 | 804 | // |
794 | 804 | // To do this, we distribute the initial mass using the backedge masses |
795 | 804 | // as weights for the distribution. |
796 | 804 | BlockMass LoopMass = BlockMass::getFull(); |
797 | 804 | Distribution Dist; |
798 | 804 | |
799 | 804 | DEBUG(dbgs() << "adjust-loop-header-mass:\n"); |
800 | 3.71k | for (uint32_t H = 0; H < Loop.NumHeaders3.71k ; ++H2.91k ) { |
801 | 2.91k | auto &HeaderNode = Loop.Nodes[H]; |
802 | 2.91k | auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)]; |
803 | 2.91k | DEBUG(dbgs() << " - Add back edge mass for node " |
804 | 2.91k | << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n"); |
805 | 2.91k | if (BackedgeMass.getMass() > 0) |
806 | 2.91k | Dist.addLocal(HeaderNode, BackedgeMass.getMass()); |
807 | 2.91k | else |
808 | 2.91k | DEBUG(dbgs() << " Nothing added. Back edge mass is zero\n"); |
809 | 2.91k | } |
810 | 804 | |
811 | 804 | DitheringDistributer D(Dist, LoopMass); |
812 | 804 | |
813 | 804 | DEBUG(dbgs() << " Distribute loop mass " << LoopMass |
814 | 804 | << " to headers using above weights\n"); |
815 | 2.91k | for (const Weight &W : Dist.Weights) { |
816 | 2.91k | BlockMass Taken = D.takeMass(W.Amount); |
817 | 2.91k | assert(W.Type == Weight::Local && "all weights should be local"); |
818 | 2.91k | Working[W.TargetNode.Index].getMass() = Taken; |
819 | 2.91k | DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); |
820 | 2.91k | } |
821 | 804 | } |