Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// Loops should be simplified before this analysis.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
15
#include "llvm/ADT/APInt.h"
16
#include "llvm/ADT/DenseMap.h"
17
#include "llvm/ADT/GraphTraits.h"
18
#include "llvm/ADT/None.h"
19
#include "llvm/ADT/SCCIterator.h"
20
#include "llvm/IR/Function.h"
21
#include "llvm/Support/BlockFrequency.h"
22
#include "llvm/Support/BranchProbability.h"
23
#include "llvm/Support/Compiler.h"
24
#include "llvm/Support/Debug.h"
25
#include "llvm/Support/ScaledNumber.h"
26
#include "llvm/Support/MathExtras.h"
27
#include "llvm/Support/raw_ostream.h"
28
#include <algorithm>
29
#include <cassert>
30
#include <cstddef>
31
#include <cstdint>
32
#include <iterator>
33
#include <list>
34
#include <numeric>
35
#include <utility>
36
#include <vector>
37
38
using namespace llvm;
39
using namespace llvm::bfi_detail;
40
41
#define DEBUG_TYPE "block-freq"
42
43
41.2M
ScaledNumber<uint64_t> BlockMass::toScaled() const {
44
41.2M
  if (isFull())
45
11.3M
    return ScaledNumber<uint64_t>(1, 0);
46
29.9M
  return ScaledNumber<uint64_t>(getMass() + 1, -64);
47
29.9M
}
48
49
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
50
LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); }
51
#endif
52
53
0
static char getHexDigit(int N) {
54
0
  assert(N < 16);
55
0
  if (N < 10)
56
0
    return '0' + N;
57
0
  return 'a' + N - 10;
58
0
}
59
60
0
raw_ostream &BlockMass::print(raw_ostream &OS) const {
61
0
  for (int Digits = 0; 
Digits < 160
;
++Digits0
)
62
0
    OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf);
63
0
  return OS;
64
0
}
65
66
namespace {
67
68
using BlockNode = BlockFrequencyInfoImplBase::BlockNode;
69
using Distribution = BlockFrequencyInfoImplBase::Distribution;
70
using WeightList = BlockFrequencyInfoImplBase::Distribution::WeightList;
71
using Scaled64 = BlockFrequencyInfoImplBase::Scaled64;
72
using LoopData = BlockFrequencyInfoImplBase::LoopData;
73
using Weight = BlockFrequencyInfoImplBase::Weight;
74
using FrequencyData = BlockFrequencyInfoImplBase::FrequencyData;
75
76
/// \brief Dithering mass distributer.
77
///
78
/// This class splits up a single mass into portions by weight, dithering to
79
/// spread out error.  No mass is lost.  The dithering precision depends on the
80
/// precision of the product of \a BlockMass and \a BranchProbability.
81
///
82
/// The distribution algorithm follows.
83
///
84
///  1. Initialize by saving the sum of the weights in \a RemWeight and the
85
///     mass to distribute in \a RemMass.
86
///
87
///  2. For each portion:
88
///
89
///      1. Construct a branch probability, P, as the portion's weight divided
90
///         by the current value of \a RemWeight.
91
///      2. Calculate the portion's mass as \a RemMass times P.
92
///      3. Update \a RemWeight and \a RemMass at each portion by subtracting
93
///         the current portion's weight and mass.
94
struct DitheringDistributer {
95
  uint32_t RemWeight;
96
  BlockMass RemMass;
97
98
  DitheringDistributer(Distribution &Dist, const BlockMass &Mass);
99
100
  BlockMass takeMass(uint32_t Weight);
101
};
102
103
} // end anonymous namespace
104
105
DitheringDistributer::DitheringDistributer(Distribution &Dist,
106
38.1M
                                           const BlockMass &Mass) {
107
38.1M
  Dist.normalize();
108
38.1M
  RemWeight = Dist.Total;
109
38.1M
  RemMass = Mass;
110
38.1M
}
111
112
49.8M
BlockMass DitheringDistributer::takeMass(uint32_t Weight) {
113
49.8M
  assert(Weight && "invalid weight");
114
49.8M
  assert(Weight <= RemWeight);
115
49.8M
  BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight);
116
49.8M
117
49.8M
  // Decrement totals (dither).
118
49.8M
  RemWeight -= Weight;
119
49.8M
  RemMass -= Mass;
120
49.8M
  return Mass;
121
49.8M
}
122
123
void Distribution::add(const BlockNode &Node, uint64_t Amount,
124
50.2M
                       Weight::DistType Type) {
125
50.2M
  assert(Amount && "invalid weight of 0");
126
50.2M
  uint64_t NewTotal = Total + Amount;
127
50.2M
128
50.2M
  // Check for overflow.  It should be impossible to overflow twice.
129
50.2M
  bool IsOverflow = NewTotal < Total;
130
50.2M
  assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow");
131
50.2M
  DidOverflow |= IsOverflow;
132
50.2M
133
50.2M
  // Update the total.
134
50.2M
  Total = NewTotal;
135
50.2M
136
50.2M
  // Save the weight.
137
50.2M
  Weights.push_back(Weight(Type, Node, Amount));
138
50.2M
}
139
140
383k
static void combineWeight(Weight &W, const Weight &OtherW) {
141
383k
  assert(OtherW.TargetNode.isValid());
142
383k
  if (
!W.Amount383k
) {
143
13.1k
    W = OtherW;
144
13.1k
    return;
145
13.1k
  }
146
383k
  assert(W.Type == OtherW.Type);
147
370k
  assert(W.TargetNode == OtherW.TargetNode);
148
370k
  assert(OtherW.Amount && "Expected non-zero weight");
149
370k
  if (W.Amount > W.Amount + OtherW.Amount)
150
370k
    // Saturate on overflow.
151
2
    W.Amount = UINT64_MAX;
152
370k
  else
153
370k
    W.Amount += OtherW.Amount;
154
383k
}
155
156
17.4M
static void combineWeightsBySorting(WeightList &Weights) {
157
17.4M
  // Sort so edges to the same node are adjacent.
158
17.4M
  std::sort(Weights.begin(), Weights.end(),
159
17.4M
            [](const Weight &L,
160
19.2M
               const Weight &R) { return L.TargetNode < R.TargetNode; });
161
17.4M
162
17.4M
  // Combine adjacent edges.
163
17.4M
  WeightList::iterator O = Weights.begin();
164
52.4M
  for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E;
165
35.0M
       
++O, (I = L)35.0M
) {
166
35.0M
    *O = *I;
167
35.0M
168
35.0M
    // Find the adjacent weights to the same node.
169
35.4M
    for (++L; 
L != E && 35.4M
I->TargetNode == L->TargetNode18.0M
;
++L368k
)
170
368k
      combineWeight(*O, *L);
171
35.0M
  }
172
17.4M
173
17.4M
  // Erase extra entries.
174
17.4M
  Weights.erase(O, Weights.end());
175
17.4M
}
176
177
85
static void combineWeightsByHashing(WeightList &Weights) {
178
85
  // Collect weights into a DenseMap.
179
85
  using HashTable = DenseMap<BlockNode::IndexType, Weight>;
180
85
181
85
  HashTable Combined(NextPowerOf2(2 * Weights.size()));
182
85
  for (const Weight &W : Weights)
183
15.2k
    combineWeight(Combined[W.TargetNode.Index], W);
184
85
185
85
  // Check whether anything changed.
186
85
  if (Weights.size() == Combined.size())
187
63
    return;
188
22
189
22
  // Fill in the new weights.
190
22
  Weights.clear();
191
22
  Weights.reserve(Combined.size());
192
22
  for (const auto &I : Combined)
193
2.23k
    Weights.push_back(I.second);
194
85
}
195
196
17.4M
static void combineWeights(WeightList &Weights) {
197
17.4M
  // Use a hash table for many successors to keep this linear.
198
17.4M
  if (
Weights.size() > 12817.4M
) {
199
85
    combineWeightsByHashing(Weights);
200
85
    return;
201
85
  }
202
17.4M
203
17.4M
  combineWeightsBySorting(Weights);
204
17.4M
}
205
206
1.43M
static uint64_t shiftRightAndRound(uint64_t N, int Shift) {
207
1.43M
  assert(Shift >= 0);
208
1.43M
  assert(Shift < 64);
209
1.43M
  if (!Shift)
210
0
    return N;
211
1.43M
  return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1));
212
1.43M
}
213
214
38.1M
void Distribution::normalize() {
215
38.1M
  // Early exit for termination nodes.
216
38.1M
  if (Weights.empty())
217
5.95M
    return;
218
32.2M
219
32.2M
  // Only bother if there are multiple successors.
220
32.2M
  
if (32.2M
Weights.size() > 132.2M
)
221
17.4M
    combineWeights(Weights);
222
32.2M
223
32.2M
  // Early exit when combined into a single successor.
224
32.2M
  if (
Weights.size() == 132.2M
) {
225
15.0M
    Total = 1;
226
15.0M
    Weights.front().Amount = 1;
227
15.0M
    return;
228
15.0M
  }
229
17.2M
230
17.2M
  // Determine how much to shift right so that the total fits into 32-bits.
231
17.2M
  //
232
17.2M
  // If we shift at all, shift by 1 extra.  Otherwise, the lower limit of 1
233
17.2M
  // for each weight can cause a 32-bit overflow.
234
17.2M
  int Shift = 0;
235
17.2M
  if (DidOverflow)
236
0
    Shift = 33;
237
17.2M
  else 
if (17.2M
Total > UINT32_MAX17.2M
)
238
647k
    Shift = 33 - countLeadingZeros(Total);
239
17.2M
240
17.2M
  // Early exit if nothing needs to be scaled.
241
17.2M
  if (
!Shift17.2M
) {
242
16.5M
    // If we didn't overflow then combineWeights() shouldn't have changed the
243
16.5M
    // sum of the weights, but let's double-check.
244
16.5M
    assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0),
245
16.5M
                                    [](uint64_t Sum, const Weight &W) {
246
16.5M
                      return Sum + W.Amount;
247
16.5M
                    }) &&
248
16.5M
           "Expected total to be correct");
249
16.5M
    return;
250
16.5M
  }
251
647k
252
647k
  // Recompute the total through accumulation (rather than shifting it) so that
253
647k
  // it's accurate after shifting and any changes combineWeights() made above.
254
647k
  Total = 0;
255
647k
256
647k
  // Sum the weights to each node and shift right if necessary.
257
1.43M
  for (Weight &W : Weights) {
258
1.43M
    // Scale down below UINT32_MAX.  Since Shift is larger than necessary, we
259
1.43M
    // can round here without concern about overflow.
260
1.43M
    assert(W.TargetNode.isValid());
261
1.43M
    W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift));
262
1.43M
    assert(W.Amount <= UINT32_MAX);
263
1.43M
264
1.43M
    // Update the total.
265
1.43M
    Total += W.Amount;
266
1.43M
  }
267
38.1M
  assert(Total <= UINT32_MAX);
268
38.1M
}
269
270
9.75M
void BlockFrequencyInfoImplBase::clear() {
271
9.75M
  // Swap with a default-constructed std::vector, since std::vector<>::clear()
272
9.75M
  // does not actually clear heap storage.
273
9.75M
  std::vector<FrequencyData>().swap(Freqs);
274
9.75M
  std::vector<WorkingData>().swap(Working);
275
9.75M
  Loops.clear();
276
9.75M
}
277
278
/// \brief Clear all memory not needed downstream.
279
///
280
/// Releases all memory not used downstream.  In particular, saves Freqs.
281
4.87M
static void cleanup(BlockFrequencyInfoImplBase &BFI) {
282
4.87M
  std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs));
283
4.87M
  BFI.clear();
284
4.87M
  BFI.Freqs = std::move(SavedFreqs);
285
4.87M
}
286
287
bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
288
                                           const LoopData *OuterLoop,
289
                                           const BlockNode &Pred,
290
                                           const BlockNode &Succ,
291
50.2M
                                           uint64_t Weight) {
292
50.2M
  if (!Weight)
293
1.66M
    Weight = 1;
294
50.2M
295
50.2M
  auto isLoopHeader = [&OuterLoop](const BlockNode &Node) {
296
18.3M
    return OuterLoop && OuterLoop->isHeader(Node);
297
50.2M
  };
298
50.2M
299
50.2M
  BlockNode Resolved = Working[Succ.Index].getResolvedNode();
300
50.2M
301
#ifndef NDEBUG
302
  auto debugSuccessor = [&](const char *Type) {
303
    dbgs() << "  =>"
304
           << " [" << Type << "] weight = " << Weight;
305
    if (!isLoopHeader(Resolved))
306
      dbgs() << ", succ = " << getBlockName(Succ);
307
    if (Resolved != Succ)
308
      dbgs() << ", resolved = " << getBlockName(Resolved);
309
    dbgs() << "\n";
310
  };
311
  (void)debugSuccessor;
312
#endif
313
314
50.2M
  if (
isLoopHeader(Resolved)50.2M
) {
315
3.17M
    DEBUG(debugSuccessor("backedge"));
316
3.17M
    Dist.addBackedge(Resolved, Weight);
317
3.17M
    return true;
318
3.17M
  }
319
47.0M
320
47.0M
  
if (47.0M
Working[Resolved.Index].getContainingLoop() != OuterLoop47.0M
) {
321
4.16M
    DEBUG(debugSuccessor("  exit  "));
322
4.16M
    Dist.addExit(Resolved, Weight);
323
4.16M
    return true;
324
4.16M
  }
325
42.9M
326
42.9M
  
if (42.9M
Resolved < Pred42.9M
) {
327
4.68k
    if (
!isLoopHeader(Pred)4.68k
) {
328
584
      // If OuterLoop is an irreducible loop, we can't actually handle this.
329
584
      assert((!OuterLoop || !OuterLoop->isIrreducible()) &&
330
584
             "unhandled irreducible control flow");
331
584
332
584
      // Irreducible backedge.  Abort.
333
584
      DEBUG(debugSuccessor("abort!!!"));
334
584
      return false;
335
584
    }
336
4.09k
337
4.09k
    // If "Pred" is a loop header, then this isn't really a backedge; rather,
338
4.09k
    // OuterLoop must be irreducible.  These false backedges can come only from
339
4.09k
    // secondary loop headers.
340
4.68k
    assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) &&
341
4.09k
           "unhandled irreducible control flow");
342
4.09k
  }
343
42.9M
344
42.9M
  
DEBUG42.9M
(debugSuccessor(" local "));
345
42.9M
  Dist.addLocal(Resolved, Weight);
346
42.9M
  return true;
347
50.2M
}
348
349
bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
350
3.11M
    const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) {
351
3.11M
  // Copy the exit map into Dist.
352
3.11M
  for (const auto &I : Loop.Exits)
353
4.14M
    
if (4.14M
!addToDist(Dist, OuterLoop, Loop.getHeader(), I.first,
354
4.14M
                   I.second.getMass()))
355
4.14M
      // Irreducible backedge.
356
48
      return false;
357
3.11M
358
3.11M
  return true;
359
3.11M
}
360
361
/// \brief Compute the loop scale for a loop.
362
3.11M
void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
363
3.11M
  // Compute loop scale.
364
3.11M
  DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n");
365
3.11M
366
3.11M
  // Infinite loops need special handling. If we give the back edge an infinite
367
3.11M
  // mass, they may saturate all the other scales in the function down to 1,
368
3.11M
  // making all the other region temperatures look exactly the same. Choose an
369
3.11M
  // arbitrary scale to avoid these issues.
370
3.11M
  //
371
3.11M
  // FIXME: An alternate way would be to select a symbolic scale which is later
372
3.11M
  // replaced to be the maximum of all computed scales plus 1. This would
373
3.11M
  // appropriately describe the loop as having a large scale, without skewing
374
3.11M
  // the final frequency computation.
375
3.11M
  const Scaled64 InfiniteLoopScale(1, 12);
376
3.11M
377
3.11M
  // LoopScale == 1 / ExitMass
378
3.11M
  // ExitMass == HeadMass - BackedgeMass
379
3.11M
  BlockMass TotalBackedgeMass;
380
3.11M
  for (auto &Mass : Loop.BackedgeMass)
381
3.11M
    TotalBackedgeMass += Mass;
382
3.11M
  BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass;
383
3.11M
384
3.11M
  // Block scale stores the inverse of the scale. If this is an infinite loop,
385
3.11M
  // its exit mass will be zero. In this case, use an arbitrary scale for the
386
3.11M
  // loop scale.
387
3.11M
  Loop.Scale =
388
3.11M
      ExitMass.isEmpty() ? 
InfiniteLoopScale1.94k
:
ExitMass.toScaled().inverse()3.10M
;
389
3.11M
390
3.11M
  DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
391
3.11M
               << " - " << TotalBackedgeMass << ")\n"
392
3.11M
               << " - scale = " << Loop.Scale << "\n");
393
3.11M
}
394
395
/// \brief Package up a loop.
396
3.11M
void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) {
397
3.11M
  DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n");
398
3.11M
399
3.11M
  // Clear the subloop exits to prevent quadratic memory usage.
400
10.9M
  for (const BlockNode &M : Loop.Nodes) {
401
10.9M
    if (auto *Loop = Working[M.Index].getPackagedLoop())
402
895k
      Loop->Exits.clear();
403
10.9M
    DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n");
404
10.9M
  }
405
3.11M
  Loop.IsPackaged = true;
406
3.11M
}
407
408
#ifndef NDEBUG
409
static void debugAssign(const BlockFrequencyInfoImplBase &BFI,
410
                        const DitheringDistributer &D, const BlockNode &T,
411
                        const BlockMass &M, const char *Desc) {
412
  dbgs() << "  => assign " << M << " (" << D.RemMass << ")";
413
  if (Desc)
414
    dbgs() << " [" << Desc << "]";
415
  if (T.isValid())
416
    dbgs() << " to " << BFI.getBlockName(T);
417
  dbgs() << "\n";
418
}
419
#endif
420
421
void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
422
                                                LoopData *OuterLoop,
423
38.1M
                                                Distribution &Dist) {
424
38.1M
  BlockMass Mass = Working[Source.Index].getMass();
425
38.1M
  DEBUG(dbgs() << "  => mass:  " << Mass << "\n");
426
38.1M
427
38.1M
  // Distribute mass to successors as laid out in Dist.
428
38.1M
  DitheringDistributer D(Dist, Mass);
429
38.1M
430
49.8M
  for (const Weight &W : Dist.Weights) {
431
49.8M
    // Check for a local edge (non-backedge and non-exit).
432
49.8M
    BlockMass Taken = D.takeMass(W.Amount);
433
49.8M
    if (
W.Type == Weight::Local49.8M
) {
434
42.5M
      Working[W.TargetNode.Index].getMass() += Taken;
435
42.5M
      DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
436
42.5M
      continue;
437
42.5M
    }
438
7.31M
439
7.31M
    // Backedges and exits only make sense if we're processing a loop.
440
49.8M
    assert(OuterLoop && "backedge or exit outside of loop");
441
7.31M
442
7.31M
    // Check for a backedge.
443
7.31M
    if (
W.Type == Weight::Backedge7.31M
) {
444
3.17M
      OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(W.TargetNode)] += Taken;
445
3.17M
      DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back"));
446
3.17M
      continue;
447
3.17M
    }
448
4.14M
449
4.14M
    // This must be an exit.
450
7.31M
    assert(W.Type == Weight::Exit);
451
4.14M
    OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken));
452
4.14M
    DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit"));
453
49.8M
  }
454
38.1M
}
455
456
static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
457
4.87M
                                     const Scaled64 &Min, const Scaled64 &Max) {
458
4.87M
  // Scale the Factor to a size that creates integers.  Ideally, integers would
459
4.87M
  // be scaled so that Max == UINT64_MAX so that they can be best
460
4.87M
  // differentiated.  However, in the presence of large frequency values, small
461
4.87M
  // frequencies are scaled down to 1, making it impossible to differentiate
462
4.87M
  // small, unequal numbers. When the spread between Min and Max frequencies
463
4.87M
  // fits well within MaxBits, we make the scale be at least 8.
464
4.87M
  const unsigned MaxBits = 64;
465
4.87M
  const unsigned SpreadBits = (Max / Min).lg();
466
4.87M
  Scaled64 ScalingFactor;
467
4.87M
  if (
SpreadBits <= MaxBits - 34.87M
) {
468
4.87M
    // If the values are small enough, make the scaling factor at least 8 to
469
4.87M
    // allow distinguishing small values.
470
4.87M
    ScalingFactor = Min.inverse();
471
4.87M
    ScalingFactor <<= 3;
472
4.87M
  } else {
473
2.60k
    // If the values need more than MaxBits to be represented, saturate small
474
2.60k
    // frequency values down to 1 by using a scaling factor that benefits large
475
2.60k
    // frequency values.
476
2.60k
    ScalingFactor = Scaled64(1, MaxBits) / Max;
477
2.60k
  }
478
4.87M
479
4.87M
  // Translate the floats to integers.
480
4.87M
  DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
481
4.87M
               << ", factor = " << ScalingFactor << "\n");
482
39.9M
  for (size_t Index = 0; 
Index < BFI.Freqs.size()39.9M
;
++Index35.0M
) {
483
35.0M
    Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor;
484
35.0M
    BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>());
485
35.0M
    DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = "
486
35.0M
                 << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled
487
35.0M
                 << ", int = " << BFI.Freqs[Index].Integer << "\n");
488
35.0M
  }
489
4.87M
}
490
491
/// \brief Unwrap a loop package.
492
///
493
/// Visits all the members of a loop, adjusting their BlockData according to
494
/// the loop's pseudo-node.
495
3.11M
static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) {
496
3.11M
  DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop)
497
3.11M
               << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale
498
3.11M
               << "\n");
499
3.11M
  Loop.Scale *= Loop.Mass.toScaled();
500
3.11M
  Loop.IsPackaged = false;
501
3.11M
  DEBUG(dbgs() << "  => combined-scale = " << Loop.Scale << "\n");
502
3.11M
503
3.11M
  // Propagate the head scale through the loop.  Since members are visited in
504
3.11M
  // RPO, the head scale will be updated by the loop scale first, and then the
505
3.11M
  // final head scale will be used for updated the rest of the members.
506
10.9M
  for (const BlockNode &N : Loop.Nodes) {
507
10.9M
    const auto &Working = BFI.Working[N.Index];
508
895k
    Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale
509
10.0M
                                       : BFI.Freqs[N.Index].Scaled;
510
10.9M
    Scaled64 New = Loop.Scale * F;
511
10.9M
    DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " << New
512
10.9M
                 << "\n");
513
10.9M
    F = New;
514
10.9M
  }
515
3.11M
}
516
517
4.87M
void BlockFrequencyInfoImplBase::unwrapLoops() {
518
4.87M
  // Set initial frequencies from loop-local masses.
519
39.9M
  for (size_t Index = 0; 
Index < Working.size()39.9M
;
++Index35.0M
)
520
35.0M
    Freqs[Index].Scaled = Working[Index].Mass.toScaled();
521
4.87M
522
4.87M
  for (LoopData &Loop : Loops)
523
3.11M
    unwrapLoop(*this, Loop);
524
4.87M
}
525
526
4.87M
void BlockFrequencyInfoImplBase::finalizeMetrics() {
527
4.87M
  // Unwrap loop packages in reverse post-order, tracking min and max
528
4.87M
  // frequencies.
529
4.87M
  auto Min = Scaled64::getLargest();
530
4.87M
  auto Max = Scaled64::getZero();
531
39.9M
  for (size_t Index = 0; 
Index < Working.size()39.9M
;
++Index35.0M
) {
532
35.0M
    // Update min/max scale.
533
35.0M
    Min = std::min(Min, Freqs[Index].Scaled);
534
35.0M
    Max = std::max(Max, Freqs[Index].Scaled);
535
35.0M
  }
536
4.87M
537
4.87M
  // Convert to integers.
538
4.87M
  convertFloatingToInteger(*this, Min, Max);
539
4.87M
540
4.87M
  // Clean up data structures.
541
4.87M
  cleanup(*this);
542
4.87M
543
4.87M
  // Print out the final stats.
544
4.87M
  DEBUG(dump());
545
4.87M
}
546
547
BlockFrequency
548
67.0M
BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {
549
67.0M
  if (!Node.isValid())
550
524k
    return 0;
551
66.4M
  return Freqs[Node.Index].Integer;
552
66.4M
}
553
554
Optional<uint64_t>
555
BlockFrequencyInfoImplBase::getBlockProfileCount(const Function &F,
556
876
                                                 const BlockNode &Node) const {
557
876
  return getProfileCountFromFreq(F, getBlockFreq(Node).getFrequency());
558
876
}
559
560
Optional<uint64_t>
561
BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F,
562
946
                                                    uint64_t Freq) const {
563
946
  auto EntryCount = F.getEntryCount();
564
946
  if (!EntryCount)
565
737
    return None;
566
209
  // Use 128 bit APInt to do the arithmetic to avoid overflow.
567
209
  APInt BlockCount(128, EntryCount.getValue());
568
209
  APInt BlockFreq(128, Freq);
569
209
  APInt EntryFreq(128, getEntryFreq());
570
209
  BlockCount *= BlockFreq;
571
209
  BlockCount = BlockCount.udiv(EntryFreq);
572
209
  return BlockCount.getLimitedValue();
573
209
}
574
575
Scaled64
576
435
BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
577
435
  if (!Node.isValid())
578
0
    return Scaled64::getZero();
579
435
  return Freqs[Node.Index].Scaled;
580
435
}
581
582
void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node,
583
1.92k
                                              uint64_t Freq) {
584
1.92k
  assert(Node.isValid() && "Expected valid node");
585
1.92k
  assert(Node.Index < Freqs.size() && "Expected legal index");
586
1.92k
  Freqs[Node.Index].Integer = Freq;
587
1.92k
}
588
589
std::string
590
0
BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const {
591
0
  return {};
592
0
}
593
594
std::string
595
0
BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const {
596
0
  return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? 
"**"0
:
"*"0
);
597
0
}
598
599
raw_ostream &
600
BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
601
0
                                           const BlockNode &Node) const {
602
0
  return OS << getFloatingBlockFreq(Node);
603
0
}
604
605
raw_ostream &
606
BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
607
0
                                           const BlockFrequency &Freq) const {
608
0
  Scaled64 Block(Freq.getFrequency(), 0);
609
0
  Scaled64 Entry(getEntryFreq(), 0);
610
0
611
0
  return OS << Block / Entry;
612
0
}
613
614
207
void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) {
615
207
  Start = OuterLoop.getHeader();
616
207
  Nodes.reserve(OuterLoop.Nodes.size());
617
207
  for (auto N : OuterLoop.Nodes)
618
11.8k
    addNode(N);
619
207
  indexNodes();
620
207
}
621
622
377
void IrreducibleGraph::addNodesInFunction() {
623
377
  Start = 0;
624
12.5k
  for (uint32_t Index = 0; 
Index < BFI.Working.size()12.5k
;
++Index12.1k
)
625
12.1k
    
if (12.1k
!BFI.Working[Index].isPackaged()12.1k
)
626
9.72k
      addNode(Index);
627
377
  indexNodes();
628
377
}
629
630
584
void IrreducibleGraph::indexNodes() {
631
584
  for (auto &I : Nodes)
632
21.5k
    Lookup[I.Node.Index] = &I;
633
584
}
634
635
void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ,
636
42.9k
                               const BFIBase::LoopData *OuterLoop) {
637
42.9k
  if (
OuterLoop && 42.9k
OuterLoop->isHeader(Succ)20.0k
)
638
353
    return;
639
42.6k
  auto L = Lookup.find(Succ.Index);
640
42.6k
  if (L == Lookup.end())
641
2.75k
    return;
642
39.8k
  IrrNode &SuccIrr = *L->second;
643
39.8k
  Irr.Edges.push_back(&SuccIrr);
644
39.8k
  SuccIrr.Edges.push_front(&Irr);
645
39.8k
  ++SuccIrr.NumIn;
646
39.8k
}
647
648
namespace llvm {
649
650
template <> struct GraphTraits<IrreducibleGraph> {
651
  using GraphT = bfi_detail::IrreducibleGraph;
652
  using NodeRef = const GraphT::IrrNode *;
653
  using ChildIteratorType = GraphT::IrrNode::iterator;
654
655
584
  static NodeRef getEntryNode(const GraphT &G) { return G.StartIrr; }
656
21.5k
  static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); }
657
61.4k
  static ChildIteratorType child_end(NodeRef N) { return N->succ_end(); }
658
};
659
660
} // end namespace llvm
661
662
/// \brief Find extra irreducible headers.
663
///
664
/// Find entry blocks and other blocks with backedges, which exist when \c G
665
/// contains irreducible sub-SCCs.
666
static void findIrreducibleHeaders(
667
    const BlockFrequencyInfoImplBase &BFI,
668
    const IrreducibleGraph &G,
669
    const std::vector<const IrreducibleGraph::IrrNode *> &SCC,
670
804
    LoopData::NodeList &Headers, LoopData::NodeList &Others) {
671
804
  // Map from nodes in the SCC to whether it's an entry block.
672
804
  SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC;
673
804
674
804
  // InSCC also acts the set of nodes in the graph.  Seed it.
675
804
  for (const auto *I : SCC)
676
8.80k
    InSCC[I] = false;
677
804
678
9.61k
  for (auto I = InSCC.begin(), E = InSCC.end(); 
I != E9.61k
;
++I8.80k
) {
679
8.80k
    auto &Irr = *I->first;
680
22.5k
    for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) {
681
22.5k
      if (InSCC.count(P))
682
19.9k
        continue;
683
2.67k
684
2.67k
      // This is an entry block.
685
2.67k
      I->second = true;
686
2.67k
      Headers.push_back(Irr.Node);
687
2.67k
      DEBUG(dbgs() << "  => entry = " << BFI.getBlockName(Irr.Node) << "\n");
688
22.5k
      break;
689
22.5k
    }
690
8.80k
  }
691
804
  assert(Headers.size() >= 2 &&
692
804
         "Expected irreducible CFG; -loop-info is likely invalid");
693
804
  if (
Headers.size() == InSCC.size()804
) {
694
184
    // Every block is a header.
695
184
    std::sort(Headers.begin(), Headers.end());
696
184
    return;
697
184
  }
698
620
699
620
  // Look for extra headers from irreducible sub-SCCs.
700
620
  
for (const auto &I : InSCC) 620
{
701
8.21k
    // Entry blocks are already headers.
702
8.21k
    if (I.second)
703
2.08k
      continue;
704
6.13k
705
6.13k
    auto &Irr = *I.first;
706
15.2k
    for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) {
707
15.2k
      // Skip forward edges.
708
15.2k
      if (P->Node < Irr.Node)
709
10.8k
        continue;
710
4.33k
711
4.33k
      // Skip predecessors from entry blocks.  These can have inverted
712
4.33k
      // ordering.
713
4.33k
      
if (4.33k
InSCC.lookup(P)4.33k
)
714
4.09k
        continue;
715
234
716
234
      // Store the extra header.
717
234
      Headers.push_back(Irr.Node);
718
234
      DEBUG(dbgs() << "  => extra = " << BFI.getBlockName(Irr.Node) << "\n");
719
15.2k
      break;
720
15.2k
    }
721
6.13k
    if (Headers.back() == Irr.Node)
722
6.13k
      // Added this as a header.
723
234
      continue;
724
5.89k
725
5.89k
    // This is not a header.
726
5.89k
    Others.push_back(Irr.Node);
727
5.89k
    DEBUG(dbgs() << "  => other = " << BFI.getBlockName(Irr.Node) << "\n");
728
8.21k
  }
729
804
  std::sort(Headers.begin(), Headers.end());
730
804
  std::sort(Others.begin(), Others.end());
731
804
}
732
733
static void createIrreducibleLoop(
734
    BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G,
735
    LoopData *OuterLoop, std::list<LoopData>::iterator Insert,
736
804
    const std::vector<const IrreducibleGraph::IrrNode *> &SCC) {
737
804
  // Translate the SCC into RPO.
738
804
  DEBUG(dbgs() << " - found-scc\n");
739
804
740
804
  LoopData::NodeList Headers;
741
804
  LoopData::NodeList Others;
742
804
  findIrreducibleHeaders(BFI, G, SCC, Headers, Others);
743
804
744
804
  auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(),
745
804
                                Headers.end(), Others.begin(), Others.end());
746
804
747
804
  // Update loop hierarchy.
748
804
  for (const auto &N : Loop->Nodes)
749
8.80k
    
if (8.80k
BFI.Working[N.Index].isLoopHeader()8.80k
)
750
892
      BFI.Working[N.Index].Loop->Parent = &*Loop;
751
8.80k
    else
752
7.91k
      BFI.Working[N.Index].Loop = &*Loop;
753
804
}
754
755
iterator_range<std::list<LoopData>::iterator>
756
BlockFrequencyInfoImplBase::analyzeIrreducible(
757
    const IrreducibleGraph &G, LoopData *OuterLoop,
758
584
    std::list<LoopData>::iterator Insert) {
759
584
  assert((OuterLoop == nullptr) == (Insert == Loops.begin()));
760
584
  auto Prev = OuterLoop ? 
std::prev(Insert)207
:
Loops.end()377
;
761
584
762
14.1k
  for (auto I = scc_begin(G); 
!I.isAtEnd()14.1k
;
++I13.5k
) {
763
13.5k
    if (I->size() < 2)
764
12.7k
      continue;
765
804
766
804
    // Translate the SCC into RPO.
767
804
    createIrreducibleLoop(*this, G, OuterLoop, Insert, *I);
768
804
  }
769
584
770
584
  if (OuterLoop)
771
207
    return make_range(std::next(Prev), Insert);
772
377
  return make_range(Loops.begin(), Insert);
773
377
}
774
775
void
776
207
BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) {
777
207
  OuterLoop.Exits.clear();
778
207
  for (auto &Mass : OuterLoop.BackedgeMass)
779
207
    Mass = BlockMass::getEmpty();
780
207
  auto O = OuterLoop.Nodes.begin() + 1;
781
11.8k
  for (auto I = O, E = OuterLoop.Nodes.end(); 
I != E11.8k
;
++I11.6k
)
782
11.6k
    
if (11.6k
!Working[I->Index].isPackaged()11.6k
)
783
7.19k
      *O++ = *I;
784
207
  OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end());
785
207
}
786
787
804
void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) {
788
804
  assert(Loop.isIrreducible() && "this only makes sense on irreducible loops");
789
804
790
804
  // Since the loop has more than one header block, the mass flowing back into
791
804
  // each header will be different. Adjust the mass in each header loop to
792
804
  // reflect the masses flowing through back edges.
793
804
  //
794
804
  // To do this, we distribute the initial mass using the backedge masses
795
804
  // as weights for the distribution.
796
804
  BlockMass LoopMass = BlockMass::getFull();
797
804
  Distribution Dist;
798
804
799
804
  DEBUG(dbgs() << "adjust-loop-header-mass:\n");
800
3.71k
  for (uint32_t H = 0; 
H < Loop.NumHeaders3.71k
;
++H2.91k
) {
801
2.91k
    auto &HeaderNode = Loop.Nodes[H];
802
2.91k
    auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)];
803
2.91k
    DEBUG(dbgs() << " - Add back edge mass for node "
804
2.91k
                 << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n");
805
2.91k
    if (BackedgeMass.getMass() > 0)
806
2.91k
      Dist.addLocal(HeaderNode, BackedgeMass.getMass());
807
2.91k
    else
808
2.91k
      DEBUG(dbgs() << "   Nothing added. Back edge mass is zero\n");
809
2.91k
  }
810
804
811
804
  DitheringDistributer D(Dist, LoopMass);
812
804
813
804
  DEBUG(dbgs() << " Distribute loop mass " << LoopMass
814
804
               << " to headers using above weights\n");
815
2.91k
  for (const Weight &W : Dist.Weights) {
816
2.91k
    BlockMass Taken = D.takeMass(W.Amount);
817
2.91k
    assert(W.Type == Weight::Local && "all weights should be local");
818
2.91k
    Working[W.TargetNode.Index].getMass() = Taken;
819
2.91k
    DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
820
2.91k
  }
821
804
}