Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This transformation analyzes and transforms the induction variables (and
10
// computations derived from them) into forms suitable for efficient execution
11
// on the target.
12
//
13
// This pass performs a strength reduction on array references inside loops that
14
// have as one or more of their components the loop induction variable, it
15
// rewrites expressions to take advantage of scaled-index addressing modes
16
// available on the target, and it performs a variety of other optimizations
17
// related to loop induction variables.
18
//
19
// Terminology note: this code has a lot of handling for "post-increment" or
20
// "post-inc" users. This is not talking about post-increment addressing modes;
21
// it is instead talking about code like this:
22
//
23
//   %i = phi [ 0, %entry ], [ %i.next, %latch ]
24
//   ...
25
//   %i.next = add %i, 1
26
//   %c = icmp eq %i.next, %n
27
//
28
// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29
// it's useful to think about these as the same register, with some uses using
30
// the value of the register before the add and some using it after. In this
31
// example, the icmp is a post-increment user, since it uses %i.next, which is
32
// the value of the induction variable after the increment. The other common
33
// case of post-increment users is users outside the loop.
34
//
35
// TODO: More sophistication in the way Formulae are generated and filtered.
36
//
37
// TODO: Handle multiple loops at a time.
38
//
39
// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40
//       of a GlobalValue?
41
//
42
// TODO: When truncation is free, truncate ICmp users' operands to make it a
43
//       smaller encoding (on x86 at least).
44
//
45
// TODO: When a negated register is used by an add (such as in a list of
46
//       multiple base registers, or as the increment expression in an addrec),
47
//       we may not actually need both reg and (-1 * reg) in registers; the
48
//       negation can be implemented by using a sub instead of an add. The
49
//       lack of support for taking this into consideration when making
50
//       register pressure decisions is partly worked around by the "Special"
51
//       use kind.
52
//
53
//===----------------------------------------------------------------------===//
54
55
#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
56
#include "llvm/ADT/APInt.h"
57
#include "llvm/ADT/DenseMap.h"
58
#include "llvm/ADT/DenseSet.h"
59
#include "llvm/ADT/Hashing.h"
60
#include "llvm/ADT/PointerIntPair.h"
61
#include "llvm/ADT/STLExtras.h"
62
#include "llvm/ADT/SetVector.h"
63
#include "llvm/ADT/SmallBitVector.h"
64
#include "llvm/ADT/SmallPtrSet.h"
65
#include "llvm/ADT/SmallSet.h"
66
#include "llvm/ADT/SmallVector.h"
67
#include "llvm/ADT/iterator_range.h"
68
#include "llvm/Analysis/IVUsers.h"
69
#include "llvm/Analysis/LoopAnalysisManager.h"
70
#include "llvm/Analysis/LoopInfo.h"
71
#include "llvm/Analysis/LoopPass.h"
72
#include "llvm/Analysis/ScalarEvolution.h"
73
#include "llvm/Analysis/ScalarEvolutionExpander.h"
74
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
75
#include "llvm/Analysis/ScalarEvolutionNormalization.h"
76
#include "llvm/Analysis/TargetTransformInfo.h"
77
#include "llvm/Transforms/Utils/Local.h"
78
#include "llvm/Config/llvm-config.h"
79
#include "llvm/IR/BasicBlock.h"
80
#include "llvm/IR/Constant.h"
81
#include "llvm/IR/Constants.h"
82
#include "llvm/IR/DerivedTypes.h"
83
#include "llvm/IR/Dominators.h"
84
#include "llvm/IR/GlobalValue.h"
85
#include "llvm/IR/IRBuilder.h"
86
#include "llvm/IR/InstrTypes.h"
87
#include "llvm/IR/Instruction.h"
88
#include "llvm/IR/Instructions.h"
89
#include "llvm/IR/IntrinsicInst.h"
90
#include "llvm/IR/Intrinsics.h"
91
#include "llvm/IR/Module.h"
92
#include "llvm/IR/OperandTraits.h"
93
#include "llvm/IR/Operator.h"
94
#include "llvm/IR/PassManager.h"
95
#include "llvm/IR/Type.h"
96
#include "llvm/IR/Use.h"
97
#include "llvm/IR/User.h"
98
#include "llvm/IR/Value.h"
99
#include "llvm/IR/ValueHandle.h"
100
#include "llvm/Pass.h"
101
#include "llvm/Support/Casting.h"
102
#include "llvm/Support/CommandLine.h"
103
#include "llvm/Support/Compiler.h"
104
#include "llvm/Support/Debug.h"
105
#include "llvm/Support/ErrorHandling.h"
106
#include "llvm/Support/MathExtras.h"
107
#include "llvm/Support/raw_ostream.h"
108
#include "llvm/Transforms/Scalar.h"
109
#include "llvm/Transforms/Utils.h"
110
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
111
#include <algorithm>
112
#include <cassert>
113
#include <cstddef>
114
#include <cstdint>
115
#include <cstdlib>
116
#include <iterator>
117
#include <limits>
118
#include <numeric>
119
#include <map>
120
#include <utility>
121
122
using namespace llvm;
123
124
#define DEBUG_TYPE "loop-reduce"
125
126
/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
127
/// bail out. This threshold is far beyond the number of users that LSR can
128
/// conceivably solve, so it should not affect generated code, but catches the
129
/// worst cases before LSR burns too much compile time and stack space.
130
static const unsigned MaxIVUsers = 200;
131
132
// Temporary flag to cleanup congruent phis after LSR phi expansion.
133
// It's currently disabled until we can determine whether it's truly useful or
134
// not. The flag should be removed after the v3.0 release.
135
// This is now needed for ivchains.
136
static cl::opt<bool> EnablePhiElim(
137
  "enable-lsr-phielim", cl::Hidden, cl::init(true),
138
  cl::desc("Enable LSR phi elimination"));
139
140
// The flag adds instruction count to solutions cost comparision.
141
static cl::opt<bool> InsnsCost(
142
  "lsr-insns-cost", cl::Hidden, cl::init(true),
143
  cl::desc("Add instruction count to a LSR cost model"));
144
145
// Flag to choose how to narrow complex lsr solution
146
static cl::opt<bool> LSRExpNarrow(
147
  "lsr-exp-narrow", cl::Hidden, cl::init(false),
148
  cl::desc("Narrow LSR complex solution using"
149
           " expectation of registers number"));
150
151
// Flag to narrow search space by filtering non-optimal formulae with
152
// the same ScaledReg and Scale.
153
static cl::opt<bool> FilterSameScaledReg(
154
    "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
155
    cl::desc("Narrow LSR search space by filtering non-optimal formulae"
156
             " with the same ScaledReg and Scale"));
157
158
static cl::opt<bool> EnableBackedgeIndexing(
159
  "lsr-backedge-indexing", cl::Hidden, cl::init(true),
160
  cl::desc("Enable the generation of cross iteration indexed memops"));
161
162
static cl::opt<unsigned> ComplexityLimit(
163
  "lsr-complexity-limit", cl::Hidden,
164
  cl::init(std::numeric_limits<uint16_t>::max()),
165
  cl::desc("LSR search space complexity limit"));
166
167
static cl::opt<unsigned> SetupCostDepthLimit(
168
    "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
169
    cl::desc("The limit on recursion depth for LSRs setup cost"));
170
171
#ifndef NDEBUG
172
// Stress test IV chain generation.
173
static cl::opt<bool> StressIVChain(
174
  "stress-ivchain", cl::Hidden, cl::init(false),
175
  cl::desc("Stress test LSR IV chains"));
176
#else
177
static bool StressIVChain = false;
178
#endif
179
180
namespace {
181
182
struct MemAccessTy {
183
  /// Used in situations where the accessed memory type is unknown.
184
  static const unsigned UnknownAddressSpace =
185
      std::numeric_limits<unsigned>::max();
186
187
  Type *MemTy = nullptr;
188
  unsigned AddrSpace = UnknownAddressSpace;
189
190
506k
  MemAccessTy() = default;
191
342k
  MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
192
193
1.89M
  bool operator==(MemAccessTy Other) const {
194
1.89M
    return MemTy == Other.MemTy && 
AddrSpace == Other.AddrSpace1.68M
;
195
1.89M
  }
196
197
0
  bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
198
199
  static MemAccessTy getUnknown(LLVMContext &Ctx,
200
3.62k
                                unsigned AS = UnknownAddressSpace) {
201
3.62k
    return MemAccessTy(Type::getVoidTy(Ctx), AS);
202
3.62k
  }
203
204
3.36k
  Type *getType() { return MemTy; }
205
};
206
207
/// This class holds data which is used to order reuse candidates.
208
class RegSortData {
209
public:
210
  /// This represents the set of LSRUse indices which reference
211
  /// a particular register.
212
  SmallBitVector UsedByIndices;
213
214
  void print(raw_ostream &OS) const;
215
  void dump() const;
216
};
217
218
} // end anonymous namespace
219
220
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
221
void RegSortData::print(raw_ostream &OS) const {
222
  OS << "[NumUses=" << UsedByIndices.count() << ']';
223
}
224
225
LLVM_DUMP_METHOD void RegSortData::dump() const {
226
  print(errs()); errs() << '\n';
227
}
228
#endif
229
230
namespace {
231
232
/// Map register candidates to information about how they are used.
233
class RegUseTracker {
234
  using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
235
236
  RegUsesTy RegUsesMap;
237
  SmallVector<const SCEV *, 16> RegSequence;
238
239
public:
240
  void countRegister(const SCEV *Reg, size_t LUIdx);
241
  void dropRegister(const SCEV *Reg, size_t LUIdx);
242
  void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
243
244
  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
245
246
  const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
247
248
  void clear();
249
250
  using iterator = SmallVectorImpl<const SCEV *>::iterator;
251
  using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator;
252
253
208k
  iterator begin() { return RegSequence.begin(); }
254
208k
  iterator end()   { return RegSequence.end(); }
255
0
  const_iterator begin() const { return RegSequence.begin(); }
256
0
  const_iterator end() const   { return RegSequence.end(); }
257
};
258
259
} // end anonymous namespace
260
261
void
262
8.56M
RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
263
8.56M
  std::pair<RegUsesTy::iterator, bool> Pair =
264
8.56M
    RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
265
8.56M
  RegSortData &RSD = Pair.first->second;
266
8.56M
  if (Pair.second)
267
2.23M
    RegSequence.push_back(Reg);
268
8.56M
  RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
269
8.56M
  RSD.UsedByIndices.set(LUIdx);
270
8.56M
}
271
272
void
273
720k
RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
274
720k
  RegUsesTy::iterator It = RegUsesMap.find(Reg);
275
720k
  assert(It != RegUsesMap.end());
276
720k
  RegSortData &RSD = It->second;
277
720k
  assert(RSD.UsedByIndices.size() > LUIdx);
278
720k
  RSD.UsedByIndices.reset(LUIdx);
279
720k
}
280
281
void
282
140k
RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
283
140k
  assert(LUIdx <= LastLUIdx);
284
140k
285
140k
  // Update RegUses. The data structure is not optimized for this purpose;
286
140k
  // we must iterate through it and update each of the bit vectors.
287
67.8M
  for (auto &Pair : RegUsesMap) {
288
67.8M
    SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
289
67.8M
    if (LUIdx < UsedByIndices.size())
290
66.2M
      UsedByIndices[LUIdx] =
291
66.2M
        LastLUIdx < UsedByIndices.size() ? 
UsedByIndices[LastLUIdx]33.5M
:
false32.7M
;
292
67.8M
    UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
293
67.8M
  }
294
140k
}
295
296
bool
297
8.38M
RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
298
8.38M
  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
299
8.38M
  if (I == RegUsesMap.end())
300
59.6k
    return false;
301
8.32M
  const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
302
8.32M
  int i = UsedByIndices.find_first();
303
8.32M
  if (i == -1) 
return false0
;
304
8.32M
  if ((size_t)i != LUIdx) 
return true3.80M
;
305
4.51M
  return UsedByIndices.find_next(i) != -1;
306
4.51M
}
307
308
4.15M
const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
309
4.15M
  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
310
4.15M
  assert(I != RegUsesMap.end() && "Unknown register!");
311
4.15M
  return I->second.UsedByIndices;
312
4.15M
}
313
314
103k
void RegUseTracker::clear() {
315
103k
  RegUsesMap.clear();
316
103k
  RegSequence.clear();
317
103k
}
318
319
namespace {
320
321
/// This class holds information that describes a formula for computing
322
/// satisfying a use. It may include broken-out immediates and scaled registers.
323
struct Formula {
324
  /// Global base address used for complex addressing.
325
  GlobalValue *BaseGV = nullptr;
326
327
  /// Base offset for complex addressing.
328
  int64_t BaseOffset = 0;
329
330
  /// Whether any complex addressing has a base register.
331
  bool HasBaseReg = false;
332
333
  /// The scale of any complex addressing.
334
  int64_t Scale = 0;
335
336
  /// The list of "base" registers for this use. When this is non-empty. The
337
  /// canonical representation of a formula is
338
  /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
339
  /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
340
  /// 3. The reg containing recurrent expr related with currect loop in the
341
  /// formula should be put in the ScaledReg.
342
  /// #1 enforces that the scaled register is always used when at least two
343
  /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
344
  /// #2 enforces that 1 * reg is reg.
345
  /// #3 ensures invariant regs with respect to current loop can be combined
346
  /// together in LSR codegen.
347
  /// This invariant can be temporarily broken while building a formula.
348
  /// However, every formula inserted into the LSRInstance must be in canonical
349
  /// form.
350
  SmallVector<const SCEV *, 4> BaseRegs;
351
352
  /// The 'scaled' register for this use. This should be non-null when Scale is
353
  /// not zero.
354
  const SCEV *ScaledReg = nullptr;
355
356
  /// An additional constant offset which added near the use. This requires a
357
  /// temporary register, but the offset itself can live in an add immediate
358
  /// field rather than a register.
359
  int64_t UnfoldedOffset = 0;
360
361
456k
  Formula() = default;
362
363
  void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
364
365
  bool isCanonical(const Loop &L) const;
366
367
  void canonicalize(const Loop &L);
368
369
  bool unscale();
370
371
  bool hasZeroEnd() const;
372
373
  size_t getNumRegs() const;
374
  Type *getType() const;
375
376
  void deleteBaseReg(const SCEV *&S);
377
378
  bool referencesReg(const SCEV *S) const;
379
  bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
380
                                  const RegUseTracker &RegUses) const;
381
382
  void print(raw_ostream &OS) const;
383
  void dump() const;
384
};
385
386
} // end anonymous namespace
387
388
/// Recursion helper for initialMatch.
389
static void DoInitialMatch(const SCEV *S, Loop *L,
390
                           SmallVectorImpl<const SCEV *> &Good,
391
                           SmallVectorImpl<const SCEV *> &Bad,
392
507k
                           ScalarEvolution &SE) {
393
507k
  // Collect expressions which properly dominate the loop header.
394
507k
  if (SE.properlyDominates(S, L->getHeader())) {
395
446k
    Good.push_back(S);
396
446k
    return;
397
446k
  }
398
61.4k
399
61.4k
  // Look at add operands.
400
61.4k
  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
401
29.9k
    for (const SCEV *S : Add->operands())
402
60.5k
      DoInitialMatch(S, L, Good, Bad, SE);
403
29.9k
    return;
404
29.9k
  }
405
31.4k
406
31.4k
  // Look at addrec operands.
407
31.4k
  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
408
716
    if (!AR->getStart()->isZero() && 
AR->isAffine()358
) {
409
358
      DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
410
358
      DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
411
358
                                      AR->getStepRecurrence(SE),
412
358
                                      // FIXME: AR->getNoWrapFlags()
413
358
                                      AR->getLoop(), SCEV::FlagAnyWrap),
414
358
                     L, Good, Bad, SE);
415
358
      return;
416
358
    }
417
31.0k
418
31.0k
  // Handle a multiplication by -1 (negation) if it didn't fold.
419
31.0k
  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
420
1.09k
    if (Mul->getOperand(0)->isAllOnesValue()) {
421
245
      SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
422
245
      const SCEV *NewMul = SE.getMulExpr(Ops);
423
245
424
245
      SmallVector<const SCEV *, 4> MyGood;
425
245
      SmallVector<const SCEV *, 4> MyBad;
426
245
      DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
427
245
      const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
428
245
        SE.getEffectiveSCEVType(NewMul->getType())));
429
245
      for (const SCEV *S : MyGood)
430
0
        Good.push_back(SE.getMulExpr(NegOne, S));
431
245
      for (const SCEV *S : MyBad)
432
313
        Bad.push_back(SE.getMulExpr(NegOne, S));
433
245
      return;
434
245
    }
435
30.8k
436
30.8k
  // Ok, we can't do anything interesting. Just stuff the whole thing into a
437
30.8k
  // register and hope for the best.
438
30.8k
  Bad.push_back(S);
439
30.8k
}
440
441
/// Incorporate loop-variant parts of S into this Formula, attempting to keep
442
/// all loop-invariant and loop-computable values in a single base register.
443
446k
void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
444
446k
  SmallVector<const SCEV *, 4> Good;
445
446k
  SmallVector<const SCEV *, 4> Bad;
446
446k
  DoInitialMatch(S, L, Good, Bad, SE);
447
446k
  if (!Good.empty()) {
448
446k
    const SCEV *Sum = SE.getAddExpr(Good);
449
446k
    if (!Sum->isZero())
450
446k
      BaseRegs.push_back(Sum);
451
446k
    HasBaseReg = true;
452
446k
  }
453
446k
  if (!Bad.empty()) {
454
30.2k
    const SCEV *Sum = SE.getAddExpr(Bad);
455
30.2k
    if (!Sum->isZero())
456
30.2k
      BaseRegs.push_back(Sum);
457
30.2k
    HasBaseReg = true;
458
30.2k
  }
459
446k
  canonicalize(*L);
460
446k
}
461
462
/// Check whether or not this formula satisfies the canonical
463
/// representation.
464
/// \see Formula::BaseRegs.
465
5.93M
bool Formula::isCanonical(const Loop &L) const {
466
5.93M
  if (!ScaledReg)
467
3.59M
    return BaseRegs.size() <= 1;
468
2.33M
469
2.33M
  if (Scale != 1)
470
75.4k
    return true;
471
2.26M
472
2.26M
  if (Scale == 1 && BaseRegs.empty())
473
0
    return false;
474
2.26M
475
2.26M
  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
476
2.26M
  if (SAR && 
SAR->getLoop() == &L1.86M
)
477
1.81M
    return true;
478
446k
479
446k
  // If ScaledReg is not a recurrent expr, or it is but its loop is not current
480
446k
  // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
481
446k
  // loop, we want to swap the reg in BaseRegs with ScaledReg.
482
446k
  auto I =
483
939k
      find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
484
939k
        return isa<const SCEVAddRecExpr>(S) &&
485
939k
               
(cast<SCEVAddRecExpr>(S)->getLoop() == &L)494k
;
486
939k
      });
487
446k
  return I == BaseRegs.end();
488
446k
}
489
490
/// Helper method to morph a formula into its canonical representation.
491
/// \see Formula::BaseRegs.
492
/// Every formula having more than one base register, must use the ScaledReg
493
/// field. Otherwise, we would have to do special cases everywhere in LSR
494
/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
495
/// On the other hand, 1*reg should be canonicalized into reg.
496
5.93M
void Formula::canonicalize(const Loop &L) {
497
5.93M
  if (isCanonical(L))
498
3.13M
    return;
499
2.79M
  // So far we did not need this case. This is easy to implement but it is
500
2.79M
  // useless to maintain dead code. Beside it could hurt compile time.
501
2.79M
  assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
502
2.79M
503
2.79M
  // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
504
2.79M
  if (!ScaledReg) {
505
2.35M
    ScaledReg = BaseRegs.back();
506
2.35M
    BaseRegs.pop_back();
507
2.35M
    Scale = 1;
508
2.35M
  }
509
2.79M
510
2.79M
  // If ScaledReg is an invariant with respect to L, find the reg from
511
2.79M
  // BaseRegs containing the recurrent expr related with Loop L. Swap the
512
2.79M
  // reg with ScaledReg.
513
2.79M
  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
514
2.79M
  if (!SAR || 
SAR->getLoop() != &L1.29M
) {
515
1.75M
    auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
516
2.25M
                     [&](const SCEV *S) {
517
2.25M
                       return isa<const SCEVAddRecExpr>(S) &&
518
2.25M
                              
(cast<SCEVAddRecExpr>(S)->getLoop() == &L)1.80M
;
519
2.25M
                     });
520
1.75M
    if (I != BaseRegs.end())
521
1.75M
      std::swap(ScaledReg, *I);
522
1.75M
  }
523
2.79M
}
524
525
/// Get rid of the scale in the formula.
526
/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
527
/// \return true if it was possible to get rid of the scale, false otherwise.
528
/// \note After this operation the formula may not be in the canonical form.
529
21.3M
bool Formula::unscale() {
530
21.3M
  if (Scale != 1)
531
7.42M
    return false;
532
13.9M
  Scale = 0;
533
13.9M
  BaseRegs.push_back(ScaledReg);
534
13.9M
  ScaledReg = nullptr;
535
13.9M
  return true;
536
13.9M
}
537
538
1.59M
bool Formula::hasZeroEnd() const {
539
1.59M
  if (UnfoldedOffset || 
BaseOffset1.54M
)
540
228k
    return false;
541
1.36M
  if (BaseRegs.size() != 1 || 
ScaledReg1.28M
)
542
637k
    return false;
543
724k
  return true;
544
724k
}
545
546
/// Return the total number of register operands used by this formula. This does
547
/// not include register uses implied by non-constant addrec strides.
548
22.2M
size_t Formula::getNumRegs() const {
549
22.2M
  return !!ScaledReg + BaseRegs.size();
550
22.2M
}
551
552
/// Return the type of this formula, if it has one, or null otherwise. This type
553
/// is meaningless except for the bit size.
554
6.51M
Type *Formula::getType() const {
555
6.51M
  return !BaseRegs.empty() ? 
BaseRegs.front()->getType()6.49M
:
556
6.51M
         
ScaledReg 21.4k
?
ScaledReg->getType()21.4k
:
557
21.4k
         
BaseGV 0
?
BaseGV->getType()0
:
558
0
         nullptr;
559
6.51M
}
560
561
/// Delete the given base reg from the BaseRegs list.
562
1.14M
void Formula::deleteBaseReg(const SCEV *&S) {
563
1.14M
  if (&S != &BaseRegs.back())
564
15.4k
    std::swap(S, BaseRegs.back());
565
1.14M
  BaseRegs.pop_back();
566
1.14M
}
567
568
/// Test if this formula references the given register.
569
324k
bool Formula::referencesReg(const SCEV *S) const {
570
324k
  return S == ScaledReg || 
is_contained(BaseRegs, S)315k
;
571
324k
}
572
573
/// Test whether this formula uses registers which are used by uses other than
574
/// the use with the given index.
575
bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
576
81.3k
                                         const RegUseTracker &RegUses) const {
577
81.3k
  if (ScaledReg)
578
49.7k
    if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
579
19.6k
      return true;
580
61.7k
  for (const SCEV *BaseReg : BaseRegs)
581
70.2k
    if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
582
22.7k
      return true;
583
61.7k
  
return false38.9k
;
584
61.7k
}
585
586
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
587
void Formula::print(raw_ostream &OS) const {
588
  bool First = true;
589
  if (BaseGV) {
590
    if (!First) OS << " + "; else First = false;
591
    BaseGV->printAsOperand(OS, /*PrintType=*/false);
592
  }
593
  if (BaseOffset != 0) {
594
    if (!First) OS << " + "; else First = false;
595
    OS << BaseOffset;
596
  }
597
  for (const SCEV *BaseReg : BaseRegs) {
598
    if (!First) OS << " + "; else First = false;
599
    OS << "reg(" << *BaseReg << ')';
600
  }
601
  if (HasBaseReg && BaseRegs.empty()) {
602
    if (!First) OS << " + "; else First = false;
603
    OS << "**error: HasBaseReg**";
604
  } else if (!HasBaseReg && !BaseRegs.empty()) {
605
    if (!First) OS << " + "; else First = false;
606
    OS << "**error: !HasBaseReg**";
607
  }
608
  if (Scale != 0) {
609
    if (!First) OS << " + "; else First = false;
610
    OS << Scale << "*reg(";
611
    if (ScaledReg)
612
      OS << *ScaledReg;
613
    else
614
      OS << "<unknown>";
615
    OS << ')';
616
  }
617
  if (UnfoldedOffset != 0) {
618
    if (!First) OS << " + ";
619
    OS << "imm(" << UnfoldedOffset << ')';
620
  }
621
}
622
623
LLVM_DUMP_METHOD void Formula::dump() const {
624
  print(errs()); errs() << '\n';
625
}
626
#endif
627
628
/// Return true if the given addrec can be sign-extended without changing its
629
/// value.
630
188k
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
631
188k
  Type *WideTy =
632
188k
    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
633
188k
  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
634
188k
}
635
636
/// Return true if the given add can be sign-extended without changing its
637
/// value.
638
25.6k
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
639
25.6k
  Type *WideTy =
640
25.6k
    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
641
25.6k
  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
642
25.6k
}
643
644
/// Return true if the given mul can be sign-extended without changing its
645
/// value.
646
134k
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
647
134k
  Type *WideTy =
648
134k
    IntegerType::get(SE.getContext(),
649
134k
                     SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
650
134k
  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
651
134k
}
652
653
/// Return an expression for LHS /s RHS, if it can be determined and if the
654
/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
655
/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
656
/// the multiplication may overflow, which is useful when the result will be
657
/// used in a context where the most significant bits are ignored.
658
static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
659
                                ScalarEvolution &SE,
660
3.32M
                                bool IgnoreSignificantBits = false) {
661
3.32M
  // Handle the trivial case, which works for any SCEV type.
662
3.32M
  if (LHS == RHS)
663
318k
    return SE.getConstant(LHS->getType(), 1);
664
3.00M
665
3.00M
  // Handle a few RHS special cases.
666
3.00M
  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
667
3.00M
  if (RC) {
668
2.99M
    const APInt &RA = RC->getAPInt();
669
2.99M
    // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
670
2.99M
    // some folding.
671
2.99M
    if (RA.isAllOnesValue())
672
615k
      return SE.getMulExpr(LHS, RC);
673
2.38M
    // Handle x /s 1 as x.
674
2.38M
    if (RA == 1)
675
675k
      return LHS;
676
1.71M
  }
677
1.71M
678
1.71M
  // Check for a division of a constant by a constant.
679
1.71M
  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
680
688k
    if (!RC)
681
2.03k
      return nullptr;
682
686k
    const APInt &LA = C->getAPInt();
683
686k
    const APInt &RA = RC->getAPInt();
684
686k
    if (LA.srem(RA) != 0)
685
51.7k
      return nullptr;
686
634k
    return SE.getConstant(LA.sdiv(RA));
687
634k
  }
688
1.02M
689
1.02M
  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
690
1.02M
  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
691
566k
    if ((IgnoreSignificantBits || 
isAddRecSExtable(AR, SE)188k
) &&
AR->isAffine()462k
) {
692
462k
      const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
693
462k
                                      IgnoreSignificantBits);
694
462k
      if (!Step) 
return nullptr2.19k
;
695
460k
      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
696
460k
                                       IgnoreSignificantBits);
697
460k
      if (!Start) 
return nullptr136k
;
698
323k
      // FlagNW is independent of the start value, step direction, and is
699
323k
      // preserved with smaller magnitude steps.
700
323k
      // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
701
323k
      return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
702
323k
    }
703
103k
    return nullptr;
704
103k
  }
705
456k
706
456k
  // Distribute the sdiv over add operands, if the add doesn't overflow.
707
456k
  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
708
95.9k
    if (IgnoreSignificantBits || 
isAddSExtable(Add, SE)25.6k
) {
709
75.8k
      SmallVector<const SCEV *, 8> Ops;
710
163k
      for (const SCEV *S : Add->operands()) {
711
163k
        const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
712
163k
        if (!Op) 
return nullptr59.4k
;
713
103k
        Ops.push_back(Op);
714
103k
      }
715
75.8k
      
return SE.getAddExpr(Ops)16.4k
;
716
20.0k
    }
717
20.0k
    return nullptr;
718
20.0k
  }
719
360k
720
360k
  // Check for a multiply operand that we can pull RHS out of.
721
360k
  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
722
234k
    if (IgnoreSignificantBits || 
isMulSExtable(Mul, SE)134k
) {
723
135k
      SmallVector<const SCEV *, 4> Ops;
724
135k
      bool Found = false;
725
271k
      for (const SCEV *S : Mul->operands()) {
726
271k
        if (!Found)
727
137k
          if (const SCEV *Q = getExactSDiv(S, RHS, SE,
728
132k
                                           IgnoreSignificantBits)) {
729
132k
            S = Q;
730
132k
            Found = true;
731
132k
          }
732
271k
        Ops.push_back(S);
733
271k
      }
734
135k
      return Found ? 
SE.getMulExpr(Ops)132k
:
nullptr2.37k
;
735
135k
    }
736
99.3k
    return nullptr;
737
99.3k
  }
738
125k
739
125k
  // Otherwise we don't know.
740
125k
  return nullptr;
741
125k
}
742
743
/// If S involves the addition of a constant integer value, return that integer
744
/// value, and mutate S to point to a new SCEV with that value excluded.
745
20.9M
static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
746
20.9M
  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
747
7.38M
    if (C->getAPInt().getMinSignedBits() <= 64) {
748
7.38M
      S = SE.getConstant(C->getType(), 0);
749
7.38M
      return C->getValue()->getSExtValue();
750
7.38M
    }
751
13.5M
  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
752
2.31M
    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
753
2.31M
    int64_t Result = ExtractImmediate(NewOps.front(), SE);
754
2.31M
    if (Result != 0)
755
1.85M
      S = SE.getAddExpr(NewOps);
756
2.31M
    return Result;
757
11.2M
  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
758
6.21M
    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
759
6.21M
    int64_t Result = ExtractImmediate(NewOps.front(), SE);
760
6.21M
    if (Result != 0)
761
1.90M
      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
762
1.90M
                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
763
1.90M
                           SCEV::FlagAnyWrap);
764
6.21M
    return Result;
765
6.21M
  }
766
5.04M
  return 0;
767
5.04M
}
768
769
/// If S involves the addition of a GlobalValue address, return that symbol, and
770
/// mutate S to point to a new SCEV with that value excluded.
771
15.0M
static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
772
15.0M
  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
773
3.36M
    if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
774
759k
      S = SE.getConstant(GV->getType(), 0);
775
759k
      return GV;
776
759k
    }
777
11.6M
  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
778
1.13M
    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
779
1.13M
    GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
780
1.13M
    if (Result)
781
212k
      S = SE.getAddExpr(NewOps);
782
1.13M
    return Result;
783
10.5M
  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
784
4.21M
    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
785
4.21M
    GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
786
4.21M
    if (Result)
787
118k
      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
788
118k
                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
789
118k
                           SCEV::FlagAnyWrap);
790
4.21M
    return Result;
791
4.21M
  }
792
8.92M
  return nullptr;
793
8.92M
}
794
795
/// Returns true if the specified instruction is using the specified value as an
796
/// address.
797
static bool isAddressUse(const TargetTransformInfo &TTI,
798
503k
                         Instruction *Inst, Value *OperandVal) {
799
503k
  bool isAddress = isa<LoadInst>(Inst);
800
503k
  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
801
209k
    if (SI->getPointerOperand() == OperandVal)
802
205k
      isAddress = true;
803
294k
  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
804
3.59k
    // Addressing modes can also be folded into prefetches and a variety
805
3.59k
    // of intrinsics.
806
3.59k
    switch (II->getIntrinsicID()) {
807
3.59k
    case Intrinsic::memset:
808
1.85k
    case Intrinsic::prefetch:
809
1.85k
      if (II->getArgOperand(0) == OperandVal)
810
1.85k
        isAddress = true;
811
1.85k
      break;
812
1.85k
    case Intrinsic::memmove:
813
1.32k
    case Intrinsic::memcpy:
814
1.32k
      if (II->getArgOperand(0) == OperandVal ||
815
1.32k
          
II->getArgOperand(1) == OperandVal681
)
816
1.32k
        isAddress = true;
817
1.32k
      break;
818
1.32k
    default: {
819
413
      MemIntrinsicInfo IntrInfo;
820
413
      if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
821
243
        if (IntrInfo.PtrVal == OperandVal)
822
243
          isAddress = true;
823
243
      }
824
413
    }
825
3.59k
    }
826
290k
  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
827
16
    if (RMW->getPointerOperand() == OperandVal)
828
16
      isAddress = true;
829
290k
  } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
830
10
    if (CmpX->getPointerOperand() == OperandVal)
831
10
      isAddress = true;
832
10
  }
833
503k
  return isAddress;
834
503k
}
835
836
/// Return the type of the memory being accessed.
837
static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
838
339k
                                 Instruction *Inst, Value *OperandVal) {
839
339k
  MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
840
339k
  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
841
205k
    AccessTy.MemTy = SI->getOperand(0)->getType();
842
205k
    AccessTy.AddrSpace = SI->getPointerAddressSpace();
843
205k
  } else 
if (const LoadInst *133k
LI133k
= dyn_cast<LoadInst>(Inst)) {
844
129k
    AccessTy.AddrSpace = LI->getPointerAddressSpace();
845
129k
  } else 
if (const AtomicRMWInst *3.45k
RMW3.45k
= dyn_cast<AtomicRMWInst>(Inst)) {
846
16
    AccessTy.AddrSpace = RMW->getPointerAddressSpace();
847
3.43k
  } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
848
10
    AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
849
3.42k
  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
850
3.42k
    switch (II->getIntrinsicID()) {
851
3.42k
    case Intrinsic::prefetch:
852
1.85k
    case Intrinsic::memset:
853
1.85k
      AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
854
1.85k
      AccessTy.MemTy = OperandVal->getType();
855
1.85k
      break;
856
1.85k
    case Intrinsic::memmove:
857
1.32k
    case Intrinsic::memcpy:
858
1.32k
      AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
859
1.32k
      AccessTy.MemTy = OperandVal->getType();
860
1.32k
      break;
861
1.32k
    default: {
862
243
      MemIntrinsicInfo IntrInfo;
863
243
      if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
864
243
        AccessTy.AddrSpace
865
243
          = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
866
243
      }
867
243
868
243
      break;
869
339k
    }
870
3.42k
    }
871
3.42k
  }
872
339k
873
339k
  // All pointers have the same requirements, so canonicalize them to an
874
339k
  // arbitrary pointer type to minimize variation.
875
339k
  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
876
34.5k
    AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
877
34.5k
                                      PTy->getAddressSpace());
878
339k
879
339k
  return AccessTy;
880
339k
}
881
882
/// Return true if this AddRec is already a phi in its loop.
883
1.10M
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
884
2.34M
  for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
885
2.34M
    if (SE.isSCEVable(PN.getType()) &&
886
2.34M
        (SE.getEffectiveSCEVType(PN.getType()) ==
887
2.31M
         SE.getEffectiveSCEVType(AR->getType())) &&
888
2.34M
        
SE.getSCEV(&PN) == AR1.43M
)
889
101k
      return true;
890
2.34M
  }
891
1.10M
  
return false1.00M
;
892
1.10M
}
893
894
/// Check if expanding this expression is likely to incur significant cost. This
895
/// is tricky because SCEV doesn't track which expressions are actually computed
896
/// by the current IR.
897
///
898
/// We currently allow expansion of IV increments that involve adds,
899
/// multiplication by constants, and AddRecs from existing phis.
900
///
901
/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
902
/// obvious multiple of the UDivExpr.
903
static bool isHighCostExpansion(const SCEV *S,
904
                                SmallPtrSetImpl<const SCEV*> &Processed,
905
336k
                                ScalarEvolution &SE) {
906
336k
  // Zero/One operand expressions
907
336k
  switch (S->getSCEVType()) {
908
336k
  case scUnknown:
909
331k
  case scConstant:
910
331k
    return false;
911
331k
  case scTruncate:
912
11
    return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
913
11
                               Processed, SE);
914
331k
  case scZeroExtend:
915
176
    return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
916
176
                               Processed, SE);
917
331k
  case scSignExtend:
918
1.44k
    return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
919
1.44k
                               Processed, SE);
920
3.96k
  }
921
3.96k
922
3.96k
  if (!Processed.insert(S).second)
923
5
    return false;
924
3.95k
925
3.95k
  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
926
2.47k
    for (const SCEV *S : Add->operands()) {
927
2.47k
      if (isHighCostExpansion(S, Processed, SE))
928
516
        return true;
929
2.47k
    }
930
1.33k
    
return false821
;
931
2.62k
  }
932
2.62k
933
2.62k
  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
934
2.20k
    if (Mul->getNumOperands() == 2) {
935
2.19k
      // Multiplication by a constant is ok
936
2.19k
      if (isa<SCEVConstant>(Mul->getOperand(0)))
937
1.84k
        return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
938
346
939
346
      // If we have the value of one operand, check if an existing
940
346
      // multiplication already generates this expression.
941
346
      if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
942
173
        Value *UVal = U->getValue();
943
394
        for (User *UR : UVal->users()) {
944
394
          // If U is a constant, it may be used by a ConstantExpr.
945
394
          Instruction *UI = dyn_cast<Instruction>(UR);
946
394
          if (UI && UI->getOpcode() == Instruction::Mul &&
947
394
              
SE.isSCEVable(UI->getType())173
) {
948
173
            return SE.getSCEV(UI) == Mul;
949
173
          }
950
394
        }
951
173
      }
952
346
    }
953
2.20k
  }
954
2.62k
955
2.62k
  
if (const SCEVAddRecExpr *603
AR603
= dyn_cast<SCEVAddRecExpr>(S)) {
956
241
    if (isExistingPhi(AR, SE))
957
82
      return false;
958
521
  }
959
521
960
521
  // Fow now, consider any other type of expression (div/mul/min/max) high cost.
961
521
  return true;
962
521
}
963
964
/// If any of the instructions in the specified set are trivially dead, delete
965
/// them and see if this makes any of their operands subsequently dead.
966
static bool
967
103k
DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
968
103k
  bool Changed = false;
969
103k
970
930k
  while (!DeadInsts.empty()) {
971
826k
    Value *V = DeadInsts.pop_back_val();
972
826k
    Instruction *I = dyn_cast_or_null<Instruction>(V);
973
826k
974
826k
    if (!I || 
!isInstructionTriviallyDead(I)776k
)
975
249k
      continue;
976
576k
977
576k
    for (Use &O : I->operands())
978
1.12M
      if (Instruction *U = dyn_cast<Instruction>(O)) {
979
679k
        O = nullptr;
980
679k
        if (U->use_empty())
981
245k
          DeadInsts.emplace_back(U);
982
679k
      }
983
576k
984
576k
    I->eraseFromParent();
985
576k
    Changed = true;
986
576k
  }
987
103k
988
103k
  return Changed;
989
103k
}
990
991
namespace {
992
993
class LSRUse;
994
995
} // end anonymous namespace
996
997
/// Check if the addressing mode defined by \p F is completely
998
/// folded in \p LU at isel time.
999
/// This includes address-mode folding and special icmp tricks.
1000
/// This function returns true if \p LU can accommodate what \p F
1001
/// defines and up to 1 base + 1 scaled + offset.
1002
/// In other words, if \p F has several base registers, this function may
1003
/// still return true. Therefore, users still need to account for
1004
/// additional base registers and/or unfolded offsets to derive an
1005
/// accurate cost model.
1006
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1007
                                 const LSRUse &LU, const Formula &F);
1008
1009
// Get the cost of the scaling factor used in F for LU.
1010
static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
1011
                                     const LSRUse &LU, const Formula &F,
1012
                                     const Loop &L);
1013
1014
namespace {
1015
1016
/// This class is used to measure and compare candidate formulae.
1017
class Cost {
1018
  const Loop *L = nullptr;
1019
  ScalarEvolution *SE = nullptr;
1020
  const TargetTransformInfo *TTI = nullptr;
1021
  TargetTransformInfo::LSRCost C;
1022
1023
public:
1024
  Cost() = delete;
1025
  Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI) :
1026
6.66M
    L(L), SE(&SE), TTI(&TTI) {
1027
6.66M
    C.Insns = 0;
1028
6.66M
    C.NumRegs = 0;
1029
6.66M
    C.AddRecCost = 0;
1030
6.66M
    C.NumIVMuls = 0;
1031
6.66M
    C.NumBaseAdds = 0;
1032
6.66M
    C.ImmCost = 0;
1033
6.66M
    C.SetupCost = 0;
1034
6.66M
    C.ScaleCost = 0;
1035
6.66M
  }
1036
1037
  bool isLess(Cost &Other);
1038
1039
  void Lose();
1040
1041
#ifndef NDEBUG
1042
  // Once any of the metrics loses, they must all remain losers.
1043
  bool isValid() {
1044
    return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1045
             | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1046
      || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1047
           & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1048
  }
1049
#endif
1050
1051
30.1M
  bool isLoser() {
1052
30.1M
    assert(isValid() && "invalid cost");
1053
30.1M
    return C.NumRegs == ~0u;
1054
30.1M
  }
1055
1056
  void RateFormula(const Formula &F,
1057
                   SmallPtrSetImpl<const SCEV *> &Regs,
1058
                   const DenseSet<const SCEV *> &VisitedRegs,
1059
                   const LSRUse &LU,
1060
                   SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1061
1062
  void print(raw_ostream &OS) const;
1063
  void dump() const;
1064
1065
private:
1066
  void RateRegister(const Formula &F, const SCEV *Reg,
1067
                    SmallPtrSetImpl<const SCEV *> &Regs);
1068
  void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1069
                           SmallPtrSetImpl<const SCEV *> &Regs,
1070
                           SmallPtrSetImpl<const SCEV *> *LoserRegs);
1071
};
1072
1073
/// An operand value in an instruction which is to be replaced with some
1074
/// equivalent, possibly strength-reduced, replacement.
1075
struct LSRFixup {
1076
  /// The instruction which will be updated.
1077
  Instruction *UserInst = nullptr;
1078
1079
  /// The operand of the instruction which will be replaced. The operand may be
1080
  /// used more than once; every instance will be replaced.
1081
  Value *OperandValToReplace = nullptr;
1082
1083
  /// If this user is to use the post-incremented value of an induction
1084
  /// variable, this set is non-empty and holds the loops associated with the
1085
  /// induction variable.
1086
  PostIncLoopSet PostIncLoops;
1087
1088
  /// A constant offset to be added to the LSRUse expression.  This allows
1089
  /// multiple fixups to share the same LSRUse with different offsets, for
1090
  /// example in an unrolled loop.
1091
  int64_t Offset = 0;
1092
1093
506k
  LSRFixup() = default;
1094
1095
  bool isUseFullyOutsideLoop(const Loop *L) const;
1096
1097
  void print(raw_ostream &OS) const;
1098
  void dump() const;
1099
};
1100
1101
/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
1102
/// SmallVectors of const SCEV*.
1103
struct UniquifierDenseMapInfo {
1104
28.5M
  static SmallVector<const SCEV *, 4> getEmptyKey() {
1105
28.5M
    SmallVector<const SCEV *, 4>  V;
1106
28.5M
    V.push_back(reinterpret_cast<const SCEV *>(-1));
1107
28.5M
    return V;
1108
28.5M
  }
1109
1110
20.6M
  static SmallVector<const SCEV *, 4> getTombstoneKey() {
1111
20.6M
    SmallVector<const SCEV *, 4> V;
1112
20.6M
    V.push_back(reinterpret_cast<const SCEV *>(-2));
1113
20.6M
    return V;
1114
20.6M
  }
1115
1116
19.5M
  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1117
19.5M
    return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1118
19.5M
  }
1119
1120
  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1121
124M
                      const SmallVector<const SCEV *, 4> &RHS) {
1122
124M
    return LHS == RHS;
1123
124M
  }
1124
};
1125
1126
/// This class holds the state that LSR keeps for each use in IVUsers, as well
1127
/// as uses invented by LSR itself. It includes information about what kinds of
1128
/// things can be folded into the user, information about the user itself, and
1129
/// information about how the use may be satisfied.  TODO: Represent multiple
1130
/// users of the same expression in common?
1131
class LSRUse {
1132
  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1133
1134
public:
1135
  /// An enum for a kind of use, indicating what types of scaled and immediate
1136
  /// operands it might support.
1137
  enum KindType {
1138
    Basic,   ///< A normal use, with no folding.
1139
    Special, ///< A special case of basic, allowing -1 scales.
1140
    Address, ///< An address use; folding according to TargetLowering
1141
    ICmpZero ///< An equality icmp with both operands folded into one.
1142
    // TODO: Add a generic icmp too?
1143
  };
1144
1145
  using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1146
1147
  KindType Kind;
1148
  MemAccessTy AccessTy;
1149
1150
  /// The list of operands which are to be replaced.
1151
  SmallVector<LSRFixup, 8> Fixups;
1152
1153
  /// Keep track of the min and max offsets of the fixups.
1154
  int64_t MinOffset = std::numeric_limits<int64_t>::max();
1155
  int64_t MaxOffset = std::numeric_limits<int64_t>::min();
1156
1157
  /// This records whether all of the fixups using this LSRUse are outside of
1158
  /// the loop, in which case some special-case heuristics may be used.
1159
  bool AllFixupsOutsideLoop = true;
1160
1161
  /// RigidFormula is set to true to guarantee that this use will be associated
1162
  /// with a single formula--the one that initially matched. Some SCEV
1163
  /// expressions cannot be expanded. This allows LSR to consider the registers
1164
  /// used by those expressions without the need to expand them later after
1165
  /// changing the formula.
1166
  bool RigidFormula = false;
1167
1168
  /// This records the widest use type for any fixup using this
1169
  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1170
  /// fixup widths to be equivalent, because the narrower one may be relying on
1171
  /// the implicit truncation to truncate away bogus bits.
1172
  Type *WidestFixupType = nullptr;
1173
1174
  /// A list of ways to build a value that can satisfy this user.  After the
1175
  /// list is populated, one of these is selected heuristically and used to
1176
  /// formulate a replacement for OperandValToReplace in UserInst.
1177
  SmallVector<Formula, 12> Formulae;
1178
1179
  /// The set of register candidates used by all formulae in this LSRUse.
1180
  SmallPtrSet<const SCEV *, 4> Regs;
1181
1182
456k
  LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1183
1184
506k
  LSRFixup &getNewFixup() {
1185
506k
    Fixups.push_back(LSRFixup());
1186
506k
    return Fixups.back();
1187
506k
  }
1188
1189
145k
  void pushFixup(LSRFixup &f) {
1190
145k
    Fixups.push_back(f);
1191
145k
    if (f.Offset > MaxOffset)
1192
768
      MaxOffset = f.Offset;
1193
145k
    if (f.Offset < MinOffset)
1194
586
      MinOffset = f.Offset;
1195
145k
  }
1196
1197
  bool HasFormulaWithSameRegs(const Formula &F) const;
1198
  float getNotSelectedProbability(const SCEV *Reg) const;
1199
  bool InsertFormula(const Formula &F, const Loop &L);
1200
  void DeleteFormula(Formula &F);
1201
  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1202
1203
  void print(raw_ostream &OS) const;
1204
  void dump() const;
1205
};
1206
1207
} // end anonymous namespace
1208
1209
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1210
                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
1211
                                 GlobalValue *BaseGV, int64_t BaseOffset,
1212
                                 bool HasBaseReg, int64_t Scale,
1213
                                 Instruction *Fixup = nullptr);
1214
1215
60.7M
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1216
60.7M
  if (isa<SCEVUnknown>(Reg) || 
isa<SCEVConstant>(Reg)44.6M
)
1217
32.8M
    return 1;
1218
27.8M
  if (Depth == 0)
1219
1.04M
    return 0;
1220
26.8M
  if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1221
8.86M
    return getSetupCost(S->getStart(), Depth - 1);
1222
17.9M
  if (auto S = dyn_cast<SCEVCastExpr>(Reg))
1223
4.43M
    return getSetupCost(S->getOperand(), Depth - 1);
1224
13.5M
  if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1225
12.8M
    return std::accumulate(S->op_begin(), S->op_end(), 0,
1226
30.8M
                           [&](unsigned i, const SCEV *Reg) {
1227
30.8M
                             return i + getSetupCost(Reg, Depth - 1);
1228
30.8M
                           });
1229
697k
  if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1230
697k
    return getSetupCost(S->getLHS(), Depth - 1) +
1231
697k
           getSetupCost(S->getRHS(), Depth - 1);
1232
0
  return 0;
1233
0
}
1234
1235
/// Tally up interesting quantities from the given register.
1236
void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1237
16.3M
                        SmallPtrSetImpl<const SCEV *> &Regs) {
1238
16.3M
  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1239
8.97M
    // If this is an addrec for another loop, it should be an invariant
1240
8.97M
    // with respect to L since L is the innermost loop (at least
1241
8.97M
    // for now LSR only handles innermost loops).
1242
8.97M
    if (AR->getLoop() != L) {
1243
1.10M
      // If the AddRec exists, consider it's register free and leave it alone.
1244
1.10M
      if (isExistingPhi(AR, *SE))
1245
101k
        return;
1246
1.00M
1247
1.00M
      // It is bad to allow LSR for current loop to add induction variables
1248
1.00M
      // for its sibling loops.
1249
1.00M
      if (!AR->getLoop()->contains(L)) {
1250
15.9k
        Lose();
1251
15.9k
        return;
1252
15.9k
      }
1253
989k
1254
989k
      // Otherwise, it will be an invariant with respect to Loop L.
1255
989k
      ++C.NumRegs;
1256
989k
      return;
1257
989k
    }
1258
7.87M
1259
7.87M
    unsigned LoopCost = 1;
1260
7.87M
    if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1261
7.87M
        
TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())727k
) {
1262
7.14M
1263
7.14M
      // If the step size matches the base offset, we could use pre-indexed
1264
7.14M
      // addressing.
1265
7.14M
      if (TTI->shouldFavorBackedgeIndex(L)) {
1266
58.3k
        if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
1267
58.3k
          if (Step->getAPInt() == F.BaseOffset)
1268
12.5k
            LoopCost = 0;
1269
58.3k
      }
1270
7.14M
1271
7.14M
      if (TTI->shouldFavorPostInc()) {
1272
7.78k
        const SCEV *LoopStep = AR->getStepRecurrence(*SE);
1273
7.78k
        if (isa<SCEVConstant>(LoopStep)) {
1274
7.39k
          const SCEV *LoopStart = AR->getStart();
1275
7.39k
          if (!isa<SCEVConstant>(LoopStart) &&
1276
7.39k
              
SE->isLoopInvariant(LoopStart, L)3.67k
)
1277
3.67k
            LoopCost = 0;
1278
7.39k
        }
1279
7.78k
      }
1280
7.14M
    }
1281
7.87M
    C.AddRecCost += LoopCost;
1282
7.87M
1283
7.87M
    // Add the step value register, if it needs one.
1284
7.87M
    // TODO: The non-affine case isn't precisely modeled here.
1285
7.87M
    if (!AR->isAffine() || 
!isa<SCEVConstant>(AR->getOperand(1))7.87M
) {
1286
650k
      if (!Regs.count(AR->getOperand(1))) {
1287
649k
        RateRegister(F, AR->getOperand(1), Regs);
1288
649k
        if (isLoser())
1289
3
          return;
1290
15.2M
      }
1291
650k
    }
1292
7.87M
  }
1293
15.2M
  ++C.NumRegs;
1294
15.2M
1295
15.2M
  // Rough heuristic; favor registers which don't require extra setup
1296
15.2M
  // instructions in the preheader.
1297
15.2M
  C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1298
15.2M
  // Ensure we don't, even with the recusion limit, produce invalid costs.
1299
15.2M
  C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1300
15.2M
1301
15.2M
  C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1302
15.2M
               
SE->hasComputableLoopEvolution(Reg, L)1.89M
;
1303
15.2M
}
1304
1305
/// Record this register in the set. If we haven't seen it before, rate
1306
/// it. Optional LoserRegs provides a way to declare any formula that refers to
1307
/// one of those regs an instant loser.
1308
void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1309
                               SmallPtrSetImpl<const SCEV *> &Regs,
1310
16.8M
                               SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1311
16.8M
  if (LoserRegs && 
LoserRegs->count(Reg)8.43M
) {
1312
66.4k
    Lose();
1313
66.4k
    return;
1314
66.4k
  }
1315
16.8M
  if (Regs.insert(Reg).second) {
1316
15.6M
    RateRegister(F, Reg, Regs);
1317
15.6M
    if (LoserRegs && 
isLoser()8.36M
)
1318
15.9k
      LoserRegs->insert(Reg);
1319
15.6M
  }
1320
16.8M
}
1321
1322
void Cost::RateFormula(const Formula &F,
1323
                       SmallPtrSetImpl<const SCEV *> &Regs,
1324
                       const DenseSet<const SCEV *> &VisitedRegs,
1325
                       const LSRUse &LU,
1326
8.93M
                       SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1327
8.93M
  assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1328
8.93M
  // Tally up the registers.
1329
8.93M
  unsigned PrevAddRecCost = C.AddRecCost;
1330
8.93M
  unsigned PrevNumRegs = C.NumRegs;
1331
8.93M
  unsigned PrevNumBaseAdds = C.NumBaseAdds;
1332
8.93M
  if (const SCEV *ScaledReg = F.ScaledReg) {
1333
6.03M
    if (VisitedRegs.count(ScaledReg)) {
1334
65.1k
      Lose();
1335
65.1k
      return;
1336
65.1k
    }
1337
5.96M
    RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
1338
5.96M
    if (isLoser())
1339
14.7k
      return;
1340
8.85M
  }
1341
10.9M
  
for (const SCEV *BaseReg : F.BaseRegs)8.85M
{
1342
10.9M
    if (VisitedRegs.count(BaseReg)) {
1343
17.8k
      Lose();
1344
17.8k
      return;
1345
17.8k
    }
1346
10.9M
    RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
1347
10.9M
    if (isLoser())
1348
67.5k
      return;
1349
10.9M
  }
1350
8.85M
1351
8.85M
  // Determine how many (unfolded) adds we'll need inside the loop.
1352
8.85M
  size_t NumBaseParts = F.getNumRegs();
1353
8.76M
  if (NumBaseParts > 1)
1354
5.83M
    // Do not count the base and a possible second register if the target
1355
5.83M
    // allows to fold 2 registers.
1356
5.83M
    C.NumBaseAdds +=
1357
5.83M
        NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1358
8.76M
  C.NumBaseAdds += (F.UnfoldedOffset != 0);
1359
8.76M
1360
8.76M
  // Accumulate non-free scaling amounts.
1361
8.76M
  C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L);
1362
8.76M
1363
8.76M
  // Tally up the non-zero immediates.
1364
19.2M
  for (const LSRFixup &Fixup : LU.Fixups) {
1365
19.2M
    int64_t O = Fixup.Offset;
1366
19.2M
    int64_t Offset = (uint64_t)O + F.BaseOffset;
1367
19.2M
    if (F.BaseGV)
1368
1.62k
      C.ImmCost += 64; // Handle symbolic values conservatively.
1369
19.2M
                     // TODO: This should probably be the pointer size.
1370
19.2M
    else if (Offset != 0)
1371
11.4M
      C.ImmCost += APInt(64, Offset, true).getMinSignedBits();
1372
19.2M
1373
19.2M
    // Check with target if this offset with this instruction is
1374
19.2M
    // specifically not supported.
1375
19.2M
    if (LU.Kind == LSRUse::Address && 
Offset != 016.4M
&&
1376
19.2M
        !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1377
11.2M
                              Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1378
6.80M
      C.NumBaseAdds++;
1379
19.2M
  }
1380
8.76M
1381
8.76M
  // If we don't count instruction cost exit here.
1382
8.76M
  if (!InsnsCost) {
1383
312
    assert(isValid() && "invalid cost");
1384
312
    return;
1385
312
  }
1386
8.76M
1387
8.76M
  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1388
8.76M
  // additional instruction (at least fill).
1389
8.76M
  unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1;
1390
8.76M
  if (C.NumRegs > TTIRegNum) {
1391
31.3k
    // Cost already exceeded TTIRegNum, then only newly added register can add
1392
31.3k
    // new instructions.
1393
31.3k
    if (PrevNumRegs > TTIRegNum)
1394
23.5k
      C.Insns += (C.NumRegs - PrevNumRegs);
1395
7.84k
    else
1396
7.84k
      C.Insns += (C.NumRegs - TTIRegNum);
1397
31.3k
  }
1398
8.76M
1399
8.76M
  // If ICmpZero formula ends with not 0, it could not be replaced by
1400
8.76M
  // just add or sub. We'll need to compare final result of AddRec.
1401
8.76M
  // That means we'll need an additional instruction. But if the target can
1402
8.76M
  // macro-fuse a compare with a branch, don't count this extra instruction.
1403
8.76M
  // For -10 + {0, +, 1}:
1404
8.76M
  // i = i + 1;
1405
8.76M
  // cmp i, 10
1406
8.76M
  //
1407
8.76M
  // For {-10, +, 1}:
1408
8.76M
  // i = i + 1;
1409
8.76M
  if (LU.Kind == LSRUse::ICmpZero && 
!F.hasZeroEnd()1.59M
&&
1410
8.76M
      
!TTI->canMacroFuseCmp()866k
)
1411
771k
    C.Insns++;
1412
8.76M
  // Each new AddRec adds 1 instruction to calculation.
1413
8.76M
  C.Insns += (C.AddRecCost - PrevAddRecCost);
1414
8.76M
1415
8.76M
  // BaseAdds adds instructions for unfolded registers.
1416
8.76M
  if (LU.Kind != LSRUse::ICmpZero)
1417
7.17M
    C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1418
8.76M
  assert(isValid() && "invalid cost");
1419
8.76M
}
1420
1421
/// Set this cost to a losing value.
1422
268k
void Cost::Lose() {
1423
268k
  C.Insns = std::numeric_limits<unsigned>::max();
1424
268k
  C.NumRegs = std::numeric_limits<unsigned>::max();
1425
268k
  C.AddRecCost = std::numeric_limits<unsigned>::max();
1426
268k
  C.NumIVMuls = std::numeric_limits<unsigned>::max();
1427
268k
  C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1428
268k
  C.ImmCost = std::numeric_limits<unsigned>::max();
1429
268k
  C.SetupCost = std::numeric_limits<unsigned>::max();
1430
268k
  C.ScaleCost = std::numeric_limits<unsigned>::max();
1431
268k
}
1432
1433
/// Choose the lower cost.
1434
4.65M
bool Cost::isLess(Cost &Other) {
1435
4.65M
  if (InsnsCost.getNumOccurrences() > 0 && 
InsnsCost257
&&
1436
4.65M
      
C.Insns != Other.C.Insns71
)
1437
26
    return C.Insns < Other.C.Insns;
1438
4.65M
  return TTI->isLSRCostLess(C, Other.C);
1439
4.65M
}
1440
1441
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1442
void Cost::print(raw_ostream &OS) const {
1443
  if (InsnsCost)
1444
    OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1445
  OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1446
  if (C.AddRecCost != 0)
1447
    OS << ", with addrec cost " << C.AddRecCost;
1448
  if (C.NumIVMuls != 0)
1449
    OS << ", plus " << C.NumIVMuls << " IV mul"
1450
       << (C.NumIVMuls == 1 ? "" : "s");
1451
  if (C.NumBaseAdds != 0)
1452
    OS << ", plus " << C.NumBaseAdds << " base add"
1453
       << (C.NumBaseAdds == 1 ? "" : "s");
1454
  if (C.ScaleCost != 0)
1455
    OS << ", plus " << C.ScaleCost << " scale cost";
1456
  if (C.ImmCost != 0)
1457
    OS << ", plus " << C.ImmCost << " imm cost";
1458
  if (C.SetupCost != 0)
1459
    OS << ", plus " << C.SetupCost << " setup cost";
1460
}
1461
1462
LLVM_DUMP_METHOD void Cost::dump() const {
1463
  print(errs()); errs() << '\n';
1464
}
1465
#endif
1466
1467
/// Test whether this fixup always uses its value outside of the given loop.
1468
618k
bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1469
618k
  // PHI nodes use their value in their incoming blocks.
1470
618k
  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1471
134k
    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; 
++i94.7k
)
1472
96.7k
      if (PN->getIncomingValue(i) == OperandValToReplace &&
1473
96.7k
          
L->contains(PN->getIncomingBlock(i))41.5k
)
1474
2.07k
        return false;
1475
39.4k
    
return true37.4k
;
1476
578k
  }
1477
578k
1478
578k
  return !L->contains(UserInst);
1479
578k
}
1480
1481
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1482
void LSRFixup::print(raw_ostream &OS) const {
1483
  OS << "UserInst=";
1484
  // Store is common and interesting enough to be worth special-casing.
1485
  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1486
    OS << "store ";
1487
    Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1488
  } else if (UserInst->getType()->isVoidTy())
1489
    OS << UserInst->getOpcodeName();
1490
  else
1491
    UserInst->printAsOperand(OS, /*PrintType=*/false);
1492
1493
  OS << ", OperandValToReplace=";
1494
  OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1495
1496
  for (const Loop *PIL : PostIncLoops) {
1497
    OS << ", PostIncLoop=";
1498
    PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1499
  }
1500
1501
  if (Offset != 0)
1502
    OS << ", Offset=" << Offset;
1503
}
1504
1505
LLVM_DUMP_METHOD void LSRFixup::dump() const {
1506
  print(errs()); errs() << '\n';
1507
}
1508
#endif
1509
1510
/// Test whether this use as a formula which has the same registers as the given
1511
/// formula.
1512
2.13M
bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1513
2.13M
  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1514
2.13M
  if (F.ScaledReg) 
Key.push_back(F.ScaledReg)893k
;
1515
2.13M
  // Unstable sort by host order ok, because this is only used for uniquifying.
1516
2.13M
  llvm::sort(Key);
1517
2.13M
  return Uniquifier.count(Key);
1518
2.13M
}
1519
1520
/// The function returns a probability of selecting formula without Reg.
1521
0
float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1522
0
  unsigned FNum = 0;
1523
0
  for (const Formula &F : Formulae)
1524
0
    if (F.referencesReg(Reg))
1525
0
      FNum++;
1526
0
  return ((float)(Formulae.size() - FNum)) / Formulae.size();
1527
0
}
1528
1529
/// If the given formula has not yet been inserted, add it to the list, and
1530
/// return true. Return false otherwise.  The formula must be in canonical form.
1531
12.5M
bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1532
12.5M
  assert(F.isCanonical(L) && "Invalid canonical representation");
1533
12.5M
1534
12.5M
  if (!Formulae.empty() && 
RigidFormula12.0M
)
1535
447
    return false;
1536
12.5M
1537
12.5M
  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1538
12.5M
  if (F.ScaledReg) 
Key.push_back(F.ScaledReg)10.5M
;
1539
12.5M
  // Unstable sort by host order ok, because this is only used for uniquifying.
1540
12.5M
  llvm::sort(Key);
1541
12.5M
1542
12.5M
  if (!Uniquifier.insert(Key).second)
1543
8.42M
    return false;
1544
4.10M
1545
4.10M
  // Using a register to hold the value of 0 is not profitable.
1546
4.10M
  assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1547
4.10M
         "Zero allocated in a scaled register!");
1548
#ifndef NDEBUG
1549
  for (const SCEV *BaseReg : F.BaseRegs)
1550
    assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1551
#endif
1552
1553
4.10M
  // Add the formula to the list.
1554
4.10M
  Formulae.push_back(F);
1555
4.10M
1556
4.10M
  // Record registers now being used by this use.
1557
4.10M
  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1558
4.10M
  if (F.ScaledReg)
1559
2.91M
    Regs.insert(F.ScaledReg);
1560
4.10M
1561
4.10M
  return true;
1562
4.10M
}
1563
1564
/// Remove the given formula from this use's list.
1565
1.13M
void LSRUse::DeleteFormula(Formula &F) {
1566
1.13M
  if (&F != &Formulae.back())
1567
1.02M
    std::swap(F, Formulae.back());
1568
1.13M
  Formulae.pop_back();
1569
1.13M
}
1570
1571
/// Recompute the Regs field, and update RegUses.
1572
174k
void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1573
174k
  // Now that we've filtered out some formulae, recompute the Regs set.
1574
174k
  SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1575
174k
  Regs.clear();
1576
892k
  for (const Formula &F : Formulae) {
1577
892k
    if (F.ScaledReg) 
Regs.insert(F.ScaledReg)610k
;
1578
892k
    Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1579
892k
  }
1580
174k
1581
174k
  // Update the RegTracker.
1582
174k
  for (const SCEV *S : OldRegs)
1583
1.70M
    if (!Regs.count(S))
1584
720k
      RegUses.dropRegister(S, LUIdx);
1585
174k
}
1586
1587
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1588
void LSRUse::print(raw_ostream &OS) const {
1589
  OS << "LSR Use: Kind=";
1590
  switch (Kind) {
1591
  case Basic:    OS << "Basic"; break;
1592
  case Special:  OS << "Special"; break;
1593
  case ICmpZero: OS << "ICmpZero"; break;
1594
  case Address:
1595
    OS << "Address of ";
1596
    if (AccessTy.MemTy->isPointerTy())
1597
      OS << "pointer"; // the full pointer type could be really verbose
1598
    else {
1599
      OS << *AccessTy.MemTy;
1600
    }
1601
1602
    OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1603
  }
1604
1605
  OS << ", Offsets={";
1606
  bool NeedComma = false;
1607
  for (const LSRFixup &Fixup : Fixups) {
1608
    if (NeedComma) OS << ',';
1609
    OS << Fixup.Offset;
1610
    NeedComma = true;
1611
  }
1612
  OS << '}';
1613
1614
  if (AllFixupsOutsideLoop)
1615
    OS << ", all-fixups-outside-loop";
1616
1617
  if (WidestFixupType)
1618
    OS << ", widest fixup type: " << *WidestFixupType;
1619
}
1620
1621
LLVM_DUMP_METHOD void LSRUse::dump() const {
1622
  print(errs()); errs() << '\n';
1623
}
1624
#endif
1625
1626
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1627
                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
1628
                                 GlobalValue *BaseGV, int64_t BaseOffset,
1629
                                 bool HasBaseReg, int64_t Scale,
1630
63.8M
                                 Instruction *Fixup/*= nullptr*/) {
1631
63.8M
  switch (Kind) {
1632
63.8M
  case LSRUse::Address:
1633
53.2M
    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
1634
53.2M
                                     HasBaseReg, Scale, AccessTy.AddrSpace, Fixup);
1635
63.8M
1636
63.8M
  case LSRUse::ICmpZero:
1637
6.96M
    // There's not even a target hook for querying whether it would be legal to
1638
6.96M
    // fold a GV into an ICmp.
1639
6.96M
    if (BaseGV)
1640
36
      return false;
1641
6.96M
1642
6.96M
    // ICmp only has two operands; don't allow more than two non-trivial parts.
1643
6.96M
    if (Scale != 0 && 
HasBaseReg4.90M
&&
BaseOffset != 03.84M
)
1644
286k
      return false;
1645
6.67M
1646
6.67M
    // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1647
6.67M
    // putting the scaled register in the other operand of the icmp.
1648
6.67M
    if (Scale != 0 && 
Scale != -14.61M
)
1649
1.93M
      return false;
1650
4.73M
1651
4.73M
    // If we have low-level target information, ask the target if it can fold an
1652
4.73M
    // integer immediate on an icmp.
1653
4.73M
    if (BaseOffset != 0) {
1654
536k
      // We have one of:
1655
536k
      // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1656
536k
      // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1657
536k
      // Offs is the ICmp immediate.
1658
536k
      if (Scale == 0)
1659
385k
        // The cast does the right thing with
1660
385k
        // std::numeric_limits<int64_t>::min().
1661
385k
        BaseOffset = -(uint64_t)BaseOffset;
1662
536k
      return TTI.isLegalICmpImmediate(BaseOffset);
1663
536k
    }
1664
4.19M
1665
4.19M
    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1666
4.19M
    return true;
1667
4.19M
1668
4.19M
  case LSRUse::Basic:
1669
2.51M
    // Only handle single-register values.
1670
2.51M
    return !BaseGV && 
Scale == 02.49M
&&
BaseOffset == 01.00M
;
1671
4.19M
1672
4.19M
  case LSRUse::Special:
1673
1.19M
    // Special case Basic to handle -1 scales.
1674
1.19M
    return !BaseGV && (Scale == 0 || 
Scale == -11.14M
) &&
BaseOffset == 0773k
;
1675
0
  }
1676
0
1677
0
  llvm_unreachable("Invalid LSRUse Kind!");
1678
0
}
1679
1680
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1681
                                 int64_t MinOffset, int64_t MaxOffset,
1682
                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
1683
                                 GlobalValue *BaseGV, int64_t BaseOffset,
1684
34.0M
                                 bool HasBaseReg, int64_t Scale) {
1685
34.0M
  // Check for overflow.
1686
34.0M
  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
1687
34.0M
      (MinOffset > 0))
1688
0
    return false;
1689
34.0M
  MinOffset = (uint64_t)BaseOffset + MinOffset;
1690
34.0M
  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
1691
34.0M
      (MaxOffset > 0))
1692
0
    return false;
1693
34.0M
  MaxOffset = (uint64_t)BaseOffset + MaxOffset;
1694
34.0M
1695
34.0M
  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1696
34.0M
                              HasBaseReg, Scale) &&
1697
34.0M
         isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1698
18.3M
                              HasBaseReg, Scale);
1699
34.0M
}
1700
1701
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1702
                                 int64_t MinOffset, int64_t MaxOffset,
1703
                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
1704
5.88M
                                 const Formula &F, const Loop &L) {
1705
5.88M
  // For the purpose of isAMCompletelyFolded either having a canonical formula
1706
5.88M
  // or a scale not equal to zero is correct.
1707
5.88M
  // Problems may arise from non canonical formulae having a scale == 0.
1708
5.88M
  // Strictly speaking it would best to just rely on canonical formulae.
1709
5.88M
  // However, when we generate the scaled formulae, we first check that the
1710
5.88M
  // scaling factor is profitable before computing the actual ScaledReg for
1711
5.88M
  // compile time sake.
1712
5.88M
  assert((F.isCanonical(L) || F.Scale != 0));
1713
5.88M
  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1714
5.88M
                              F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1715
5.88M
}
1716
1717
/// Test whether we know how to expand the current formula.
1718
static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1719
                       int64_t MaxOffset, LSRUse::KindType Kind,
1720
                       MemAccessTy AccessTy, GlobalValue *BaseGV,
1721
17.6M
                       int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
1722
17.6M
  // We know how to expand completely foldable formulae.
1723
17.6M
  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1724
17.6M
                              BaseOffset, HasBaseReg, Scale) ||
1725
17.6M
         // Or formulae that use a base register produced by a sum of base
1726
17.6M
         // registers.
1727
17.6M
         
(10.5M
Scale == 110.5M
&&
1728
10.5M
          isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1729
2.81M
                               BaseGV, BaseOffset, true, 0));
1730
17.6M
}
1731
1732
static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1733
                       int64_t MaxOffset, LSRUse::KindType Kind,
1734
17.6M
                       MemAccessTy AccessTy, const Formula &F) {
1735
17.6M
  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1736
17.6M
                    F.BaseOffset, F.HasBaseReg, F.Scale);
1737
17.6M
}
1738
1739
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1740
5.93M
                                 const LSRUse &LU, const Formula &F) {
1741
5.93M
  // Target may want to look at the user instructions.
1742
5.93M
  if (LU.Kind == LSRUse::Address && 
TTI.LSRWithInstrQueries()4.66M
) {
1743
360
    for (const LSRFixup &Fixup : LU.Fixups)
1744
751
      if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1745
751
                                (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1746
751
                                F.Scale, Fixup.UserInst))
1747
192
        return false;
1748
360
    
return true168
;
1749
5.93M
  }
1750
5.93M
1751
5.93M
  return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1752
5.93M
                              LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1753
5.93M
                              F.Scale);
1754
5.93M
}
1755
1756
static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
1757
                                     const LSRUse &LU, const Formula &F,
1758
8.76M
                                     const Loop &L) {
1759
8.76M
  if (!F.Scale)
1760
2.88M
    return 0;
1761
5.88M
1762
5.88M
  // If the use is not completely folded in that instruction, we will have to
1763
5.88M
  // pay an extra cost only for scale != 1.
1764
5.88M
  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1765
5.88M
                            LU.AccessTy, F, L))
1766
2.26M
    return F.Scale != 1;
1767
3.61M
1768
3.61M
  switch (LU.Kind) {
1769
3.61M
  case LSRUse::Address: {
1770
3.04M
    // Check the scaling factor cost with both the min and max offsets.
1771
3.04M
    int ScaleCostMinOffset = TTI.getScalingFactorCost(
1772
3.04M
        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
1773
3.04M
        F.Scale, LU.AccessTy.AddrSpace);
1774
3.04M
    int ScaleCostMaxOffset = TTI.getScalingFactorCost(
1775
3.04M
        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
1776
3.04M
        F.Scale, LU.AccessTy.AddrSpace);
1777
3.04M
1778
3.04M
    assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
1779
3.04M
           "Legal addressing mode has an illegal cost!");
1780
3.04M
    return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1781
3.61M
  }
1782
3.61M
  case LSRUse::ICmpZero:
1783
572k
  case LSRUse::Basic:
1784
572k
  case LSRUse::Special:
1785
572k
    // The use is completely folded, i.e., everything is folded into the
1786
572k
    // instruction.
1787
572k
    return 0;
1788
0
  }
1789
0
1790
0
  llvm_unreachable("Invalid LSRUse Kind!");
1791
0
}
1792
1793
static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1794
                             LSRUse::KindType Kind, MemAccessTy AccessTy,
1795
                             GlobalValue *BaseGV, int64_t BaseOffset,
1796
538k
                             bool HasBaseReg) {
1797
538k
  // Fast-path: zero is always foldable.
1798
538k
  if (BaseOffset == 0 && 
!BaseGV220k
)
return true220k
;
1799
318k
1800
318k
  // Conservatively, create an address with an immediate and a
1801
318k
  // base and a scale.
1802
318k
  int64_t Scale = Kind == LSRUse::ICmpZero ? 
-120.7k
:
1297k
;
1803
318k
1804
318k
  // Canonicalize a scale of 1 to a base register if the formula doesn't
1805
318k
  // already have a base register.
1806
318k
  if (!HasBaseReg && 
Scale == 117.5k
) {
1807
17.5k
    Scale = 0;
1808
17.5k
    HasBaseReg = true;
1809
17.5k
  }
1810
318k
1811
318k
  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
1812
318k
                              HasBaseReg, Scale);
1813
318k
}
1814
1815
static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1816
                             ScalarEvolution &SE, int64_t MinOffset,
1817
                             int64_t MaxOffset, LSRUse::KindType Kind,
1818
                             MemAccessTy AccessTy, const SCEV *S,
1819
5.20M
                             bool HasBaseReg) {
1820
5.20M
  // Fast-path: zero is always foldable.
1821
5.20M
  if (S->isZero()) 
return true0
;
1822
5.20M
1823
5.20M
  // Conservatively, create an address with an immediate and a
1824
5.20M
  // base and a scale.
1825
5.20M
  int64_t BaseOffset = ExtractImmediate(S, SE);
1826
5.20M
  GlobalValue *BaseGV = ExtractSymbol(S, SE);
1827
5.20M
1828
5.20M
  // If there's anything else involved, it's not foldable.
1829
5.20M
  if (!S->isZero()) 
return false3.42M
;
1830
1.77M
1831
1.77M
  // Fast-path: zero is always foldable.
1832
1.77M
  if (BaseOffset == 0 && 
!BaseGV324k
)
return true0
;
1833
1.77M
1834
1.77M
  // Conservatively, create an address with an immediate and a
1835
1.77M
  // base and a scale.
1836
1.77M
  int64_t Scale = Kind == LSRUse::ICmpZero ? 
-161.8k
:
11.71M
;
1837
1.77M
1838
1.77M
  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1839
1.77M
                              BaseOffset, HasBaseReg, Scale);
1840
1.77M
}
1841
1842
namespace {
1843
1844
/// An individual increment in a Chain of IV increments.  Relate an IV user to
1845
/// an expression that computes the IV it uses from the IV used by the previous
1846
/// link in the Chain.
1847
///
1848
/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
1849
/// original IVOperand. The head of the chain's IVOperand is only valid during
1850
/// chain collection, before LSR replaces IV users. During chain generation,
1851
/// IncExpr can be used to find the new IVOperand that computes the same
1852
/// expression.
1853
struct IVInc {
1854
  Instruction *UserInst;
1855
  Value* IVOperand;
1856
  const SCEV *IncExpr;
1857
1858
  IVInc(Instruction *U, Value *O, const SCEV *E)
1859
523k
      : UserInst(U), IVOperand(O), IncExpr(E) {}
1860
};
1861
1862
// The list of IV increments in program order.  We typically add the head of a
1863
// chain without finding subsequent links.
1864
struct IVChain {
1865
  SmallVector<IVInc, 1> Incs;
1866
  const SCEV *ExprBase = nullptr;
1867
1868
0
  IVChain() = default;
1869
  IVChain(const IVInc &Head, const SCEV *Base)
1870
193k
      : Incs(1, Head), ExprBase(Base) {}
1871
1872
  using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
1873
1874
  // Return the first increment in the chain.
1875
135k
  const_iterator begin() const {
1876
135k
    assert(!Incs.empty());
1877
135k
    return std::next(Incs.begin());
1878
135k
  }
1879
135k
  const_iterator end() const {
1880
135k
    return Incs.end();
1881
135k
  }
1882
1883
  // Returns true if this chain contains any increments.
1884
193k
  bool hasIncs() const { return Incs.size() >= 2; }
1885
1886
  // Add an IVInc to the end of this chain.
1887
330k
  void add(const IVInc &X) { Incs.push_back(X); }
1888
1889
  // Returns the last UserInst in the chain.
1890
348k
  Instruction *tailUserInst() const { return Incs.back().UserInst; }
1891
1892
  // Returns true if IncExpr can be profitably added to this chain.
1893
  bool isProfitableIncrement(const SCEV *OperExpr,
1894
                             const SCEV *IncExpr,
1895
                             ScalarEvolution&);
1896
};
1897
1898
/// Helper for CollectChains to track multiple IV increment uses.  Distinguish
1899
/// between FarUsers that definitely cross IV increments and NearUsers that may
1900
/// be used between IV increments.
1901
struct ChainUsers {
1902
  SmallPtrSet<Instruction*, 4> FarUsers;
1903
  SmallPtrSet<Instruction*, 4> NearUsers;
1904
};
1905
1906
/// This class holds state for the main loop strength reduction logic.
1907
class LSRInstance {
1908
  IVUsers &IU;
1909
  ScalarEvolution &SE;
1910
  DominatorTree &DT;
1911
  LoopInfo &LI;
1912
  AssumptionCache &AC;
1913
  TargetLibraryInfo &LibInfo;
1914
  const TargetTransformInfo &TTI;
1915
  Loop *const L;
1916
  bool FavorBackedgeIndex = false;
1917
  bool Changed = false;
1918
1919
  /// This is the insert position that the current loop's induction variable
1920
  /// increment should be placed. In simple loops, this is the latch block's
1921
  /// terminator. But in more complicated cases, this is a position which will
1922
  /// dominate all the in-loop post-increment users.
1923
  Instruction *IVIncInsertPos = nullptr;
1924
1925
  /// Interesting factors between use strides.
1926
  ///
1927
  /// We explicitly use a SetVector which contains a SmallSet, instead of the
1928
  /// default, a SmallDenseSet, because we need to use the full range of
1929
  /// int64_ts, and there's currently no good way of doing that with
1930
  /// SmallDenseSet.
1931
  SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
1932
1933
  /// Interesting use types, to facilitate truncation reuse.
1934
  SmallSetVector<Type *, 4> Types;
1935
1936
  /// The list of interesting uses.
1937
  mutable SmallVector<LSRUse, 16> Uses;
1938
1939
  /// Track which uses use which register candidates.
1940
  RegUseTracker RegUses;
1941
1942
  // Limit the number of chains to avoid quadratic behavior. We don't expect to
1943
  // have more than a few IV increment chains in a loop. Missing a Chain falls
1944
  // back to normal LSR behavior for those uses.
1945
  static const unsigned MaxChains = 8;
1946
1947
  /// IV users can form a chain of IV increments.
1948
  SmallVector<IVChain, MaxChains> IVChainVec;
1949
1950
  /// IV users that belong to profitable IVChains.
1951
  SmallPtrSet<Use*, MaxChains> IVIncSet;
1952
1953
  void OptimizeShadowIV();
1954
  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
1955
  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1956
  void OptimizeLoopTermCond();
1957
1958
  void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
1959
                        SmallVectorImpl<ChainUsers> &ChainUsersVec);
1960
  void FinalizeChain(IVChain &Chain);
1961
  void CollectChains();
1962
  void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
1963
                       SmallVectorImpl<WeakTrackingVH> &DeadInsts);
1964
1965
  void CollectInterestingTypesAndFactors();
1966
  void CollectFixupsAndInitialFormulae();
1967
1968
  // Support for sharing of LSRUses between LSRFixups.
1969
  using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
1970
  UseMapTy UseMap;
1971
1972
  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
1973
                          LSRUse::KindType Kind, MemAccessTy AccessTy);
1974
1975
  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
1976
                                    MemAccessTy AccessTy);
1977
1978
  void DeleteUse(LSRUse &LU, size_t LUIdx);
1979
1980
  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
1981
1982
  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1983
  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1984
  void CountRegisters(const Formula &F, size_t LUIdx);
1985
  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
1986
1987
  void CollectLoopInvariantFixupsAndFormulae();
1988
1989
  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
1990
                              unsigned Depth = 0);
1991
1992
  void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
1993
                                  const Formula &Base, unsigned Depth,
1994
                                  size_t Idx, bool IsScaledReg = false);
1995
  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
1996
  void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
1997
                                   const Formula &Base, size_t Idx,
1998
                                   bool IsScaledReg = false);
1999
  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2000
  void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2001
                                   const Formula &Base,
2002
                                   const SmallVectorImpl<int64_t> &Worklist,
2003
                                   size_t Idx, bool IsScaledReg = false);
2004
  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2005
  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2006
  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2007
  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2008
  void GenerateCrossUseConstantOffsets();
2009
  void GenerateAllReuseFormulae();
2010
2011
  void FilterOutUndesirableDedicatedRegisters();
2012
2013
  size_t EstimateSearchSpaceComplexity() const;
2014
  void NarrowSearchSpaceByDetectingSupersets();
2015
  void NarrowSearchSpaceByCollapsingUnrolledCode();
2016
  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2017
  void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2018
  void NarrowSearchSpaceByDeletingCostlyFormulas();
2019
  void NarrowSearchSpaceByPickingWinnerRegs();
2020
  void NarrowSearchSpaceUsingHeuristics();
2021
2022
  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2023
                    Cost &SolutionCost,
2024
                    SmallVectorImpl<const Formula *> &Workspace,
2025
                    const Cost &CurCost,
2026
                    const SmallPtrSet<const SCEV *, 16> &CurRegs,
2027
                    DenseSet<const SCEV *> &VisitedRegs) const;
2028
  void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2029
2030
  BasicBlock::iterator
2031
    HoistInsertPosition(BasicBlock::iterator IP,
2032
                        const SmallVectorImpl<Instruction *> &Inputs) const;
2033
  BasicBlock::iterator
2034
    AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2035
                                  const LSRFixup &LF,
2036
                                  const LSRUse &LU,
2037
                                  SCEVExpander &Rewriter) const;
2038
2039
  Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2040
                BasicBlock::iterator IP, SCEVExpander &Rewriter,
2041
                SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2042
  void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2043
                     const Formula &F, SCEVExpander &Rewriter,
2044
                     SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2045
  void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2046
               SCEVExpander &Rewriter,
2047
               SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2048
  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2049
2050
public:
2051
  LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2052
              LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2053
              TargetLibraryInfo &LibInfo);
2054
2055
208k
  bool getChanged() const { return Changed; }
2056
2057
  void print_factors_and_types(raw_ostream &OS) const;
2058
  void print_fixups(raw_ostream &OS) const;
2059
  void print_uses(raw_ostream &OS) const;
2060
  void print(raw_ostream &OS) const;
2061
  void dump() const;
2062
};
2063
2064
} // end anonymous namespace
2065
2066
/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2067
/// the cast operation.
2068
125k
void LSRInstance::OptimizeShadowIV() {
2069
125k
  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2070
125k
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2071
59.7k
    return;
2072
65.3k
2073
65.3k
  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2074
431k
       UI != E; /* empty */) {
2075
366k
    IVUsers::const_iterator CandidateUI = UI;
2076
366k
    ++UI;
2077
366k
    Instruction *ShadowUse = CandidateUI->getUser();
2078
366k
    Type *DestTy = nullptr;
2079
366k
    bool IsSigned = false;
2080
366k
2081
366k
    /* If shadow use is a int->float cast then insert a second IV
2082
366k
       to eliminate this cast.
2083
366k
2084
366k
         for (unsigned i = 0; i < n; ++i)
2085
366k
           foo((double)i);
2086
366k
2087
366k
       is transformed into
2088
366k
2089
366k
         double d = 0.0;
2090
366k
         for (unsigned i = 0; i < n; ++i, ++d)
2091
366k
           foo(d);
2092
366k
    */
2093
366k
    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2094
20
      IsSigned = false;
2095
20
      DestTy = UCast->getDestTy();
2096
20
    }
2097
366k
    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2098
3.21k
      IsSigned = true;
2099
3.21k
      DestTy = SCast->getDestTy();
2100
3.21k
    }
2101
366k
    if (!DestTy) 
continue362k
;
2102
3.23k
2103
3.23k
    // If target does not support DestTy natively then do not apply
2104
3.23k
    // this transformation.
2105
3.23k
    if (!TTI.isTypeLegal(DestTy)) 
continue0
;
2106
3.23k
2107
3.23k
    PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2108
3.23k
    if (!PH) 
continue3.10k
;
2109
122
    if (PH->getNumIncomingValues() != 2) 
continue0
;
2110
122
2111
122
    // If the calculation in integers overflows, the result in FP type will
2112
122
    // differ. So we only can do this transformation if we are guaranteed to not
2113
122
    // deal with overflowing values
2114
122
    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2115
122
    if (!AR) 
continue0
;
2116
122
    if (IsSigned && 
!AR->hasNoSignedWrap()110
)
continue1
;
2117
121
    if (!IsSigned && 
!AR->hasNoUnsignedWrap()12
)
continue8
;
2118
113
2119
113
    Type *SrcTy = PH->getType();
2120
113
    int Mantissa = DestTy->getFPMantissaWidth();
2121
113
    if (Mantissa == -1) 
continue0
;
2122
113
    if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2123
43
      continue;
2124
70
2125
70
    unsigned Entry, Latch;
2126
70
    if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2127
46
      Entry = 0;
2128
46
      Latch = 1;
2129
46
    } else {
2130
24
      Entry = 1;
2131
24
      Latch = 0;
2132
24
    }
2133
70
2134
70
    ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2135
70
    if (!Init) 
continue33
;
2136
37
    Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2137
35
                                        (double)Init->getSExtValue() :
2138
37
                                        
(double)Init->getZExtValue()2
);
2139
37
2140
37
    BinaryOperator *Incr =
2141
37
      dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
2142
37
    if (!Incr) 
continue0
;
2143
37
    if (Incr->getOpcode() != Instruction::Add
2144
37
        && 
Incr->getOpcode() != Instruction::Sub0
)
2145
0
      continue;
2146
37
2147
37
    /* Initialize new IV, double d = 0.0 in above example. */
2148
37
    ConstantInt *C = nullptr;
2149
37
    if (Incr->getOperand(0) == PH)
2150
37
      C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2151
0
    else if (Incr->getOperand(1) == PH)
2152
0
      C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2153
0
    else
2154
0
      continue;
2155
37
2156
37
    if (!C) 
continue0
;
2157
37
2158
37
    // Ignore negative constants, as the code below doesn't handle them
2159
37
    // correctly. TODO: Remove this restriction.
2160
37
    if (!C->getValue().isStrictlyPositive()) 
continue2
;
2161
35
2162
35
    /* Add new PHINode. */
2163
35
    PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
2164
35
2165
35
    /* create new increment. '++d' in above example. */
2166
35
    Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2167
35
    BinaryOperator *NewIncr =
2168
35
      BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
2169
35
                               Instruction::FAdd : 
Instruction::FSub0
,
2170
35
                             NewPH, CFP, "IV.S.next.", Incr);
2171
35
2172
35
    NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2173
35
    NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2174
35
2175
35
    /* Remove cast operation */
2176
35
    ShadowUse->replaceAllUsesWith(NewPH);
2177
35
    ShadowUse->eraseFromParent();
2178
35
    Changed = true;
2179
35
    break;
2180
35
  }
2181
65.3k
}
2182
2183
/// If Cond has an operand that is an expression of an IV, set the IV user and
2184
/// stride information and return true, otherwise return false.
2185
137k
bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2186
137k
  for (IVStrideUse &U : IU)
2187
517k
    if (U.getUser() == Cond) {
2188
108k
      // NOTE: we could handle setcc instructions with multiple uses here, but
2189
108k
      // InstCombine does it as well for simple uses, it's not clear that it
2190
108k
      // occurs enough in real life to handle.
2191
108k
      CondUse = &U;
2192
108k
      return true;
2193
108k
    }
2194
137k
  
return false29.2k
;
2195
137k
}
2196
2197
/// Rewrite the loop's terminating condition if it uses a max computation.
2198
///
2199
/// This is a narrow solution to a specific, but acute, problem. For loops
2200
/// like this:
2201
///
2202
///   i = 0;
2203
///   do {
2204
///     p[i] = 0.0;
2205
///   } while (++i < n);
2206
///
2207
/// the trip count isn't just 'n', because 'n' might not be positive. And
2208
/// unfortunately this can come up even for loops where the user didn't use
2209
/// a C do-while loop. For example, seemingly well-behaved top-test loops
2210
/// will commonly be lowered like this:
2211
///
2212
///   if (n > 0) {
2213
///     i = 0;
2214
///     do {
2215
///       p[i] = 0.0;
2216
///     } while (++i < n);
2217
///   }
2218
///
2219
/// and then it's possible for subsequent optimization to obscure the if
2220
/// test in such a way that indvars can't find it.
2221
///
2222
/// When indvars can't find the if test in loops like this, it creates a
2223
/// max expression, which allows it to give the loop a canonical
2224
/// induction variable:
2225
///
2226
///   i = 0;
2227
///   max = n < 1 ? 1 : n;
2228
///   do {
2229
///     p[i] = 0.0;
2230
///   } while (++i != max);
2231
///
2232
/// Canonical induction variables are necessary because the loop passes
2233
/// are designed around them. The most obvious example of this is the
2234
/// LoopInfo analysis, which doesn't remember trip count values. It
2235
/// expects to be able to rediscover the trip count each time it is
2236
/// needed, and it does this using a simple analysis that only succeeds if
2237
/// the loop has a canonical induction variable.
2238
///
2239
/// However, when it comes time to generate code, the maximum operation
2240
/// can be quite costly, especially if it's inside of an outer loop.
2241
///
2242
/// This function solves this problem by detecting this type of loop and
2243
/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2244
/// the instructions for the maximum computation.
2245
108k
ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2246
108k
  // Check that the loop matches the pattern we're looking for.
2247
108k
  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2248
108k
      
Cond->getPredicate() != CmpInst::ICMP_NE36.5k
)
2249
36.4k
    return Cond;
2250
71.8k
2251
71.8k
  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2252
71.8k
  if (!Sel || 
!Sel->hasOneUse()2.95k
)
return Cond71.7k
;
2253
31
2254
31
  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2255
31
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2256
1
    return Cond;
2257
30
  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2258
30
2259
30
  // Add one to the backedge-taken count to get the trip count.
2260
30
  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2261
30
  if (IterationCount != SE.getSCEV(Sel)) 
return Cond9
;
2262
21
2263
21
  // Check for a max calculation that matches the pattern. There's no check
2264
21
  // for ICMP_ULE here because the comparison would be with zero, which
2265
21
  // isn't interesting.
2266
21
  CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2267
21
  const SCEVNAryExpr *Max = nullptr;
2268
21
  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2269
2
    Pred = ICmpInst::ICMP_SLE;
2270
2
    Max = S;
2271
19
  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2272
7
    Pred = ICmpInst::ICMP_SLT;
2273
7
    Max = S;
2274
12
  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2275
9
    Pred = ICmpInst::ICMP_ULT;
2276
9
    Max = U;
2277
9
  } else {
2278
3
    // No match; bail.
2279
3
    return Cond;
2280
3
  }
2281
18
2282
18
  // To handle a max with more than two operands, this optimization would
2283
18
  // require additional checking and setup.
2284
18
  if (Max->getNumOperands() != 2)
2285
1
    return Cond;
2286
17
2287
17
  const SCEV *MaxLHS = Max->getOperand(0);
2288
17
  const SCEV *MaxRHS = Max->getOperand(1);
2289
17
2290
17
  // ScalarEvolution canonicalizes constants to the left. For < and >, look
2291
17
  // for a comparison with 1. For <= and >=, a comparison with zero.
2292
17
  if (!MaxLHS ||
2293
17
      (ICmpInst::isTrueWhenEqual(Pred) ? 
!MaxLHS->isZero()2
:
(MaxLHS != One)15
))
2294
0
    return Cond;
2295
17
2296
17
  // Check the relevant induction variable for conformance to
2297
17
  // the pattern.
2298
17
  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2299
17
  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2300
17
  if (!AR || !AR->isAffine() ||
2301
17
      AR->getStart() != One ||
2302
17
      AR->getStepRecurrence(SE) != One)
2303
0
    return Cond;
2304
17
2305
17
  assert(AR->getLoop() == L &&
2306
17
         "Loop condition operand is an addrec in a different loop!");
2307
17
2308
17
  // Check the right operand of the select, and remember it, as it will
2309
17
  // be used in the new comparison instruction.
2310
17
  Value *NewRHS = nullptr;
2311
17
  if (ICmpInst::isTrueWhenEqual(Pred)) {
2312
2
    // Look for n+1, and grab n.
2313
2
    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2314
2
      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2315
2
         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2316
2
           NewRHS = BO->getOperand(0);
2317
2
    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2318
0
      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2319
0
        if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2320
0
          NewRHS = BO->getOperand(0);
2321
2
    if (!NewRHS)
2322
0
      return Cond;
2323
15
  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2324
2
    NewRHS = Sel->getOperand(1);
2325
13
  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2326
11
    NewRHS = Sel->getOperand(2);
2327
2
  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2328
2
    NewRHS = SU->getValue();
2329
0
  else
2330
0
    // Max doesn't match expected pattern.
2331
0
    return Cond;
2332
17
2333
17
  // Determine the new comparison opcode. It may be signed or unsigned,
2334
17
  // and the original comparison may be either equality or inequality.
2335
17
  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2336
11
    Pred = CmpInst::getInversePredicate(Pred);
2337
17
2338
17
  // Ok, everything looks ok to change the condition into an SLT or SGE and
2339
17
  // delete the max calculation.
2340
17
  ICmpInst *NewCond =
2341
17
    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
2342
17
2343
17
  // Delete the max calculation instructions.
2344
17
  Cond->replaceAllUsesWith(NewCond);
2345
17
  CondUse->setUser(NewCond);
2346
17
  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2347
17
  Cond->eraseFromParent();
2348
17
  Sel->eraseFromParent();
2349
17
  if (Cmp->use_empty())
2350
14
    Cmp->eraseFromParent();
2351
17
  return NewCond;
2352
17
}
2353
2354
/// Change loop terminating condition to use the postinc iv when possible.
2355
void
2356
125k
LSRInstance::OptimizeLoopTermCond() {
2357
125k
  SmallPtrSet<Instruction *, 4> PostIncs;
2358
125k
2359
125k
  // We need a different set of heuristics for rotated and non-rotated loops.
2360
125k
  // If a loop is rotated then the latch is also the backedge, so inserting
2361
125k
  // post-inc expressions just before the latch is ideal. To reduce live ranges
2362
125k
  // it also makes sense to rewrite terminating conditions to use post-inc
2363
125k
  // expressions.
2364
125k
  //
2365
125k
  // If the loop is not rotated then the latch is not a backedge; the latch
2366
125k
  // check is done in the loop head. Adding post-inc expressions before the
2367
125k
  // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2368
125k
  // in the loop body. In this case we do *not* want to use post-inc expressions
2369
125k
  // in the latch check, and we want to insert post-inc expressions before
2370
125k
  // the backedge.
2371
125k
  BasicBlock *LatchBlock = L->getLoopLatch();
2372
125k
  SmallVector<BasicBlock*, 8> ExitingBlocks;
2373
125k
  L->getExitingBlocks(ExitingBlocks);
2374
160k
  if (
llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) 125k
{
2375
160k
        return LatchBlock != BB;
2376
160k
      })) {
2377
2.06k
    // The backedge doesn't exit the loop; treat this as a head-tested loop.
2378
2.06k
    IVIncInsertPos = LatchBlock->getTerminator();
2379
2.06k
    return;
2380
2.06k
  }
2381
122k
2382
122k
  // Otherwise treat this as a rotated loop.
2383
157k
  
for (BasicBlock *ExitingBlock : ExitingBlocks)122k
{
2384
157k
    // Get the terminating condition for the loop if possible.  If we
2385
157k
    // can, we want to change it to use a post-incremented version of its
2386
157k
    // induction variable, to allow coalescing the live ranges for the IV into
2387
157k
    // one register value.
2388
157k
2389
157k
    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2390
157k
    if (!TermBr)
2391
7.34k
      continue;
2392
149k
    // FIXME: Overly conservative, termination condition could be an 'or' etc..
2393
149k
    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2394
12.2k
      continue;
2395
137k
2396
137k
    // Search IVUsesByStride to find Cond's IVUse if there is one.
2397
137k
    IVStrideUse *CondUse = nullptr;
2398
137k
    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2399
137k
    if (!FindIVUserForCond(Cond, CondUse))
2400
29.2k
      continue;
2401
108k
2402
108k
    // If the trip count is computed in terms of a max (due to ScalarEvolution
2403
108k
    // being unable to find a sufficient guard, for example), change the loop
2404
108k
    // comparison to use SLT or ULT instead of NE.
2405
108k
    // One consequence of doing this now is that it disrupts the count-down
2406
108k
    // optimization. That's not always a bad thing though, because in such
2407
108k
    // cases it may still be worthwhile to avoid a max.
2408
108k
    Cond = OptimizeMax(Cond, CondUse);
2409
108k
2410
108k
    // If this exiting block dominates the latch block, it may also use
2411
108k
    // the post-inc value if it won't be shared with other uses.
2412
108k
    // Check for dominance.
2413
108k
    if (!DT.dominates(ExitingBlock, LatchBlock))
2414
139
      continue;
2415
108k
2416
108k
    // Conservatively avoid trying to use the post-inc value in non-latch
2417
108k
    // exits if there may be pre-inc users in intervening blocks.
2418
108k
    if (LatchBlock != ExitingBlock)
2419
2.32k
      
for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); 1.38k
UI != E;
++UI941
)
2420
2.24k
        // Test if the use is reachable from the exiting block. This dominator
2421
2.24k
        // query is a conservative approximation of reachability.
2422
2.24k
        if (&*UI != CondUse &&
2423
2.24k
            
!DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)1.88k
) {
2424
1.76k
          // Conservatively assume there may be reuse if the quotient of their
2425
1.76k
          // strides could be a legal scale.
2426
1.76k
          const SCEV *A = IU.getStride(*CondUse, L);
2427
1.76k
          const SCEV *B = IU.getStride(*UI, L);
2428
1.76k
          if (!A || !B) 
continue0
;
2429
1.76k
          if (SE.getTypeSizeInBits(A->getType()) !=
2430
1.76k
              SE.getTypeSizeInBits(B->getType())) {
2431
280
            if (SE.getTypeSizeInBits(A->getType()) >
2432
280
                SE.getTypeSizeInBits(B->getType()))
2433
179
              B = SE.getSignExtendExpr(B, A->getType());
2434
101
            else
2435
101
              A = SE.getSignExtendExpr(A, B->getType());
2436
280
          }
2437
1.76k
          if (const SCEVConstant *D =
2438
1.64k
                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2439
1.64k
            const ConstantInt *C = D->getValue();
2440
1.64k
            // Stride of one or negative one can have reuse with non-addresses.
2441
1.64k
            if (C->isOne() || 
C->isMinusOne()699
)
2442
1.05k
              goto decline_post_inc;
2443
592
            // Avoid weird situations.
2444
592
            if (C->getValue().getMinSignedBits() >= 64 ||
2445
592
                C->getValue().isMinSignedValue())
2446
0
              goto decline_post_inc;
2447
592
            // Check for possible scaled-address reuse.
2448
592
            if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
2449
412
              MemAccessTy AccessTy = getAccessType(
2450
412
                  TTI, UI->getUser(), UI->getOperandValToReplace());
2451
412
              int64_t Scale = C->getSExtValue();
2452
412
              if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2453
412
                                            /*BaseOffset=*/0,
2454
412
                                            /*HasBaseReg=*/false, Scale,
2455
412
                                            AccessTy.AddrSpace))
2456
224
                goto decline_post_inc;
2457
188
              Scale = -Scale;
2458
188
              if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2459
188
                                            /*BaseOffset=*/0,
2460
188
                                            /*HasBaseReg=*/false, Scale,
2461
188
                                            AccessTy.AddrSpace))
2462
20
                goto decline_post_inc;
2463
188
            }
2464
592
          }
2465
1.76k
        }
2466
108k
2467
108k
    
LLVM_DEBUG106k
(dbgs() << " Change loop exiting icmp to use postinc iv: "
2468
106k
                      << *Cond << '\n');
2469
106k
2470
106k
    // It's possible for the setcc instruction to be anywhere in the loop, and
2471
106k
    // possible for it to have multiple users.  If it is not immediately before
2472
106k
    // the exiting block branch, move it.
2473
106k
    if (&*++BasicBlock::iterator(Cond) != TermBr) {
2474
5.10k
      if (Cond->hasOneUse()) {
2475
5.09k
        Cond->moveBefore(TermBr);
2476
5.09k
      } else {
2477
15
        // Clone the terminating condition and insert into the loopend.
2478
15
        ICmpInst *OldCond = Cond;
2479
15
        Cond = cast<ICmpInst>(Cond->clone());
2480
15
        Cond->setName(L->getHeader()->getName() + ".termcond");
2481
15
        ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
2482
15
2483
15
        // Clone the IVUse, as the old use still exists!
2484
15
        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2485
15
        TermBr->replaceUsesOfWith(OldCond, Cond);
2486
15
      }
2487
5.10k
    }
2488
106k
2489
106k
    // If we get to here, we know that we can transform the setcc instruction to
2490
106k
    // use the post-incremented version of the IV, allowing us to coalesce the
2491
106k
    // live ranges for the IV correctly.
2492
106k
    CondUse->transformToPostInc(L);
2493
106k
    Changed = true;
2494
106k
2495
106k
    PostIncs.insert(Cond);
2496
108k
  decline_post_inc:;
2497
108k
  }
2498
122k
2499
122k
  // Determine an insertion point for the loop induction variable increment. It
2500
122k
  // must dominate all the post-inc comparisons we just set up, and it must
2501
122k
  // dominate the loop latch edge.
2502
122k
  IVIncInsertPos = L->getLoopLatch()->getTerminator();
2503
122k
  for (Instruction *Inst : PostIncs) {
2504
106k
    BasicBlock *BB =
2505
106k
      DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
2506
106k
                                    Inst->getParent());
2507
106k
    if (BB == Inst->getParent())
2508
106k
      IVIncInsertPos = Inst;
2509
30
    else if (BB != IVIncInsertPos->getParent())
2510
0
      IVIncInsertPos = BB->getTerminator();
2511
106k
  }
2512
122k
}
2513
2514
/// Determine if the given use can accommodate a fixup at the given offset and
2515
/// other details. If so, update the use and return true.
2516
bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
2517
                                     bool HasBaseReg, LSRUse::KindType Kind,
2518
190k
                                     MemAccessTy AccessTy) {
2519
190k
  int64_t NewMinOffset = LU.MinOffset;
2520
190k
  int64_t NewMaxOffset = LU.MaxOffset;
2521
190k
  MemAccessTy NewAccessTy = AccessTy;
2522
190k
2523
190k
  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2524
190k
  // something conservative, however this can pessimize in the case that one of
2525
190k
  // the uses will have all its uses outside the loop, for example.
2526
190k
  if (LU.Kind != Kind)
2527
0
    return false;
2528
190k
2529
190k
  // Check for a mismatched access type, and fall back conservatively as needed.
2530
190k
  // TODO: Be less conservative when the type is similar and can use the same
2531
190k
  // addressing modes.
2532
190k
  if (Kind == LSRUse::Address) {
2533
173k
    if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2534
3.62k
      NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2535
3.62k
                                            AccessTy.AddrSpace);
2536
3.62k
    }
2537
173k
  }
2538
190k
2539
190k
  // Conservatively assume HasBaseReg is true for now.
2540
190k
  if (NewOffset < LU.MinOffset) {
2541
15.1k
    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2542
15.1k
                          LU.MaxOffset - NewOffset, HasBaseReg))
2543
2
      return false;
2544
15.1k
    NewMinOffset = NewOffset;
2545
175k
  } else if (NewOffset > LU.MaxOffset) {
2546
12.7k
    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2547
12.7k
                          NewOffset - LU.MinOffset, HasBaseReg))
2548
18
      return false;
2549
12.7k
    NewMaxOffset = NewOffset;
2550
12.7k
  }
2551
190k
2552
190k
  // Update the use.
2553
190k
  LU.MinOffset = NewMinOffset;
2554
190k
  LU.MaxOffset = NewMaxOffset;
2555
190k
  LU.AccessTy = NewAccessTy;
2556
190k
  return true;
2557
190k
}
2558
2559
/// Return an LSRUse index and an offset value for a fixup which needs the given
2560
/// expression, with the given kind and optional access type.  Either reuse an
2561
/// existing use or create a new one, as needed.
2562
std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
2563
                                               LSRUse::KindType Kind,
2564
506k
                                               MemAccessTy AccessTy) {
2565
506k
  const SCEV *Copy = Expr;
2566
506k
  int64_t Offset = ExtractImmediate(Expr, SE);
2567
506k
2568
506k
  // Basic uses can't accept any offset, for example.
2569
506k
  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2570
506k
                        Offset, /*HasBaseReg=*/ true)) {
2571
263k
    Expr = Copy;
2572
263k
    Offset = 0;
2573
263k
  }
2574
506k
2575
506k
  std::pair<UseMapTy::iterator, bool> P =
2576
506k
    UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2577
506k
  if (!P.second) {
2578
50.0k
    // A use already existed with this base.
2579
50.0k
    size_t LUIdx = P.first->second;
2580
50.0k
    LSRUse &LU = Uses[LUIdx];
2581
50.0k
    if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2582
50.0k
      // Reuse this use.
2583
50.0k
      return std::make_pair(LUIdx, Offset);
2584
456k
  }
2585
456k
2586
456k
  // Create a new use.
2587
456k
  size_t LUIdx = Uses.size();
2588
456k
  P.first->second = LUIdx;
2589
456k
  Uses.push_back(LSRUse(Kind, AccessTy));
2590
456k
  LSRUse &LU = Uses[LUIdx];
2591
456k
2592
456k
  LU.MinOffset = Offset;
2593
456k
  LU.MaxOffset = Offset;
2594
456k
  return std::make_pair(LUIdx, Offset);
2595
456k
}
2596
2597
/// Delete the given use from the Uses list.
2598
140k
void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2599
140k
  if (&LU != &Uses.back())
2600
137k
    std::swap(LU, Uses.back());
2601
140k
  Uses.pop_back();
2602
140k
2603
140k
  // Update RegUses.
2604
140k
  RegUses.swapAndDropUse(LUIdx, Uses.size());
2605
140k
}
2606
2607
/// Look for a use distinct from OrigLU which is has a formula that has the same
2608
/// registers as the given formula.
2609
LSRUse *
2610
LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2611
246k
                                       const LSRUse &OrigLU) {
2612
246k
  // Search all uses for the formula. This could be more clever.
2613
2.87M
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; 
++LUIdx2.63M
) {
2614
2.77M
    LSRUse &LU = Uses[LUIdx];
2615
2.77M
    // Check whether this use is close enough to OrigLU, to see whether it's
2616
2.77M
    // worthwhile looking through its formulae.
2617
2.77M
    // Ignore ICmpZero uses because they may contain formulae generated by
2618
2.77M
    // GenerateICmpZeroScales, in which case adding fixup offsets may
2619
2.77M
    // be invalid.
2620
2.77M
    if (&LU != &OrigLU &&
2621
2.77M
        
LU.Kind != LSRUse::ICmpZero2.63M
&&
2622
2.77M
        
LU.Kind == OrigLU.Kind2.42M
&&
OrigLU.AccessTy == LU.AccessTy1.89M
&&
2623
2.77M
        
LU.WidestFixupType == OrigLU.WidestFixupType1.68M
&&
2624
2.77M
        
LU.HasFormulaWithSameRegs(OrigF)1.68M
) {
2625
1.25M
      // Scan through this use's formulae.
2626
16.8M
      for (const Formula &F : LU.Formulae) {
2627
16.8M
        // Check to see if this formula has the same registers and symbols
2628
16.8M
        // as OrigF.
2629
16.8M
        if (F.BaseRegs == OrigF.BaseRegs &&
2630
16.8M
            
F.ScaledReg == OrigF.ScaledReg1.39M
&&
2631
16.8M
            
F.BaseGV == OrigF.BaseGV1.25M
&&
2632
16.8M
            
F.Scale == OrigF.Scale1.25M
&&
2633
16.8M
            
F.UnfoldedOffset == OrigF.UnfoldedOffset1.25M
) {
2634
1.20M
          if (F.BaseOffset == 0)
2635
140k
            return &LU;
2636
1.06M
          // This is the formula where all the registers and symbols matched;
2637
1.06M
          // there aren't going to be any others. Since we declined it, we
2638
1.06M
          // can skip the rest of the formulae and proceed to the next LSRUse.
2639
1.06M
          break;
2640
1.06M
        }
2641
16.8M
      }
2642
1.25M
    }
2643
2.77M
  }
2644
246k
2645
246k
  // Nothing looked good.
2646
246k
  
return nullptr106k
;
2647
246k
}
2648
2649
103k
void LSRInstance::CollectInterestingTypesAndFactors() {
2650
103k
  SmallSetVector<const SCEV *, 4> Strides;
2651
103k
2652
103k
  // Collect interesting types and strides.
2653
103k
  SmallVector<const SCEV *, 4> Worklist;
2654
502k
  for (const IVStrideUse &U : IU) {
2655
502k
    const SCEV *Expr = IU.getExpr(U);
2656
502k
2657
502k
    // Collect interesting types.
2658
502k
    Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2659
502k
2660
502k
    // Add strides for mentioned loops.
2661
502k
    Worklist.push_back(Expr);
2662
1.70M
    do {
2663
1.70M
      const SCEV *S = Worklist.pop_back_val();
2664
1.70M
      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2665
524k
        if (AR->getLoop() == L)
2666
502k
          Strides.insert(AR->getStepRecurrence(SE));
2667
524k
        Worklist.push_back(AR->getStart());
2668
1.18M
      } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2669
308k
        Worklist.append(Add->op_begin(), Add->op_end());
2670
308k
      }
2671
1.70M
    } while (!Worklist.empty());
2672
502k
  }
2673
103k
2674
103k
  // Compute interesting factors from the set of interesting strides.
2675
103k
  for (SmallSetVector<const SCEV *, 4>::const_iterator
2676
296k
       I = Strides.begin(), E = Strides.end(); I != E; 
++I192k
)
2677
192k
    for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2678
309k
         std::next(I); NewStrideIter != E; 
++NewStrideIter116k
) {
2679
116k
      const SCEV *OldStride = *I;
2680
116k
      const SCEV *NewStride = *NewStrideIter;
2681
116k
2682
116k
      if (SE.getTypeSizeInBits(OldStride->getType()) !=
2683
116k
          SE.getTypeSizeInBits(NewStride->getType())) {
2684
36.3k
        if (SE.getTypeSizeInBits(OldStride->getType()) >
2685
36.3k
            SE.getTypeSizeInBits(NewStride->getType()))
2686
20.3k
          NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2687
15.9k
        else
2688
15.9k
          OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2689
36.3k
      }
2690
116k
      if (const SCEVConstant *Factor =
2691
68.4k
            dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2692
68.4k
                                                        SE, true))) {
2693
68.4k
        if (Factor->getAPInt().getMinSignedBits() <= 64)
2694
68.4k
          Factors.insert(Factor->getAPInt().getSExtValue());
2695
68.4k
      } else 
if (const SCEVConstant *48.5k
Factor48.5k
=
2696
46.4k
                   dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2697
46.4k
                                                               NewStride,
2698
46.4k
                                                               SE, true))) {
2699
46.4k
        if (Factor->getAPInt().getMinSignedBits() <= 64)
2700
46.4k
          Factors.insert(Factor->getAPInt().getSExtValue());
2701
46.4k
      }
2702
116k
    }
2703
103k
2704
103k
  // If all uses use the same type, don't bother looking for truncation-based
2705
103k
  // reuse.
2706
103k
  if (Types.size() == 1)
2707
83.7k
    Types.clear();
2708
103k
2709
103k
  LLVM_DEBUG(print_factors_and_types(dbgs()));
2710
103k
}
2711
2712
/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2713
/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2714
/// IVStrideUses, we could partially skip this.
2715
static User::op_iterator
2716
findIVOperand(User::op_iterator OI, User::op_iterator OE,
2717
841k
              Loop *L, ScalarEvolution &SE) {
2718
1.20M
  for(; OI != OE; 
++OI359k
) {
2719
776k
    if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2720
696k
      if (!SE.isSCEVable(Oper->getType()))
2721
159k
        continue;
2722
536k
2723
536k
      if (const SCEVAddRecExpr *AR =
2724
422k
          dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2725
422k
        if (AR->getLoop() == L)
2726
417k
          break;
2727
422k
      }
2728
536k
    }
2729
776k
  }
2730
841k
  return OI;
2731
841k
}
2732
2733
/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2734
/// a convenient helper.
2735
903k
static Value *getWideOperand(Value *Oper) {
2736
903k
  if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2737
23.3k
    return Trunc->getOperand(0);
2738
879k
  return Oper;
2739
879k
}
2740
2741
/// Return true if we allow an IV chain to include both types.
2742
337k
static bool isCompatibleIVType(Value *LVal, Value *RVal) {
2743
337k
  Type *LType = LVal->getType();
2744
337k
  Type *RType = RVal->getType();
2745
337k
  return (LType == RType) || 
(15.5k
LType->isPointerTy()15.5k
&&
RType->isPointerTy()13.5k
&&
2746
15.5k
                              // Different address spaces means (possibly)
2747
15.5k
                              // different types of the pointer implementation,
2748
15.5k
                              // e.g. i16 vs i32 so disallow that.
2749
15.5k
                              (LType->getPointerAddressSpace() ==
2750
13.4k
                               RType->getPointerAddressSpace()));
2751
337k
}
2752
2753
/// Return an approximation of this SCEV expression's "base", or NULL for any
2754
/// constant. Returning the expression itself is conservative. Returning a
2755
/// deeper subexpression is more precise and valid as long as it isn't less
2756
/// complex than another subexpression. For expressions involving multiple
2757
/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2758
/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2759
/// IVInc==b-a.
2760
///
2761
/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2762
/// SCEVUnknown, we simply return the rightmost SCEV operand.
2763
1.12M
static const SCEV *getExprBase(const SCEV *S) {
2764
1.12M
  switch (S->getSCEVType()) {
2765
1.12M
  default: // uncluding scUnknown.
2766
80.5k
    return S;
2767
1.12M
  case scConstant:
2768
170k
    return nullptr;
2769
1.12M
  case scTruncate:
2770
144
    return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
2771
1.12M
  case scZeroExtend:
2772
1.91k
    return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
2773
1.12M
  case scSignExtend:
2774
2.05k
    return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
2775
1.12M
  case scAddExpr: {
2776
314k
    // Skip over scaled operands (scMulExpr) to follow add operands as long as
2777
314k
    // there's nothing more complex.
2778
314k
    // FIXME: not sure if we want to recognize negation.
2779
314k
    const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
2780
314k
    for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
2781
317k
           E(Add->op_begin()); I != E; 
++I2.29k
) {
2782
317k
      const SCEV *SubExpr = *I;
2783
317k
      if (SubExpr->getSCEVType() == scAddExpr)
2784
0
        return getExprBase(SubExpr);
2785
317k
2786
317k
      if (SubExpr->getSCEVType() != scMulExpr)
2787
314k
        return SubExpr;
2788
317k
    }
2789
314k
    
return S22
; // all operands are scaled, be conservative.
2790
314k
  }
2791
558k
  case scAddRecExpr:
2792
558k
    return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
2793
1.12M
  }
2794
1.12M
}
2795
2796
/// Return true if the chain increment is profitable to expand into a loop
2797
/// invariant value, which may require its own register. A profitable chain
2798
/// increment will be an offset relative to the same base. We allow such offsets
2799
/// to potentially be used as chain increment as long as it's not obviously
2800
/// expensive to expand using real instructions.
2801
bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
2802
                                    const SCEV *IncExpr,
2803
331k
                                    ScalarEvolution &SE) {
2804
331k
  // Aggressively form chains when -stress-ivchain.
2805
331k
  if (StressIVChain)
2806
0
    return true;
2807
331k
2808
331k
  // Do not replace a constant offset from IV head with a nonconstant IV
2809
331k
  // increment.
2810
331k
  if (!isa<SCEVConstant>(IncExpr)) {
2811
2.26k
    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
2812
2.26k
    if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
2813
384
      return false;
2814
331k
  }
2815
331k
2816
331k
  SmallPtrSet<const SCEV*, 8> Processed;
2817
331k
  return !isHighCostExpansion(IncExpr, Processed, SE);
2818
331k
}
2819
2820
/// Return true if the number of registers needed for the chain is estimated to
2821
/// be less than the number required for the individual IV users. First prohibit
2822
/// any IV users that keep the IV live across increments (the Users set should
2823
/// be empty). Next count the number and type of increments in the chain.
2824
///
2825
/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
2826
/// effectively use postinc addressing modes. Only consider it profitable it the
2827
/// increments can be computed in fewer registers when chained.
2828
///
2829
/// TODO: Consider IVInc free if it's already used in another chains.
2830
static bool
2831
isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
2832
193k
                  ScalarEvolution &SE) {
2833
193k
  if (StressIVChain)
2834
0
    return true;
2835
193k
2836
193k
  if (!Chain.hasIncs())
2837
55.8k
    return false;
2838
137k
2839
137k
  if (!Users.empty()) {
2840
4.56k
    LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
2841
4.56k
               for (Instruction *Inst
2842
4.56k
                    : Users) { dbgs() << "  " << *Inst << "\n"; });
2843
4.56k
    return false;
2844
4.56k
  }
2845
132k
  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2846
132k
2847
132k
  // The chain itself may require a register, so intialize cost to 1.
2848
132k
  int cost = 1;
2849
132k
2850
132k
  // A complete chain likely eliminates the need for keeping the original IV in
2851
132k
  // a register. LSR does not currently know how to form a complete chain unless
2852
132k
  // the header phi already exists.
2853
132k
  if (isa<PHINode>(Chain.tailUserInst())
2854
132k
      && 
SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr104k
) {
2855
29.1k
    --cost;
2856
29.1k
  }
2857
132k
  const SCEV *LastIncExpr = nullptr;
2858
132k
  unsigned NumConstIncrements = 0;
2859
132k
  unsigned NumVarIncrements = 0;
2860
132k
  unsigned NumReusedIncrements = 0;
2861
322k
  for (const IVInc &Inc : Chain) {
2862
322k
    if (Inc.IncExpr->isZero())
2863
106k
      continue;
2864
215k
2865
215k
    // Incrementing by zero or some constant is neutral. We assume constants can
2866
215k
    // be folded into an addressing mode or an add's immediate operand.
2867
215k
    if (isa<SCEVConstant>(Inc.IncExpr)) {
2868
214k
      ++NumConstIncrements;
2869
214k
      continue;
2870
214k
    }
2871
1.04k
2872
1.04k
    if (Inc.IncExpr == LastIncExpr)
2873
109
      ++NumReusedIncrements;
2874
932
    else
2875
932
      ++NumVarIncrements;
2876
1.04k
2877
1.04k
    LastIncExpr = Inc.IncExpr;
2878
1.04k
  }
2879
132k
  // An IV chain with a single increment is handled by LSR's postinc
2880
132k
  // uses. However, a chain with multiple increments requires keeping the IV's
2881
132k
  // value live longer than it needs to be if chained.
2882
132k
  if (NumConstIncrements > 1)
2883
11.4k
    --cost;
2884
132k
2885
132k
  // Materializing increment expressions in the preheader that didn't exist in
2886
132k
  // the original code may cost a register. For example, sign-extended array
2887
132k
  // indices can produce ridiculous increments like this:
2888
132k
  // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
2889
132k
  cost += NumVarIncrements;
2890
132k
2891
132k
  // Reusing variable increments likely saves a register to hold the multiple of
2892
132k
  // the stride.
2893
132k
  cost -= NumReusedIncrements;
2894
132k
2895
132k
  LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
2896
132k
                    << "\n");
2897
132k
2898
132k
  return cost < 0;
2899
132k
}
2900
2901
/// Add this IV user to an existing chain or make it the head of a new chain.
2902
void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2903
565k
                                   SmallVectorImpl<ChainUsers> &ChainUsersVec) {
2904
565k
  // When IVs are used as types of varying widths, they are generally converted
2905
565k
  // to a wider type with some uses remaining narrow under a (free) trunc.
2906
565k
  Value *const NextIV = getWideOperand(IVOper);
2907
565k
  const SCEV *const OperExpr = SE.getSCEV(NextIV);
2908
565k
  const SCEV *const OperExprBase = getExprBase(OperExpr);
2909
565k
2910
565k
  // Visit all existing chains. Check if its IVOper can be computed as a
2911
565k
  // profitable loop invariant increment from the last link in the Chain.
2912
565k
  unsigned ChainIdx = 0, NChains = IVChainVec.size();
2913
565k
  const SCEV *LastIncExpr = nullptr;
2914
952k
  for (; ChainIdx < NChains; 
++ChainIdx386k
) {
2915
717k
    IVChain &Chain = IVChainVec[ChainIdx];
2916
717k
2917
717k
    // Prune the solution space aggressively by checking that both IV operands
2918
717k
    // are expressions that operate on the same unscaled SCEVUnknown. This
2919
717k
    // "base" will be canceled by the subsequent getMinusSCEV call. Checking
2920
717k
    // first avoids creating extra SCEV expressions.
2921
717k
    if (!StressIVChain && Chain.ExprBase != OperExprBase)
2922
383k
      continue;
2923
333k
2924
333k
    Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
2925
333k
    if (!isCompatibleIVType(PrevIV, NextIV))
2926
746
      continue;
2927
332k
2928
332k
    // A phi node terminates a chain.
2929
332k
    if (isa<PHINode>(UserInst) && 
isa<PHINode>(Chain.tailUserInst())109k
)
2930
364
      continue;
2931
332k
2932
332k
    // The increment must be loop-invariant so it can be kept in a register.
2933
332k
    const SCEV *PrevExpr = SE.getSCEV(PrevIV);
2934
332k
    const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
2935
332k
    if (!SE.isLoopInvariant(IncExpr, L))
2936
1.20k
      continue;
2937
331k
2938
331k
    if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
2939
330k
      LastIncExpr = IncExpr;
2940
330k
      break;
2941
330k
    }
2942
331k
  }
2943
565k
  // If we haven't found a chain, create a new one, unless we hit the max. Don't
2944
565k
  // bother for phi nodes, because they must be last in the chain.
2945
565k
  if (ChainIdx == NChains) {
2946
235k
    if (isa<PHINode>(UserInst))
2947
40.9k
      return;
2948
194k
    if (NChains >= MaxChains && 
!StressIVChain1.21k
) {
2949
1.21k
      LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
2950
1.21k
      return;
2951
1.21k
    }
2952
193k
    LastIncExpr = OperExpr;
2953
193k
    // IVUsers may have skipped over sign/zero extensions. We don't currently
2954
193k
    // attempt to form chains involving extensions unless they can be hoisted
2955
193k
    // into this loop's AddRec.
2956
193k
    if (!isa<SCEVAddRecExpr>(LastIncExpr))
2957
2
      return;
2958
193k
    ++NChains;
2959
193k
    IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
2960
193k
                                 OperExprBase));
2961
193k
    ChainUsersVec.resize(NChains);
2962
193k
    LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
2963
193k
                      << ") IV=" << *LastIncExpr << "\n");
2964
330k
  } else {
2965
330k
    LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
2966
330k
                      << ") IV+" << *LastIncExpr << "\n");
2967
330k
    // Add this IV user to the end of the chain.
2968
330k
    IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
2969
330k
  }
2970
565k
  IVChain &Chain = IVChainVec[ChainIdx];
2971
523k
2972
523k
  SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
2973
523k
  // This chain's NearUsers become FarUsers.
2974
523k
  if (!LastIncExpr->isZero()) {
2975
414k
    ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
2976
414k
                                            NearUsers.end());
2977
414k
    NearUsers.clear();
2978
414k
  }
2979
523k
2980
523k
  // All other uses of IVOperand become near uses of the chain.
2981
523k
  // We currently ignore intermediate values within SCEV expressions, assuming
2982
523k
  // they will eventually be used be the current chain, or can be computed
2983
523k
  // from one of the chain increments. To be more precise we could
2984
523k
  // transitively follow its user and only add leaf IV users to the set.
2985
873k
  for (User *U : IVOper->users()) {
2986
873k
    Instruction *OtherUse = dyn_cast<Instruction>(U);
2987
873k
    if (!OtherUse)
2988
0
      continue;
2989
873k
    // Uses in the chain will no longer be uses if the chain is formed.
2990
873k
    // Include the head of the chain in this iteration (not Chain.begin()).
2991
873k
    IVChain::const_iterator IncIter = Chain.Incs.begin();
2992
873k
    IVChain::const_iterator IncEnd = Chain.Incs.end();
2993
8.98M
    for( ; IncIter != IncEnd; 
++IncIter8.11M
) {
2994
8.76M
      if (IncIter->UserInst == OtherUse)
2995
651k
        break;
2996
8.76M
    }
2997
873k
    if (IncIter != IncEnd)
2998
651k
      continue;
2999
222k
3000
222k
    if (SE.isSCEVable(OtherUse->getType())
3001
222k
        && 
!isa<SCEVUnknown>(SE.getSCEV(OtherUse))211k
3002
222k
        && 
IU.isIVUserOrOperand(OtherUse)173k
) {
3003
173k
      continue;
3004
173k
    }
3005
48.9k
    NearUsers.insert(OtherUse);
3006
48.9k
  }
3007
523k
3008
523k
  // Since this user is part of the chain, it's no longer considered a use
3009
523k
  // of the chain.
3010
523k
  ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3011
523k
}
3012
3013
/// Populate the vector of Chains.
3014
///
3015
/// This decreases ILP at the architecture level. Targets with ample registers,
3016
/// multiple memory ports, and no register renaming probably don't want
3017
/// this. However, such targets should probably disable LSR altogether.
3018
///
3019
/// The job of LSR is to make a reasonable choice of induction variables across
3020
/// the loop. Subsequent passes can easily "unchain" computation exposing more
3021
/// ILP *within the loop* if the target wants it.
3022
///
3023
/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3024
/// will not reorder memory operations, it will recognize this as a chain, but
3025
/// will generate redundant IV increments. Ideally this would be corrected later
3026
/// by a smart scheduler:
3027
///        = A[i]
3028
///        = A[i+x]
3029
/// A[i]   =
3030
/// A[i+x] =
3031
///
3032
/// TODO: Walk the entire domtree within this loop, not just the path to the
3033
/// loop latch. This will discover chains on side paths, but requires
3034
/// maintaining multiple copies of the Chains state.
3035
103k
void LSRInstance::CollectChains() {
3036
103k
  LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3037
103k
  SmallVector<ChainUsers, 8> ChainUsersVec;
3038
103k
3039
103k
  SmallVector<BasicBlock *,8> LatchPath;
3040
103k
  BasicBlock *LoopHeader = L->getHeader();
3041
103k
  for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3042
139k
       Rung->getBlock() != LoopHeader; 
Rung = Rung->getIDom()35.7k
) {
3043
35.7k
    LatchPath.push_back(Rung->getBlock());
3044
35.7k
  }
3045
103k
  LatchPath.push_back(LoopHeader);
3046
103k
3047
103k
  // Walk the instruction stream from the loop header to the loop latch.
3048
139k
  for (BasicBlock *BB : reverse(LatchPath)) {
3049
1.95M
    for (Instruction &I : *BB) {
3050
1.95M
      // Skip instructions that weren't seen by IVUsers analysis.
3051
1.95M
      if (isa<PHINode>(I) || 
!IU.isIVUserOrOperand(&I)1.77M
)
3052
883k
        continue;
3053
1.06M
3054
1.06M
      // Ignore users that are part of a SCEV expression. This way we only
3055
1.06M
      // consider leaf IV Users. This effectively rediscovers a portion of
3056
1.06M
      // IVUsers analysis but in program order this time.
3057
1.06M
      if (SE.isSCEVable(I.getType()) && 
!isa<SCEVUnknown>(SE.getSCEV(&I))833k
)
3058
643k
          continue;
3059
424k
3060
424k
      // Remove this instruction from any NearUsers set it may be in.
3061
424k
      for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3062
909k
           ChainIdx < NChains; 
++ChainIdx485k
) {
3063
485k
        ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3064
485k
      }
3065
424k
      // Search for operands that can be chained.
3066
424k
      SmallPtrSet<Instruction*, 4> UniqueOperands;
3067
424k
      User::op_iterator IVOpEnd = I.op_end();
3068
424k
      User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3069
840k
      while (IVOpIter != IVOpEnd) {
3070
416k
        Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3071
416k
        if (UniqueOperands.insert(IVOpInst).second)
3072
416k
          ChainInstruction(&I, IVOpInst, ChainUsersVec);
3073
416k
        IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3074
416k
      }
3075
424k
    } // Continue walking down the instructions.
3076
139k
  } // Continue walking down the domtree.
3077
103k
  // Visit phi backedges to determine if the chain can generate the IV postinc.
3078
166k
  for (PHINode &PN : L->getHeader()->phis()) {
3079
166k
    if (!SE.isSCEVable(PN.getType()))
3080
16.5k
      continue;
3081
149k
3082
149k
    Instruction *IncV =
3083
149k
        dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3084
149k
    if (IncV)
3085
149k
      ChainInstruction(&PN, IncV, ChainUsersVec);
3086
149k
  }
3087
103k
  // Remove any unprofitable chains.
3088
103k
  unsigned ChainIdx = 0;
3089
103k
  for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3090
296k
       UsersIdx < NChains; 
++UsersIdx193k
) {
3091
193k
    if (!isProfitableChain(IVChainVec[UsersIdx],
3092
193k
                           ChainUsersVec[UsersIdx].FarUsers, SE))
3093
191k
      continue;
3094
1.31k
    // Preserve the chain at UsesIdx.
3095
1.31k
    if (ChainIdx != UsersIdx)
3096
190
      IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3097
1.31k
    FinalizeChain(IVChainVec[ChainIdx]);
3098
1.31k
    ++ChainIdx;
3099
1.31k
  }
3100
103k
  IVChainVec.resize(ChainIdx);
3101
103k
}
3102
3103
1.31k
void LSRInstance::FinalizeChain(IVChain &Chain) {
3104
1.31k
  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3105
1.31k
  LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3106
1.31k
  
3107
7.40k
  for (const IVInc &Inc : Chain) {
3108
7.40k
    LLVM_DEBUG(dbgs() << "        Inc: " << *Inc.UserInst << "\n");
3109
7.40k
    auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3110
7.40k
    assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3111
7.40k
    IVIncSet.insert(UseI);
3112
7.40k
  }
3113
1.31k
}
3114
3115
/// Return true if the IVInc can be folded into an addressing mode.
3116
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3117
6.63k
                             Value *Operand, const TargetTransformInfo &TTI) {
3118
6.63k
  const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3119
6.63k
  if (!IncConst || 
!isAddressUse(TTI, UserInst, Operand)6.52k
)
3120
1.77k
    return false;
3121
4.86k
3122
4.86k
  if (IncConst->getAPInt().getMinSignedBits() > 64)
3123
0
    return false;
3124
4.86k
3125
4.86k
  MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3126
4.86k
  int64_t IncOffset = IncConst->getValue()->getSExtValue();
3127
4.86k
  if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3128
4.86k
                        IncOffset, /*HasBaseReg=*/false))
3129
10
    return false;
3130
4.85k
3131
4.85k
  return true;
3132
4.85k
}
3133
3134
/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3135
/// user's operand from the previous IV user's operand.
3136
void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
3137
1.31k
                                  SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3138
1.31k
  // Find the new IVOperand for the head of the chain. It may have been replaced
3139
1.31k
  // by LSR.
3140
1.31k
  const IVInc &Head = Chain.Incs[0];
3141
1.31k
  User::op_iterator IVOpEnd = Head.UserInst->op_end();
3142
1.31k
  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3143
1.31k
  User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3144
1.31k
                                             IVOpEnd, L, SE);
3145
1.31k
  Value *IVSrc = nullptr;
3146
1.40k
  while (IVOpIter != IVOpEnd) {
3147
1.39k
    IVSrc = getWideOperand(*IVOpIter);
3148
1.39k
3149
1.39k
    // If this operand computes the expression that the chain needs, we may use
3150
1.39k
    // it. (Check this after setting IVSrc which is used below.)
3151
1.39k
    //
3152
1.39k
    // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3153
1.39k
    // narrow for the chain, so we can no longer use it. We do allow using a
3154
1.39k
    // wider phi, assuming the LSR checked for free truncation. In that case we
3155
1.39k
    // should already have a truncate on this operand such that
3156
1.39k
    // getSCEV(IVSrc) == IncExpr.
3157
1.39k
    if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3158
1.39k
        || 
SE.getSCEV(IVSrc) == Head.IncExpr90
) {
3159
1.30k
      break;
3160
1.30k
    }
3161
90
    IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3162
90
  }
3163
1.31k
  if (IVOpIter == IVOpEnd) {
3164
7
    // Gracefully give up on this chain.
3165
7
    LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3166
7
    return;
3167
7
  }
3168
1.30k
3169
1.30k
  LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3170
1.30k
  Type *IVTy = IVSrc->getType();
3171
1.30k
  Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3172
1.30k
  const SCEV *LeftOverExpr = nullptr;
3173
7.32k
  for (const IVInc &Inc : Chain) {
3174
7.32k
    Instruction *InsertPt = Inc.UserInst;
3175
7.32k
    if (isa<PHINode>(InsertPt))
3176
1.30k
      InsertPt = L->getLoopLatch()->getTerminator();
3177
7.32k
3178
7.32k
    // IVOper will replace the current IV User's operand. IVSrc is the IV
3179
7.32k
    // value currently held in a register.
3180
7.32k
    Value *IVOper = IVSrc;
3181
7.32k
    if (!Inc.IncExpr->isZero()) {
3182
6.55k
      // IncExpr was the result of subtraction of two narrow values, so must
3183
6.55k
      // be signed.
3184
6.55k
      const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3185
6.55k
      LeftOverExpr = LeftOverExpr ?
3186
4.76k
        SE.getAddExpr(LeftOverExpr, IncExpr) : 
IncExpr1.79k
;
3187
6.55k
    }
3188
7.32k
    if (LeftOverExpr && 
!LeftOverExpr->isZero()6.69k
) {
3189
6.63k
      // Expand the IV increment.
3190
6.63k
      Rewriter.clearPostInc();
3191
6.63k
      Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3192
6.63k
      const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3193
6.63k
                                             SE.getUnknown(IncV));
3194
6.63k
      IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3195
6.63k
3196
6.63k
      // If an IV increment can't be folded, use it as the next IV value.
3197
6.63k
      if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3198
1.78k
        assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3199
1.78k
        IVSrc = IVOper;
3200
1.78k
        LeftOverExpr = nullptr;
3201
1.78k
      }
3202
6.63k
    }
3203
7.32k
    Type *OperTy = Inc.IVOperand->getType();
3204
7.32k
    if (IVTy != OperTy) {
3205
2.51k
      assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3206
2.51k
             "cannot extend a chained IV");
3207
2.51k
      IRBuilder<> Builder(InsertPt);
3208
2.51k
      IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3209
2.51k
    }
3210
7.32k
    Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3211
7.32k
    DeadInsts.emplace_back(Inc.IVOperand);
3212
7.32k
  }
3213
1.30k
  // If LSR created a new, wider phi, we may also replace its postinc. We only
3214
1.30k
  // do this if we also found a wide value for the head of the chain.
3215
1.30k
  if (isa<PHINode>(Chain.tailUserInst())) {
3216
3.57k
    for (PHINode &Phi : L->getHeader()->phis()) {
3217
3.57k
      if (!isCompatibleIVType(&Phi, IVSrc))
3218
1.32k
        continue;
3219
2.24k
      Instruction *PostIncV = dyn_cast<Instruction>(
3220
2.24k
          Phi.getIncomingValueForBlock(L->getLoopLatch()));
3221
2.24k
      if (!PostIncV || 
(SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc))2.24k
)
3222
933
        continue;
3223
1.30k
      Value *IVOper = IVSrc;
3224
1.30k
      Type *PostIncTy = PostIncV->getType();
3225
1.30k
      if (IVTy != PostIncTy) {
3226
762
        assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3227
762
        IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3228
762
        Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3229
762
        IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3230
762
      }
3231
1.30k
      Phi.replaceUsesOfWith(PostIncV, IVOper);
3232
1.30k
      DeadInsts.emplace_back(PostIncV);
3233
1.30k
    }
3234
1.30k
  }
3235
1.30k
}
3236
3237
103k
void LSRInstance::CollectFixupsAndInitialFormulae() {
3238
103k
  BranchInst *ExitBranch = nullptr;
3239
103k
  bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &LibInfo);
3240
103k
3241
502k
  for (const IVStrideUse &U : IU) {
3242
502k
    Instruction *UserInst = U.getUser();
3243
502k
    // Skip IV users that are part of profitable IV Chains.
3244
502k
    User::op_iterator UseI =
3245
502k
        find(UserInst->operands(), U.getOperandValToReplace());
3246
502k
    assert(UseI != UserInst->op_end() && "cannot find IV operand");
3247
502k
    if (IVIncSet.count(UseI)) {
3248
6.09k
      LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3249
6.09k
      continue;
3250
6.09k
    }
3251
496k
3252
496k
    LSRUse::KindType Kind = LSRUse::Basic;
3253
496k
    MemAccessTy AccessTy;
3254
496k
    if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3255
333k
      Kind = LSRUse::Address;
3256
333k
      AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3257
333k
    }
3258
496k
3259
496k
    const SCEV *S = IU.getExpr(U);
3260
496k
    PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3261
496k
3262
496k
    // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3263
496k
    // (N - i == 0), and this allows (N - i) to be the expression that we work
3264
496k
    // with rather than just N or i, so we can consider the register
3265
496k
    // requirements for both N and i at the same time. Limiting this code to
3266
496k
    // equality icmps is not a problem because all interesting loops use
3267
496k
    // equality icmps, thanks to IndVarSimplify.
3268
496k
    if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
3269
105k
      if (CI->isEquality()) {
3270
68.3k
        // If CI can be saved in some target, like replaced inside hardware loop
3271
68.3k
        // in PowerPC, no need to generate initial formulae for it.
3272
68.3k
        if (SaveCmp && 
CI == dyn_cast<ICmpInst>(ExitBranch->getCondition())149
)
3273
147
          continue;
3274
68.2k
        // Swap the operands if needed to put the OperandValToReplace on the
3275
68.2k
        // left, for consistency.
3276
68.2k
        Value *NV = CI->getOperand(1);
3277
68.2k
        if (NV == U.getOperandValToReplace()) {
3278
489
          CI->setOperand(1, CI->getOperand(0));
3279
489
          CI->setOperand(0, NV);
3280
489
          NV = CI->getOperand(1);
3281
489
          Changed = true;
3282
489
        }
3283
68.2k
3284
68.2k
        // x == y  -->  x - y == 0
3285
68.2k
        const SCEV *N = SE.getSCEV(NV);
3286
68.2k
        if (SE.isLoopInvariant(N, L) && 
isSafeToExpand(N, SE)67.4k
) {
3287
67.3k
          // S is normalized, so normalize N before folding it into S
3288
67.3k
          // to keep the result normalized.
3289
67.3k
          N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3290
67.3k
          Kind = LSRUse::ICmpZero;
3291
67.3k
          S = SE.getMinusSCEV(N, S);
3292
67.3k
        }
3293
68.2k
3294
68.2k
        // -1 and the negations of all interesting strides (except the negation
3295
68.2k
        // of -1) are now also interesting.
3296
142k
        for (size_t i = 0, e = Factors.size(); i != e; 
++i73.8k
)
3297
73.8k
          if (Factors[i] != -1)
3298
71.0k
            Factors.insert(-(uint64_t)Factors[i]);
3299
68.2k
        Factors.insert(-1);
3300
68.2k
      }
3301
496k
3302
496k
    // Get or create an LSRUse.
3303
496k
    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
3304
496k
    size_t LUIdx = P.first;
3305
496k
    int64_t Offset = P.second;
3306
496k
    LSRUse &LU = Uses[LUIdx];
3307
496k
3308
496k
    // Record the fixup.
3309
496k
    LSRFixup &LF = LU.getNewFixup();
3310
496k
    LF.UserInst = UserInst;
3311
496k
    LF.OperandValToReplace = U.getOperandValToReplace();
3312
496k
    LF.PostIncLoops = TmpPostIncLoops;
3313
496k
    LF.Offset = Offset;
3314
496k
    LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3315
496k
3316
496k
    if (!LU.WidestFixupType ||
3317
496k
        SE.getTypeSizeInBits(LU.WidestFixupType) <
3318
50.0k
        SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3319
446k
      LU.WidestFixupType = LF.OperandValToReplace->getType();
3320
496k
3321
496k
    // If this is the first use of this LSRUse, give it a formula.
3322
496k
    if (LU.Formulae.empty()) {
3323
446k
      InsertInitialFormula(S, LU, LUIdx);
3324
446k
      CountRegisters(LU.Formulae.back(), LUIdx);
3325
446k
    }
3326
496k
  }
3327
103k
3328
103k
  LLVM_DEBUG(print_fixups(dbgs()));
3329
103k
}
3330
3331
/// Insert a formula for the given expression into the given use, separating out
3332
/// loop-variant portions from loop-invariant and loop-computable portions.
3333
void
3334
446k
LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
3335
446k
  // Mark uses whose expressions cannot be expanded.
3336
446k
  if (!isSafeToExpand(S, SE))
3337
83
    LU.RigidFormula = true;
3338
446k
3339
446k
  Formula F;
3340
446k
  F.initialMatch(S, L, SE);
3341
446k
  bool Inserted = InsertFormula(LU, LUIdx, F);
3342
446k
  assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3343
446k
}
3344
3345
/// Insert a simple single-register formula for the given expression into the
3346
/// given use.
3347
void
3348
LSRInstance::InsertSupplementalFormula(const SCEV *S,
3349
9.86k
                                       LSRUse &LU, size_t LUIdx) {
3350
9.86k
  Formula F;
3351
9.86k
  F.BaseRegs.push_back(S);
3352
9.86k
  F.HasBaseReg = true;
3353
9.86k
  bool Inserted = InsertFormula(LU, LUIdx, F);
3354
9.86k
  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3355
9.86k
}
3356
3357
/// Note which registers are used by the given formula, updating RegUses.
3358
4.55M
void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3359
4.55M
  if (F.ScaledReg)
3360
2.94M
    RegUses.countRegister(F.ScaledReg, LUIdx);
3361
4.55M
  for (const SCEV *BaseReg : F.BaseRegs)
3362
5.61M
    RegUses.countRegister(BaseReg, LUIdx);
3363
4.55M
}
3364
3365
/// If the given formula has not yet been inserted, add it to the list, and
3366
/// return true. Return false otherwise.
3367
12.5M
bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3368
12.5M
  // Do not insert formula that we will not be able to expand.
3369
12.5M
  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3370
12.5M
         "Formula is illegal");
3371
12.5M
3372
12.5M
  if (!LU.InsertFormula(F, *L))
3373
8.42M
    return false;
3374
4.10M
3375
4.10M
  CountRegisters(F, LUIdx);
3376
4.10M
  return true;
3377
4.10M
}
3378
3379
/// Check for other uses of loop-invariant values which we're tracking. These
3380
/// other uses will pin these values in registers, making them less profitable
3381
/// for elimination.
3382
/// TODO: This currently misses non-constant addrec step registers.
3383
/// TODO: Should this give more weight to users inside the loop?
3384
void
3385
103k
LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3386
103k
  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3387
103k
  SmallPtrSet<const SCEV *, 32> Visited;
3388
103k
3389
2.19M
  while (!Worklist.empty()) {
3390
2.08M
    const SCEV *S = Worklist.pop_back_val();
3391
2.08M
3392
2.08M
    // Don't process the same SCEV twice
3393
2.08M
    if (!Visited.insert(S).second)
3394
565k
      continue;
3395
1.52M
3396
1.52M
    if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3397
752k
      Worklist.append(N->op_begin(), N->op_end());
3398
769k
    else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
3399
57.5k
      Worklist.push_back(C->getOperand());
3400
711k
    else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3401
11.5k
      Worklist.push_back(D->getLHS());
3402
11.5k
      Worklist.push_back(D->getRHS());
3403
700k
    } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3404
213k
      const Value *V = US->getValue();
3405
213k
      if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3406
140k
        // Look for instructions defined outside the loop.
3407
140k
        if (L->contains(Inst)) 
continue24.1k
;
3408
72.7k
      } else if (isa<UndefValue>(V))
3409
100
        // Undef doesn't have a live range, so it doesn't matter.
3410
100
        continue;
3411
2.57M
      
for (const Use &U : V->uses())188k
{
3412
2.57M
        const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3413
2.57M
        // Ignore non-instructions.
3414
2.57M
        if (!UserInst)
3415
278k
          continue;
3416
2.29M
        // Ignore instructions in other functions (as can happen with
3417
2.29M
        // Constants).
3418
2.29M
        if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3419
328k
          continue;
3420
1.96M
        // Ignore instructions not dominated by the loop.
3421
1.96M
        const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3422
1.88M
          UserInst->getParent() :
3423
1.96M
          cast<PHINode>(UserInst)->getIncomingBlock(
3424
86.4k
            PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3425
1.96M
        if (!DT.dominates(L->getHeader(), UseBB))
3426
1.82M
          continue;
3427
143k
        // Don't bother if the instruction is in a BB which ends in an EHPad.
3428
143k
        if (UseBB->getTerminator()->isEHPad())
3429
2
          continue;
3430
143k
        // Don't bother rewriting PHIs in catchswitch blocks.
3431
143k
        if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3432
1
          continue;
3433
143k
        // Ignore uses which are part of other SCEV expressions, to avoid
3434
143k
        // analyzing them multiple times.
3435
143k
        if (SE.isSCEVable(UserInst->getType())) {
3436
142k
          const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3437
142k
          // If the user is a no-op, look through to its uses.
3438
142k
          if (!isa<SCEVUnknown>(UserS))
3439
124k
            continue;
3440
18.1k
          if (UserS == US) {
3441
355
            Worklist.push_back(
3442
355
              SE.getUnknown(const_cast<Instruction *>(UserInst)));
3443
355
            continue;
3444
355
          }
3445
19.0k
        }
3446
19.0k
        // Ignore icmp instructions which are already being analyzed.
3447
19.0k
        if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3448
10.2k
          unsigned OtherIdx = !U.getOperandNo();
3449
10.2k
          Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3450
10.2k
          if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3451
9.23k
            continue;
3452
9.86k
        }
3453
9.86k
3454
9.86k
        std::pair<size_t, int64_t> P = getUse(
3455
9.86k
            S, LSRUse::Basic, MemAccessTy());
3456
9.86k
        size_t LUIdx = P.first;
3457
9.86k
        int64_t Offset = P.second;
3458
9.86k
        LSRUse &LU = Uses[LUIdx];
3459
9.86k
        LSRFixup &LF = LU.getNewFixup();
3460
9.86k
        LF.UserInst = const_cast<Instruction *>(UserInst);
3461
9.86k
        LF.OperandValToReplace = U;
3462
9.86k
        LF.Offset = Offset;
3463
9.86k
        LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3464
9.86k
        if (!LU.WidestFixupType ||
3465
9.86k
            SE.getTypeSizeInBits(LU.WidestFixupType) <
3466
0
            SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3467
9.86k
          LU.WidestFixupType = LF.OperandValToReplace->getType();
3468
9.86k
        InsertSupplementalFormula(US, LU, LUIdx);
3469
9.86k
        CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3470
9.86k
        break;
3471
9.86k
      }
3472
188k
    }
3473
1.52M
  }
3474
103k
}
3475
3476
/// Split S into subexpressions which can be pulled out into separate
3477
/// registers. If C is non-null, multiply each subexpression by C.
3478
///
3479
/// Return remainder expression after factoring the subexpressions captured by
3480
/// Ops. If Ops is complete, return NULL.
3481
static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3482
                                   SmallVectorImpl<const SCEV *> &Ops,
3483
                                   const Loop *L,
3484
                                   ScalarEvolution &SE,
3485
7.04M
                                   unsigned Depth = 0) {
3486
7.04M
  // Arbitrarily cap recursion to protect compile time.
3487
7.04M
  if (Depth >= 3)
3488
241k
    return S;
3489
6.79M
3490
6.79M
  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3491
825k
    // Break out add operands.
3492
1.80M
    for (const SCEV *S : Add->operands()) {
3493
1.80M
      const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3494
1.80M
      if (Remainder)
3495
1.47M
        Ops.push_back(C ? 
SE.getMulExpr(C, Remainder)7.63k
:
Remainder1.47M
);
3496
1.80M
    }
3497
825k
    return nullptr;
3498
5.97M
  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3499
1.97M
    // Split a non-zero base out of an addrec.
3500
1.97M
    if (AR->getStart()->isZero() || 
!AR->isAffine()1.07M
)
3501
904k
      return S;
3502
1.07M
3503
1.07M
    const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3504
1.07M
                                            C, Ops, L, SE, Depth+1);
3505
1.07M
    // Split the non-zero AddRec unless it is part of a nested recurrence that
3506
1.07M
    // does not pertain to this loop.
3507
1.07M
    if (Remainder && 
(559k
AR->getLoop() == L559k
||
!isa<SCEVAddRecExpr>(Remainder)49.0k
)) {
3508
555k
      Ops.push_back(C ? 
SE.getMulExpr(C, Remainder)64
:
Remainder555k
);
3509
555k
      Remainder = nullptr;
3510
555k
    }
3511
1.07M
    if (Remainder != AR->getStart()) {
3512
1.07M
      if (!Remainder)
3513
1.07M
        Remainder = SE.getConstant(AR->getType(), 0);
3514
1.07M
      return SE.getAddRecExpr(Remainder,
3515
1.07M
                              AR->getStepRecurrence(SE),
3516
1.07M
                              AR->getLoop(),
3517
1.07M
                              //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3518
1.07M
                              SCEV::FlagAnyWrap);
3519
1.07M
    }
3520
3.99M
  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3521
697k
    // Break (C * (a + b + c)) into C*a + C*b + C*c.
3522
697k
    if (Mul->getNumOperands() != 2)
3523
3.22k
      return S;
3524
693k
    if (const SCEVConstant *Op0 =
3525
686k
        dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3526
686k
      C = C ? 
cast<SCEVConstant>(SE.getMulExpr(C, Op0))52
:
Op0686k
;
3527
686k
      const SCEV *Remainder =
3528
686k
        CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3529
686k
      if (Remainder)
3530
682k
        Ops.push_back(SE.getMulExpr(C, Remainder));
3531
686k
      return nullptr;
3532
686k
    }
3533
3.30M
  }
3534
3.30M
  return S;
3535
3.30M
}
3536
3537
/// Return true if the SCEV represents a value that may end up as a
3538
/// post-increment operation.
3539
static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
3540
                              LSRUse &LU, const SCEV *S, const Loop *L,
3541
3.63k
                              ScalarEvolution &SE) {
3542
3.63k
  if (LU.Kind != LSRUse::Address ||
3543
3.63k
      
!LU.AccessTy.getType()->isIntOrIntVectorTy()2.18k
)
3544
1.54k
    return false;
3545
2.08k
  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
3546
2.08k
  if (!AR)
3547
755
    return false;
3548
1.32k
  const SCEV *LoopStep = AR->getStepRecurrence(SE);
3549
1.32k
  if (!isa<SCEVConstant>(LoopStep))
3550
156
    return false;
3551
1.17k
  if (LU.AccessTy.getType()->getScalarSizeInBits() !=
3552
1.17k
      LoopStep->getType()->getScalarSizeInBits())
3553
878
    return false;
3554
293
  // Check if a post-indexed load/store can be used.
3555
293
  if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
3556
293
      
TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())0
) {
3557
293
    const SCEV *LoopStart = AR->getStart();
3558
293
    if (!isa<SCEVConstant>(LoopStart) && 
SE.isLoopInvariant(LoopStart, L)262
)
3559
262
      return true;
3560
31
  }
3561
31
  return false;
3562
31
}
3563
3564
/// Helper function for LSRInstance::GenerateReassociations.
3565
void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3566
                                             const Formula &Base,
3567
                                             unsigned Depth, size_t Idx,
3568
3.47M
                                             bool IsScaledReg) {
3569
3.47M
  const SCEV *BaseReg = IsScaledReg ? 
Base.ScaledReg1.33M
:
Base.BaseRegs[Idx]2.14M
;
3570
3.47M
  // Don't generate reassociations for the base register of a value that
3571
3.47M
  // may generate a post-increment operator. The reason is that the
3572
3.47M
  // reassociations cause extra base+register formula to be created,
3573
3.47M
  // and possibly chosen, but the post-increment is more efficient.
3574
3.47M
  if (TTI.shouldFavorPostInc() && 
mayUsePostIncMode(TTI, LU, BaseReg, L, SE)3.18k
)
3575
242
    return;
3576
3.47M
  SmallVector<const SCEV *, 8> AddOps;
3577
3.47M
  const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3578
3.47M
  if (Remainder)
3579
2.80M
    AddOps.push_back(Remainder);
3580
3.47M
3581
3.47M
  if (AddOps.size() == 1)
3582
2.07M
    return;
3583
1.40M
3584
1.40M
  for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
3585
1.40M
                                                     JE = AddOps.end();
3586
4.85M
       J != JE; 
++J3.45M
) {
3587
3.45M
    // Loop-variant "unknown" values are uninteresting; we won't be able to
3588
3.45M
    // do anything meaningful with them.
3589
3.45M
    if (isa<SCEVUnknown>(*J) && 
!SE.isLoopInvariant(*J, L)891k
)
3590
1.52k
      continue;
3591
3.45M
3592
3.45M
    // Don't pull a constant into a register if the constant could be folded
3593
3.45M
    // into an immediate field.
3594
3.45M
    if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3595
3.45M
                         LU.AccessTy, *J, Base.getNumRegs() > 1))
3596
213k
      continue;
3597
3.23M
3598
3.23M
    // Collect all operands except *J.
3599
3.23M
    SmallVector<const SCEV *, 8> InnerAddOps(
3600
3.23M
        ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3601
3.23M
    InnerAddOps.append(std::next(J),
3602
3.23M
                       ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3603
3.23M
3604
3.23M
    // Don't leave just a constant behind in a register if the constant could
3605
3.23M
    // be folded into an immediate field.
3606
3.23M
    if (InnerAddOps.size() == 1 &&
3607
3.23M
        isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3608
1.74M
                         LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3609
10.0k
      continue;
3610
3.22M
3611
3.22M
    const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3612
3.22M
    if (InnerSum->isZero())
3613
1
      continue;
3614
3.22M
    Formula F = Base;
3615
3.22M
3616
3.22M
    // Add the remaining pieces of the add back into the new formula.
3617
3.22M
    const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3618
3.22M
    if (InnerSumSC && 
SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64499k
&&
3619
3.22M
        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3620
499k
                                InnerSumSC->getValue()->getZExtValue())) {
3621
292k
      F.UnfoldedOffset =
3622
292k
          (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
3623
292k
      if (IsScaledReg)
3624
118k
        F.ScaledReg = nullptr;
3625
174k
      else
3626
174k
        F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3627
2.93M
    } else if (IsScaledReg)
3628
1.22M
      F.ScaledReg = InnerSum;
3629
1.71M
    else
3630
1.71M
      F.BaseRegs[Idx] = InnerSum;
3631
3.22M
3632
3.22M
    // Add J as its own register, or an unfolded immediate.
3633
3.22M
    const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3634
3.22M
    if (SC && 
SE.getTypeSizeInBits(SC->getType()) <= 64732k
&&
3635
3.22M
        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3636
732k
                                SC->getValue()->getZExtValue()))
3637
514k
      F.UnfoldedOffset =
3638
514k
          (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
3639
2.71M
    else
3640
2.71M
      F.BaseRegs.push_back(*J);
3641
3.22M
    // We may have changed the number of register in base regs, adjust the
3642
3.22M
    // formula accordingly.
3643
3.22M
    F.canonicalize(*L);
3644
3.22M
3645
3.22M
    if (InsertFormula(LU, LUIdx, F))
3646
1.46M
      // If that formula hadn't been seen before, recurse to find more like
3647
1.46M
      // it.
3648
1.46M
      // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
3649
1.46M
      // Because just Depth is not enough to bound compile time.
3650
1.46M
      // This means that every time AddOps.size() is greater 16^x we will add
3651
1.46M
      // x to Depth.
3652
1.46M
      GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
3653
1.46M
                             Depth + 1 + (Log2_32(AddOps.size()) >> 2));
3654
3.22M
  }
3655
1.40M
}
3656
3657
/// Split out subexpressions from adds and the bases of addrecs.
3658
void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3659
1.92M
                                         Formula Base, unsigned Depth) {
3660
1.92M
  assert(Base.isCanonical(*L) && "Input must be in the canonical form");
3661
1.92M
  // Arbitrarily cap recursion to protect compile time.
3662
1.92M
  if (Depth >= 3)
3663
143k
    return;
3664
1.78M
3665
3.92M
  
for (size_t i = 0, e = Base.BaseRegs.size(); 1.78M
i != e;
++i2.14M
)
3666
2.14M
    GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
3667
1.78M
3668
1.78M
  if (Base.Scale == 1)
3669
1.33M
    GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
3670
1.33M
                               /* Idx */ -1, /* IsScaledReg */ true);
3671
1.78M
}
3672
3673
///  Generate a formula consisting of all of the loop-dominating registers added
3674
/// into a single register.
3675
void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
3676
1.92M
                                       Formula Base) {
3677
1.92M
  // This method is only interesting on a plurality of registers.
3678
1.92M
  if (Base.BaseRegs.size() + (Base.Scale == 1) +
3679
1.92M
      (Base.UnfoldedOffset != 0) <= 1)
3680
425k
    return;
3681
1.50M
3682
1.50M
  // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
3683
1.50M
  // processing the formula.
3684
1.50M
  Base.unscale();
3685
1.50M
  SmallVector<const SCEV *, 4> Ops;
3686
1.50M
  Formula NewBase = Base;
3687
1.50M
  NewBase.BaseRegs.clear();
3688
1.50M
  Type *CombinedIntegerType = nullptr;
3689
3.53M
  for (const SCEV *BaseReg : Base.BaseRegs) {
3690
3.53M
    if (SE.properlyDominates(BaseReg, L->getHeader()) &&
3691
3.53M
        
!SE.hasComputableLoopEvolution(BaseReg, L)3.46M
) {
3692
1.96M
      if (!CombinedIntegerType)
3693
1.43M
        CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
3694
1.96M
      Ops.push_back(BaseReg);
3695
1.96M
    }
3696
1.56M
    else
3697
1.56M
      NewBase.BaseRegs.push_back(BaseReg);
3698
3.53M
  }
3699
1.50M
3700
1.50M
  // If no register is relevant, we're done.
3701
1.50M
  if (Ops.size() == 0)
3702
68.0k
    return;
3703
1.43M
3704
1.43M
  // Utility function for generating the required variants of the combined
3705
1.43M
  // registers.
3706
1.43M
  auto GenerateFormula = [&](const SCEV *Sum) {
3707
810k
    Formula F = NewBase;
3708
810k
3709
810k
    // TODO: If Sum is zero, it probably means ScalarEvolution missed an
3710
810k
    // opportunity to fold something. For now, just ignore such cases
3711
810k
    // rather than proceed with zero in a register.
3712
810k
    if (Sum->isZero())
3713
0
      return;
3714
810k
3715
810k
    F.BaseRegs.push_back(Sum);
3716
810k
    F.canonicalize(*L);
3717
810k
    (void)InsertFormula(LU, LUIdx, F);
3718
810k
  };
3719
1.43M
3720
1.43M
  // If we collected at least two registers, generate a formula combining them.
3721
1.43M
  if (Ops.size() > 1) {
3722
487k
    SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
3723
487k
    GenerateFormula(SE.getAddExpr(OpsCopy));
3724
487k
  }
3725
1.43M
3726
1.43M
  // If we have an unfolded offset, generate a formula combining it with the
3727
1.43M
  // registers collected.
3728
1.43M
  if (NewBase.UnfoldedOffset) {
3729
322k
    assert(CombinedIntegerType && "Missing a type for the unfolded offset");
3730
322k
    Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
3731
322k
                                 true));
3732
322k
    NewBase.UnfoldedOffset = 0;
3733
322k
    GenerateFormula(SE.getAddExpr(Ops));
3734
322k
  }
3735
1.43M
}
3736
3737
/// Helper function for LSRInstance::GenerateSymbolicOffsets.
3738
void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
3739
                                              const Formula &Base, size_t Idx,
3740
4.48M
                                              bool IsScaledReg) {
3741
4.48M
  const SCEV *G = IsScaledReg ? 
Base.ScaledReg1.73M
:
Base.BaseRegs[Idx]2.74M
;
3742
4.48M
  GlobalValue *GV = ExtractSymbol(G, SE);
3743
4.48M
  if (G->isZero() || 
!GV4.27M
)
3744
4.26M
    return;
3745
222k
  Formula F = Base;
3746
222k
  F.BaseGV = GV;
3747
222k
  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3748
222k
    return;
3749
174
  if (IsScaledReg)
3750
10
    F.ScaledReg = G;
3751
164
  else
3752
164
    F.BaseRegs[Idx] = G;
3753
174
  (void)InsertFormula(LU, LUIdx, F);
3754
174
}
3755
3756
/// Generate reuse formulae using symbolic offsets.
3757
void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
3758
2.18M
                                          Formula Base) {
3759
2.18M
  // We can't add a symbolic offset if the address already contains one.
3760
2.18M
  if (Base.BaseGV) 
return0
;
3761
2.18M
3762
4.93M
  
for (size_t i = 0, e = Base.BaseRegs.size(); 2.18M
i != e;
++i2.74M
)
3763
2.74M
    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
3764
2.18M
  if (Base.Scale == 1)
3765
1.73M
    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
3766
1.73M
                                /* IsScaledReg */ true);
3767
2.18M
}
3768
3769
/// Helper function for LSRInstance::GenerateConstantOffsets.
3770
void LSRInstance::GenerateConstantOffsetsImpl(
3771
    LSRUse &LU, unsigned LUIdx, const Formula &Base,
3772
4.48M
    const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
3773
4.48M
3774
4.52M
  auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
3775
4.52M
    Formula F = Base;
3776
4.52M
    F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
3777
4.52M
3778
4.52M
    if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
3779
4.52M
                   LU.AccessTy, F)) {
3780
4.52M
      // Add the offset to the base register.
3781
4.52M
      const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
3782
4.52M
      // If it cancelled out, drop the base register, otherwise update it.
3783
4.52M
      if (NewG->isZero()) {
3784
1
        if (IsScaledReg) {
3785
0
          F.Scale = 0;
3786
0
          F.ScaledReg = nullptr;
3787
0
        } else
3788
1
          F.deleteBaseReg(F.BaseRegs[Idx]);
3789
1
        F.canonicalize(*L);
3790
4.52M
      } else if (IsScaledReg)
3791
1.75M
        F.ScaledReg = NewG;
3792
2.77M
      else
3793
2.77M
        F.BaseRegs[Idx] = NewG;
3794
4.52M
3795
4.52M
      (void)InsertFormula(LU, LUIdx, F);
3796
4.52M
    }
3797
4.52M
  };
3798
4.48M
3799
4.48M
  const SCEV *G = IsScaledReg ? 
Base.ScaledReg1.73M
:
Base.BaseRegs[Idx]2.74M
;
3800
4.48M
3801
4.48M
  // With constant offsets and constant steps, we can generate pre-inc
3802
4.48M
  // accesses by having the offset equal the step. So, for access #0 with a
3803
4.48M
  // step of 8, we generate a G - 8 base which would require the first access
3804
4.48M
  // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
3805
4.48M
  // for itself and hopefully becomes the base for other accesses. This means
3806
4.48M
  // means that a single pre-indexed access can be generated to become the new
3807
4.48M
  // base pointer for each iteration of the loop, resulting in no extra add/sub
3808
4.48M
  // instructions for pointer updating.
3809
4.48M
  if (FavorBackedgeIndex && 
LU.Kind == LSRUse::Address8.05k
) {
3810
7.28k
    if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
3811
3.95k
      if (auto *StepRec =
3812
3.90k
          dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
3813
3.90k
        const APInt &StepInt = StepRec->getAPInt();
3814
3.90k
        int64_t Step = StepInt.isNegative() ?
3815
2.95k
          
StepInt.getSExtValue()950
: StepInt.getZExtValue();
3816
3.90k
3817
3.90k
        for (int64_t Offset : Worklist) {
3818
3.90k
          Offset -= Step;
3819
3.90k
          GenerateOffset(G, Offset);
3820
3.90k
        }
3821
3.90k
      }
3822
3.95k
    }
3823
7.28k
  }
3824
4.48M
  for (int64_t Offset : Worklist)
3825
4.52M
    GenerateOffset(G, Offset);
3826
4.48M
3827
4.48M
  int64_t Imm = ExtractImmediate(G, SE);
3828
4.48M
  if (G->isZero() || 
Imm == 04.36M
)
3829
3.28M
    return;
3830
1.20M
  Formula F = Base;
3831
1.20M
  F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
3832
1.20M
  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3833
387k
    return;
3834
817k
  if (IsScaledReg)
3835
324k
    F.ScaledReg = G;
3836
493k
  else
3837
493k
    F.BaseRegs[Idx] = G;
3838
817k
  (void)InsertFormula(LU, LUIdx, F);
3839
817k
}
3840
3841
/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
3842
void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
3843
2.18M
                                          Formula Base) {
3844
2.18M
  // TODO: For now, just add the min and max offset, because it usually isn't
3845
2.18M
  // worthwhile looking at everything inbetween.
3846
2.18M
  SmallVector<int64_t, 2> Worklist;
3847
2.18M
  Worklist.push_back(LU.MinOffset);
3848
2.18M
  if (LU.MaxOffset != LU.MinOffset)
3849
17.2k
    Worklist.push_back(LU.MaxOffset);
3850
2.18M
3851
4.93M
  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; 
++i2.74M
)
3852
2.74M
    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
3853
2.18M
  if (Base.Scale == 1)
3854
1.73M
    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
3855
1.73M
                                /* IsScaledReg */ true);
3856
2.18M
}
3857
3858
/// For ICmpZero, check to see if we can scale up the comparison. For example, x
3859
/// == y -> x*c == y*c.
3860
void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
3861
2.40M
                                         Formula Base) {
3862
2.40M
  if (LU.Kind != LSRUse::ICmpZero) 
return2.20M
;
3863
195k
3864
195k
  // Determine the integer type for the base formula.
3865
195k
  Type *IntTy = Base.getType();
3866
195k
  if (!IntTy) 
return0
;
3867
195k
  if (SE.getTypeSizeInBits(IntTy) > 64) 
return0
;
3868
195k
3869
195k
  // Don't do this if there is more than one offset.
3870
195k
  if (LU.MinOffset != LU.MaxOffset) 
return0
;
3871
195k
3872
195k
  // Check if transformation is valid. It is illegal to multiply pointer.
3873
195k
  if (Base.ScaledReg && 
Base.ScaledReg->getType()->isPointerTy()114k
)
3874
6.64k
    return;
3875
188k
  for (const SCEV *BaseReg : Base.BaseRegs)
3876
209k
    if (BaseReg->getType()->isPointerTy())
3877
25.3k
      return;
3878
188k
  assert(!Base.BaseGV && "ICmpZero use is not legal!");
3879
163k
3880
163k
  // Check each interesting stride.
3881
492k
  for (int64_t Factor : Factors) {
3882
492k
    // Check that the multiplication doesn't overflow.
3883
492k
    if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && 
Factor == -10
)
3884
0
      continue;
3885
492k
    int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
3886
492k
    if (NewBaseOffset / Factor != Base.BaseOffset)
3887
2
      continue;
3888
492k
    // If the offset will be truncated at this use, check that it is in bounds.
3889
492k
    if (!IntTy->isPointerTy() &&
3890
492k
        !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
3891
139
      continue;
3892
492k
3893
492k
    // Check that multiplying with the use offset doesn't overflow.
3894
492k
    int64_t Offset = LU.MinOffset;
3895
492k
    if (Offset == std::numeric_limits<int64_t>::min() && 
Factor == -10
)
3896
0
      continue;
3897
492k
    Offset = (uint64_t)Offset * Factor;
3898
492k
    if (Offset / Factor != LU.MinOffset)
3899
0
      continue;
3900
492k
    // If the offset will be truncated at this use, check that it is in bounds.
3901
492k
    if (!IntTy->isPointerTy() &&
3902
492k
        !ConstantInt::isValueValidForType(IntTy, Offset))
3903
0
      continue;
3904
492k
3905
492k
    Formula F = Base;
3906
492k
    F.BaseOffset = NewBaseOffset;
3907
492k
3908
492k
    // Check that this scale is legal.
3909
492k
    if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
3910
865
      continue;
3911
491k
3912
491k
    // Compensate for the use having MinOffset built into it.
3913
491k
    F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
3914
491k
3915
491k
    const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3916
491k
3917
491k
    // Check that multiplying with each base register doesn't overflow.
3918
810k
    for (size_t i = 0, e = F.BaseRegs.size(); i != e; 
++i318k
) {
3919
515k
      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
3920
515k
      if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
3921
196k
        goto next;
3922
515k
    }
3923
491k
3924
491k
    // Check that multiplying with the scaled register doesn't overflow.
3925
491k
    
if (295k
F.ScaledReg295k
) {
3926
145k
      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
3927
145k
      if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
3928
27.0k
        continue;
3929
268k
    }
3930
268k
3931
268k
    // Check that multiplying with the unfolded offset doesn't overflow.
3932
268k
    if (F.UnfoldedOffset != 0) {
3933
7.55k
      if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
3934
7.55k
          
Factor == -10
)
3935
0
        continue;
3936
7.55k
      F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
3937
7.55k
      if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
3938
0
        continue;
3939
7.55k
      // If the offset will be truncated, check that it is in bounds.
3940
7.55k
      if (!IntTy->isPointerTy() &&
3941
7.55k
          !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
3942
193
        continue;
3943
267k
    }
3944
267k
3945
267k
    // If we make it here and it's legal, add it.
3946
267k
    (void)InsertFormula(LU, LUIdx, F);
3947
464k
  next:;
3948
464k
  }
3949
163k
}
3950
3951
/// Generate stride factor reuse formulae by making use of scaled-offset address
3952
/// modes, for example.
3953
2.64M
void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
3954
2.64M
  // Determine the integer type for the base formula.
3955
2.64M
  Type *IntTy = Base.getType();
3956
2.64M
  if (!IntTy) 
return0
;
3957
2.64M
3958
2.64M
  // If this Formula already has a scaled register, we can't add another one.
3959
2.64M
  // Try to unscale the formula to generate a better scale.
3960
2.64M
  if (Base.Scale != 0 && 
!Base.unscale()1.92M
)
3961
0
    return;
3962
2.64M
3963
2.64M
  assert(Base.Scale == 0 && "unscale did not did its job!");
3964
2.64M
3965
2.64M
  // Check each interesting stride.
3966
7.17M
  for (int64_t Factor : Factors) {
3967
7.17M
    Base.Scale = Factor;
3968
7.17M
    Base.HasBaseReg = Base.BaseRegs.size() > 1;
3969
7.17M
    // Check whether this scale is going to be legal.
3970
7.17M
    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
3971
7.17M
                    Base)) {
3972
5.73M
      // As a special-case, handle special out-of-loop Basic users specially.
3973
5.73M
      // TODO: Reconsider this special case.
3974
5.73M
      if (LU.Kind == LSRUse::Basic &&
3975
5.73M
          isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
3976
264k
                     LU.AccessTy, Base) &&
3977
5.73M
          
LU.AllFixupsOutsideLoop82.3k
)
3978
24.2k
        LU.Kind = LSRUse::Special;
3979
5.71M
      else
3980
5.71M
        continue;
3981
1.46M
    }
3982
1.46M
    // For an ICmpZero, negating a solitary base register won't lead to
3983
1.46M
    // new solutions.
3984
1.46M
    if (LU.Kind == LSRUse::ICmpZero &&
3985
1.46M
        
!Base.HasBaseReg494k
&&
Base.BaseOffset == 0249k
&&
!Base.BaseGV198k
)
3986
198k
      continue;
3987
1.26M
    // For each addrec base reg, if its loop is current loop, apply the scale.
3988
3.78M
    
for (size_t i = 0, e = Base.BaseRegs.size(); 1.26M
i != e;
++i2.51M
) {
3989
2.51M
      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
3990
2.51M
      if (AR && 
(1.34M
AR->getLoop() == L1.34M
||
LU.AllFixupsOutsideLoop87.3k
)) {
3991
1.26M
        const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3992
1.26M
        if (FactorS->isZero())
3993
0
          continue;
3994
1.26M
        // Divide out the factor, ignoring high bits, since we'll be
3995
1.26M
        // scaling the value back up in the end.
3996
1.26M
        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
3997
1.14M
          // TODO: This could be optimized to avoid all the copying.
3998
1.14M
          Formula F = Base;
3999
1.14M
          F.ScaledReg = Quotient;
4000
1.14M
          F.deleteBaseReg(F.BaseRegs[i]);
4001
1.14M
          // The canonical representation of 1*reg is reg, which is already in
4002
1.14M
          // Base. In that case, do not try to insert the formula, it will be
4003
1.14M
          // rejected anyway.
4004
1.14M
          if (F.Scale == 1 && 
(567k
F.BaseRegs.empty()567k
||
4005
567k
                               
(413k
AR->getLoop() != L413k
&&
LU.AllFixupsOutsideLoop960
)))
4006
155k
            continue;
4007
985k
          // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4008
985k
          // non canonical Formula with ScaledReg's loop not being L.
4009
985k
          if (F.Scale == 1 && 
LU.AllFixupsOutsideLoop412k
)
4010
13.2k
            F.canonicalize(*L);
4011
985k
          (void)InsertFormula(LU, LUIdx, F);
4012
985k
        }
4013
1.26M
      }
4014
2.51M
    }
4015
1.26M
  }
4016
2.64M
}
4017
4018
/// Generate reuse formulae from different IV types.
4019
3.17M
void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4020
3.17M
  // Don't bother truncating symbolic values.
4021
3.17M
  if (Base.BaseGV) 
return304
;
4022
3.17M
4023
3.17M
  // Determine the integer type for the base formula.
4024
3.17M
  Type *DstTy = Base.getType();
4025
3.17M
  if (!DstTy) 
return0
;
4026
3.17M
  DstTy = SE.getEffectiveSCEVType(DstTy);
4027
3.17M
4028
3.17M
  for (Type *SrcTy : Types) {
4029
2.52M
    if (SrcTy != DstTy && 
TTI.isTruncateFree(SrcTy, DstTy)1.26M
) {
4030
81.3k
      Formula F = Base;
4031
81.3k
4032
81.3k
      // Sometimes SCEV is able to prove zero during ext transform. It may
4033
81.3k
      // happen if SCEV did not do all possible transforms while creating the
4034
81.3k
      // initial node (maybe due to depth limitations), but it can do them while
4035
81.3k
      // taking ext.
4036
81.3k
      if (F.ScaledReg) {
4037
49.7k
        const SCEV *NewScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
4038
49.7k
        if (NewScaledReg->isZero())
4039
0
         continue;
4040
49.7k
        F.ScaledReg = NewScaledReg;
4041
49.7k
      }
4042
81.3k
      bool HasZeroBaseReg = false;
4043
101k
      for (const SCEV *&BaseReg : F.BaseRegs) {
4044
101k
        const SCEV *NewBaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
4045
101k
        if (NewBaseReg->isZero()) {
4046
5
          HasZeroBaseReg = true;
4047
5
          break;
4048
5
        }
4049
101k
        BaseReg = NewBaseReg;
4050
101k
      }
4051
81.3k
      if (HasZeroBaseReg)
4052
5
        continue;
4053
81.3k
4054
81.3k
      // TODO: This assumes we've done basic processing on all uses and
4055
81.3k
      // have an idea what the register usage is.
4056
81.3k
      if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4057
38.9k
        continue;
4058
42.4k
4059
42.4k
      F.canonicalize(*L);
4060
42.4k
      (void)InsertFormula(LU, LUIdx, F);
4061
42.4k
    }
4062
2.52M
  }
4063
3.17M
}
4064
4065
namespace {
4066
4067
/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4068
/// modifications so that the search phase doesn't have to worry about the data
4069
/// structures moving underneath it.
4070
struct WorkItem {
4071
  size_t LUIdx;
4072
  int64_t Imm;
4073
  const SCEV *OrigReg;
4074
4075
  WorkItem(size_t LI, int64_t I, const SCEV *R)
4076
1.62M
      : LUIdx(LI), Imm(I), OrigReg(R) {}
4077
4078
  void print(raw_ostream &OS) const;
4079
  void dump() const;
4080
};
4081
4082
} // end anonymous namespace
4083
4084
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4085
void WorkItem::print(raw_ostream &OS) const {
4086
  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4087
     << " , add offset " << Imm;
4088
}
4089
4090
LLVM_DUMP_METHOD void WorkItem::dump() const {
4091
  print(errs()); errs() << '\n';
4092
}
4093
#endif
4094
4095
/// Look for registers which are a constant distance apart and try to form reuse
4096
/// opportunities between them.
4097
103k
void LSRInstance::GenerateCrossUseConstantOffsets() {
4098
103k
  // Group the registers by their value without any added constant offset.
4099
103k
  using ImmMapTy = std::map<int64_t, const SCEV *>;
4100
103k
4101
103k
  DenseMap<const SCEV *, ImmMapTy> Map;
4102
103k
  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4103
103k
  SmallVector<const SCEV *, 8> Sequence;
4104
2.23M
  for (const SCEV *Use : RegUses) {
4105
2.23M
    const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4106
2.23M
    int64_t Imm = ExtractImmediate(Reg, SE);
4107
2.23M
    auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
4108
2.23M
    if (Pair.second)
4109
1.02M
      Sequence.push_back(Reg);
4110
2.23M
    Pair.first->second.insert(std::make_pair(Imm, Use));
4111
2.23M
    UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4112
2.23M
  }
4113
103k
4114
103k
  // Now examine each set of registers with the same base value. Build up
4115
103k
  // a list of work to do and do the work in a separate step so that we're
4116
103k
  // not adding formulae and register counts while we're searching.
4117
103k
  SmallVector<WorkItem, 32> WorkItems;
4118
103k
  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
4119
1.02M
  for (const SCEV *Reg : Sequence) {
4120
1.02M
    const ImmMapTy &Imms = Map.find(Reg)->second;
4121
1.02M
4122
1.02M
    // It's not worthwhile looking for reuse if there's only one offset.
4123
1.02M
    if (Imms.size() == 1)
4124
622k
      continue;
4125
406k
4126
406k
    LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4127
406k
               for (const auto &Entry
4128
406k
                    : Imms) dbgs()
4129
406k
               << ' ' << Entry.first;
4130
406k
               dbgs() << '\n');
4131
406k
4132
406k
    // Examine each offset.
4133
406k
    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4134
2.01M
         J != JE; 
++J1.60M
) {
4135
1.60M
      const SCEV *OrigReg = J->second;
4136
1.60M
4137
1.60M
      int64_t JImm = J->first;
4138
1.60M
      const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4139
1.60M
4140
1.60M
      if (!isa<SCEVConstant>(OrigReg) &&
4141
1.60M
          
UsedByIndicesMap[Reg].count() == 11.48M
) {
4142
364k
        LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4143
364k
                          << '\n');
4144
364k
        continue;
4145
364k
      }
4146
1.24M
4147
1.24M
      // Conservatively examine offsets between this orig reg a few selected
4148
1.24M
      // other orig regs.
4149
1.24M
      int64_t First = Imms.begin()->first;
4150
1.24M
      int64_t Last = std::prev(Imms.end())->first;
4151
1.24M
      // Compute (First + Last)  / 2 without overflow using the fact that
4152
1.24M
      // First + Last = 2 * (First + Last) + (First ^ Last).
4153
1.24M
      int64_t Avg = (First & Last) + ((First ^ Last) >> 1);
4154
1.24M
      // If the result is negative and First is odd and Last even (or vice versa),
4155
1.24M
      // we rounded towards -inf. Add 1 in that case, to round towards 0.
4156
1.24M
      Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));
4157
1.24M
      ImmMapTy::const_iterator OtherImms[] = {
4158
1.24M
          Imms.begin(), std::prev(Imms.end()),
4159
1.24M
         Imms.lower_bound(Avg)};
4160
4.97M
      for (size_t i = 0, e = array_lengthof(OtherImms); i != e; 
++i3.72M
) {
4161
3.72M
        ImmMapTy::const_iterator M = OtherImms[i];
4162
3.72M
        if (M == J || 
M == JE3.04M
)
continue684k
;
4163
3.04M
4164
3.04M
        // Compute the difference between the two.
4165
3.04M
        int64_t Imm = (uint64_t)JImm - M->first;
4166
3.04M
        for (unsigned LUIdx : UsedByIndices.set_bits())
4167
4.82M
          // Make a memo of this use, offset, and register tuple.
4168
4.82M
          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4169
1.62M
            WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4170
3.04M
      }
4171
1.24M
    }
4172
406k
  }
4173
103k
4174
103k
  Map.clear();
4175
103k
  Sequence.clear();
4176
103k
  UsedByIndicesMap.clear();
4177
103k
  UniqueItems.clear();
4178
103k
4179
103k
  // Now iterate through the worklist and add new formulae.
4180
1.62M
  for (const WorkItem &WI : WorkItems) {
4181
1.62M
    size_t LUIdx = WI.LUIdx;
4182
1.62M
    LSRUse &LU = Uses[LUIdx];
4183
1.62M
    int64_t Imm = WI.Imm;
4184
1.62M
    const SCEV *OrigReg = WI.OrigReg;
4185
1.62M
4186
1.62M
    Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4187
1.62M
    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
4188
1.62M
    unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4189
1.62M
4190
1.62M
    // TODO: Use a more targeted data structure.
4191
19.5M
    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; 
++L17.9M
) {
4192
17.9M
      Formula F = LU.Formulae[L];
4193
17.9M
      // FIXME: The code for the scaled and unscaled registers looks
4194
17.9M
      // very similar but slightly different. Investigate if they
4195
17.9M
      // could be merged. That way, we would not have to unscale the
4196
17.9M
      // Formula.
4197
17.9M
      F.unscale();
4198
17.9M
      // Use the immediate in the scaled register.
4199
17.9M
      if (F.ScaledReg == OrigReg) {
4200
242k
        int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
4201
242k
        // Don't create 50 + reg(-50).
4202
242k
        if (F.referencesReg(SE.getSCEV(
4203
242k
                   ConstantInt::get(IntTy, -(uint64_t)Offset))))
4204
9.04k
          continue;
4205
233k
        Formula NewF = F;
4206
233k
        NewF.BaseOffset = Offset;
4207
233k
        if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4208
233k
                        NewF))
4209
201k
          continue;
4210
31.8k
        NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4211
31.8k
4212
31.8k
        // If the new scale is a constant in a register, and adding the constant
4213
31.8k
        // value to the immediate would produce a value closer to zero than the
4214
31.8k
        // immediate itself, then the formula isn't worthwhile.
4215
31.8k
        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
4216
0
          if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
4217
0
              (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4218
0
                  .ule(std::abs(NewF.BaseOffset)))
4219
0
            continue;
4220
31.8k
4221
31.8k
        // OK, looks good.
4222
31.8k
        NewF.canonicalize(*this->L);
4223
31.8k
        (void)InsertFormula(LU, LUIdx, NewF);
4224
17.6M
      } else {
4225
17.6M
        // Use the immediate in a base register.
4226
49.5M
        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; 
++N31.8M
) {
4227
33.2M
          const SCEV *BaseReg = F.BaseRegs[N];
4228
33.2M
          if (BaseReg != OrigReg)
4229
30.9M
            continue;
4230
2.24M
          Formula NewF = F;
4231
2.24M
          NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
4232
2.24M
          if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4233
2.24M
                          LU.Kind, LU.AccessTy, NewF)) {
4234
1.38M
            if (TTI.shouldFavorPostInc() &&
4235
1.38M
                
mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE)446
)
4236
20
              continue;
4237
1.38M
            if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
4238
884k
              continue;
4239
503k
            NewF = F;
4240
503k
            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
4241
503k
          }
4242
2.24M
          NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4243
1.35M
4244
1.35M
          // If the new formula has a constant in a register, and adding the
4245
1.35M
          // constant value to the immediate would produce a value closer to
4246
1.35M
          // zero than the immediate itself, then the formula isn't worthwhile.
4247
1.35M
          for (const SCEV *NewReg : NewF.BaseRegs)
4248
2.26M
            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
4249
120k
              if ((C->getAPInt() + NewF.BaseOffset)
4250
120k
                      .abs()
4251
120k
                      .slt(std::abs(NewF.BaseOffset)) &&
4252
120k
                  (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
4253
7.68k
                      countTrailingZeros<uint64_t>(NewF.BaseOffset))
4254
813
                goto skip_formula;
4255
1.35M
4256
1.35M
          // Ok, looks good.
4257
1.35M
          NewF.canonicalize(*this->L);
4258
1.35M
          (void)InsertFormula(LU, LUIdx, NewF);
4259
1.35M
          break;
4260
813
        skip_formula:;
4261
813
        }
4262
17.6M
      }
4263
17.9M
    }
4264
1.62M
  }
4265
103k
}
4266
4267
/// Generate formulae for each use.
4268
void
4269
103k
LSRInstance::GenerateAllReuseFormulae() {
4270
103k
  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4271
103k
  // queries are more precise.
4272
559k
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; 
++LUIdx456k
) {
4273
456k
    LSRUse &LU = Uses[LUIdx];
4274
912k
    for (size_t i = 0, f = LU.Formulae.size(); i != f; 
++i456k
)
4275
456k
      GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4276
2.38M
    for (size_t i = 0, f = LU.Formulae.size(); i != f; 
++i1.92M
)
4277
1.92M
      GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4278
456k
  }
4279
559k
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; 
++LUIdx456k
) {
4280
456k
    LSRUse &LU = Uses[LUIdx];
4281
2.64M
    for (size_t i = 0, f = LU.Formulae.size(); i != f; 
++i2.18M
)
4282
2.18M
      GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4283
2.64M
    for (size_t i = 0, f = LU.Formulae.size(); i != f; 
++i2.18M
)
4284
2.18M
      GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4285
2.85M
    for (size_t i = 0, f = LU.Formulae.size(); i != f; 
++i2.40M
)
4286
2.40M
      GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4287
3.09M
    for (size_t i = 0, f = LU.Formulae.size(); i != f; 
++i2.64M
)
4288
2.64M
      GenerateScales(LU, LUIdx, LU.Formulae[i]);
4289
456k
  }
4290
559k
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; 
++LUIdx456k
) {
4291
456k
    LSRUse &LU = Uses[LUIdx];
4292
3.62M
    for (size_t i = 0, f = LU.Formulae.size(); i != f; 
++i3.17M
)
4293
3.17M
      GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4294
456k
  }
4295
103k
4296
103k
  GenerateCrossUseConstantOffsets();
4297
103k
4298
103k
  LLVM_DEBUG(dbgs() << "\n"
4299
103k
                       "After generating reuse formulae:\n";
4300
103k
             print_uses(dbgs()));
4301
103k
}
4302
4303
/// If there are multiple formulae with the same set of registers used
4304
/// by other uses, pick the best one and delete the others.
4305
105k
void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4306
105k
  DenseSet<const SCEV *> VisitedRegs;
4307
105k
  SmallPtrSet<const SCEV *, 16> Regs;
4308
105k
  SmallPtrSet<const SCEV *, 16> LoserRegs;
4309
#ifndef NDEBUG
4310
  bool ChangedFormulae = false;
4311
#endif
4312
4313
105k
  // Collect the best formula for each unique set of shared registers. This
4314
105k
  // is reset for each use.
4315
105k
  using BestFormulaeTy =
4316
105k
      DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
4317
105k
4318
105k
  BestFormulaeTy BestFormulae;
4319
105k
4320
576k
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; 
++LUIdx471k
) {
4321
471k
    LSRUse &LU = Uses[LUIdx];
4322
471k
    LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4323
471k
               dbgs() << '\n');
4324
471k
4325
471k
    bool Any = false;
4326
471k
    for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4327
4.74M
         FIdx != NumForms; 
++FIdx4.27M
) {
4328
4.27M
      Formula &F = LU.Formulae[FIdx];
4329
4.27M
4330
4.27M
      // Some formulas are instant losers. For example, they may depend on
4331
4.27M
      // nonexistent AddRecs from other loops. These need to be filtered
4332
4.27M
      // immediately, otherwise heuristics could choose them over others leading
4333
4.27M
      // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4334
4.27M
      // avoids the need to recompute this information across formulae using the
4335
4.27M
      // same bad AddRec. Passing LoserRegs is also essential unless we remove
4336
4.27M
      // the corresponding bad register from the Regs set.
4337
4.27M
      Cost CostF(L, SE, TTI);
4338
4.27M
      Regs.clear();
4339
4.27M
      CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
4340
4.27M
      if (CostF.isLoser()) {
4341
82.3k
        // During initial formula generation, undesirable formulae are generated
4342
82.3k
        // by uses within other loops that have some non-trivial address mode or
4343
82.3k
        // use the postinc form of the IV. LSR needs to provide these formulae
4344
82.3k
        // as the basis of rediscovering the desired formula that uses an AddRec
4345
82.3k
        // corresponding to the existing phi. Once all formulae have been
4346
82.3k
        // generated, these initial losers may be pruned.
4347
82.3k
        LLVM_DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
4348
82.3k
                   dbgs() << "\n");
4349
82.3k
      }
4350
4.19M
      else {
4351
4.19M
        SmallVector<const SCEV *, 4> Key;
4352
5.28M
        for (const SCEV *Reg : F.BaseRegs) {
4353
5.28M
          if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4354
2.76M
            Key.push_back(Reg);
4355
5.28M
        }
4356
4.19M
        if (F.ScaledReg &&
4357
4.19M
            
RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)2.97M
)
4358
2.05M
          Key.push_back(F.ScaledReg);
4359
4.19M
        // Unstable sort by host order ok, because this is only used for
4360
4.19M
        // uniquifying.
4361
4.19M
        llvm::sort(Key);
4362
4.19M
4363
4.19M
        std::pair<BestFormulaeTy::const_iterator, bool> P =
4364
4.19M
          BestFormulae.insert(std::make_pair(Key, FIdx));
4365
4.19M
        if (P.second)
4366
3.29M
          continue;
4367
894k
4368
894k
        Formula &Best = LU.Formulae[P.first->second];
4369
894k
4370
894k
        Cost CostBest(L, SE, TTI);
4371
894k
        Regs.clear();
4372
894k
        CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
4373
894k
        if (CostF.isLess(CostBest))
4374
85.8k
          std::swap(F, Best);
4375
894k
        LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
4376
894k
                   dbgs() << "\n"
4377
894k
                             "    in favor of formula ";
4378
894k
                   Best.print(dbgs()); dbgs() << '\n');
4379
894k
      }
4380
#ifndef NDEBUG
4381
      ChangedFormulae = true;
4382
#endif
4383
977k
      LU.DeleteFormula(F);
4384
977k
      --FIdx;
4385
977k
      --NumForms;
4386
977k
      Any = true;
4387
977k
    }
4388
471k
4389
471k
    // Now that we've filtered out some formulae, recompute the Regs set.
4390
471k
    if (Any)
4391
150k
      LU.RecomputeRegs(LUIdx, RegUses);
4392
471k
4393
471k
    // Reset this to prepare for the next use.
4394
471k
    BestFormulae.clear();
4395
471k
  }
4396
105k
4397
105k
  LLVM_DEBUG(if (ChangedFormulae) {
4398
105k
    dbgs() << "\n"
4399
105k
              "After filtering out undesirable candidates:\n";
4400
105k
    print_uses(dbgs());
4401
105k
  });
4402
105k
}
4403
4404
/// Estimate the worst-case number of solutions the solver might have to
4405
/// consider. It almost never considers this many solutions because it prune the
4406
/// search space, but the pruning isn't always sufficient.
4407
519k
size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4408
519k
  size_t Power = 1;
4409
1.57M
  for (const LSRUse &LU : Uses) {
4410
1.57M
    size_t FSize = LU.Formulae.size();
4411
1.57M
    if (FSize >= ComplexityLimit) {
4412
0
      Power = ComplexityLimit;
4413
0
      break;
4414
0
    }
4415
1.57M
    Power *= FSize;
4416
1.57M
    if (Power >= ComplexityLimit)
4417
15.9k
      break;
4418
1.57M
  }
4419
519k
  return Power;
4420
519k
}
4421
4422
/// When one formula uses a superset of the registers of another formula, it
4423
/// won't help reduce register pressure (though it may not necessarily hurt
4424
/// register pressure); remove it to simplify the system.
4425
103k
void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4426
103k
  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4427
5.91k
    LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4428
5.91k
4429
5.91k
    LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4430
5.91k
                         "which use a superset of registers used by other "
4431
5.91k
                         "formulae.\n");
4432
5.91k
4433
176k
    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; 
++LUIdx170k
) {
4434
170k
      LSRUse &LU = Uses[LUIdx];
4435
170k
      bool Any = false;
4436
2.29M
      for (size_t i = 0, e = LU.Formulae.size(); i != e; 
++i2.12M
) {
4437
2.12M
        Formula &F = LU.Formulae[i];
4438
2.12M
        // Look for a formula with a constant or GV in a register. If the use
4439
2.12M
        // also has a formula with that same value in an immediate field,
4440
2.12M
        // delete the one that uses a register.
4441
2.12M
        for (SmallVectorImpl<const SCEV *>::const_iterator
4442
4.90M
             I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; 
++I2.78M
) {
4443
2.79M
          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4444
187k
            Formula NewF = F;
4445
187k
            //FIXME: Formulas should store bitwidth to do wrapping properly.
4446
187k
            //       See PR41034.
4447
187k
            NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue();
4448
187k
            NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4449
187k
                                (I - F.BaseRegs.begin()));
4450
187k
            if (LU.HasFormulaWithSameRegs(NewF)) {
4451
6.45k
              LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
4452
6.45k
                         dbgs() << '\n');
4453
6.45k
              LU.DeleteFormula(F);
4454
6.45k
              --i;
4455
6.45k
              --e;
4456
6.45k
              Any = true;
4457
6.45k
              break;
4458
6.45k
            }
4459
2.60M
          } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4460
680k
            if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4461
268k
              if (!F.BaseGV) {
4462
268k
                Formula NewF = F;
4463
268k
                NewF.BaseGV = GV;
4464
268k
                NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4465
268k
                                    (I - F.BaseRegs.begin()));
4466
268k
                if (LU.HasFormulaWithSameRegs(NewF)) {
4467
0
                  LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
4468
0
                             dbgs() << '\n');
4469
0
                  LU.DeleteFormula(F);
4470
0
                  --i;
4471
0
                  --e;
4472
0
                  Any = true;
4473
0
                  break;
4474
0
                }
4475
268k
              }
4476
680k
          }
4477
2.79M
        }
4478
2.12M
      }
4479
170k
      if (Any)
4480
1.03k
        LU.RecomputeRegs(LUIdx, RegUses);
4481
170k
    }
4482
5.91k
4483
5.91k
    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4484
5.91k
  }
4485
103k
}
4486
4487
/// When there are many registers for expressions like A, A+1, A+2, etc.,
4488
/// allocate a single register for them.
4489
103k
void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4490
103k
  if (EstimateSearchSpaceComplexity() < ComplexityLimit) 
4491
97.6k
    return;
4492
5.90k
4493
5.90k
  LLVM_DEBUG(
4494
5.90k
      dbgs() << "The search space is too complex.\n"
4495
5.90k
                "Narrowing the search space by assuming that uses separated "
4496
5.90k
                "by a constant offset will use the same registers.\n");
4497
5.90k
4498
5.90k
  // This is especially useful for unrolled loops.
4499
5.90k
4500
176k
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; 
++LUIdx170k
) {
4501
170k
    LSRUse &LU = Uses[LUIdx];
4502
1.56M
    for (const Formula &F : LU.Formulae) {
4503
1.56M
      if (F.BaseOffset == 0 || 
(246k
F.Scale != 0246k
&&
F.Scale != 161.9k
))
4504
1.32M
        continue;
4505
246k
4506
246k
      LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4507
246k
      if (!LUThatHas)
4508
106k
        continue;
4509
140k
4510
140k
      if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4511
140k
                              LU.Kind, LU.AccessTy))
4512
20
        continue;
4513
140k
4514
140k
      LLVM_DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4515
140k
4516
140k
      LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4517
140k
4518
140k
      // Transfer the fixups of LU to LUThatHas.
4519
145k
      for (LSRFixup &Fixup : LU.Fixups) {
4520
145k
        Fixup.Offset += F.BaseOffset;
4521
145k
        LUThatHas->pushFixup(Fixup);
4522
145k
        LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4523
145k
      }
4524
140k
4525
140k
      // Delete formulae from the new use which are no longer legal.
4526
140k
      bool Any = false;
4527
1.39M
      for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; 
++i1.25M
) {
4528
1.25M
        Formula &F = LUThatHas->Formulae[i];
4529
1.25M
        if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4530
1.25M
                        LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4531
20.7k
          LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
4532
20.7k
          LUThatHas->DeleteFormula(F);
4533
20.7k
          --i;
4534
20.7k
          --e;
4535
20.7k
          Any = true;
4536
20.7k
        }
4537
1.25M
      }
4538
140k
4539
140k
      if (Any)
4540
3.74k
        LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4541
140k
4542
140k
      // Delete the old use.
4543
140k
      DeleteUse(LU, LUIdx);
4544
140k
      --LUIdx;
4545
140k
      --NumUses;
4546
140k
      break;
4547
140k
    }
4548
170k
  }
4549
5.90k
4550
5.90k
  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4551
5.90k
}
4552
4553
/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4554
/// we've done more filtering, as it may be able to find more formulae to
4555
/// eliminate.
4556
103k
void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4557
103k
  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4558
1.52k
    LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4559
1.52k
4560
1.52k
    LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
4561
1.52k
                         "undesirable dedicated registers.\n");
4562
1.52k
4563
1.52k
    FilterOutUndesirableDedicatedRegisters();
4564
1.52k
4565
1.52k
    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4566
1.52k
  }
4567
103k
}
4568
4569
/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
4570
/// Pick the best one and delete the others.
4571
/// This narrowing heuristic is to keep as many formulae with different
4572
/// Scale and ScaledReg pair as possible while narrowing the search space.
4573
/// The benefit is that it is more likely to find out a better solution
4574
/// from a formulae set with more Scale and ScaledReg variations than
4575
/// a formulae set with the same Scale and ScaledReg. The picking winner
4576
/// reg heuristic will often keep the formulae with the same Scale and
4577
/// ScaledReg and filter others, and we want to avoid that if possible.
4578
103k
void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
4579
103k
  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4580
102k
    return;
4581
1.44k
4582
1.44k
  LLVM_DEBUG(
4583
1.44k
      dbgs() << "The search space is too complex.\n"
4584
1.44k
                "Narrowing the search space by choosing the best Formula "
4585
1.44k
                "from the Formulae with the same Scale and ScaledReg.\n");
4586
1.44k
4587
1.44k
  // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
4588
1.44k
  using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
4589
1.44k
4590
1.44k
  BestFormulaeTy BestFormulae;
4591
#ifndef NDEBUG
4592
  bool ChangedFormulae = false;
4593
#endif
4594
  DenseSet<const SCEV *> VisitedRegs;
4595
1.44k
  SmallPtrSet<const SCEV *, 16> Regs;
4596
1.44k
4597
16.5k
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; 
++LUIdx15.1k
) {
4598
15.1k
    LSRUse &LU = Uses[LUIdx];
4599
15.1k
    LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4600
15.1k
               dbgs() << '\n');
4601
15.1k
4602
15.1k
    // Return true if Formula FA is better than Formula FB.
4603
60.7k
    auto IsBetterThan = [&](Formula &FA, Formula &FB) {
4604
60.7k
      // First we will try to choose the Formula with fewer new registers.
4605
60.7k
      // For a register used by current Formula, the more the register is
4606
60.7k
      // shared among LSRUses, the less we increase the register number
4607
60.7k
      // counter of the formula.
4608
60.7k
      size_t FARegNum = 0;
4609
105k
      for (const SCEV *Reg : FA.BaseRegs) {
4610
105k
        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
4611
105k
        FARegNum += (NumUses - UsedByIndices.count() + 1);
4612
105k
      }
4613
60.7k
      size_t FBRegNum = 0;
4614
95.9k
      for (const SCEV *Reg : FB.BaseRegs) {
4615
95.9k
        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
4616
95.9k
        FBRegNum += (NumUses - UsedByIndices.count() + 1);
4617
95.9k
      }
4618
60.7k
      if (FARegNum != FBRegNum)
4619
55.7k
        return FARegNum < FBRegNum;
4620
4.94k
4621
4.94k
      // If the new register numbers are the same, choose the Formula with
4622
4.94k
      // less Cost.
4623
4.94k
      Cost CostFA(L, SE, TTI);
4624
4.94k
      Cost CostFB(L, SE, TTI);
4625
4.94k
      Regs.clear();
4626
4.94k
      CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
4627
4.94k
      Regs.clear();
4628
4.94k
      CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
4629
4.94k
      return CostFA.isLess(CostFB);
4630
4.94k
    };
4631
15.1k
4632
15.1k
    bool Any = false;
4633
175k
    for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
4634
160k
         ++FIdx) {
4635
160k
      Formula &F = LU.Formulae[FIdx];
4636
160k
      if (!F.ScaledReg)
4637
31.8k
        continue;
4638
128k
      auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
4639
128k
      if (P.second)
4640
68.2k
        continue;
4641
60.7k
4642
60.7k
      Formula &Best = LU.Formulae[P.first->second];
4643
60.7k
      if (IsBetterThan(F, Best))
4644
22.2k
        std::swap(F, Best);
4645
60.7k
      LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
4646
60.7k
                 dbgs() << "\n"
4647
60.7k
                           "    in favor of formula ";
4648
60.7k
                 Best.print(dbgs()); dbgs() << '\n');
4649
#ifndef NDEBUG
4650
      ChangedFormulae = true;
4651
#endif
4652
      LU.DeleteFormula(F);
4653
60.7k
      --FIdx;
4654
60.7k
      --NumForms;
4655
60.7k
      Any = true;
4656
60.7k
    }
4657
15.1k
    if (Any)
4658
7.45k
      LU.RecomputeRegs(LUIdx, RegUses);
4659
15.1k
4660
15.1k
    // Reset this to prepare for the next use.
4661
15.1k
    BestFormulae.clear();
4662
15.1k
  }
4663
1.44k
4664
1.44k
  LLVM_DEBUG(if (ChangedFormulae) {
4665
1.44k
    dbgs() << "\n"
4666
1.44k
              "After filtering out undesirable candidates:\n";
4667
1.44k
    print_uses(dbgs());
4668
1.44k
  });
4669
1.44k
}
4670
4671
/// The function delete formulas with high registers number expectation.
4672
/// Assuming we don't know the value of each formula (already delete
4673
/// all inefficient), generate probability of not selecting for each
4674
/// register.
4675
/// For example,
4676
/// Use1:
4677
///  reg(a) + reg({0,+,1})
4678
///  reg(a) + reg({-1,+,1}) + 1
4679
///  reg({a,+,1})
4680
/// Use2:
4681
///  reg(b) + reg({0,+,1})
4682
///  reg(b) + reg({-1,+,1}) + 1
4683
///  reg({b,+,1})
4684
/// Use3:
4685
///  reg(c) + reg(b) + reg({0,+,1})
4686
///  reg(c) + reg({b,+,1})
4687
///
4688
/// Probability of not selecting
4689
///                 Use1   Use2    Use3
4690
/// reg(a)         (1/3) *   1   *   1
4691
/// reg(b)           1   * (1/3) * (1/2)
4692
/// reg({0,+,1})   (2/3) * (2/3) * (1/2)
4693
/// reg({-1,+,1})  (2/3) * (2/3) *   1
4694
/// reg({a,+,1})   (2/3) *   1   *   1
4695
/// reg({b,+,1})     1   * (2/3) * (2/3)
4696
/// reg(c)           1   *   1   *   0
4697
///
4698
/// Now count registers number mathematical expectation for each formula:
4699
/// Note that for each use we exclude probability if not selecting for the use.
4700
/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
4701
/// probabilty 1/3 of not selecting for Use1).
4702
/// Use1:
4703
///  reg(a) + reg({0,+,1})          1 + 1/3       -- to be deleted
4704
///  reg(a) + reg({-1,+,1}) + 1     1 + 4/9       -- to be deleted
4705
///  reg({a,+,1})                   1
4706
/// Use2:
4707
///  reg(b) + reg({0,+,1})          1/2 + 1/3     -- to be deleted
4708
///  reg(b) + reg({-1,+,1}) + 1     1/2 + 2/3     -- to be deleted
4709
///  reg({b,+,1})                   2/3
4710
/// Use3:
4711
///  reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
4712
///  reg(c) + reg({b,+,1})          1 + 2/3
4713
0
void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
4714
0
  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4715
0
    return;
4716
0
  // Ok, we have too many of formulae on our hands to conveniently handle.
4717
0
  // Use a rough heuristic to thin out the list.
4718
0
4719
0
  // Set of Regs wich will be 100% used in final solution.
4720
0
  // Used in each formula of a solution (in example above this is reg(c)).
4721
0
  // We can skip them in calculations.
4722
0
  SmallPtrSet<const SCEV *, 4> UniqRegs;
4723
0
  LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4724
0
4725
0
  // Map each register to probability of not selecting
4726
0
  DenseMap <const SCEV *, float> RegNumMap;
4727
0
  for (const SCEV *Reg : RegUses) {
4728
0
    if (UniqRegs.count(Reg))
4729
0
      continue;
4730
0
    float PNotSel = 1;
4731
0
    for (const LSRUse &LU : Uses) {
4732
0
      if (!LU.Regs.count(Reg))
4733
0
        continue;
4734
0
      float P = LU.getNotSelectedProbability(Reg);
4735
0
      if (P != 0.0)
4736
0
        PNotSel *= P;
4737
0
      else
4738
0
        UniqRegs.insert(Reg);
4739
0
    }
4740
0
    RegNumMap.insert(std::make_pair(Reg, PNotSel));
4741
0
  }
4742
0
4743
0
  LLVM_DEBUG(
4744
0
      dbgs() << "Narrowing the search space by deleting costly formulas\n");
4745
0
4746
0
  // Delete formulas where registers number expectation is high.
4747
0
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4748
0
    LSRUse &LU = Uses[LUIdx];
4749
0
    // If nothing to delete - continue.
4750
0
    if (LU.Formulae.size() < 2)
4751
0
      continue;
4752
0
    // This is temporary solution to test performance. Float should be
4753
0
    // replaced with round independent type (based on integers) to avoid
4754
0
    // different results for different target builds.
4755
0
    float FMinRegNum = LU.Formulae[0].getNumRegs();
4756
0
    float FMinARegNum = LU.Formulae[0].getNumRegs();
4757
0
    size_t MinIdx = 0;
4758
0
    for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4759
0
      Formula &F = LU.Formulae[i];
4760
0
      float FRegNum = 0;
4761
0
      float FARegNum = 0;
4762
0
      for (const SCEV *BaseReg : F.BaseRegs) {
4763
0
        if (UniqRegs.count(BaseReg))
4764
0
          continue;
4765
0
        FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
4766
0
        if (isa<SCEVAddRecExpr>(BaseReg))
4767
0
          FARegNum +=
4768
0
              RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
4769
0
      }
4770
0
      if (const SCEV *ScaledReg = F.ScaledReg) {
4771
0
        if (!UniqRegs.count(ScaledReg)) {
4772
0
          FRegNum +=
4773
0
              RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
4774
0
          if (isa<SCEVAddRecExpr>(ScaledReg))
4775
0
            FARegNum +=
4776
0
                RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
4777
0
        }
4778
0
      }
4779
0
      if (FMinRegNum > FRegNum ||
4780
0
          (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
4781
0
        FMinRegNum = FRegNum;
4782
0
        FMinARegNum = FARegNum;
4783
0
        MinIdx = i;
4784
0
      }
4785
0
    }
4786
0
    LLVM_DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs());
4787
0
               dbgs() << " with min reg num " << FMinRegNum << '\n');
4788
0
    if (MinIdx != 0)
4789
0
      std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
4790
0
    while (LU.Formulae.size() != 1) {
4791
0
      LLVM_DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs());
4792
0
                 dbgs() << '\n');
4793
0
      LU.Formulae.pop_back();
4794
0
    }
4795
0
    LU.RecomputeRegs(LUIdx, RegUses);
4796
0
    assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
4797
0
    Formula &F = LU.Formulae[0];
4798
0
    LLVM_DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n');
4799
0
    // When we choose the formula, the regs become unique.
4800
0
    UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
4801
0
    if (F.ScaledReg)
4802
0
      UniqRegs.insert(F.ScaledReg);
4803
0
  }
4804
0
  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4805
0
}
4806
4807
/// Pick a register which seems likely to be profitable, and then in any use
4808
/// which has any reference to that register, delete all formulae which do not
4809
/// reference that register.
4810
103k
void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
4811
103k
  // With all other options exhausted, loop until the system is simple
4812
103k
  // enough to handle.
4813
103k
  SmallPtrSet<const SCEV *, 4> Taken;
4814
104k
  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4815
1.19k
    // Ok, we have too many of formulae on our hands to conveniently handle.
4816
1.19k
    // Use a rough heuristic to thin out the list.
4817
1.19k
    LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4818
1.19k
4819
1.19k
    // Pick the register which is used by the most LSRUses, which is likely
4820
1.19k
    // to be a good reuse register candidate.
4821
1.19k
    const SCEV *Best = nullptr;
4822
1.19k
    unsigned BestNum = 0;
4823
113k
    for (const SCEV *Reg : RegUses) {
4824
113k
      if (Taken.count(Reg))
4825
65
        continue;
4826
113k
      if (!Best) {
4827
1.19k
        Best = Reg;
4828
1.19k
        BestNum = RegUses.getUsedByIndices(Reg).count();
4829
112k
      } else {
4830
112k
        unsigned Count = RegUses.getUsedByIndices(Reg).count();
4831
112k
        if (Count > BestNum) {
4832
1.99k
          Best = Reg;
4833
1.99k
          BestNum = Count;
4834
1.99k
        }
4835
112k
      }
4836
113k
    }
4837
1.19k
4838
1.19k
    LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
4839
1.19k
                      << " will yield profitable reuse.\n");
4840
1.19k
    Taken.insert(Best);
4841
1.19k
4842
1.19k
    // In any use with formulae which references this register, delete formulae
4843
1.19k
    // which don't reference it.
4844
16.3k
    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; 
++LUIdx15.1k
) {
4845
15.1k
      LSRUse &LU = Uses[LUIdx];
4846
15.1k
      if (!LU.Regs.count(Best)) 
continue3.27k
;
4847
11.8k
4848
11.8k
      bool Any = false;
4849
94.0k
      for (size_t i = 0, e = LU.Formulae.size(); i != e; 
++i82.1k
) {
4850
82.1k
        Formula &F = LU.Formulae[i];
4851
82.1k
        if (!F.referencesReg(Best)) {
4852
69.3k
          LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
4853
69.3k
          LU.DeleteFormula(F);
4854
69.3k
          --e;
4855
69.3k
          --i;
4856
69.3k
          Any = true;
4857
69.3k
          assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
4858
69.3k
          continue;
4859
69.3k
        }
4860
82.1k
      }
4861
11.8k
4862
11.8k
      if (Any)
4863
11.5k
        LU.RecomputeRegs(LUIdx, RegUses);
4864
11.8k
    }
4865
1.19k
4866
1.19k
    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4867
1.19k
  }
4868
103k
}
4869
4870
/// If there are an extraordinary number of formulae to choose from, use some
4871
/// rough heuristics to prune down the number of formulae. This keeps the main
4872
/// solver from taking an extraordinary amount of time in some worst-case
4873
/// scenarios.
4874
103k
void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
4875
103k
  NarrowSearchSpaceByDetectingSupersets();
4876
103k
  NarrowSearchSpaceByCollapsingUnrolledCode();
4877
103k
  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
4878
103k
  if (FilterSameScaledReg)
4879
103k
    NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
4880
103k
  if (LSRExpNarrow)
4881
0
    NarrowSearchSpaceByDeletingCostlyFormulas();
4882
103k
  else
4883
103k
    NarrowSearchSpaceByPickingWinnerRegs();
4884
103k
}
4885
4886
/// This is the recursive solver.
4887
void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
4888
                               Cost &SolutionCost,
4889
                               SmallVectorImpl<const Formula *> &Workspace,
4890
                               const Cost &CurCost,
4891
                               const SmallPtrSet<const SCEV *, 16> &CurRegs,
4892
1.28M
                               DenseSet<const SCEV *> &VisitedRegs) const {
4893
1.28M
  // Some ideas:
4894
1.28M
  //  - prune more:
4895
1.28M
  //    - use more aggressive filtering
4896
1.28M
  //    - sort the formula so that the most profitable solutions are found first
4897
1.28M
  //    - sort the uses too
4898
1.28M
  //  - search faster:
4899
1.28M
  //    - don't compute a cost, and then compare. compare while computing a cost
4900
1.28M
  //      and bail early.
4901
1.28M
  //    - track register sets with SmallBitVector
4902
1.28M
4903
1.28M
  const LSRUse &LU = Uses[Workspace.size()];
4904
1.28M
4905
1.28M
  // If this use references any register that's already a part of the
4906
1.28M
  // in-progress solution, consider it a requirement that a formula must
4907
1.28M
  // reference that register in order to be considered. This prunes out
4908
1.28M
  // unprofitable searching.
4909
1.28M
  SmallSetVector<const SCEV *, 4> ReqRegs;
4910
1.28M
  for (const SCEV *S : CurRegs)
4911
6.07M
    if (LU.Regs.count(S))
4912
889k
      ReqRegs.insert(S);
4913
1.28M
4914
1.28M
  SmallPtrSet<const SCEV *, 16> NewRegs;
4915
1.28M
  Cost NewCost(L, SE, TTI);
4916
7.10M
  for (const Formula &F : LU.Formulae) {
4917
7.10M
    // Ignore formulae which may not be ideal in terms of register reuse of
4918
7.10M
    // ReqRegs.  The formula should use all required registers before
4919
7.10M
    // introducing new ones.
4920
7.10M
    int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
4921
7.10M
    for (const SCEV *Reg : ReqRegs) {
4922
5.49M
      if ((F.ScaledReg && 
F.ScaledReg == Reg4.22M
) ||
4923
5.49M
          
is_contained(F.BaseRegs, Reg)4.68M
) {
4924
1.48M
        --NumReqRegsToFind;
4925
1.48M
        if (NumReqRegsToFind == 0)
4926
1.02M
          break;
4927
1.48M
      }
4928
5.49M
    }
4929
7.10M
    if (NumReqRegsToFind != 0) {
4930
3.35M
      // If none of the formulae satisfied the required registers, then we could
4931
3.35M
      // clear ReqRegs and try again. Currently, we simply give up in this case.
4932
3.35M
      continue;
4933
3.35M
    }
4934
3.75M
4935
3.75M
    // Evaluate the cost of the current formula. If it's already worse than
4936
3.75M
    // the current best, prune the search at that point.
4937
3.75M
    NewCost = CurCost;
4938
3.75M
    NewRegs = CurRegs;
4939
3.75M
    NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
4940
3.75M
    if (NewCost.isLess(SolutionCost)) {
4941
1.36M
      Workspace.push_back(&F);
4942
1.36M
      if (Workspace.size() != Uses.size()) {
4943
1.17M
        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
4944
1.17M
                     NewRegs, VisitedRegs);
4945
1.17M
        if (F.getNumRegs() == 1 && 
Workspace.size() == 1495k
)
4946
156k
          VisitedRegs.insert(F.ScaledReg ? 
F.ScaledReg6.56k
:
F.BaseRegs[0]149k
);
4947
1.17M
      } else {
4948
181k
        LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
4949
181k
                   dbgs() << ".\nRegs:\n";
4950
181k
                   for (const SCEV *S : NewRegs) dbgs()
4951
181k
                      << "- " << *S << "\n";
4952
181k
                   dbgs() << '\n');
4953
181k
4954
181k
        SolutionCost = NewCost;
4955
181k
        Solution = Workspace;
4956
181k
      }
4957
1.36M
      Workspace.pop_back();
4958
1.36M
    }
4959
3.75M
  }
4960
1.28M
}
4961
4962
/// Choose one formula from each use. Return the results in the given Solution
4963
/// vector.
4964
103k
void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
4965
103k
  SmallVector<const Formula *, 8> Workspace;
4966
103k
  Cost SolutionCost(L, SE, TTI);
4967
103k
  SolutionCost.Lose();
4968
103k
  Cost CurCost(L, SE, TTI);
4969
103k
  SmallPtrSet<const SCEV *, 16> CurRegs;
4970
103k
  DenseSet<const SCEV *> VisitedRegs;
4971
103k
  Workspace.reserve(Uses.size());
4972
103k
4973
103k
  // SolveRecurse does all the work.
4974
103k
  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
4975
103k
               CurRegs, VisitedRegs);
4976
103k
  if (Solution.empty()) {
4977
112
    LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
4978
112
    return;
4979
112
  }
4980
103k
4981
103k
  // Ok, we've now made all our decisions.
4982
103k
  LLVM_DEBUG(dbgs() << "\n"
4983
103k
                       "The chosen solution requires ";
4984
103k
             SolutionCost.print(dbgs()); dbgs() << ":\n";
4985
103k
             for (size_t i = 0, e = Uses.size(); i != e; ++i) {
4986
103k
               dbgs() << "  ";
4987
103k
               Uses[i].print(dbgs());
4988
103k
               dbgs() << "\n"
4989
103k
                         "    ";
4990
103k
               Solution[i]->print(dbgs());
4991
103k
               dbgs() << '\n';
4992
103k
             });
4993
103k
4994
103k
  assert(Solution.size() == Uses.size() && "Malformed solution!");
4995
103k
}
4996
4997
/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
4998
/// we can go while still being dominated by the input positions. This helps
4999
/// canonicalize the insert position, which encourages sharing.
5000
BasicBlock::iterator
5001
LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5002
                                 const SmallVectorImpl<Instruction *> &Inputs)
5003
506k
                                                                         const {
5004
506k
  Instruction *Tentative = &*IP;
5005
1.00M
  while (true) {
5006
1.00M
    bool AllDominate = true;
5007
1.00M
    Instruction *BetterPos = nullptr;
5008
1.00M
    // Don't bother attempting to insert before a catchswitch, their basic block
5009
1.00M
    // cannot have other non-PHI instructions.
5010
1.00M
    if (isa<CatchSwitchInst>(Tentative))
5011
3
      return IP;
5012
1.00M
5013
1.14M
    
for (Instruction *Inst : Inputs)1.00M
{
5014
1.14M
      if (Inst == Tentative || 
!DT.dominates(Inst, Tentative)1.05M
) {
5015
500k
        AllDominate = false;
5016
500k
        break;
5017
500k
      }
5018
644k
      // Attempt to find an insert position in the middle of the block,
5019
644k
      // instead of at the end, so that it can be used for other expansions.
5020
644k
      if (Tentative->getParent() == Inst->getParent() &&
5021
644k
          
(469k
!BetterPos469k
||
!DT.dominates(Inst, BetterPos)77
))
5022
469k
        BetterPos = &*std::next(BasicBlock::iterator(Inst));
5023
644k
    }
5024
1.00M
    if (!AllDominate)
5025
500k
      break;
5026
503k
    if (BetterPos)
5027
389k
      IP = BetterPos->getIterator();
5028
113k
    else
5029
113k
      IP = Tentative->getIterator();
5030
503k
5031
503k
    const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5032
503k
    unsigned IPLoopDepth = IPLoop ? 
IPLoop->getLoopDepth()420k
:
082.6k
;
5033
503k
5034
503k
    BasicBlock *IDom;
5035
586k
    for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5036
586k
      if (!Rung) 
return IP0
;
5037
586k
      Rung = Rung->getIDom();
5038
586k
      if (!Rung) 
return IP5.75k
;
5039
580k
      IDom = Rung->getBlock();
5040
580k
5041
580k
      // Don't climb into a loop though.
5042
580k
      const Loop *IDomLoop = LI.getLoopFor(IDom);
5043
580k
      unsigned IDomDepth = IDomLoop ? 
IDomLoop->getLoopDepth()238k
:
0342k
;
5044
580k
      if (IDomDepth <= IPLoopDepth &&
5045
580k
          
(497k
IDomDepth != IPLoopDepth497k
||
IDomLoop == IPLoop162k
))
5046
497k
        break;
5047
580k
    }
5048
503k
5049
503k
    Tentative = IDom->getTerminator();
5050
497k
  }
5051
506k
5052
506k
  
return IP500k
;
5053
506k
}
5054
5055
/// Determine an input position which will be dominated by the operands and
5056
/// which will dominate the result.
5057
BasicBlock::iterator
5058
LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
5059
                                           const LSRFixup &LF,
5060
                                           const LSRUse &LU,
5061
506k
                                           SCEVExpander &Rewriter) const {
5062
506k
  // Collect some instructions which must be dominated by the
5063
506k
  // expanding replacement. These must be dominated by any operands that
5064
506k
  // will be required in the expansion.
5065
506k
  SmallVector<Instruction *, 4> Inputs;
5066
506k
  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5067
500k
    Inputs.push_back(I);
5068
506k
  if (LU.Kind == LSRUse::ICmpZero)
5069
67.3k
    if (Instruction *I =
5070
38.5k
          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5071
38.5k
      Inputs.push_back(I);
5072
506k
  if (LF.PostIncLoops.count(L)) {
5073
111k
    if (LF.isUseFullyOutsideLoop(L))
5074
26.1k
      Inputs.push_back(L->getLoopLatch()->getTerminator());
5075
85.7k
    else
5076
85.7k
      Inputs.push_back(IVIncInsertPos);
5077
111k
  }
5078
506k
  // The expansion must also be dominated by the increment positions of any
5079
506k
  // loops it for which it is using post-inc mode.
5080
506k
  for (const Loop *PIL : LF.PostIncLoops) {
5081
117k
    if (PIL == L) 
continue111k
;
5082
6.02k
5083
6.02k
    // Be dominated by the loop exit.
5084
6.02k
    SmallVector<BasicBlock *, 4> ExitingBlocks;
5085
6.02k
    PIL->getExitingBlocks(ExitingBlocks);
5086
6.02k
    if (!ExitingBlocks.empty()) {
5087
6.02k
      BasicBlock *BB = ExitingBlocks[0];
5088
6.08k
      for (unsigned i = 1, e = ExitingBlocks.size(); i != e; 
++i60
)
5089
60
        BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5090
6.02k
      Inputs.push_back(BB->getTerminator());
5091
6.02k
    }
5092
6.02k
  }
5093
506k
5094
506k
  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
5095
506k
         && !isa<DbgInfoIntrinsic>(LowestIP) &&
5096
506k
         "Insertion point must be a normal instruction");
5097
506k
5098
506k
  // Then, climb up the immediate dominator tree as far as we can go while
5099
506k
  // still being dominated by the input positions.
5100
506k
  BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5101
506k
5102
506k
  // Don't insert instructions before PHI nodes.
5103
522k
  while (isa<PHINode&