Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Analysis/LoopAccessAnalysis.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// The implementation for the loop memory dependence that was originally
10
// developed for the loop vectorizer.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "llvm/Analysis/LoopAccessAnalysis.h"
15
#include "llvm/ADT/APInt.h"
16
#include "llvm/ADT/DenseMap.h"
17
#include "llvm/ADT/DepthFirstIterator.h"
18
#include "llvm/ADT/EquivalenceClasses.h"
19
#include "llvm/ADT/PointerIntPair.h"
20
#include "llvm/ADT/STLExtras.h"
21
#include "llvm/ADT/SetVector.h"
22
#include "llvm/ADT/SmallPtrSet.h"
23
#include "llvm/ADT/SmallSet.h"
24
#include "llvm/ADT/SmallVector.h"
25
#include "llvm/ADT/iterator_range.h"
26
#include "llvm/Analysis/AliasAnalysis.h"
27
#include "llvm/Analysis/AliasSetTracker.h"
28
#include "llvm/Analysis/LoopAnalysisManager.h"
29
#include "llvm/Analysis/LoopInfo.h"
30
#include "llvm/Analysis/MemoryLocation.h"
31
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
32
#include "llvm/Analysis/ScalarEvolution.h"
33
#include "llvm/Analysis/ScalarEvolutionExpander.h"
34
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
35
#include "llvm/Analysis/TargetLibraryInfo.h"
36
#include "llvm/Analysis/ValueTracking.h"
37
#include "llvm/Analysis/VectorUtils.h"
38
#include "llvm/IR/BasicBlock.h"
39
#include "llvm/IR/Constants.h"
40
#include "llvm/IR/DataLayout.h"
41
#include "llvm/IR/DebugLoc.h"
42
#include "llvm/IR/DerivedTypes.h"
43
#include "llvm/IR/DiagnosticInfo.h"
44
#include "llvm/IR/Dominators.h"
45
#include "llvm/IR/Function.h"
46
#include "llvm/IR/IRBuilder.h"
47
#include "llvm/IR/InstrTypes.h"
48
#include "llvm/IR/Instruction.h"
49
#include "llvm/IR/Instructions.h"
50
#include "llvm/IR/Operator.h"
51
#include "llvm/IR/PassManager.h"
52
#include "llvm/IR/Type.h"
53
#include "llvm/IR/Value.h"
54
#include "llvm/IR/ValueHandle.h"
55
#include "llvm/Pass.h"
56
#include "llvm/Support/Casting.h"
57
#include "llvm/Support/CommandLine.h"
58
#include "llvm/Support/Debug.h"
59
#include "llvm/Support/ErrorHandling.h"
60
#include "llvm/Support/raw_ostream.h"
61
#include <algorithm>
62
#include <cassert>
63
#include <cstdint>
64
#include <cstdlib>
65
#include <iterator>
66
#include <utility>
67
#include <vector>
68
69
using namespace llvm;
70
71
131k
#define DEBUG_TYPE "loop-accesses"
72
73
static cl::opt<unsigned, true>
74
VectorizationFactor("force-vector-width", cl::Hidden,
75
                    cl::desc("Sets the SIMD width. Zero is autoselect."),
76
                    cl::location(VectorizerParams::VectorizationFactor));
77
unsigned VectorizerParams::VectorizationFactor;
78
79
static cl::opt<unsigned, true>
80
VectorizationInterleave("force-vector-interleave", cl::Hidden,
81
                        cl::desc("Sets the vectorization interleave count. "
82
                                 "Zero is autoselect."),
83
                        cl::location(
84
                            VectorizerParams::VectorizationInterleave));
85
unsigned VectorizerParams::VectorizationInterleave;
86
87
static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
88
    "runtime-memory-check-threshold", cl::Hidden,
89
    cl::desc("When performing memory disambiguation checks at runtime do not "
90
             "generate more than this number of comparisons (default = 8)."),
91
    cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8));
92
unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
93
94
/// The maximum iterations used to merge memory checks
95
static cl::opt<unsigned> MemoryCheckMergeThreshold(
96
    "memory-check-merge-threshold", cl::Hidden,
97
    cl::desc("Maximum number of comparisons done when trying to merge "
98
             "runtime memory checks. (default = 100)"),
99
    cl::init(100));
100
101
/// Maximum SIMD width.
102
const unsigned VectorizerParams::MaxVectorWidth = 64;
103
104
/// We collect dependences up to this threshold.
105
static cl::opt<unsigned>
106
    MaxDependences("max-dependences", cl::Hidden,
107
                   cl::desc("Maximum number of dependences collected by "
108
                            "loop-access analysis (default = 100)"),
109
                   cl::init(100));
110
111
/// This enables versioning on the strides of symbolically striding memory
112
/// accesses in code like the following.
113
///   for (i = 0; i < N; ++i)
114
///     A[i * Stride1] += B[i * Stride2] ...
115
///
116
/// Will be roughly translated to
117
///    if (Stride1 == 1 && Stride2 == 1) {
118
///      for (i = 0; i < N; i+=4)
119
///       A[i:i+3] += ...
120
///    } else
121
///      ...
122
static cl::opt<bool> EnableMemAccessVersioning(
123
    "enable-mem-access-versioning", cl::init(true), cl::Hidden,
124
    cl::desc("Enable symbolic stride memory access versioning"));
125
126
/// Enable store-to-load forwarding conflict detection. This option can
127
/// be disabled for correctness testing.
128
static cl::opt<bool> EnableForwardingConflictDetection(
129
    "store-to-load-forwarding-conflict-detection", cl::Hidden,
130
    cl::desc("Enable conflict detection in loop-access analysis"),
131
    cl::init(true));
132
133
163k
bool VectorizerParams::isInterleaveForced() {
134
163k
  return ::VectorizationInterleave.getNumOccurrences() > 0;
135
163k
}
136
137
7.02k
Value *llvm::stripIntegerCast(Value *V) {
138
7.02k
  if (auto *CI = dyn_cast<CastInst>(V))
139
6.83k
    if (CI->getOperand(0)->getType()->isIntegerTy())
140
6.83k
      return CI->getOperand(0);
141
189
  return V;
142
189
}
143
144
const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
145
                                            const ValueToValueMap &PtrToStride,
146
837k
                                            Value *Ptr, Value *OrigPtr) {
147
837k
  const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
148
837k
149
837k
  // If there is an entry in the map return the SCEV of the pointer with the
150
837k
  // symbolic stride replaced by one.
151
837k
  ValueToValueMap::const_iterator SI =
152
837k
      PtrToStride.find(OrigPtr ? 
OrigPtr0
: Ptr);
153
837k
  if (SI != PtrToStride.end()) {
154
7.02k
    Value *StrideVal = SI->second;
155
7.02k
156
7.02k
    // Strip casts.
157
7.02k
    StrideVal = stripIntegerCast(StrideVal);
158
7.02k
159
7.02k
    ScalarEvolution *SE = PSE.getSE();
160
7.02k
    const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
161
7.02k
    const auto *CT =
162
7.02k
        static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
163
7.02k
164
7.02k
    PSE.addPredicate(*SE->getEqualPredicate(U, CT));
165
7.02k
    auto *Expr = PSE.getSCEV(Ptr);
166
7.02k
167
7.02k
    LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
168
7.02k
                      << " by: " << *Expr << "\n");
169
7.02k
    return Expr;
170
7.02k
  }
171
830k
172
830k
  // Otherwise, just return the SCEV of the original pointer.
173
830k
  return OrigSCEV;
174
830k
}
175
176
/// Calculate Start and End points of memory access.
177
/// Let's assume A is the first access and B is a memory access on N-th loop
178
/// iteration. Then B is calculated as:
179
///   B = A + Step*N .
180
/// Step value may be positive or negative.
181
/// N is a calculated back-edge taken count:
182
///     N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
183
/// Start and End points are calculated in the following way:
184
/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
185
/// where SizeOfElt is the size of single memory access in bytes.
186
///
187
/// There is no conflict when the intervals are disjoint:
188
/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
189
void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
190
                                    unsigned DepSetId, unsigned ASId,
191
                                    const ValueToValueMap &Strides,
192
99.9k
                                    PredicatedScalarEvolution &PSE) {
193
99.9k
  // Get the stride replaced scev.
194
99.9k
  const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
195
99.9k
  ScalarEvolution *SE = PSE.getSE();
196
99.9k
197
99.9k
  const SCEV *ScStart;
198
99.9k
  const SCEV *ScEnd;
199
99.9k
200
99.9k
  if (SE->isLoopInvariant(Sc, Lp))
201
6.05k
    ScStart = ScEnd = Sc;
202
93.9k
  else {
203
93.9k
    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
204
93.9k
    assert(AR && "Invalid addrec expression");
205
93.9k
    const SCEV *Ex = PSE.getBackedgeTakenCount();
206
93.9k
207
93.9k
    ScStart = AR->getStart();
208
93.9k
    ScEnd = AR->evaluateAtIteration(Ex, *SE);
209
93.9k
    const SCEV *Step = AR->getStepRecurrence(*SE);
210
93.9k
211
93.9k
    // For expressions with negative step, the upper bound is ScStart and the
212
93.9k
    // lower bound is ScEnd.
213
93.9k
    if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
214
82.3k
      if (CStep->getValue()->isNegative())
215
5.31k
        std::swap(ScStart, ScEnd);
216
82.3k
    } else {
217
11.5k
      // Fallback case: the step is not constant, but we can still
218
11.5k
      // get the upper and lower bounds of the interval by using min/max
219
11.5k
      // expressions.
220
11.5k
      ScStart = SE->getUMinExpr(ScStart, ScEnd);
221
11.5k
      ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
222
11.5k
    }
223
93.9k
    // Add the size of the pointed element to ScEnd.
224
93.9k
    unsigned EltSize =
225
93.9k
      Ptr->getType()->getPointerElementType()->getScalarSizeInBits() / 8;
226
93.9k
    const SCEV *EltSizeSCEV = SE->getConstant(ScEnd->getType(), EltSize);
227
93.9k
    ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
228
93.9k
  }
229
99.9k
230
99.9k
  Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
231
99.9k
}
232
233
SmallVector<RuntimePointerChecking::PointerCheck, 4>
234
10.1k
RuntimePointerChecking::generateChecks() const {
235
10.1k
  SmallVector<PointerCheck, 4> Checks;
236
10.1k
237
38.8k
  for (unsigned I = 0; I < CheckingGroups.size(); 
++I28.7k
) {
238
219k
    for (unsigned J = I + 1; J < CheckingGroups.size(); 
++J190k
) {
239
190k
      const RuntimePointerChecking::CheckingPtrGroup &CGI = CheckingGroups[I];
240
190k
      const RuntimePointerChecking::CheckingPtrGroup &CGJ = CheckingGroups[J];
241
190k
242
190k
      if (needsChecking(CGI, CGJ))
243
104k
        Checks.push_back(std::make_pair(&CGI, &CGJ));
244
190k
    }
245
28.7k
  }
246
10.1k
  return Checks;
247
10.1k
}
248
249
void RuntimePointerChecking::generateChecks(
250
10.1k
    MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
251
10.1k
  assert(Checks.empty() && "Checks is not empty");
252
10.1k
  groupChecks(DepCands, UseDependencies);
253
10.1k
  Checks = generateChecks();
254
10.1k
}
255
256
bool RuntimePointerChecking::needsChecking(const CheckingPtrGroup &M,
257
190k
                                           const CheckingPtrGroup &N) const {
258
298k
  for (unsigned I = 0, EI = M.Members.size(); EI != I; 
++I107k
)
259
335k
    
for (unsigned J = 0, EJ = N.Members.size(); 211k
EJ != J;
++J123k
)
260
227k
      if (needsChecking(M.Members[I], N.Members[J]))
261
104k
        return true;
262
190k
  
return false86.4k
;
263
190k
}
264
265
/// Compare \p I and \p J and return the minimum.
266
/// Return nullptr in case we couldn't find an answer.
267
static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
268
24.1k
                                   ScalarEvolution *SE) {
269
24.1k
  const SCEV *Diff = SE->getMinusSCEV(J, I);
270
24.1k
  const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
271
24.1k
272
24.1k
  if (!C)
273
5.44k
    return nullptr;
274
18.6k
  if (C->getValue()->isNegative())
275
2.40k
    return J;
276
16.2k
  return I;
277
16.2k
}
278
279
14.7k
bool RuntimePointerChecking::CheckingPtrGroup::addPointer(unsigned Index) {
280
14.7k
  const SCEV *Start = RtCheck.Pointers[Index].Start;
281
14.7k
  const SCEV *End = RtCheck.Pointers[Index].End;
282
14.7k
283
14.7k
  // Compare the starts and ends with the known minimum and maximum
284
14.7k
  // of this set. We need to know how we compare against the min/max
285
14.7k
  // of the set in order to be able to emit memchecks.
286
14.7k
  const SCEV *Min0 = getMinFromExprs(Start, Low, RtCheck.SE);
287
14.7k
  if (!Min0)
288
5.42k
    return false;
289
9.35k
290
9.35k
  const SCEV *Min1 = getMinFromExprs(End, High, RtCheck.SE);
291
9.35k
  if (!Min1)
292
25
    return false;
293
9.32k
294
9.32k
  // Update the low bound  expression if we've found a new min value.
295
9.32k
  if (Min0 == Start)
296
7.69k
    Low = Start;
297
9.32k
298
9.32k
  // Update the high bound expression if we've found a new max value.
299
9.32k
  if (Min1 != End)
300
765
    High = End;
301
9.32k
302
9.32k
  Members.push_back(Index);
303
9.32k
  return true;
304
9.32k
}
305
306
void RuntimePointerChecking::groupChecks(
307
10.1k
    MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
308
10.1k
  // We build the groups from dependency candidates equivalence classes
309
10.1k
  // because:
310
10.1k
  //    - We know that pointers in the same equivalence class share
311
10.1k
  //      the same underlying object and therefore there is a chance
312
10.1k
  //      that we can compare pointers
313
10.1k
  //    - We wouldn't be able to merge two pointers for which we need
314
10.1k
  //      to emit a memcheck. The classes in DepCands are already
315
10.1k
  //      conveniently built such that no two pointers in the same
316
10.1k
  //      class need checking against each other.
317
10.1k
318
10.1k
  // We use the following (greedy) algorithm to construct the groups
319
10.1k
  // For every pointer in the equivalence class:
320
10.1k
  //   For each existing group:
321
10.1k
  //   - if the difference between this pointer and the min/max bounds
322
10.1k
  //     of the group is a constant, then make the pointer part of the
323
10.1k
  //     group and update the min/max bounds of that group as required.
324
10.1k
325
10.1k
  CheckingGroups.clear();
326
10.1k
327
10.1k
  // If we need to check two pointers to the same underlying object
328
10.1k
  // with a non-constant difference, we shouldn't perform any pointer
329
10.1k
  // grouping with those pointers. This is because we can easily get
330
10.1k
  // into cases where the resulting check would return false, even when
331
10.1k
  // the accesses are safe.
332
10.1k
  //
333
10.1k
  // The following example shows this:
334
10.1k
  // for (i = 0; i < 1000; ++i)
335
10.1k
  //   a[5000 + i * m] = a[i] + a[i + 9000]
336
10.1k
  //
337
10.1k
  // Here grouping gives a check of (5000, 5000 + 1000 * m) against
338
10.1k
  // (0, 10000) which is always false. However, if m is 1, there is no
339
10.1k
  // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
340
10.1k
  // us to perform an accurate check in this case.
341
10.1k
  //
342
10.1k
  // The above case requires that we have an UnknownDependence between
343
10.1k
  // accesses to the same underlying object. This cannot happen unless
344
10.1k
  // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
345
10.1k
  // is also false. In this case we will use the fallback path and create
346
10.1k
  // separate checking groups for all pointers.
347
10.1k
348
10.1k
  // If we don't have the dependency partitions, construct a new
349
10.1k
  // checking pointer group for each pointer. This is also required
350
10.1k
  // for correctness, because in this case we can have checking between
351
10.1k
  // pointers to the same underlying object.
352
10.1k
  if (!UseDependencies) {
353
3.40k
    for (unsigned I = 0; I < Pointers.size(); 
++I2.81k
)
354
2.81k
      CheckingGroups.push_back(CheckingPtrGroup(I, *this));
355
583
    return;
356
583
  }
357
9.58k
358
9.58k
  unsigned TotalComparisons = 0;
359
9.58k
360
9.58k
  DenseMap<Value *, unsigned> PositionMap;
361
42.9k
  for (unsigned Index = 0; Index < Pointers.size(); 
++Index33.3k
)
362
33.3k
    PositionMap[Pointers[Index].PointerValue] = Index;
363
9.58k
364
9.58k
  // We need to keep track of what pointers we've already seen so we
365
9.58k
  // don't process them twice.
366
9.58k
  SmallSet<unsigned, 2> Seen;
367
9.58k
368
9.58k
  // Go through all equivalence classes, get the "pointer check groups"
369
9.58k
  // and add them to the overall solution. We use the order in which accesses
370
9.58k
  // appear in 'Pointers' to enforce determinism.
371
42.9k
  for (unsigned I = 0; I < Pointers.size(); 
++I33.3k
) {
372
33.3k
    // We've seen this pointer before, and therefore already processed
373
33.3k
    // its equivalence class.
374
33.3k
    if (Seen.count(I))
375
10.2k
      continue;
376
23.0k
377
23.0k
    MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
378
23.0k
                                           Pointers[I].IsWritePtr);
379
23.0k
380
23.0k
    SmallVector<CheckingPtrGroup, 2> Groups;
381
23.0k
    auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
382
23.0k
383
23.0k
    // Because DepCands is constructed by visiting accesses in the order in
384
23.0k
    // which they appear in alias sets (which is deterministic) and the
385
23.0k
    // iteration order within an equivalence class member is only dependent on
386
23.0k
    // the order in which unions and insertions are performed on the
387
23.0k
    // equivalence class, the iteration order is deterministic.
388
23.0k
    for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
389
58.3k
         MI != ME; 
++MI35.2k
) {
390
35.2k
      unsigned Pointer = PositionMap[MI->getPointer()];
391
35.2k
      bool Merged = false;
392
35.2k
      // Mark this pointer as seen.
393
35.2k
      Seen.insert(Pointer);
394
35.2k
395
35.2k
      // Go through all the existing sets and see if we can find one
396
35.2k
      // which can include this pointer.
397
35.2k
      for (CheckingPtrGroup &Group : Groups) {
398
15.9k
        // Don't perform more than a certain amount of comparisons.
399
15.9k
        // This should limit the cost of grouping the pointers to something
400
15.9k
        // reasonable.  If we do end up hitting this threshold, the algorithm
401
15.9k
        // will create separate groups for all remaining pointers.
402
15.9k
        if (TotalComparisons > MemoryCheckMergeThreshold)
403
1.13k
          break;
404
14.7k
405
14.7k
        TotalComparisons++;
406
14.7k
407
14.7k
        if (Group.addPointer(Pointer)) {
408
9.32k
          Merged = true;
409
9.32k
          break;
410
9.32k
        }
411
14.7k
      }
412
35.2k
413
35.2k
      if (!Merged)
414
25.9k
        // We couldn't add this pointer to any existing set or the threshold
415
25.9k
        // for the number of comparisons has been reached. Create a new group
416
25.9k
        // to hold the current pointer.
417
25.9k
        Groups.push_back(CheckingPtrGroup(Pointer, *this));
418
35.2k
    }
419
23.0k
420
23.0k
    // We've computed the grouped checks for this partition.
421
23.0k
    // Save the results and continue with the next one.
422
23.0k
    llvm::copy(Groups, std::back_inserter(CheckingGroups));
423
23.0k
  }
424
9.58k
}
425
426
bool RuntimePointerChecking::arePointersInSamePartition(
427
    const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
428
145
    unsigned PtrIdx2) {
429
145
  return (PtrToPartition[PtrIdx1] != -1 &&
430
145
          PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
431
145
}
432
433
227k
bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
434
227k
  const PointerInfo &PointerI = Pointers[I];
435
227k
  const PointerInfo &PointerJ = Pointers[J];
436
227k
437
227k
  // No need to check if two readonly pointers intersect.
438
227k
  if (!PointerI.IsWritePtr && 
!PointerJ.IsWritePtr76.6k
)
439
70.8k
    return false;
440
157k
441
157k
  // Only need to check pointers between two different dependency sets.
442
157k
  if (PointerI.DependencySetId == PointerJ.DependencySetId)
443
50.9k
    return false;
444
106k
445
106k
  // Only need to check pointers in the same alias set.
446
106k
  if (PointerI.AliasSetId != PointerJ.AliasSetId)
447
1.56k
    return false;
448
104k
449
104k
  return true;
450
104k
}
451
452
void RuntimePointerChecking::printChecks(
453
    raw_ostream &OS, const SmallVectorImpl<PointerCheck> &Checks,
454
115
    unsigned Depth) const {
455
115
  unsigned N = 0;
456
115
  for (const auto &Check : Checks) {
457
109
    const auto &First = Check.first->Members, &Second = Check.second->Members;
458
109
459
109
    OS.indent(Depth) << "Check " << N++ << ":\n";
460
109
461
109
    OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
462
264
    for (unsigned K = 0; K < First.size(); 
++K155
)
463
155
      OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
464
109
465
109
    OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
466
225
    for (unsigned K = 0; K < Second.size(); 
++K116
)
467
116
      OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
468
109
  }
469
115
}
470
471
115
void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
472
115
473
115
  OS.indent(Depth) << "Run-time memory checks:\n";
474
115
  printChecks(OS, Checks, Depth);
475
115
476
115
  OS.indent(Depth) << "Grouped accesses:\n";
477
242
  for (unsigned I = 0; I < CheckingGroups.size(); 
++I127
) {
478
127
    const auto &CG = CheckingGroups[I];
479
127
480
127
    OS.indent(Depth + 2) << "Group " << &CG << ":\n";
481
127
    OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
482
127
                         << ")\n";
483
286
    for (unsigned J = 0; J < CG.Members.size(); 
++J159
) {
484
159
      OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
485
159
                           << "\n";
486
159
    }
487
127
  }
488
115
}
489
490
namespace {
491
492
/// Analyses memory accesses in a loop.
493
///
494
/// Checks whether run time pointer checks are needed and builds sets for data
495
/// dependence checking.
496
class AccessAnalysis {
497
public:
498
  /// Read or write access location.
499
  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
500
  typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
501
502
  AccessAnalysis(const DataLayout &Dl, Loop *TheLoop, AliasAnalysis *AA,
503
                 LoopInfo *LI, MemoryDepChecker::DepCandidates &DA,
504
                 PredicatedScalarEvolution &PSE)
505
      : DL(Dl), TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA),
506
56.2k
        IsRTCheckAnalysisNeeded(false), PSE(PSE) {}
507
508
  /// Register a load  and whether it is only read from.
509
74.8k
  void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
510
74.8k
    Value *Ptr = const_cast<Value*>(Loc.Ptr);
511
74.8k
    AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
512
74.8k
    Accesses.insert(MemAccessInfo(Ptr, false));
513
74.8k
    if (IsReadOnly)
514
66.7k
      ReadOnlyPtr.insert(Ptr);
515
74.8k
  }
516
517
  /// Register a store.
518
103k
  void addStore(MemoryLocation &Loc) {
519
103k
    Value *Ptr = const_cast<Value*>(Loc.Ptr);
520
103k
    AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
521
103k
    Accesses.insert(MemAccessInfo(Ptr, true));
522
103k
  }
523
524
  /// Check if we can emit a run-time no-alias check for \p Access.
525
  ///
526
  /// Returns true if we can emit a run-time no alias check for \p Access.
527
  /// If we can check this access, this also adds it to a dependence set and
528
  /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
529
  /// we will attempt to use additional run-time checks in order to get
530
  /// the bounds of the pointer.
531
  bool createCheckForAccess(RuntimePointerChecking &RtCheck,
532
                            MemAccessInfo Access,
533
                            const ValueToValueMap &Strides,
534
                            DenseMap<Value *, unsigned> &DepSetId,
535
                            Loop *TheLoop, unsigned &RunningDepId,
536
                            unsigned ASId, bool ShouldCheckStride,
537
                            bool Assume);
538
539
  /// Check whether we can check the pointers at runtime for
540
  /// non-intersection.
541
  ///
542
  /// Returns true if we need no check or if we do and we can generate them
543
  /// (i.e. the pointers have computable bounds).
544
  bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
545
                       Loop *TheLoop, const ValueToValueMap &Strides,
546
                       bool ShouldCheckWrap = false);
547
548
  /// Goes over all memory accesses, checks whether a RT check is needed
549
  /// and builds sets of dependent accesses.
550
33.9k
  void buildDependenceSets() {
551
33.9k
    processMemAccesses();
552
33.9k
  }
553
554
  /// Initial processing of memory accesses determined that we need to
555
  /// perform dependency checking.
556
  ///
557
  /// Note that this can later be cleared if we retry memcheck analysis without
558
  /// dependency checking (i.e. FoundNonConstantDistanceDependence).
559
156k
  bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
560
561
  /// We decided that no dependence analysis would be used.  Reset the state.
562
585
  void resetDepChecks(MemoryDepChecker &DepChecker) {
563
585
    CheckDeps.clear();
564
585
    DepChecker.clearDependences();
565
585
  }
566
567
22.1k
  MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
568
569
private:
570
  typedef SetVector<MemAccessInfo> PtrAccessSet;
571
572
  /// Go over all memory access and check whether runtime pointer checks
573
  /// are needed and build sets of dependency check candidates.
574
  void processMemAccesses();
575
576
  /// Set of all accesses.
577
  PtrAccessSet Accesses;
578
579
  const DataLayout &DL;
580
581
  /// The loop being checked.
582
  const Loop *TheLoop;
583
584
  /// List of accesses that need a further dependence check.
585
  MemAccessInfoList CheckDeps;
586
587
  /// Set of pointers that are read only.
588
  SmallPtrSet<Value*, 16> ReadOnlyPtr;
589
590
  /// An alias set tracker to partition the access set by underlying object and
591
  //intrinsic property (such as TBAA metadata).
592
  AliasSetTracker AST;
593
594
  LoopInfo *LI;
595
596
  /// Sets of potentially dependent accesses - members of one set share an
597
  /// underlying pointer. The set "CheckDeps" identfies which sets really need a
598
  /// dependence check.
599
  MemoryDepChecker::DepCandidates &DepCands;
600
601
  /// Initial processing of memory accesses determined that we may need
602
  /// to add memchecks.  Perform the analysis to determine the necessary checks.
603
  ///
604
  /// Note that, this is different from isDependencyCheckNeeded.  When we retry
605
  /// memcheck analysis without dependency checking
606
  /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
607
  /// cleared while this remains set if we have potentially dependent accesses.
608
  bool IsRTCheckAnalysisNeeded;
609
610
  /// The SCEV predicate containing all the SCEV-related assumptions.
611
  PredicatedScalarEvolution &PSE;
612
};
613
614
} // end anonymous namespace
615
616
/// Check whether a pointer can participate in a runtime bounds check.
617
/// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
618
/// by adding run-time checks (overflow checks) if necessary.
619
static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
620
                                const ValueToValueMap &Strides, Value *Ptr,
621
130k
                                Loop *L, bool Assume) {
622
130k
  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
623
130k
624
130k
  // The bounds for loop-invariant pointer is trivial.
625
130k
  if (PSE.getSE()->isLoopInvariant(PtrScev, L))
626
6.05k
    return true;
627
124k
628
124k
  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
629
124k
630
124k
  if (!AR && 
Assume30.6k
)
631
4.84k
    AR = PSE.getAsAddRec(Ptr);
632
124k
633
124k
  if (!AR)
634
29.8k
    return false;
635
94.7k
636
94.7k
  return AR->isAffine();
637
94.7k
}
638
639
/// Check whether a pointer address cannot wrap.
640
static bool isNoWrap(PredicatedScalarEvolution &PSE,
641
3.68k
                     const ValueToValueMap &Strides, Value *Ptr, Loop *L) {
642
3.68k
  const SCEV *PtrScev = PSE.getSCEV(Ptr);
643
3.68k
  if (PSE.getSE()->isLoopInvariant(PtrScev, L))
644
84
    return true;
645
3.59k
646
3.59k
  int64_t Stride = getPtrStride(PSE, Ptr, L, Strides);
647
3.59k
  if (Stride == 1 || 
PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)1.87k
)
648
1.91k
    return true;
649
1.68k
650
1.68k
  return false;
651
1.68k
}
652
653
bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
654
                                          MemAccessInfo Access,
655
                                          const ValueToValueMap &StridesMap,
656
                                          DenseMap<Value *, unsigned> &DepSetId,
657
                                          Loop *TheLoop, unsigned &RunningDepId,
658
                                          unsigned ASId, bool ShouldCheckWrap,
659
130k
                                          bool Assume) {
660
130k
  Value *Ptr = Access.getPointer();
661
130k
662
130k
  if (!hasComputableBounds(PSE, StridesMap, Ptr, TheLoop, Assume))
663
29.8k
    return false;
664
100k
665
100k
  // When we run after a failing dependency check we have to make sure
666
100k
  // we don't have wrapping pointers.
667
100k
  if (ShouldCheckWrap && 
!isNoWrap(PSE, StridesMap, Ptr, TheLoop)3.68k
) {
668
1.68k
    auto *Expr = PSE.getSCEV(Ptr);
669
1.68k
    if (!Assume || 
!isa<SCEVAddRecExpr>(Expr)832
)
670
848
      return false;
671
832
    PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
672
832
  }
673
100k
674
100k
  // The id of the dependence set.
675
100k
  unsigned DepId;
676
99.9k
677
99.9k
  if (isDependencyCheckNeeded()) {
678
97.1k
    Value *Leader = DepCands.getLeaderValue(Access).getPointer();
679
97.1k
    unsigned &LeaderId = DepSetId[Leader];
680
97.1k
    if (!LeaderId)
681
46.5k
      LeaderId = RunningDepId++;
682
97.1k
    DepId = LeaderId;
683
97.1k
  } else
684
2.83k
    // Each access has its own dependence set.
685
2.83k
    DepId = RunningDepId++;
686
99.9k
687
99.9k
  bool IsWrite = Access.getInt();
688
99.9k
  RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
689
99.9k
  LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
690
99.9k
691
99.9k
  return true;
692
100k
 }
693
694
bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
695
                                     ScalarEvolution *SE, Loop *TheLoop,
696
                                     const ValueToValueMap &StridesMap,
697
34.5k
                                     bool ShouldCheckWrap) {
698
34.5k
  // Find pointers with computable bounds. We are going to use this information
699
34.5k
  // to place a runtime bound check.
700
34.5k
  bool CanDoRT = true;
701
34.5k
702
34.5k
  bool NeedRTCheck = false;
703
34.5k
  if (!IsRTCheckAnalysisNeeded) 
return true7.70k
;
704
26.8k
705
26.8k
  bool IsDepCheckNeeded = isDependencyCheckNeeded();
706
26.8k
707
26.8k
  // We assign a consecutive id to access from different alias sets.
708
26.8k
  // Accesses between different groups doesn't need to be checked.
709
26.8k
  unsigned ASId = 1;
710
32.3k
  for (auto &AS : AST) {
711
32.3k
    int NumReadPtrChecks = 0;
712
32.3k
    int NumWritePtrChecks = 0;
713
32.3k
    bool CanDoAliasSetRT = true;
714
32.3k
715
32.3k
    // We assign consecutive id to access from different dependence sets.
716
32.3k
    // Accesses within the same set don't need a runtime check.
717
32.3k
    unsigned RunningDepId = 1;
718
32.3k
    DenseMap<Value *, unsigned> DepSetId;
719
32.3k
720
32.3k
    SmallVector<MemAccessInfo, 4> Retries;
721
32.3k
722
124k
    for (auto A : AS) {
723
124k
      Value *Ptr = A.getValue();
724
124k
      bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
725
124k
      MemAccessInfo Access(Ptr, IsWrite);
726
124k
727
124k
      if (IsWrite)
728
73.2k
        ++NumWritePtrChecks;
729
51.4k
      else
730
51.4k
        ++NumReadPtrChecks;
731
124k
732
124k
      if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, TheLoop,
733
124k
                                RunningDepId, ASId, ShouldCheckWrap, false)) {
734
26.6k
        LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" << *Ptr << '\n');
735
26.6k
        Retries.push_back(Access);
736
26.6k
        CanDoAliasSetRT = false;
737
26.6k
      }
738
124k
    }
739
32.3k
740
32.3k
    // If we have at least two writes or one write and a read then we need to
741
32.3k
    // check them.  But there is no need to checks if there is only one
742
32.3k
    // dependence set for this alias set.
743
32.3k
    //
744
32.3k
    // Note that this function computes CanDoRT and NeedRTCheck independently.
745
32.3k
    // For example CanDoRT=false, NeedRTCheck=false means that we have a pointer
746
32.3k
    // for which we couldn't find the bounds but we don't actually need to emit
747
32.3k
    // any checks so it does not matter.
748
32.3k
    bool NeedsAliasSetRTCheck = false;
749
32.3k
    if (!(IsDepCheckNeeded && 
CanDoAliasSetRT31.7k
&&
RunningDepId == 226.3k
))
750
16.0k
      NeedsAliasSetRTCheck = (NumWritePtrChecks >= 2 ||
751
16.0k
                             
(11.9k
NumReadPtrChecks >= 111.9k
&&
NumWritePtrChecks >= 111.4k
));
752
32.3k
753
32.3k
    // We need to perform run-time alias checks, but some pointers had bounds
754
32.3k
    // that couldn't be checked.
755
32.3k
    if (NeedsAliasSetRTCheck && 
!CanDoAliasSetRT14.5k
) {
756
4.55k
      // Reset the CanDoSetRt flag and retry all accesses that have failed.
757
4.55k
      // We know that we need these checks, so we can now be more aggressive
758
4.55k
      // and add further checks if required (overflow checks).
759
4.55k
      CanDoAliasSetRT = true;
760
4.55k
      for (auto Access : Retries)
761
6.01k
        if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId,
762
6.01k
                                  TheLoop, RunningDepId, ASId,
763
6.01k
                                  ShouldCheckWrap, /*Assume=*/true)) {
764
4.09k
          CanDoAliasSetRT = false;
765
4.09k
          break;
766
4.09k
        }
767
4.55k
    }
768
32.3k
769
32.3k
    CanDoRT &= CanDoAliasSetRT;
770
32.3k
    NeedRTCheck |= NeedsAliasSetRTCheck;
771
32.3k
    ++ASId;
772
32.3k
  }
773
26.8k
774
26.8k
  // If the pointers that we would use for the bounds comparison have different
775
26.8k
  // address spaces, assume the values aren't directly comparable, so we can't
776
26.8k
  // use them for the runtime check. We also have to assume they could
777
26.8k
  // overlap. In the future there should be metadata for whether address spaces
778
26.8k
  // are disjoint.
779
26.8k
  unsigned NumPointers = RtCheck.Pointers.size();
780
126k
  for (unsigned i = 0; i < NumPointers; 
++i99.9k
) {
781
1.32M
    for (unsigned j = i + 1; j < NumPointers; 
++j1.22M
) {
782
1.22M
      // Only need to check pointers between two different dependency sets.
783
1.22M
      if (RtCheck.Pointers[i].DependencySetId ==
784
1.22M
          RtCheck.Pointers[j].DependencySetId)
785
944k
       continue;
786
279k
      // Only need to check pointers in the same alias set.
787
279k
      if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
788
6.44k
        continue;
789
272k
790
272k
      Value *PtrI = RtCheck.Pointers[i].PointerValue;
791
272k
      Value *PtrJ = RtCheck.Pointers[j].PointerValue;
792
272k
793
272k
      unsigned ASi = PtrI->getType()->getPointerAddressSpace();
794
272k
      unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
795
272k
      if (ASi != ASj) {
796
9
        LLVM_DEBUG(
797
9
            dbgs() << "LAA: Runtime check would require comparison between"
798
9
                      " different address spaces\n");
799
9
        return false;
800
9
      }
801
272k
    }
802
99.9k
  }
803
26.8k
804
26.8k
  
if (26.8k
NeedRTCheck26.8k
&&
CanDoRT14.2k
)
805
10.1k
    RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
806
26.8k
807
26.8k
  LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
808
26.8k
                    << " pointer comparisons.\n");
809
26.8k
810
26.8k
  RtCheck.Need = NeedRTCheck;
811
26.8k
812
26.8k
  bool CanDoRTIfNeeded = !NeedRTCheck || 
CanDoRT14.2k
;
813
26.8k
  if (!CanDoRTIfNeeded)
814
4.08k
    RtCheck.reset();
815
26.8k
  return CanDoRTIfNeeded;
816
26.8k
}
817
818
33.9k
void AccessAnalysis::processMemAccesses() {
819
33.9k
  // We process the set twice: first we process read-write pointers, last we
820
33.9k
  // process read-only pointers. This allows us to skip dependence tests for
821
33.9k
  // read-only pointers.
822
33.9k
823
33.9k
  LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
824
33.9k
  LLVM_DEBUG(dbgs() << "  AST: "; AST.dump());
825
33.9k
  LLVM_DEBUG(dbgs() << "LAA:   Accesses(" << Accesses.size() << "):\n");
826
33.9k
  LLVM_DEBUG({
827
33.9k
    for (auto A : Accesses)
828
33.9k
      dbgs() << "\t" << *A.getPointer() << " (" <<
829
33.9k
                (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
830
33.9k
                                         "read-only" : "read")) << ")\n";
831
33.9k
  });
832
33.9k
833
33.9k
  // The AliasSetTracker has nicely partitioned our pointers by metadata
834
33.9k
  // compatibility and potential for underlying-object overlap. As a result, we
835
33.9k
  // only need to check for potential pointer dependencies within each alias
836
33.9k
  // set.
837
48.6k
  for (auto &AS : AST) {
838
48.6k
    // Note that both the alias-set tracker and the alias sets themselves used
839
48.6k
    // linked lists internally and so the iteration order here is deterministic
840
48.6k
    // (matching the original instruction order within each set).
841
48.6k
842
48.6k
    bool SetHasWrite = false;
843
48.6k
844
48.6k
    // Map of pointers to last access encountered.
845
48.6k
    typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
846
48.6k
    UnderlyingObjToAccessMap ObjToLastAccess;
847
48.6k
848
48.6k
    // Set of access to check after all writes have been processed.
849
48.6k
    PtrAccessSet DeferredAccesses;
850
48.6k
851
48.6k
    // Iterate over each alias set twice, once to process read/write pointers,
852
48.6k
    // and then to process read-only pointers.
853
146k
    for (int SetIteration = 0; SetIteration < 2; 
++SetIteration97.3k
) {
854
97.3k
      bool UseDeferred = SetIteration > 0;
855
97.3k
      PtrAccessSet &S = UseDeferred ? 
DeferredAccesses48.6k
:
Accesses48.6k
;
856
97.3k
857
284k
      for (auto AV : AS) {
858
284k
        Value *Ptr = AV.getValue();
859
284k
860
284k
        // For a single memory access in AliasSetTracker, Accesses may contain
861
284k
        // both read and write, and they both need to be handled for CheckDeps.
862
5.35M
        for (auto AC : S) {
863
5.35M
          if (AC.getPointer() != Ptr)
864
5.14M
            continue;
865
213k
866
213k
          bool IsWrite = AC.getInt();
867
213k
868
213k
          // If we're using the deferred access set, then it contains only
869
213k
          // reads.
870
213k
          bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && 
!IsWrite131k
;
871
213k
          if (UseDeferred && 
!IsReadOnlyPtr64.3k
)
872
0
            continue;
873
213k
          // Otherwise, the pointer must be in the PtrAccessSet, either as a
874
213k
          // read or a write.
875
213k
          assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
876
213k
                  S.count(MemAccessInfo(Ptr, false))) &&
877
213k
                 "Alias-set pointer not in the access set?");
878
213k
879
213k
          MemAccessInfo Access(Ptr, IsWrite);
880
213k
          DepCands.insert(Access);
881
213k
882
213k
          // Memorize read-only pointers for later processing and skip them in
883
213k
          // the first round (they need to be checked after we have seen all
884
213k
          // write pointers). Note: we also mark pointer that are not
885
213k
          // consecutive as "read-only" pointers (so that we check
886
213k
          // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
887
213k
          if (!UseDeferred && 
IsReadOnlyPtr149k
) {
888
64.3k
            DeferredAccesses.insert(Access);
889
64.3k
            continue;
890
64.3k
          }
891
149k
892
149k
          // If this is a write - check other reads and writes for conflicts. If
893
149k
          // this is a read only check other writes for conflicts (but only if
894
149k
          // there is no other write to the ptr - this is an optimization to
895
149k
          // catch "a[i] = a[i] + " without having to do a dependence check).
896
149k
          if ((IsWrite || 
IsReadOnlyPtr68.2k
) &&
SetHasWrite145k
) {
897
84.7k
            CheckDeps.push_back(Access);
898
84.7k
            IsRTCheckAnalysisNeeded = true;
899
84.7k
          }
900
149k
901
149k
          if (IsWrite)
902
80.8k
            SetHasWrite = true;
903
149k
904
149k
          // Create sets of pointers connected by a shared alias set and
905
149k
          // underlying object.
906
149k
          typedef SmallVector<const Value *, 16> ValueVector;
907
149k
          ValueVector TempObjects;
908
149k
909
149k
          GetUnderlyingObjects(Ptr, TempObjects, DL, LI);
910
149k
          LLVM_DEBUG(dbgs()
911
149k
                     << "Underlying objects for pointer " << *Ptr << "\n");
912
154k
          for (const Value *UnderlyingObj : TempObjects) {
913
154k
            // nullptr never alias, don't join sets for pointer that have "null"
914
154k
            // in their UnderlyingObjects list.
915
154k
            if (isa<ConstantPointerNull>(UnderlyingObj) &&
916
154k
                !NullPointerIsDefined(
917
1.96k
                    TheLoop->getHeader()->getParent(),
918
1.96k
                    UnderlyingObj->getType()->getPointerAddressSpace()))
919
1.96k
              continue;
920
152k
921
152k
            UnderlyingObjToAccessMap::iterator Prev =
922
152k
                ObjToLastAccess.find(UnderlyingObj);
923
152k
            if (Prev != ObjToLastAccess.end())
924
74.8k
              DepCands.unionSets(Access, Prev->second);
925
152k
926
152k
            ObjToLastAccess[UnderlyingObj] = Access;
927
152k
            LLVM_DEBUG(dbgs() << "  " << *UnderlyingObj << "\n");
928
152k
          }
929
149k
        }
930
284k
      }
931
97.3k
    }
932
48.6k
  }
933
33.9k
}
934
935
550k
static bool isInBoundsGep(Value *Ptr) {
936
550k
  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
937
437k
    return GEP->isInBounds();
938
112k
  return false;
939
112k
}
940
941
/// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
942
/// i.e. monotonically increasing/decreasing.
943
static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
944
315k
                           PredicatedScalarEvolution &PSE, const Loop *L) {
945
315k
  // FIXME: This should probably only return true for NUW.
946
315k
  if (AR->getNoWrapFlags(SCEV::NoWrapMask))
947
267k
    return true;
948
47.5k
949
47.5k
  // Scalar evolution does not propagate the non-wrapping flags to values that
950
47.5k
  // are derived from a non-wrapping induction variable because non-wrapping
951
47.5k
  // could be flow-sensitive.
952
47.5k
  //
953
47.5k
  // Look through the potentially overflowing instruction to try to prove
954
47.5k
  // non-wrapping for the *specific* value of Ptr.
955
47.5k
956
47.5k
  // The arithmetic implied by an inbounds GEP can't overflow.
957
47.5k
  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
958
47.5k
  if (!GEP || 
!GEP->isInBounds()5.35k
)
959
42.2k
    return false;
960
5.26k
961
5.26k
  // Make sure there is only one non-const index and analyze that.
962
5.26k
  Value *NonConstIndex = nullptr;
963
5.26k
  for (Value *Index : make_range(GEP->idx_begin(), GEP->idx_end()))
964
7.74k
    if (!isa<ConstantInt>(Index)) {
965
5.24k
      if (NonConstIndex)
966
319
        return false;
967
4.92k
      NonConstIndex = Index;
968
4.92k
    }
969
5.26k
  
if (4.94k
!NonConstIndex4.94k
)
970
337
    // The recurrence is on the pointer, ignore for now.
971
337
    return false;
972
4.60k
973
4.60k
  // The index in GEP is signed.  It is non-wrapping if it's derived from a NSW
974
4.60k
  // AddRec using a NSW operation.
975
4.60k
  if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
976
2.66k
    if (OBO->hasNoSignedWrap() &&
977
2.66k
        // Assume constant for other the operand so that the AddRec can be
978
2.66k
        // easily found.
979
2.66k
        
isa<ConstantInt>(OBO->getOperand(1))1.99k
) {
980
537
      auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
981
537
982
537
      if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
983
537
        return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
984
4.07k
    }
985
4.07k
986
4.07k
  return false;
987
4.07k
}
988
989
/// Check whether the access through \p Ptr has a constant stride.
990
int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
991
                           const Loop *Lp, const ValueToValueMap &StridesMap,
992
565k
                           bool Assume, bool ShouldCheckWrap) {
993
565k
  Type *Ty = Ptr->getType();
994
565k
  assert(Ty->isPointerTy() && "Unexpected non-ptr");
995
565k
996
565k
  // Make sure that the pointer does not point to aggregate types.
997
565k
  auto *PtrTy = cast<PointerType>(Ty);
998
565k
  if (PtrTy->getElementType()->isAggregateType()) {
999
0
    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type"
1000
0
                      << *Ptr << "\n");
1001
0
    return 0;
1002
0
  }
1003
565k
1004
565k
  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
1005
565k
1006
565k
  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
1007
565k
  if (Assume && 
!AR547k
)
1008
10.4k
    AR = PSE.getAsAddRec(Ptr);
1009
565k
1010
565k
  if (!AR) {
1011
15.1k
    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1012
15.1k
                      << " SCEV: " << *PtrScev << "\n");
1013
15.1k
    return 0;
1014
15.1k
  }
1015
550k
1016
550k
  // The access function must stride over the innermost loop.
1017
550k
  if (Lp != AR->getLoop()) {
1018
676
    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
1019
676
                      << *Ptr << " SCEV: " << *AR << "\n");
1020
676
    return 0;
1021
676
  }
1022
550k
1023
550k
  // The address calculation must not wrap. Otherwise, a dependence could be
1024
550k
  // inverted.
1025
550k
  // An inbounds getelementptr that is a AddRec with a unit stride
1026
550k
  // cannot wrap per definition. The unit stride requirement is checked later.
1027
550k
  // An getelementptr without an inbounds attribute and unit stride would have
1028
550k
  // to access the pointer value "0" which is undefined behavior in address
1029
550k
  // space 0, therefore we can also vectorize this case.
1030
550k
  bool IsInBoundsGEP = isInBoundsGep(Ptr);
1031
550k
  bool IsNoWrapAddRec = !ShouldCheckWrap ||
1032
550k
    
PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)355k
||
1033
550k
    
isNoWrapAddRec(Ptr, AR, PSE, Lp)315k
;
1034
550k
  if (!IsNoWrapAddRec && 
!IsInBoundsGEP47.5k
&&
1035
550k
      NullPointerIsDefined(Lp->getHeader()->getParent(),
1036
42.2k
                           PtrTy->getAddressSpace())) {
1037
3
    if (Assume) {
1038
2
      PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
1039
2
      IsNoWrapAddRec = true;
1040
2
      LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap in the address space:\n"
1041
2
                        << "LAA:   Pointer: " << *Ptr << "\n"
1042
2
                        << "LAA:   SCEV: " << *AR << "\n"
1043
2
                        << "LAA:   Added an overflow assumption\n");
1044
2
    } else {
1045
1
      LLVM_DEBUG(
1046
1
          dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1047
1
                 << *Ptr << " SCEV: " << *AR << "\n");
1048
1
      return 0;
1049
1
    }
1050
550k
  }
1051
550k
1052
550k
  // Check the step is constant.
1053
550k
  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
1054
550k
1055
550k
  // Calculate the pointer stride and check if it is constant.
1056
550k
  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
1057
550k
  if (!C) {
1058
18.1k
    LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
1059
18.1k
                      << " SCEV: " << *AR << "\n");
1060
18.1k
    return 0;
1061
18.1k
  }
1062
531k
1063
531k
  auto &DL = Lp->getHeader()->getModule()->getDataLayout();
1064
531k
  int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
1065
531k
  const APInt &APStepVal = C->getAPInt();
1066
531k
1067
531k
  // Huge step value - give up.
1068
531k
  if (APStepVal.getBitWidth() > 64)
1069
0
    return 0;
1070
531k
1071
531k
  int64_t StepVal = APStepVal.getSExtValue();
1072
531k
1073
531k
  // Strided access.
1074
531k
  int64_t Stride = StepVal / Size;
1075
531k
  int64_t Rem = StepVal % Size;
1076
531k
  if (Rem)
1077
0
    return 0;
1078
531k
1079
531k
  // If the SCEV could wrap but we have an inbounds gep with a unit stride we
1080
531k
  // know we can't "wrap around the address space". In case of address space
1081
531k
  // zero we know that this won't happen without triggering undefined behavior.
1082
531k
  if (!IsNoWrapAddRec && 
Stride != 130.1k
&&
Stride != -125.3k
&&
1083
531k
      
(24.3k
IsInBoundsGEP24.3k
|| !NullPointerIsDefined(Lp->getHeader()->getParent(),
1084
24.3k
                                              PtrTy->getAddressSpace()))) {
1085
24.3k
    if (Assume) {
1086
23.9k
      // We can avoid this case by adding a run-time check.
1087
23.9k
      LLVM_DEBUG(dbgs() << "LAA: Non unit strided pointer which is not either "
1088
23.9k
                        << "inbounds or in address space 0 may wrap:\n"
1089
23.9k
                        << "LAA:   Pointer: " << *Ptr << "\n"
1090
23.9k
                        << "LAA:   SCEV: " << *AR << "\n"
1091
23.9k
                        << "LAA:   Added an overflow assumption\n");
1092
23.9k
      PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
1093
23.9k
    } else
1094
380
      return 0;
1095
531k
  }
1096
531k
1097
531k
  return Stride;
1098
531k
}
1099
1100
bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
1101
                           ScalarEvolution &SE,
1102
256k
                           SmallVectorImpl<unsigned> &SortedIndices) {
1103
256k
  assert(llvm::all_of(
1104
256k
             VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1105
256k
         "Expected list of pointer operands.");
1106
256k
  SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
1107
256k
  OffValPairs.reserve(VL.size());
1108
256k
1109
256k
  // Walk over the pointers, and map each of them to an offset relative to
1110
256k
  // first pointer in the array.
1111
256k
  Value *Ptr0 = VL[0];
1112
256k
  const SCEV *Scev0 = SE.getSCEV(Ptr0);
1113
256k
  Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
1114
256k
1115
256k
  llvm::SmallSet<int64_t, 4> Offsets;
1116
572k
  for (auto *Ptr : VL) {
1117
572k
    // TODO: Outline this code as a special, more time consuming, version of
1118
572k
    // computeConstantDifference() function.
1119
572k
    if (Ptr->getType()->getPointerAddressSpace() !=
1120
572k
        Ptr0->getType()->getPointerAddressSpace())
1121
0
      return false;
1122
572k
    // If a pointer refers to a different underlying object, bail - the
1123
572k
    // pointers are by definition incomparable.
1124
572k
    Value *CurrObj = GetUnderlyingObject(Ptr, DL);
1125
572k
    if (CurrObj != Obj0)
1126
94.6k
      return false;
1127
478k
1128
478k
    const SCEV *Scev = SE.getSCEV(Ptr);
1129
478k
    const auto *Diff = dyn_cast<SCEVConstant>(SE.getMinusSCEV(Scev, Scev0));
1130
478k
    // The pointers may not have a constant offset from each other, or SCEV
1131
478k
    // may just not be smart enough to figure out they do. Regardless,
1132
478k
    // there's nothing we can do.
1133
478k
    if (!Diff)
1134
5.79k
      return false;
1135
472k
1136
472k
    // Check if the pointer with the same offset is found.
1137
472k
    int64_t Offset = Diff->getAPInt().getSExtValue();
1138
472k
    if (!Offsets.insert(Offset).second)
1139
59
      return false;
1140
472k
    OffValPairs.emplace_back(Offset, Ptr);
1141
472k
  }
1142
256k
  SortedIndices.clear();
1143
155k
  SortedIndices.resize(VL.size());
1144
155k
  std::iota(SortedIndices.begin(), SortedIndices.end(), 0);
1145
155k
1146
155k
  // Sort the memory accesses and keep the order of their uses in UseOrder.
1147
276k
  llvm::stable_sort(SortedIndices, [&](unsigned Left, unsigned Right) {
1148
276k
    return OffValPairs[Left].first < OffValPairs[Right].first;
1149
276k
  });
1150
155k
1151
155k
  // Check if the order is consecutive already.
1152
310k
  if (
llvm::all_of(SortedIndices, [&SortedIndices](const unsigned I) 155k
{
1153
310k
        return I == SortedIndices[I];
1154
310k
      }))
1155
112k
    SortedIndices.clear();
1156
155k
1157
155k
  return true;
1158
256k
}
1159
1160
/// Take the address space operand from the Load/Store instruction.
1161
/// Returns -1 if this is not a valid Load/Store instruction.
1162
5.54M
static unsigned getAddressSpaceOperand(Value *I) {
1163
5.54M
  if (LoadInst *L = dyn_cast<LoadInst>(I))
1164
231k
    return L->getPointerAddressSpace();
1165
5.30M
  if (StoreInst *S = dyn_cast<StoreInst>(I))
1166
5.30M
    return S->getPointerAddressSpace();
1167
0
  return -1;
1168
0
}
1169
1170
/// Returns true if the memory operations \p A and \p B are consecutive.
1171
bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
1172
2.77M
                               ScalarEvolution &SE, bool CheckType) {
1173
2.77M
  Value *PtrA = getLoadStorePointerOperand(A);
1174
2.77M
  Value *PtrB = getLoadStorePointerOperand(B);
1175
2.77M
  unsigned ASA = getAddressSpaceOperand(A);
1176
2.77M
  unsigned ASB = getAddressSpaceOperand(B);
1177
2.77M
1178
2.77M
  // Check that the address spaces match and that the pointers are valid.
1179
2.77M
  if (!PtrA || !PtrB || (ASA != ASB))
1180
4
    return false;
1181
2.77M
1182
2.77M
  // Make sure that A and B are different pointers.
1183
2.77M
  if (PtrA == PtrB)
1184
42.5k
    return false;
1185
2.72M
1186
2.72M
  // Make sure that A and B have the same type if required.
1187
2.72M
  if (CheckType && 
PtrA->getType() != PtrB->getType()2.69M
)
1188
947k
    return false;
1189
1.78M
1190
1.78M
  unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
1191
1.78M
  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
1192
1.78M
1193
1.78M
  APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1194
1.78M
  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1195
1.78M
  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1196
1.78M
1197
1.78M
  // Retrieve the address space again as pointer stripping now tracks through
1198
1.78M
  // `addrspacecast`.
1199
1.78M
  ASA = cast<PointerType>(PtrA->getType())->getAddressSpace();
1200
1.78M
  ASB = cast<PointerType>(PtrB->getType())->getAddressSpace();
1201
1.78M
  // Check that the address spaces match and that the pointers are valid.
1202
1.78M
  if (ASA != ASB)
1203
0
    return false;
1204
1.78M
1205
1.78M
  IdxWidth = DL.getIndexSizeInBits(ASA);
1206
1.78M
  OffsetA = OffsetA.sextOrTrunc(IdxWidth);
1207
1.78M
  OffsetB = OffsetB.sextOrTrunc(IdxWidth);
1208
1.78M
1209
1.78M
  APInt Size(IdxWidth, DL.getTypeStoreSize(Ty));
1210
1.78M
1211
1.78M
  //  OffsetDelta = OffsetB - OffsetA;
1212
1.78M
  const SCEV *OffsetSCEVA = SE.getConstant(OffsetA);
1213
1.78M
  const SCEV *OffsetSCEVB = SE.getConstant(OffsetB);
1214
1.78M
  const SCEV *OffsetDeltaSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1215
1.78M
  const SCEVConstant *OffsetDeltaC = dyn_cast<SCEVConstant>(OffsetDeltaSCEV);
1216
1.78M
  const APInt &OffsetDelta = OffsetDeltaC->getAPInt();
1217
1.78M
  // Check if they are based on the same pointer. That makes the offsets
1218
1.78M
  // sufficient.
1219
1.78M
  if (PtrA == PtrB)
1220
1.63M
    return OffsetDelta == Size;
1221
142k
1222
142k
  // Compute the necessary base pointer delta to have the necessary final delta
1223
142k
  // equal to the size.
1224
142k
  // BaseDelta = Size - OffsetDelta;
1225
142k
  const SCEV *SizeSCEV = SE.getConstant(Size);
1226
142k
  const SCEV *BaseDelta = SE.getMinusSCEV(SizeSCEV, OffsetDeltaSCEV);
1227
142k
1228
142k
  // Otherwise compute the distance with SCEV between the base pointers.
1229
142k
  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
1230
142k
  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
1231
142k
  const SCEV *X = SE.getAddExpr(PtrSCEVA, BaseDelta);
1232
142k
  return X == PtrSCEVB;
1233
142k
}
1234
1235
MemoryDepChecker::VectorizationSafetyStatus
1236
216k
MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
1237
216k
  switch (Type) {
1238
216k
  case NoDep:
1239
197k
  case Forward:
1240
197k
  case BackwardVectorizable:
1241
197k
    return VectorizationSafetyStatus::Safe;
1242
197k
1243
197k
  case Unknown:
1244
18.9k
    return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
1245
197k
  case ForwardButPreventsForwarding:
1246
510
  case Backward:
1247
510
  case BackwardVectorizableButPreventsForwarding:
1248
510
    return VectorizationSafetyStatus::Unsafe;
1249
0
  }
1250
0
  llvm_unreachable("unexpected DepType!");
1251
0
}
1252
1253
5.71k
bool MemoryDepChecker::Dependence::isBackward() const {
1254
5.71k
  switch (Type) {
1255
5.71k
  case NoDep:
1256
3.60k
  case Forward:
1257
3.60k
  case ForwardButPreventsForwarding:
1258
3.60k
  case Unknown:
1259
3.60k
    return false;
1260
3.60k
1261
3.60k
  case BackwardVectorizable:
1262
2.11k
  case Backward:
1263
2.11k
  case BackwardVectorizableButPreventsForwarding:
1264
2.11k
    return true;
1265
0
  }
1266
0
  llvm_unreachable("unexpected DepType!");
1267
0
}
1268
1269
43
bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
1270
43
  return isBackward() || 
Type == Unknown5
;
1271
43
}
1272
1273
0
bool MemoryDepChecker::Dependence::isForward() const {
1274
0
  switch (Type) {
1275
0
  case Forward:
1276
0
  case ForwardButPreventsForwarding:
1277
0
    return true;
1278
0
1279
0
  case NoDep:
1280
0
  case Unknown:
1281
0
  case BackwardVectorizable:
1282
0
  case Backward:
1283
0
  case BackwardVectorizableButPreventsForwarding:
1284
0
    return false;
1285
0
  }
1286
0
  llvm_unreachable("unexpected DepType!");
1287
0
}
1288
1289
bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1290
2.67k
                                                    uint64_t TypeByteSize) {
1291
2.67k
  // If loads occur at a distance that is not a multiple of a feasible vector
1292
2.67k
  // factor store-load forwarding does not take place.
1293
2.67k
  // Positive dependences might cause troubles because vectorizing them might
1294
2.67k
  // prevent store-load forwarding making vectorized code run a lot slower.
1295
2.67k
  //   a[i] = a[i-3] ^ a[i-8];
1296
2.67k
  //   The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1297
2.67k
  //   hence on your typical architecture store-load forwarding does not take
1298
2.67k
  //   place. Vectorizing in such cases does not make sense.
1299
2.67k
  // Store-load forwarding distance.
1300
2.67k
1301
2.67k
  // After this many iterations store-to-load forwarding conflicts should not
1302
2.67k
  // cause any slowdowns.
1303
2.67k
  const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1304
2.67k
  // Maximum vector factor.
1305
2.67k
  uint64_t MaxVFWithoutSLForwardIssues = std::min(
1306
2.67k
      VectorizerParams::MaxVectorWidth * TypeByteSize, MaxSafeDepDistBytes);
1307
2.67k
1308
2.67k
  // Compute the smallest VF at which the store and load would be misaligned.
1309
13.0k
  for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
1310
10.7k
       
VF *= 210.4k
) {
1311
10.7k
    // If the number of vector iteration between the store and the load are
1312
10.7k
    // small we could incur conflicts.
1313
10.7k
    if (Distance % VF && 
Distance / VF < NumItersForStoreLoadThroughMemory1.26k
) {
1314
338
      MaxVFWithoutSLForwardIssues = (VF >>= 1);
1315
338
      break;
1316
338
    }
1317
10.7k
  }
1318
2.67k
1319
2.67k
  if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
1320
216
    LLVM_DEBUG(
1321
216
        dbgs() << "LAA: Distance " << Distance
1322
216
               << " that could cause a store-load forwarding conflict\n");
1323
216
    return true;
1324
216
  }
1325
2.45k
1326
2.45k
  if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
1327
2.45k
      MaxVFWithoutSLForwardIssues !=
1328
400
          VectorizerParams::MaxVectorWidth * TypeByteSize)
1329
122
    MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
1330
2.45k
  return false;
1331
2.45k
}
1332
1333
216k
void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1334
216k
  if (Status < S)
1335
1.78k
    Status = S;
1336
216k
}
1337
1338
/// Given a non-constant (unknown) dependence-distance \p Dist between two
1339
/// memory accesses, that have the same stride whose absolute value is given
1340
/// in \p Stride, and that have the same type size \p TypeByteSize,
1341
/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
1342
/// possible to prove statically that the dependence distance is larger
1343
/// than the range that the accesses will travel through the execution of
1344
/// the loop. If so, return true; false otherwise. This is useful for
1345
/// example in loops such as the following (PR31098):
1346
///     for (i = 0; i < D; ++i) {
1347
///                = out[i];
1348
///       out[i+D] =
1349
///     }
1350
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1351
                                     const SCEV &BackedgeTakenCount,
1352
                                     const SCEV &Dist, uint64_t Stride,
1353
3.75k
                                     uint64_t TypeByteSize) {
1354
3.75k
1355
3.75k
  // If we can prove that
1356
3.75k
  //      (**) |Dist| > BackedgeTakenCount * Step
1357
3.75k
  // where Step is the absolute stride of the memory accesses in bytes,
1358
3.75k
  // then there is no dependence.
1359
3.75k
  //
1360
3.75k
  // Rationale:
1361
3.75k
  // We basically want to check if the absolute distance (|Dist/Step|)
1362
3.75k
  // is >= the loop iteration count (or > BackedgeTakenCount).
1363
3.75k
  // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1364
3.75k
  // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1365
3.75k
  // that the dependence distance is >= VF; This is checked elsewhere.
1366
3.75k
  // But in some cases we can prune unknown dependence distances early, and
1367
3.75k
  // even before selecting the VF, and without a runtime test, by comparing
1368
3.75k
  // the distance against the loop iteration count. Since the vectorized code
1369
3.75k
  // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1370
3.75k
  // also guarantees that distance >= VF.
1371
3.75k
  //
1372
3.75k
  const uint64_t ByteStride = Stride * TypeByteSize;
1373
3.75k
  const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
1374
3.75k
  const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
1375
3.75k
1376
3.75k
  const SCEV *CastedDist = &Dist;
1377
3.75k
  const SCEV *CastedProduct = Product;
1378
3.75k
  uint64_t DistTypeSize = DL.getTypeAllocSize(Dist.getType());
1379
3.75k
  uint64_t ProductTypeSize = DL.getTypeAllocSize(Product->getType());
1380
3.75k
1381
3.75k
  // The dependence distance can be positive/negative, so we sign extend Dist;
1382
3.75k
  // The multiplication of the absolute stride in bytes and the
1383
3.75k
  // backedgeTakenCount is non-negative, so we zero extend Product.
1384
3.75k
  if (DistTypeSize > ProductTypeSize)
1385
1.50k
    CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
1386
2.25k
  else
1387
2.25k
    CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
1388
3.75k
1389
3.75k
  // Is  Dist - (BackedgeTakenCount * Step) > 0 ?
1390
3.75k
  // (If so, then we have proven (**) because |Dist| >= Dist)
1391
3.75k
  const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
1392
3.75k
  if (SE.isKnownPositive(Minus))
1393
20
    return true;
1394
3.73k
1395
3.73k
  // Second try: Is  -Dist - (BackedgeTakenCount * Step) > 0 ?
1396
3.73k
  // (If so, then we have proven (**) because |Dist| >= -1*Dist)
1397
3.73k
  const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
1398
3.73k
  Minus = SE.getMinusSCEV(NegDist, CastedProduct);
1399
3.73k
  if (SE.isKnownPositive(Minus))
1400
16
    return true;
1401
3.72k
1402
3.72k
  return false;
1403
3.72k
}
1404
1405
/// Check the dependence for two accesses with the same stride \p Stride.
1406
/// \p Distance is the positive distance and \p TypeByteSize is type size in
1407
/// bytes.
1408
///
1409
/// \returns true if they are independent.
1410
static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
1411
140k
                                          uint64_t TypeByteSize) {
1412
140k
  assert(Stride > 1 && "The stride must be greater than 1");
1413
140k
  assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
1414
140k
  assert(Distance > 0 && "The distance must be non-zero");
1415
140k
1416
140k
  // Skip if the distance is not multiple of type byte size.
1417
140k
  if (Distance % TypeByteSize)
1418
52
    return false;
1419
140k
1420
140k
  uint64_t ScaledDist = Distance / TypeByteSize;
1421
140k
1422
140k
  // No dependence if the scaled distance is not multiple of the stride.
1423
140k
  // E.g.
1424
140k
  //      for (i = 0; i < 1024 ; i += 4)
1425
140k
  //        A[i+2] = A[i] + 1;
1426
140k
  //
1427
140k
  // Two accesses in memory (scaled distance is 2, stride is 4):
1428
140k
  //     | A[0] |      |      |      | A[4] |      |      |      |
1429
140k
  //     |      |      | A[2] |      |      |      | A[6] |      |
1430
140k
  //
1431
140k
  // E.g.
1432
140k
  //      for (i = 0; i < 1024 ; i += 3)
1433
140k
  //        A[i+4] = A[i] + 1;
1434
140k
  //
1435
140k
  // Two accesses in memory (scaled distance is 4, stride is 3):
1436
140k
  //     | A[0] |      |      | A[3] |      |      | A[6] |      |      |
1437
140k
  //     |      |      |      |      | A[4] |      |      | A[7] |      |
1438
140k
  return ScaledDist % Stride;
1439
140k
}
1440
1441
MemoryDepChecker::Dependence::DepType
1442
MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
1443
                              const MemAccessInfo &B, unsigned BIdx,
1444
216k
                              const ValueToValueMap &Strides) {
1445
216k
  assert (AIdx < BIdx && "Must pass arguments in program order");
1446
216k
1447
216k
  Value *APtr = A.getPointer();
1448
216k
  Value *BPtr = B.getPointer();
1449
216k
  bool AIsWrite = A.getInt();
1450
216k
  bool BIsWrite = B.getInt();
1451
216k
1452
216k
  // Two reads are independent.
1453
216k
  if (!AIsWrite && 
!BIsWrite66.9k
)
1454
43.0k
    return Dependence::NoDep;
1455
173k
1456
173k
  // We cannot check pointers in different address spaces.
1457
173k
  if (APtr->getType()->getPointerAddressSpace() !=
1458
173k
      BPtr->getType()->getPointerAddressSpace())
1459
0
    return Dependence::Unknown;
1460
173k
1461
173k
  int64_t StrideAPtr = getPtrStride(PSE, APtr, InnermostLoop, Strides, true);
1462
173k
  int64_t StrideBPtr = getPtrStride(PSE, BPtr, InnermostLoop, Strides, true);
1463
173k
1464
173k
  const SCEV *Src = PSE.getSCEV(APtr);
1465
173k
  const SCEV *Sink = PSE.getSCEV(BPtr);
1466
173k
1467
173k
  // If the induction step is negative we have to invert source and sink of the
1468
173k
  // dependence.
1469
173k
  if (StrideAPtr < 0) {
1470
7.50k
    std::swap(APtr, BPtr);
1471
7.50k
    std::swap(Src, Sink);
1472
7.50k
    std::swap(AIsWrite, BIsWrite);
1473
7.50k
    std::swap(AIdx, BIdx);
1474
7.50k
    std::swap(StrideAPtr, StrideBPtr);
1475
7.50k
  }
1476
173k
1477
173k
  const SCEV *Dist = PSE.getSE()->getMinusSCEV(Sink, Src);
1478
173k
1479
173k
  LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
1480
173k
                    << "(Induction step: " << StrideAPtr << ")\n");
1481
173k
  LLVM_DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
1482
173k
                    << *InstMap[BIdx] << ": " << *Dist << "\n");
1483
173k
1484
173k
  // Need accesses with constant stride. We don't want to vectorize
1485
173k
  // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
1486
173k
  // the address space.
1487
173k
  if (!StrideAPtr || 
!StrideBPtr162k
||
StrideAPtr != StrideBPtr161k
){
1488
14.5k
    LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
1489
14.5k
    return Dependence::Unknown;
1490
14.5k
  }
1491
158k
1492
158k
  Type *ATy = APtr->getType()->getPointerElementType();
1493
158k
  Type *BTy = BPtr->getType()->getPointerElementType();
1494
158k
  auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
1495
158k
  uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
1496
158k
  uint64_t Stride = std::abs(StrideAPtr);
1497
158k
  const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
1498
158k
  if (!C) {
1499
3.81k
    if (TypeByteSize == DL.getTypeAllocSize(BTy) &&
1500
3.81k
        isSafeDependenceDistance(DL, *(PSE.getSE()),
1501
3.75k
                                 *(PSE.getBackedgeTakenCount()), *Dist, Stride,
1502
3.75k
                                 TypeByteSize))
1503
36
      return Dependence::NoDep;
1504
3.77k
1505
3.77k
    LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
1506
3.77k
    FoundNonConstantDistanceDependence = true;
1507
3.77k
    return Dependence::Unknown;
1508
3.77k
  }
1509
155k
1510
155k
  const APInt &Val = C->getAPInt();
1511
155k
  int64_t Distance = Val.getSExtValue();
1512
155k
1513
155k
  // Attempt to prove strided accesses independent.
1514
155k
  if (std::abs(Distance) > 0 && 
Stride > 1151k
&&
ATy == BTy140k
&&
1515
155k
      
areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)140k
) {
1516
139k
    LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
1517
139k
    return Dependence::NoDep;
1518
139k
  }
1519
15.7k
1520
15.7k
  // Negative distances are not plausible dependencies.
1521
15.7k
  if (Val.isNegative()) {
1522
4.31k
    bool IsTrueDataDependence = (AIsWrite && 
!BIsWrite1.80k
);
1523
4.31k
    if (IsTrueDataDependence && 
EnableForwardingConflictDetection583
&&
1524
4.31k
        
(583
couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize)583
||
1525
583
         
ATy != BTy449
)) {
1526
162
      LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
1527
162
      return Dependence::ForwardButPreventsForwarding;
1528
162
    }
1529
4.15k
1530
4.15k
    LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
1531
4.15k
    return Dependence::Forward;
1532
4.15k
  }
1533
11.4k
1534
11.4k
  // Write to the same location with the same size.
1535
11.4k
  // Could be improved to assert type sizes are the same (i32 == float, etc).
1536
11.4k
  if (Val == 0) {
1537
4.01k
    if (ATy == BTy)
1538
3.86k
      return Dependence::Forward;
1539
146
    LLVM_DEBUG(
1540
146
        dbgs() << "LAA: Zero dependence difference but different types\n");
1541
146
    return Dependence::Unknown;
1542
146
  }
1543
7.40k
1544
7.40k
  assert(Val.isStrictlyPositive() && "Expect a positive value");
1545
7.40k
1546
7.40k
  if (ATy != BTy) {
1547
398
    LLVM_DEBUG(
1548
398
        dbgs()
1549
398
        << "LAA: ReadWrite-Write positive dependency with different types\n");
1550
398
    return Dependence::Unknown;
1551
398
  }
1552
7.00k
1553
7.00k
  // Bail out early if passed-in parameters make vectorization not feasible.
1554
7.00k
  unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
1555
6.96k
                           
VectorizerParams::VectorizationFactor43
: 1);
1556
7.00k
  unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
1557
6.98k
                           
VectorizerParams::VectorizationInterleave23
: 1);
1558
7.00k
  // The minimum number of iterations for a vectorized/unrolled version.
1559
7.00k
  unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
1560
7.00k
1561
7.00k
  // It's not vectorizable if the distance is smaller than the minimum distance
1562
7.00k
  // needed for a vectroized/unrolled version. Vectorizing one iteration in
1563
7.00k
  // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
1564
7.00k
  // TypeByteSize (No need to plus the last gap distance).
1565
7.00k
  //
1566
7.00k
  // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
1567
7.00k
  //      foo(int *A) {
1568
7.00k
  //        int *B = (int *)((char *)A + 14);
1569
7.00k
  //        for (i = 0 ; i < 1024 ; i += 2)
1570
7.00k
  //          B[i] = A[i] + 1;
1571
7.00k
  //      }
1572
7.00k
  //
1573
7.00k
  // Two accesses in memory (stride is 2):
1574
7.00k
  //     | A[0] |      | A[2] |      | A[4] |      | A[6] |      |
1575
7.00k
  //                              | B[0] |      | B[2] |      | B[4] |
1576
7.00k
  //
1577
7.00k
  // Distance needs for vectorizing iterations except the last iteration:
1578
7.00k
  // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
1579
7.00k
  // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
1580
7.00k
  //
1581
7.00k
  // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
1582
7.00k
  // 12, which is less than distance.
1583
7.00k
  //
1584
7.00k
  // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
1585
7.00k
  // the minimum distance needed is 28, which is greater than distance. It is
1586
7.00k
  // not safe to do vectorization.
1587
7.00k
  uint64_t MinDistanceNeeded =
1588
7.00k
      TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
1589
7.00k
  if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
1590
232
    LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
1591
232
                      << Distance << '\n');
1592
232
    return Dependence::Backward;
1593
232
  }
1594
6.77k
1595
6.77k
  // Unsafe if the minimum distance needed is greater than max safe distance.
1596
6.77k
  if (MinDistanceNeeded > MaxSafeDepDistBytes) {
1597
34
    LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
1598
34
                      << MinDistanceNeeded << " size in bytes");
1599
34
    return Dependence::Backward;
1600
34
  }
1601
6.73k
1602
6.73k
  // Positive distance bigger than max vectorization factor.
1603
6.73k
  // FIXME: Should use max factor instead of max distance in bytes, which could
1604
6.73k
  // not handle different types.
1605
6.73k
  // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
1606
6.73k
  //      void foo (int *A, char *B) {
1607
6.73k
  //        for (unsigned i = 0; i < 1024; i++) {
1608
6.73k
  //          A[i+2] = A[i] + 1;
1609
6.73k
  //          B[i+2] = B[i] + 1;
1610
6.73k
  //        }
1611
6.73k
  //      }
1612
6.73k
  //
1613
6.73k
  // This case is currently unsafe according to the max safe distance. If we
1614
6.73k
  // analyze the two accesses on array B, the max safe dependence distance
1615
6.73k
  // is 2. Then we analyze the accesses on array A, the minimum distance needed
1616
6.73k
  // is 8, which is less than 2 and forbidden vectorization, But actually
1617
6.73k
  // both A and B could be vectorized by 2 iterations.
1618
6.73k
  MaxSafeDepDistBytes =
1619
6.73k
      std::min(static_cast<uint64_t>(Distance), MaxSafeDepDistBytes);
1620
6.73k
1621
6.73k
  bool IsTrueDataDependence = (!AIsWrite && 
BIsWrite2.09k
);
1622
6.73k
  if (IsTrueDataDependence && 
EnableForwardingConflictDetection2.09k
&&
1623
6.73k
      
couldPreventStoreLoadForward(Distance, TypeByteSize)2.08k
)
1624
82
    return Dependence::BackwardVectorizableButPreventsForwarding;
1625
6.65k
1626
6.65k
  uint64_t MaxVF = MaxSafeDepDistBytes / (TypeByteSize * Stride);
1627
6.65k
  LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
1628
6.65k
                    << " with max VF = " << MaxVF << '\n');
1629
6.65k
  uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
1630
6.65k
  MaxSafeRegisterWidth = std::min(MaxSafeRegisterWidth, MaxVFInBits);
1631
6.65k
  return Dependence::BackwardVectorizable;
1632
6.65k
}
1633
1634
bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
1635
                                   MemAccessInfoList &CheckDeps,
1636
22.1k
                                   const ValueToValueMap &Strides) {
1637
22.1k
1638
22.1k
  MaxSafeDepDistBytes = -1;
1639
22.1k
  SmallPtrSet<MemAccessInfo, 8> Visited;
1640
42.9k
  for (MemAccessInfo CurAccess : CheckDeps) {
1641
42.9k
    if (Visited.count(CurAccess))
1642
15.8k
      continue;
1643
27.0k
1644
27.0k
    // Get the relevant memory access set.
1645
27.0k
    EquivalenceClasses<MemAccessInfo>::iterator I =
1646
27.0k
      AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
1647
27.0k
1648
27.0k
    // Check accesses within this set.
1649
27.0k
    EquivalenceClasses<MemAccessInfo>::member_iterator AI =
1650
27.0k
        AccessSets.member_begin(I);
1651
27.0k
    EquivalenceClasses<MemAccessInfo>::member_iterator AE =
1652
27.0k
        AccessSets.member_end();
1653
27.0k
1654
27.0k
    // Check every access pair.
1655
86.4k
    while (AI != AE) {
1656
59.4k
      Visited.insert(*AI);
1657
59.4k
      EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
1658
264k
      while (OI != AE) {
1659
205k
        // Check every accessing instruction pair in program order.
1660
205k
        for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
1661
414k
             I1E = Accesses[*AI].end(); I1 != I1E; 
++I1208k
)
1662
209k
          for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
1663
425k
               I2E = Accesses[*OI].end(); I2 != I2E; 
++I2216k
) {
1664
216k
            auto A = std::make_pair(&*AI, *I1);
1665
216k
            auto B = std::make_pair(&*OI, *I2);
1666
216k
1667
216k
            assert(*I1 != *I2);
1668
216k
            if (*I1 > *I2)
1669
197k
              std::swap(A, B);
1670
216k
1671
216k
            Dependence::DepType Type =
1672
216k
                isDependent(*A.first, A.second, *B.first, B.second, Strides);
1673
216k
            mergeInStatus(Dependence::isSafeForVectorization(Type));
1674
216k
1675
216k
            // Gather dependences unless we accumulated MaxDependences
1676
216k
            // dependences.  In that case return as soon as we find the first
1677
216k
            // unsafe dependence.  This puts a limit on this quadratic
1678
216k
            // algorithm.
1679
216k
            if (RecordDependences) {
1680
211k
              if (Type != Dependence::NoDep)
1681
30.2k
                Dependences.push_back(Dependence(A.second, B.second, Type));
1682
211k
1683
211k
              if (Dependences.size() >= MaxDependences) {
1684
132
                RecordDependences = false;
1685
132
                Dependences.clear();
1686
132
                LLVM_DEBUG(dbgs()
1687
132
                           << "Too many dependences, stopped recording\n");
1688
132
              }
1689
211k
            }
1690
216k
            if (!RecordDependences && 
!isSafeForVectorization()5.30k
)
1691
122
              return false;
1692
216k
          }
1693
205k
        ++OI;
1694
205k
      }
1695
59.4k
      AI++;
1696
59.3k
    }
1697
27.0k
  }
1698
22.1k
1699
22.1k
  
LLVM_DEBUG22.0k
(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
1700
22.0k
  return isSafeForVectorization();
1701
22.1k
}
1702
1703
SmallVector<Instruction *, 4>
1704
220
MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const {
1705
220
  MemAccessInfo Access(Ptr, isWrite);
1706
220
  auto &IndexVector = Accesses.find(Access)->second;
1707
220
1708
220
  SmallVector<Instruction *, 4> Insts;
1709
220
  transform(IndexVector,
1710
220
                 std::back_inserter(Insts),
1711
220
                 [&](unsigned Idx) { return this->InstMap[Idx]; });
1712
220
  return Insts;
1713
220
}
1714
1715
const char *MemoryDepChecker::Dependence::DepName[] = {
1716
    "NoDep", "Unknown", "Forward", "ForwardButPreventsForwarding", "Backward",
1717
    "BackwardVectorizable", "BackwardVectorizableButPreventsForwarding"};
1718
1719
void MemoryDepChecker::Dependence::print(
1720
    raw_ostream &OS, unsigned Depth,
1721
85
    const SmallVectorImpl<Instruction *> &Instrs) const {
1722
85
  OS.indent(Depth) << DepName[Type] << ":\n";
1723
85
  OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
1724
85
  OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
1725
85
}
1726
1727
191k
bool LoopAccessInfo::canAnalyzeLoop() {
1728
191k
  // We need to have a loop header.
1729
191k
  LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
1730
191k
                    << TheLoop->getHeader()->getParent()->getName() << ": "
1731
191k
                    << TheLoop->getHeader()->getName() << '\n');
1732
191k
1733
191k
  // We can only analyze innermost loops.
1734
191k
  if (!TheLoop->empty()) {
1735
11
    LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
1736
11
    recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
1737
11
    return false;
1738
11
  }
1739
191k
1740
191k
  // We must have a single backedge.
1741
191k
  if (TheLoop->getNumBackEdges() != 1) {
1742
0
    LLVM_DEBUG(
1743
0
        dbgs() << "LAA: loop control flow is not understood by analyzer\n");
1744
0
    recordAnalysis("CFGNotUnderstood")
1745
0
        << "loop control flow is not understood by analyzer";
1746
0
    return false;
1747
0
  }
1748
191k
1749
191k
  // We must have a single exiting block.
1750
191k
  if (!TheLoop->getExitingBlock()) {
1751
35.0k
    LLVM_DEBUG(
1752
35.0k
        dbgs() << "LAA: loop control flow is not understood by analyzer\n");
1753
35.0k
    recordAnalysis("CFGNotUnderstood")
1754
35.0k
        << "loop control flow is not understood by analyzer";
1755
35.0k
    return false;
1756
35.0k
  }
1757
156k
1758
156k
  // We only handle bottom-tested loops, i.e. loop in which the condition is
1759
156k
  // checked at the end of each iteration. With that we can assume that all
1760
156k
  // instructions in the loop are executed the same number of times.
1761
156k
  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1762
4.43k
    LLVM_DEBUG(
1763
4.43k
        dbgs() << "LAA: loop control flow is not understood by analyzer\n");
1764
4.43k
    recordAnalysis("CFGNotUnderstood")
1765
4.43k
        << "loop control flow is not understood by analyzer";
1766
4.43k
    return false;
1767
4.43k
  }
1768
151k
1769
151k
  // ScalarEvolution needs to be able to find the exit count.
1770
151k
  const SCEV *ExitCount = PSE->getBackedgeTakenCount();
1771
151k
  if (ExitCount == PSE->getSE()->getCouldNotCompute()) {
1772
70.7k
    recordAnalysis("CantComputeNumberOfIterations")
1773
70.7k
        << "could not determine number of loop iterations";
1774
70.7k
    LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
1775
70.7k
    return false;
1776
70.7k
  }
1777
81.1k
1778
81.1k
  return true;
1779
81.1k
}
1780
1781
void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
1782
                                 const TargetLibraryInfo *TLI,
1783
81.1k
                                 DominatorTree *DT) {
1784
81.1k
  typedef SmallPtrSet<Value*, 16> ValueSet;
1785
81.1k
1786
81.1k
  // Holds the Load and Store instructions.
1787
81.1k
  SmallVector<LoadInst *, 16> Loads;
1788
81.1k
  SmallVector<StoreInst *, 16> Stores;
1789
81.1k
1790
81.1k
  // Holds all the different accesses in the loop.
1791
81.1k
  unsigned NumReads = 0;
1792
81.1k
  unsigned NumReadWrites = 0;
1793
81.1k
1794
81.1k
  bool HasComplexMemInst = false;
1795
81.1k
1796
81.1k
  // A runtime check is only legal to insert if there are no convergent calls.
1797
81.1k
  HasConvergentOp = false;
1798
81.1k
1799
81.1k
  PtrRtChecking->Pointers.clear();
1800
81.1k
  PtrRtChecking->Need = false;
1801
81.1k
1802
81.1k
  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
1803
81.1k
1804
81.1k
  // For each block.
1805
102k
  for (BasicBlock *BB : TheLoop->blocks()) {
1806
102k
    // Scan the BB and collect legal loads and stores. Also detect any
1807
102k
    // convergent instructions.
1808
1.42M
    for (Instruction &I : *BB) {
1809
1.42M
      if (auto *Call = dyn_cast<CallBase>(&I)) {
1810
27.1k
        if (Call->isConvergent())
1811
25
          HasConvergentOp = true;
1812
27.1k
      }
1813
1.42M
1814
1.42M
      // With both a non-vectorizable memory instruction and a convergent
1815
1.42M
      // operation, found in this loop, no reason to continue the search.
1816
1.42M
      if (HasComplexMemInst && 
HasConvergentOp119k
) {
1817
2
        CanVecMem = false;
1818
2
        return;
1819
2
      }
1820
1.42M
1821
1.42M
      // Avoid hitting recordAnalysis multiple times.
1822
1.42M
      if (HasComplexMemInst)
1823
119k
        continue;
1824
1.30M
1825
1.30M
      // If this is a load, save it. If this instruction can read from memory
1826
1.30M
      // but is not a load, then we quit. Notice that we don't handle function
1827
1.30M
      // calls that read or write.
1828
1.30M
      if (I.mayReadFromMemory()) {
1829
132k
        // Many math library functions read the rounding mode. We will only
1830
132k
        // vectorize a loop if it contains known function calls that don't set
1831
132k
        // the flag. Therefore, it is safe to ignore this read from memory.
1832
132k
        auto *Call = dyn_cast<CallInst>(&I);
1833
132k
        if (Call && 
getVectorIntrinsicIDForCall(Call, TLI)15.7k
)
1834
199
          continue;
1835
132k
1836
132k
        // If the function has an explicit vectorized counterpart, we can safely
1837
132k
        // assume that it can be vectorized.
1838
132k
        if (Call && 
!Call->isNoBuiltin()15.5k
&&
Call->getCalledFunction()5.40k
&&
1839
132k
            
TLI->isFunctionVectorizable(Call->getCalledFunction()->getName())5.12k
)
1840
56
          continue;
1841
132k
1842
132k
        auto *Ld = dyn_cast<LoadInst>(&I);
1843
132k
        if (!Ld) {
1844
15.7k
          recordAnalysis("CantVectorizeInstruction", Ld)
1845
15.7k
            << "instruction cannot be vectorized";
1846
15.7k
          HasComplexMemInst = true;
1847
15.7k
          continue;
1848
15.7k
        }
1849
116k
        if (!Ld->isSimple() && 
!IsAnnotatedParallel48
) {
1850
48
          recordAnalysis("NonSimpleLoad", Ld)
1851
48
              << "read with atomic ordering or volatile read";
1852
48
          LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
1853
48
          HasComplexMemInst = true;
1854
48
          continue;
1855
48
        }
1856
116k
        NumLoads++;
1857
116k
        Loads.push_back(Ld);
1858
116k
        DepChecker->addAccess(Ld);
1859
116k
        if (EnableMemAccessVersioning)
1860
116k
          collectStridedAccess(Ld);
1861
116k
        continue;
1862
116k
      }
1863
1.17M
1864
1.17M
      // Save 'store' instructions. Abort if other instructions write to memory.
1865
1.17M
      if (I.mayWriteToMemory()) {
1866
109k
        auto *St = dyn_cast<StoreInst>(&I);
1867
109k
        if (!St) {
1868
0
          recordAnalysis("CantVectorizeInstruction", St)
1869
0
              << "instruction cannot be vectorized";
1870
0
          HasComplexMemInst = true;
1871
0
          continue;
1872
0
        }
1873
109k
        if (!St->isSimple() && 
!IsAnnotatedParallel0
) {
1874
0
          recordAnalysis("NonSimpleStore", St)
1875
0
              << "write with atomic ordering or volatile write";
1876
0
          LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
1877
0
          HasComplexMemInst = true;
1878
0
          continue;
1879
0
        }
1880
109k
        NumStores++;
1881
109k
        Stores.push_back(St);
1882
109k
        DepChecker->addAccess(St);
1883
109k
        if (EnableMemAccessVersioning)
1884
109k
          collectStridedAccess(St);
1885
109k
      }
1886
1.17M
    } // Next instr.
1887
102k
  } // Next block.
1888
81.1k
1889
81.1k
  
if (81.1k
HasComplexMemInst81.1k
) {
1890
15.7k
    CanVecMem = false;
1891
15.7k
    return;
1892
15.7k
  }
1893
65.3k
1894
65.3k
  // Now we have two lists that hold the loads and the stores.
1895
65.3k
  // Next, we find the pointers that they use.
1896
65.3k
1897
65.3k
  // Check if we see any stores. If there are no stores, then we don't
1898
65.3k
  // care if the pointers are *restrict*.
1899
65.3k
  if (!Stores.size()) {
1900
9.13k
    LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
1901
9.13k
    CanVecMem = true;
1902
9.13k
    return;
1903
9.13k
  }
1904
56.2k
1905
56.2k
  MemoryDepChecker::DepCandidates DependentAccesses;
1906
56.2k
  AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
1907
56.2k
                          TheLoop, AA, LI, DependentAccesses, *PSE);
1908
56.2k
1909
56.2k
  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
1910
56.2k
  // multiple times on the same object. If the ptr is accessed twice, once
1911
56.2k
  // for read and once for write, it will only appear once (on the write
1912
56.2k
  // list). This is okay, since we are going to check for conflicts between
1913
56.2k
  // writes and between reads and writes, but not between reads and reads.
1914
56.2k
  ValueSet Seen;
1915
56.2k
1916
56.2k
  // Record uniform store addresses to identify if we have multiple stores
1917
56.2k
  // to the same address.
1918
56.2k
  ValueSet UniformStores;
1919
56.2k
1920
103k
  for (StoreInst *ST : Stores) {
1921
103k
    Value *Ptr = ST->getPointerOperand();
1922
103k
1923
103k
    if (isUniform(Ptr))
1924
1.81k
      HasDependenceInvolvingLoopInvariantAddress |=
1925
1.81k
          !UniformStores.insert(Ptr).second;
1926
103k
1927
103k
    // If we did *not* see this pointer before, insert it to  the read-write
1928
103k
    // list. At this phase it is only a 'write' list.
1929
103k
    if (Seen.insert(Ptr).second) {
1930
103k
      ++NumReadWrites;
1931
103k
1932
103k
      MemoryLocation Loc = MemoryLocation::get(ST);
1933
103k
      // The TBAA metadata could have a control dependency on the predication
1934
103k
      // condition, so we cannot rely on it when determining whether or not we
1935
103k
      // need runtime pointer checks.
1936
103k
      if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
1937
6.53k
        Loc.AATags.TBAA = nullptr;
1938
103k
1939
103k
      Accesses.addStore(Loc);
1940
103k
    }
1941
103k
  }
1942
56.2k
1943
56.2k
  if (IsAnnotatedParallel) {
1944
21
    LLVM_DEBUG(
1945
21
        dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
1946
21
               << "checks.\n");
1947
21
    CanVecMem = true;
1948
21
    return;
1949
21
  }
1950
56.1k
1951
74.8k
  
for (LoadInst *LD : Loads)56.1k
{
1952
74.8k
    Value *Ptr = LD->getPointerOperand();
1953
74.8k
    // If we did *not* see this pointer before, insert it to the
1954
74.8k
    // read list. If we *did* see it before, then it is already in
1955
74.8k
    // the read-write list. This allows us to vectorize expressions
1956
74.8k
    // such as A[i] += x;  Because the address of A[i] is a read-write
1957
74.8k
    // pointer. This only works if the index of A[i] is consecutive.
1958
74.8k
    // If the address of i is unknown (for example A[B[i]]) then we may
1959
74.8k
    // read a few words, modify, and write a few words, and some of the
1960
74.8k
    // words may be written to the same address.
1961
74.8k
    bool IsReadOnlyPtr = false;
1962
74.8k
    if (Seen.insert(Ptr).second ||
1963
74.8k
        
!getPtrStride(*PSE, Ptr, TheLoop, SymbolicStrides)13.5k
) {
1964
66.7k
      ++NumReads;
1965
66.7k
      IsReadOnlyPtr = true;
1966
66.7k
    }
1967
74.8k
1968
74.8k
    // See if there is an unsafe dependency between a load to a uniform address and
1969
74.8k
    // store to the same uniform address.
1970
74.8k
    if (UniformStores.count(Ptr)) {
1971
983
      LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
1972
983
                           "load and uniform store to the same address!\n");
1973
983
      HasDependenceInvolvingLoopInvariantAddress = true;
1974
983
    }
1975
74.8k
1976
74.8k
    MemoryLocation Loc = MemoryLocation::get(LD);
1977
74.8k
    // The TBAA metadata could have a control dependency on the predication
1978
74.8k
    // condition, so we cannot rely on it when determining whether or not we
1979
74.8k
    // need runtime pointer checks.
1980
74.8k
    if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
1981
11.5k
      Loc.AATags.TBAA = nullptr;
1982
74.8k
1983
74.8k
    Accesses.addLoad(Loc, IsReadOnlyPtr);
1984
74.8k
  }
1985
56.1k
1986
56.1k
  // If we write (or read-write) to a single destination and there are no
1987
56.1k
  // other reads in this loop then is it safe to vectorize.
1988
56.1k
  if (NumReadWrites == 1 && 
NumReads == 040.0k
) {
1989
22.2k
    LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
1990
22.2k
    CanVecMem = true;
1991
22.2k
    return;
1992
22.2k
  }
1993
33.9k
1994
33.9k
  // Build dependence sets and check whether we need a runtime pointer bounds
1995
33.9k
  // check.
1996
33.9k
  Accesses.buildDependenceSets();
1997
33.9k
1998
33.9k
  // Find pointers with computable bounds. We are going to use this information
1999
33.9k
  // to place a runtime bound check.
2000
33.9k
  bool CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(),
2001
33.9k
                                                  TheLoop, SymbolicStrides);
2002
33.9k
  if (!CanDoRTIfNeeded) {
2003
4.08k
    recordAnalysis("CantIdentifyArrayBounds") << "cannot identify array bounds";
2004
4.08k
    LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2005
4.08k
                      << "the array bounds.\n");
2006
4.08k
    CanVecMem = false;
2007
4.08k
    return;
2008
4.08k
  }
2009
29.8k
2010
29.8k
  LLVM_DEBUG(
2011
29.8k
    dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2012
29.8k
2013
29.8k
  CanVecMem = true;
2014
29.8k
  if (Accesses.isDependencyCheckNeeded()) {
2015
22.1k
    LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2016
22.1k
    CanVecMem = DepChecker->areDepsSafe(
2017
22.1k
        DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides);
2018
22.1k
    MaxSafeDepDistBytes = DepChecker->getMaxSafeDepDistBytes();
2019
22.1k
2020
22.1k
    if (!CanVecMem && 
DepChecker->shouldRetryWithRuntimeCheck()1.76k
) {
2021
585
      LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2022
585
2023
585
      // Clear the dependency checks. We assume they are not needed.
2024
585
      Accesses.resetDepChecks(*DepChecker);
2025
585
2026
585
      PtrRtChecking->reset();
2027
585
      PtrRtChecking->Need = true;
2028
585
2029
585
      auto *SE = PSE->getSE();
2030
585
      CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, SE, TheLoop,
2031
585
                                                 SymbolicStrides, true);
2032
585
2033
585
      // Check that we found the bounds for the pointer.
2034
585
      if (!CanDoRTIfNeeded) {
2035
2
        recordAnalysis("CantCheckMemDepsAtRunTime")
2036
2
            << "cannot check memory dependencies at runtime";
2037
2
        LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2038
2
        CanVecMem = false;
2039
2
        return;
2040
2
      }
2041
583
2042
583
      CanVecMem = true;
2043
583
    }
2044
22.1k
  }
2045
29.8k
2046
29.8k
  
if (29.8k
HasConvergentOp29.8k
) {
2047
23
    recordAnalysis("CantInsertRuntimeCheckWithConvergent")
2048
23
      << "cannot add control dependency to convergent operation";
2049
23
    LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2050
23
                         "would be needed with a convergent operation\n");
2051
23
    CanVecMem = false;
2052
23
    return;
2053
23
  }
2054
29.8k
2055
29.8k
  if (CanVecMem)
2056
29.8k
    LLVM_DEBUG(
2057
29.8k
        dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
2058
29.8k
               << (PtrRtChecking->Need ? "" : " don't")
2059
29.8k
               << " need runtime memory checks.\n");
2060
29.8k
  else {
2061
1.15k
    recordAnalysis("UnsafeMemDep")
2062
1.15k
        << "unsafe dependent memory operations in loop. Use "
2063
1.15k
           "#pragma loop distribute(enable) to allow loop distribution "
2064
1.15k
           "to attempt to isolate the offending operations into a separate "
2065
1.15k
           "loop";
2066
1.15k
    LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2067
1.15k
  }
2068
29.8k
}
2069
2070
bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
2071
1.17M
                                           DominatorTree *DT)  {
2072
1.17M
  assert(TheLoop->contains(BB) && "Unknown block used");
2073
1.17M
2074
1.17M
  // Blocks that do not dominate the latch need predication.
2075
1.17M
  BasicBlock* Latch = TheLoop->getLoopLatch();
2076
1.17M
  return !DT->dominates(BB, Latch);
2077
1.17M
}
2078
2079
OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
2080
131k
                                                           Instruction *I) {
2081
131k
  assert(!Report && "Multiple reports generated");
2082
131k
2083
131k
  Value *CodeRegion = TheLoop->getHeader();
2084
131k
  DebugLoc DL = TheLoop->getStartLoc();
2085
131k
2086
131k
  if (I) {
2087
48
    CodeRegion = I->getParent();
2088
48
    // If there is no debug location attached to the instruction, revert back to
2089
48
    // using the loop's.
2090
48
    if (I->getDebugLoc())
2091
38
      DL = I->getDebugLoc();
2092
48
  }
2093
131k
2094
131k
  Report = make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
2095
131k
                                                   CodeRegion);
2096
131k
  return *Report;
2097
131k
}
2098
2099
261k
bool LoopAccessInfo::isUniform(Value *V) const {
2100
261k
  auto *SE = PSE->getSE();
2101
261k
  // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
2102
261k
  // never considered uniform.
2103
261k
  // TODO: Is this really what we want? Even without FP SCEV, we may want some
2104
261k
  // trivially loop-invariant FP values to be considered uniform.
2105
261k
  if (!SE->isSCEVable(V->getType()))
2106
62.9k
    return false;
2107
198k
  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
2108
198k
}
2109
2110
// FIXME: this function is currently a duplicate of the one in
2111
// LoopVectorize.cpp.
2112
static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
2113
13.6k
                                 Instruction *Loc) {
2114
13.6k
  if (FirstInst)
2115
11.1k
    return FirstInst;
2116
2.53k
  if (Instruction *I = dyn_cast<Instruction>(V))
2117
2.52k
    return I->getParent() == Loc->getParent() ? I : 
nullptr0
;
2118
12
  return nullptr;
2119
12
}
2120
2121
namespace {
2122
2123
/// IR Values for the lower and upper bounds of a pointer evolution.  We
2124
/// need to use value-handles because SCEV expansion can invalidate previously
2125
/// expanded values.  Thus expansion of a pointer can invalidate the bounds for
2126
/// a previous one.
2127
struct PointerBounds {
2128
  TrackingVH<Value> Start;
2129
  TrackingVH<Value> End;
2130
};
2131
2132
} // end anonymous namespace
2133
2134
/// Expand code for the lower and upper bound of the pointer group \p CG
2135
/// in \p TheLoop.  \return the values for the bounds.
2136
static PointerBounds
2137
expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
2138
             Instruction *Loc, SCEVExpander &Exp, ScalarEvolution *SE,
2139
6.82k
             const RuntimePointerChecking &PtrRtChecking) {
2140
6.82k
  Value *Ptr = PtrRtChecking.Pointers[CG->Members[0]].PointerValue;
2141
6.82k
  const SCEV *Sc = SE->getSCEV(Ptr);
2142
6.82k
2143
6.82k
  unsigned AS = Ptr->getType()->getPointerAddressSpace();
2144
6.82k
  LLVMContext &Ctx = Loc->getContext();
2145
6.82k
2146
6.82k
  // Use this type for pointer arithmetic.
2147
6.82k
  Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
2148
6.82k
2149
6.82k
  if (SE->isLoopInvariant(Sc, TheLoop)) {
2150
220
    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:"
2151
220
                      << *Ptr << "\n");
2152
220
    // Ptr could be in the loop body. If so, expand a new one at the correct
2153
220
    // location.
2154
220
    Instruction *Inst = dyn_cast<Instruction>(Ptr);
2155
220
    Value *NewPtr = (Inst && 
TheLoop->contains(Inst)184
)
2156
220
                        ? 
Exp.expandCodeFor(Sc, PtrArithTy, Loc)4
2157
220
                        : 
Ptr216
;
2158
220
    // We must return a half-open range, which means incrementing Sc.
2159
220
    const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy));
2160
220
    Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc);
2161
220
    return {NewPtr, NewPtrPlusOne};
2162
6.60k
  } else {
2163
6.60k
    Value *Start = nullptr, *End = nullptr;
2164
6.60k
    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
2165
6.60k
    Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
2166
6.60k
    End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
2167
6.60k
    LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High
2168
6.60k
                      << "\n");
2169
6.60k
    return {Start, End};
2170
6.60k
  }
2171
6.82k
}
2172
2173
/// Turns a collection of checks into a collection of expanded upper and
2174
/// lower bounds for both pointers in the check.
2175
static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> expandBounds(
2176
    const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks,
2177
    Loop *L, Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp,
2178
2.53k
    const RuntimePointerChecking &PtrRtChecking) {
2179
2.53k
  SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
2180
2.53k
2181
2.53k
  // Here we're relying on the SCEV Expander's cache to only emit code for the
2182
2.53k
  // same bounds once.
2183
2.53k
  transform(
2184
2.53k
      PointerChecks, std::back_inserter(ChecksWithBounds),
2185
3.41k
      [&](const RuntimePointerChecking::PointerCheck &Check) {
2186
3.41k
        PointerBounds
2187
3.41k
          First = expandBounds(Check.first, L, Loc, Exp, SE, PtrRtChecking),
2188
3.41k
          Second = expandBounds(Check.second, L, Loc, Exp, SE, PtrRtChecking);
2189
3.41k
        return std::make_pair(First, Second);
2190
3.41k
      });
2191
2.53k
2192
2.53k
  return ChecksWithBounds;
2193
2.53k
}
2194
2195
std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
2196
    Instruction *Loc,
2197
    const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks)
2198
2.53k
    const {
2199
2.53k
  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
2200
2.53k
  auto *SE = PSE->getSE();
2201
2.53k
  SCEVExpander Exp(*SE, DL, "induction");
2202
2.53k
  auto ExpandedChecks =
2203
2.53k
      expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, *PtrRtChecking);
2204
2.53k
2205
2.53k
  LLVMContext &Ctx = Loc->getContext();
2206
2.53k
  Instruction *FirstInst = nullptr;
2207
2.53k
  IRBuilder<> ChkBuilder(Loc);
2208
2.53k
  // Our instructions might fold to a constant.
2209
2.53k
  Value *MemoryRuntimeCheck = nullptr;
2210
2.53k
2211
3.41k
  for (const auto &Check : ExpandedChecks) {
2212
3.41k
    const PointerBounds &A = Check.first, &B = Check.second;
2213
3.41k
    // Check if two pointers (A and B) conflict where conflict is computed as:
2214
3.41k
    // start(A) <= end(B) && start(B) <= end(A)
2215
3.41k
    unsigned AS0 = A.Start->getType()->getPointerAddressSpace();
2216
3.41k
    unsigned AS1 = B.Start->getType()->getPointerAddressSpace();
2217
3.41k
2218
3.41k
    assert((AS0 == B.End->getType()->getPointerAddressSpace()) &&
2219
3.41k
           (AS1 == A.End->getType()->getPointerAddressSpace()) &&
2220
3.41k
           "Trying to bounds check pointers with different address spaces");
2221
3.41k
2222
3.41k
    Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
2223
3.41k
    Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
2224
3.41k
2225
3.41k
    Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc");
2226
3.41k
    Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc");
2227
3.41k
    Value *End0 =   ChkBuilder.CreateBitCast(A.End,   PtrArithTy1, "bc");
2228
3.41k
    Value *End1 =   ChkBuilder.CreateBitCast(B.End,   PtrArithTy0, "bc");
2229
3.41k
2230
3.41k
    // [A|B].Start points to the first accessed byte under base [A|B].
2231
3.41k
    // [A|B].End points to the last accessed byte, plus one.
2232
3.41k
    // There is no conflict when the intervals are disjoint:
2233
3.41k
    // NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
2234
3.41k
    //
2235
3.41k
    // bound0 = (B.Start < A.End)
2236
3.41k
    // bound1 = (A.Start < B.End)
2237
3.41k
    //  IsConflict = bound0 & bound1
2238
3.41k
    Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
2239
3.41k
    FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
2240
3.41k
    Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
2241
3.41k
    FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
2242
3.41k
    Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
2243
3.41k
    FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
2244
3.41k
    if (MemoryRuntimeCheck) {
2245
892
      IsConflict =
2246
892
          ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
2247
892
      FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
2248
892
    }
2249
3.41k
    MemoryRuntimeCheck = IsConflict;
2250
3.41k
  }
2251
2.53k
2252
2.53k
  if (!MemoryRuntimeCheck)
2253
16
    return std::make_pair(nullptr, nullptr);
2254
2.52k
2255
2.52k
  // We have to do this trickery because the IRBuilder might fold the check to a
2256
2.52k
  // constant expression in which case there is no Instruction anchored in a
2257
2.52k
  // the block.
2258
2.52k
  Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
2259
2.52k
                                                 ConstantInt::getTrue(Ctx));
2260
2.52k
  ChkBuilder.Insert(Check, "memcheck.conflict");
2261
2.52k
  FirstInst = getFirstInst(FirstInst, Check, Loc);
2262
2.52k
  return std::make_pair(FirstInst, Check);
2263
2.52k
}
2264
2265
std::pair<Instruction *, Instruction *>
2266
17.0k
LoopAccessInfo::addRuntimeChecks(Instruction *Loc) const {
2267
17.0k
  if (!PtrRtChecking->Need)
2268
14.5k
    return std::make_pair(nullptr, nullptr);
2269
2.48k
2270
2.48k
  return addRuntimeChecks(Loc, PtrRtChecking->getChecks());
2271
2.48k
}
2272
2273
225k
void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2274
225k
  Value *Ptr = nullptr;
2275
225k
  if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
2276
116k
    Ptr = LI->getPointerOperand();
2277
109k
  else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
2278
109k
    Ptr = SI->getPointerOperand();
2279
0
  else
2280
0
    return;
2281
225k
2282
225k
  Value *Stride = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
2283
225k
  if (!Stride)
2284
224k
    return;
2285
1.25k
2286
1.25k
  LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
2287
1.25k
                       "versioning:");
2288
1.25k
  LLVM_DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
2289
1.25k
2290
1.25k
  // Avoid adding the "Stride == 1" predicate when we know that
2291
1.25k
  // Stride >= Trip-Count. Such a predicate will effectively optimize a single
2292
1.25k
  // or zero iteration loop, as Trip-Count <= Stride == 1.
2293
1.25k
  //
2294
1.25k
  // TODO: We are currently not making a very informed decision on when it is
2295
1.25k
  // beneficial to apply stride versioning. It might make more sense that the
2296
1.25k
  // users of this analysis (such as the vectorizer) will trigger it, based on
2297
1.25k
  // their specific cost considerations; For example, in cases where stride
2298
1.25k
  // versioning does  not help resolving memory accesses/dependences, the
2299
1.25k
  // vectorizer should evaluate the cost of the runtime test, and the benefit
2300
1.25k
  // of various possible stride specializations, considering the alternatives
2301
1.25k
  // of using gather/scatters (if available).
2302
1.25k
2303
1.25k
  const SCEV *StrideExpr = PSE->getSCEV(Stride);
2304
1.25k
  const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
2305
1.25k
2306
1.25k
  // Match the types so we can compare the stride and the BETakenCount.
2307
1.25k
  // The Stride can be positive/negative, so we sign extend Stride;
2308
1.25k
  // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
2309
1.25k
  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
2310
1.25k
  uint64_t StrideTypeSize = DL.getTypeAllocSize(StrideExpr->getType());
2311
1.25k
  uint64_t BETypeSize = DL.getTypeAllocSize(BETakenCount->getType());
2312
1.25k
  const SCEV *CastedStride = StrideExpr;
2313
1.25k
  const SCEV *CastedBECount = BETakenCount;
2314
1.25k
  ScalarEvolution *SE = PSE->getSE();
2315
1.25k
  if (BETypeSize >= StrideTypeSize)
2316
406
    CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
2317
853
  else
2318
853
    CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
2319
1.25k
  const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
2320
1.25k
  // Since TripCount == BackEdgeTakenCount + 1, checking:
2321
1.25k
  // "Stride >= TripCount" is equivalent to checking:
2322
1.25k
  // Stride - BETakenCount > 0
2323
1.25k
  if (SE->isKnownPositive(StrideMinusBETaken)) {
2324
4
    LLVM_DEBUG(
2325
4
        dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
2326
4
                  "Stride==1 predicate will imply that the loop executes "
2327
4
                  "at most once.\n");
2328
4
    return;
2329
4
  }
2330
1.25k
  LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.");
2331
1.25k
2332
1.25k
  SymbolicStrides[Ptr] = Stride;
2333
1.25k
  StrideSet.insert(Stride);
2334
1.25k
}
2335
2336
LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
2337
                               const TargetLibraryInfo *TLI, AliasAnalysis *AA,
2338
                               DominatorTree *DT, LoopInfo *LI)
2339
    : PSE(llvm::make_unique<PredicatedScalarEvolution>(*SE, *L)),
2340
      PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
2341
      DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
2342
      NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
2343
      HasConvergentOp(false),
2344
191k
      HasDependenceInvolvingLoopInvariantAddress(false) {
2345
191k
  if (canAnalyzeLoop())
2346
81.1k
    analyzeLoop(AA, LI, TLI, DT);
2347
191k
}
2348
2349
115
void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
2350
115
  if (CanVecMem) {
2351
67
    OS.indent(Depth) << "Memory dependences are safe";
2352
67
    if (MaxSafeDepDistBytes != -1ULL)
2353
12
      OS << " with a maximum dependence distance of " << MaxSafeDepDistBytes
2354
12
         << " bytes";
2355
67
    if (PtrRtChecking->Need)
2356
27
      OS << " with run-time checks";
2357
67
    OS << "\n";
2358
67
  }
2359
115
2360
115
  if (HasConvergentOp)
2361
5
    OS.indent(Depth) << "Has convergent operation in loop\n";
2362
115
2363
115
  if (Report)
2364
48
    OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
2365
115
2366
115
  if (auto *Dependences = DepChecker->getDependences()) {
2367
115
    OS.indent(Depth) << "Dependences:\n";
2368
115
    for (auto &Dep : *Dependences) {
2369
85
      Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
2370
85
      OS << "\n";
2371
85
    }
2372
115
  } else
2373
0
    OS.indent(Depth) << "Too many dependences, not recorded\n";
2374
115
2375
115
  // List the pair of accesses need run-time checks to prove independence.
2376
115
  PtrRtChecking->print(OS, Depth);
2377
115
  OS << "\n";
2378
115
2379
115
  OS.indent(Depth) << "Non vectorizable stores to invariant address were "
2380
115
                   << (HasDependenceInvolvingLoopInvariantAddress ? 
""4
:
"not "111
)
2381
115
                   << "found in loop.\n";
2382
115
2383
115
  OS.indent(Depth) << "SCEV assumptions:\n";
2384
115
  PSE->getUnionPredicate().print(OS, Depth);
2385
115
2386
115
  OS << "\n";
2387
115
2388
115
  OS.indent(Depth) << "Expressions re-written:\n";
2389
115
  PSE->print(OS, Depth);
2390
115
}
2391
2392
189k
const LoopAccessInfo &LoopAccessLegacyAnalysis::getInfo(Loop *L) {
2393
189k
  auto &LAI = LoopAccessInfoMap[L];
2394
189k
2395
189k
  if (!LAI)
2396
189k
    LAI = llvm::make_unique<LoopAccessInfo>(L, SE, TLI, AA, DT, LI);
2397
189k
2398
189k
  return *LAI.get();
2399
189k
}
2400
2401
54
void LoopAccessLegacyAnalysis::print(raw_ostream &OS, const Module *M) const {
2402
54
  LoopAccessLegacyAnalysis &LAA = *const_cast<LoopAccessLegacyAnalysis *>(this);
2403
54
2404
54
  for (Loop *TopLevelLoop : *LI)
2405
63
    
for (Loop *L : depth_first(TopLevelLoop))58
{
2406
63
      OS.indent(2) << L->getHeader()->getName() << ":\n";
2407
63
      auto &LAI = LAA.getInfo(L);
2408
63
      LAI.print(OS, 4);
2409
63
    }
2410
54
}
2411
2412
836k
bool LoopAccessLegacyAnalysis::runOnFunction(Function &F) {
2413
836k
  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2414
836k
  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2415
836k
  TLI = TLIP ? &TLIP->getTLI() : 
nullptr0
;
2416
836k
  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2417
836k
  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2418
836k
  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2419
836k
2420
836k
  return false;
2421
836k
}
2422
2423
39.7k
void LoopAccessLegacyAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
2424
39.7k
    AU.addRequired<ScalarEvolutionWrapperPass>();
2425
39.7k
    AU.addRequired<AAResultsWrapperPass>();
2426
39.7k
    AU.addRequired<DominatorTreeWrapperPass>();
2427
39.7k
    AU.addRequired<LoopInfoWrapperPass>();
2428
39.7k
2429
39.7k
    AU.setPreservesAll();
2430
39.7k
}
2431
2432
char LoopAccessLegacyAnalysis::ID = 0;
2433
static const char laa_name[] = "Loop Access Analysis";
2434
#define LAA_NAME "loop-accesses"
2435
2436
48.9k
INITIALIZE_PASS_BEGIN(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
2437
48.9k
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
2438
48.9k
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
2439
48.9k
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
2440
48.9k
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
2441
48.9k
INITIALIZE_PASS_END(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
2442
2443
AnalysisKey LoopAccessAnalysis::Key;
2444
2445
LoopAccessInfo LoopAccessAnalysis::run(Loop &L, LoopAnalysisManager &AM,
2446
128
                                       LoopStandardAnalysisResults &AR) {
2447
128
  return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.AA, &AR.DT, &AR.LI);
2448
128
}
2449
2450
namespace llvm {
2451
2452
0
  Pass *createLAAPass() {
2453
0
    return new LoopAccessLegacyAnalysis();
2454
0
  }
2455
2456
} // end namespace llvm