Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This pass merges loads/stores to/from sequential memory addresses into vector
10
// loads/stores.  Although there's nothing GPU-specific in here, this pass is
11
// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12
//
13
// (For simplicity below we talk about loads only, but everything also applies
14
// to stores.)
15
//
16
// This pass is intended to be run late in the pipeline, after other
17
// vectorization opportunities have been exploited.  So the assumption here is
18
// that immediately following our new vector load we'll need to extract out the
19
// individual elements of the load, so we can operate on them individually.
20
//
21
// On CPUs this transformation is usually not beneficial, because extracting the
22
// elements of a vector register is expensive on most architectures.  It's
23
// usually better just to load each element individually into its own scalar
24
// register.
25
//
26
// However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
27
// "vector load" loads directly into a series of scalar registers.  In effect,
28
// extracting the elements of the vector is free.  It's therefore always
29
// beneficial to vectorize a sequence of loads on these architectures.
30
//
31
// Vectorizing (perhaps a better name might be "coalescing") loads can have
32
// large performance impacts on GPU kernels, and opportunities for vectorizing
33
// are common in GPU code.  This pass tries very hard to find such
34
// opportunities; its runtime is quadratic in the number of loads in a BB.
35
//
36
// Some CPU architectures, such as ARM, have instructions that load into
37
// multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
38
// could use this pass (with some modifications), but currently it implements
39
// its own pass to do something similar to what we do here.
40
41
#include "llvm/ADT/APInt.h"
42
#include "llvm/ADT/ArrayRef.h"
43
#include "llvm/ADT/MapVector.h"
44
#include "llvm/ADT/PostOrderIterator.h"
45
#include "llvm/ADT/STLExtras.h"
46
#include "llvm/ADT/SmallPtrSet.h"
47
#include "llvm/ADT/SmallVector.h"
48
#include "llvm/ADT/Statistic.h"
49
#include "llvm/ADT/iterator_range.h"
50
#include "llvm/Analysis/AliasAnalysis.h"
51
#include "llvm/Analysis/MemoryLocation.h"
52
#include "llvm/Analysis/OrderedBasicBlock.h"
53
#include "llvm/Analysis/ScalarEvolution.h"
54
#include "llvm/Analysis/TargetTransformInfo.h"
55
#include "llvm/Transforms/Utils/Local.h"
56
#include "llvm/Analysis/ValueTracking.h"
57
#include "llvm/Analysis/VectorUtils.h"
58
#include "llvm/IR/Attributes.h"
59
#include "llvm/IR/BasicBlock.h"
60
#include "llvm/IR/Constants.h"
61
#include "llvm/IR/DataLayout.h"
62
#include "llvm/IR/DerivedTypes.h"
63
#include "llvm/IR/Dominators.h"
64
#include "llvm/IR/Function.h"
65
#include "llvm/IR/IRBuilder.h"
66
#include "llvm/IR/InstrTypes.h"
67
#include "llvm/IR/Instruction.h"
68
#include "llvm/IR/Instructions.h"
69
#include "llvm/IR/IntrinsicInst.h"
70
#include "llvm/IR/Module.h"
71
#include "llvm/IR/Type.h"
72
#include "llvm/IR/User.h"
73
#include "llvm/IR/Value.h"
74
#include "llvm/Pass.h"
75
#include "llvm/Support/Casting.h"
76
#include "llvm/Support/Debug.h"
77
#include "llvm/Support/KnownBits.h"
78
#include "llvm/Support/MathExtras.h"
79
#include "llvm/Support/raw_ostream.h"
80
#include "llvm/Transforms/Vectorize.h"
81
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
82
#include <algorithm>
83
#include <cassert>
84
#include <cstdlib>
85
#include <tuple>
86
#include <utility>
87
88
using namespace llvm;
89
90
#define DEBUG_TYPE "load-store-vectorizer"
91
92
STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
93
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
94
95
// FIXME: Assuming stack alignment of 4 is always good enough
96
static const unsigned StackAdjustedAlignment = 4;
97
98
namespace {
99
100
/// ChainID is an arbitrary token that is allowed to be different only for the
101
/// accesses that are guaranteed to be considered non-consecutive by
102
/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
103
/// together and reducing the number of instructions the main search operates on
104
/// at a time, i.e. this is to reduce compile time and nothing else as the main
105
/// search has O(n^2) time complexity. The underlying type of ChainID should not
106
/// be relied upon.
107
using ChainID = const Value *;
108
using InstrList = SmallVector<Instruction *, 8>;
109
using InstrListMap = MapVector<ChainID, InstrList>;
110
111
class Vectorizer {
112
  Function &F;
113
  AliasAnalysis &AA;
114
  DominatorTree &DT;
115
  ScalarEvolution &SE;
116
  TargetTransformInfo &TTI;
117
  const DataLayout &DL;
118
  IRBuilder<> Builder;
119
120
public:
121
  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
122
             ScalarEvolution &SE, TargetTransformInfo &TTI)
123
      : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
124
28.6k
        DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
125
126
  bool run();
127
128
private:
129
  unsigned getPointerAddressSpace(Value *I);
130
131
10.7k
  unsigned getAlignment(LoadInst *LI) const {
132
10.7k
    unsigned Align = LI->getAlignment();
133
10.7k
    if (Align != 0)
134
10.4k
      return Align;
135
268
136
268
    return DL.getABITypeAlignment(LI->getType());
137
268
  }
138
139
738
  unsigned getAlignment(StoreInst *SI) const {
140
738
    unsigned Align = SI->getAlignment();
141
738
    if (Align != 0)
142
593
      return Align;
143
145
144
145
    return DL.getABITypeAlignment(SI->getValueOperand()->getType());
145
145
  }
146
147
  static const unsigned MaxDepth = 3;
148
149
  bool isConsecutiveAccess(Value *A, Value *B);
150
  bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
151
                              unsigned Depth = 0) const;
152
  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
153
                                   unsigned Depth) const;
154
  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
155
                          unsigned Depth) const;
156
157
  /// After vectorization, reorder the instructions that I depends on
158
  /// (the instructions defining its operands), to ensure they dominate I.
159
  void reorder(Instruction *I);
160
161
  /// Returns the first and the last instructions in Chain.
162
  std::pair<BasicBlock::iterator, BasicBlock::iterator>
163
  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
164
165
  /// Erases the original instructions after vectorizing.
166
  void eraseInstructions(ArrayRef<Instruction *> Chain);
167
168
  /// "Legalize" the vector type that would be produced by combining \p
169
  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
170
  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
171
  /// expected to have more than 4 elements.
172
  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
173
  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
174
175
  /// Finds the largest prefix of Chain that's vectorizable, checking for
176
  /// intervening instructions which may affect the memory accessed by the
177
  /// instructions within Chain.
178
  ///
179
  /// The elements of \p Chain must be all loads or all stores and must be in
180
  /// address order.
181
  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
182
183
  /// Collects load and store instructions to vectorize.
184
  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
185
186
  /// Processes the collected instructions, the \p Map. The values of \p Map
187
  /// should be all loads or all stores.
188
  bool vectorizeChains(InstrListMap &Map);
189
190
  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
191
  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
192
193
  /// Vectorizes the load instructions in Chain.
194
  bool
195
  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
196
                     SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
197
198
  /// Vectorizes the store instructions in Chain.
199
  bool
200
  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
201
                      SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
202
203
  /// Check if this load/store access is misaligned accesses.
204
  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
205
                          unsigned Alignment);
206
};
207
208
class LoadStoreVectorizerLegacyPass : public FunctionPass {
209
public:
210
  static char ID;
211
212
2.99k
  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
213
2.99k
    initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
214
2.99k
  }
215
216
  bool runOnFunction(Function &F) override;
217
218
28.5k
  StringRef getPassName() const override {
219
28.5k
    return "GPU Load and Store Vectorizer";
220
28.5k
  }
221
222
2.96k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
223
2.96k
    AU.addRequired<AAResultsWrapperPass>();
224
2.96k
    AU.addRequired<ScalarEvolutionWrapperPass>();
225
2.96k
    AU.addRequired<DominatorTreeWrapperPass>();
226
2.96k
    AU.addRequired<TargetTransformInfoWrapperPass>();
227
2.96k
    AU.setPreservesCFG();
228
2.96k
  }
229
};
230
231
} // end anonymous namespace
232
233
char LoadStoreVectorizerLegacyPass::ID = 0;
234
235
36.0k
INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
236
36.0k
                      "Vectorize load and Store instructions", false, false)
237
36.0k
INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
238
36.0k
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
239
36.0k
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
240
36.0k
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
241
36.0k
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
242
36.0k
INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
243
                    "Vectorize load and store instructions", false, false)
244
245
2.94k
Pass *llvm::createLoadStoreVectorizerPass() {
246
2.94k
  return new LoadStoreVectorizerLegacyPass();
247
2.94k
}
248
249
28.5k
bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
250
28.5k
  // Don't vectorize when the attribute NoImplicitFloat is used.
251
28.5k
  if (skipFunction(F) || 
F.hasFnAttribute(Attribute::NoImplicitFloat)28.5k
)
252
15
    return false;
253
28.5k
254
28.5k
  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
255
28.5k
  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
256
28.5k
  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
257
28.5k
  TargetTransformInfo &TTI =
258
28.5k
      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
259
28.5k
260
28.5k
  Vectorizer V(F, AA, DT, SE, TTI);
261
28.5k
  return V.run();
262
28.5k
}
263
264
61
PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
265
61
  // Don't vectorize when the attribute NoImplicitFloat is used.
266
61
  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
267
0
    return PreservedAnalyses::all();
268
61
269
61
  AliasAnalysis &AA = AM.getResult<AAManager>(F);
270
61
  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
271
61
  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
272
61
  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
273
61
274
61
  Vectorizer V(F, AA, DT, SE, TTI);
275
61
  bool Changed = V.run();
276
61
  PreservedAnalyses PA;
277
61
  PA.preserveSet<CFGAnalyses>();
278
61
  return Changed ? 
PA47
:
PreservedAnalyses::all()14
;
279
61
}
280
281
// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
282
// vectors of Instructions.
283
10.7k
static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
284
10.7k
  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
285
10.7k
  propagateMetadata(I, VL);
286
10.7k
}
287
288
// Vectorizer Implementation
289
28.6k
bool Vectorizer::run() {
290
28.6k
  bool Changed = false;
291
28.6k
292
28.6k
  // Scan the blocks in the function in post order.
293
31.5k
  for (BasicBlock *BB : post_order(&F)) {
294
31.5k
    InstrListMap LoadRefs, StoreRefs;
295
31.5k
    std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
296
31.5k
    Changed |= vectorizeChains(LoadRefs);
297
31.5k
    Changed |= vectorizeChains(StoreRefs);
298
31.5k
  }
299
28.6k
300
28.6k
  return Changed;
301
28.6k
}
302
303
389k
unsigned Vectorizer::getPointerAddressSpace(Value *I) {
304
389k
  if (LoadInst *L = dyn_cast<LoadInst>(I))
305
327k
    return L->getPointerAddressSpace();
306
61.7k
  if (StoreInst *S = dyn_cast<StoreInst>(I))
307
61.7k
    return S->getPointerAddressSpace();
308
0
  return -1;
309
0
}
310
311
// FIXME: Merge with llvm::isConsecutiveAccess
312
194k
bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
313
194k
  Value *PtrA = getLoadStorePointerOperand(A);
314
194k
  Value *PtrB = getLoadStorePointerOperand(B);
315
194k
  unsigned ASA = getPointerAddressSpace(A);
316
194k
  unsigned ASB = getPointerAddressSpace(B);
317
194k
318
194k
  // Check that the address spaces match and that the pointers are valid.
319
194k
  if (!PtrA || !PtrB || (ASA != ASB))
320
0
    return false;
321
194k
322
194k
  // Make sure that A and B are different pointers of the same size type.
323
194k
  Type *PtrATy = PtrA->getType()->getPointerElementType();
324
194k
  Type *PtrBTy = PtrB->getType()->getPointerElementType();
325
194k
  if (PtrA == PtrB ||
326
194k
      
PtrATy->isVectorTy() != PtrBTy->isVectorTy()194k
||
327
194k
      
DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy)194k
||
328
194k
      DL.getTypeStoreSize(PtrATy->getScalarType()) !=
329
170k
          DL.getTypeStoreSize(PtrBTy->getScalarType()))
330
23.6k
    return false;
331
170k
332
170k
  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
333
170k
  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
334
170k
335
170k
  return areConsecutivePointers(PtrA, PtrB, Size);
336
170k
}
337
338
bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
339
                                        const APInt &PtrDelta,
340
170k
                                        unsigned Depth) const {
341
170k
  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
342
170k
  APInt OffsetA(PtrBitWidth, 0);
343
170k
  APInt OffsetB(PtrBitWidth, 0);
344
170k
  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
345
170k
  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
346
170k
347
170k
  APInt OffsetDelta = OffsetB - OffsetA;
348
170k
349
170k
  // Check if they are based on the same pointer. That makes the offsets
350
170k
  // sufficient.
351
170k
  if (PtrA == PtrB)
352
167k
    return OffsetDelta == PtrDelta;
353
3.42k
354
3.42k
  // Compute the necessary base pointer delta to have the necessary final delta
355
3.42k
  // equal to the pointer delta requested.
356
3.42k
  APInt BaseDelta = PtrDelta - OffsetDelta;
357
3.42k
358
3.42k
  // Compute the distance with SCEV between the base pointers.
359
3.42k
  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
360
3.42k
  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
361
3.42k
  const SCEV *C = SE.getConstant(BaseDelta);
362
3.42k
  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
363
3.42k
  if (X == PtrSCEVB)
364
896
    return true;
365
2.52k
366
2.52k
  // The above check will not catch the cases where one of the pointers is
367
2.52k
  // factorized but the other one is not, such as (C + (S * (A + B))) vs
368
2.52k
  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
369
2.52k
  // and getting the simplified difference.
370
2.52k
  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
371
2.52k
  if (C == Dist)
372
4
    return true;
373
2.52k
374
2.52k
  // Sometimes even this doesn't work, because SCEV can't always see through
375
2.52k
  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
376
2.52k
  // things the hard way.
377
2.52k
  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
378
2.52k
}
379
380
bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
381
                                             APInt PtrDelta,
382
2.52k
                                             unsigned Depth) const {
383
2.52k
  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
384
2.52k
  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
385
2.52k
  if (!GEPA || 
!GEPB2.09k
)
386
1.10k
    return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
387
1.41k
388
1.41k
  // Look through GEPs after checking they're the same except for the last
389
1.41k
  // index.
390
1.41k
  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
391
1.41k
      
GEPA->getPointerOperand() != GEPB->getPointerOperand()1.27k
)
392
219
    return false;
393
1.19k
  gep_type_iterator GTIA = gep_type_begin(GEPA);
394
1.19k
  gep_type_iterator GTIB = gep_type_begin(GEPB);
395
1.34k
  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; 
++I144
) {
396
176
    if (GTIA.getOperand() != GTIB.getOperand())
397
32
      return false;
398
144
    ++GTIA;
399
144
    ++GTIB;
400
144
  }
401
1.19k
402
1.19k
  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
403
1.16k
  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
404
1.16k
  if (!OpA || 
!OpB318
||
OpA->getOpcode() != OpB->getOpcode()306
||
405
1.16k
      
OpA->getType() != OpB->getType()237
)
406
928
    return false;
407
237
408
237
  if (PtrDelta.isNegative()) {
409
2
    if (PtrDelta.isMinSignedValue())
410
0
      return false;
411
2
    PtrDelta.negate();
412
2
    std::swap(OpA, OpB);
413
2
  }
414
237
  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
415
237
  if (PtrDelta.urem(Stride) != 0)
416
0
    return false;
417
237
  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
418
237
  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
419
237
420
237
  // Only look through a ZExt/SExt.
421
237
  if (!isa<SExtInst>(OpA) && 
!isa<ZExtInst>(OpA)188
)
422
85
    return false;
423
152
424
152
  bool Signed = isa<SExtInst>(OpA);
425
152
426
152
  // At this point A could be a function parameter, i.e. not an instruction
427
152
  Value *ValA = OpA->getOperand(0);
428
152
  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
429
152
  if (!OpB || 
ValA->getType() != OpB->getType()147
)
430
5
    return false;
431
147
432
147
  // Now we need to prove that adding IdxDiff to ValA won't overflow.
433
147
  bool Safe = false;
434
147
  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
435
147
  // ValA, we're okay.
436
147
  if (OpB->getOpcode() == Instruction::Add &&
437
147
      
isa<ConstantInt>(OpB->getOperand(1))95
&&
438
147
      
IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())84
) {
439
46
    if (Signed)
440
4
      Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
441
42
    else
442
42
      Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
443
46
  }
444
147
445
147
  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
446
147
447
147
  // Second attempt:
448
147
  // If all set bits of IdxDiff or any higher order bit other than the sign bit
449
147
  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
450
147
  // overflow of any sort.
451
147
  if (!Safe) {
452
103
    OpA = dyn_cast<Instruction>(ValA);
453
103
    if (!OpA)
454
4
      return false;
455
99
    KnownBits Known(BitWidth);
456
99
    computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
457
99
    APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
458
99
    if (Signed)
459
42
      BitsAllowedToBeSet.clearBit(BitWidth - 1);
460
99
    if (BitsAllowedToBeSet.ult(IdxDiff))
461
64
      return false;
462
79
  }
463
79
464
79
  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
465
79
  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
466
79
  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
467
79
  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
468
79
  return X == OffsetSCEVB;
469
79
}
470
471
bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
472
                                    const APInt &PtrDelta,
473
1.10k
                                    unsigned Depth) const {
474
1.10k
  if (Depth++ == MaxDepth)
475
0
    return false;
476
1.10k
477
1.10k
  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
478
11
    if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
479
11
      return SelectA->getCondition() == SelectB->getCondition() &&
480
11
             areConsecutivePointers(SelectA->getTrueValue(),
481
11
                                    SelectB->getTrueValue(), PtrDelta, Depth) &&
482
11
             areConsecutivePointers(SelectA->getFalseValue(),
483
5
                                    SelectB->getFalseValue(), PtrDelta, Depth);
484
11
    }
485
1.09k
  }
486
1.09k
  return false;
487
1.09k
}
488
489
10.3k
void Vectorizer::reorder(Instruction *I) {
490
10.3k
  OrderedBasicBlock OBB(I->getParent());
491
10.3k
  SmallPtrSet<Instruction *, 16> InstructionsToMove;
492
10.3k
  SmallVector<Instruction *, 16> Worklist;
493
10.3k
494
10.3k
  Worklist.push_back(I);
495
20.8k
  while (!Worklist.empty()) {
496
10.4k
    Instruction *IW = Worklist.pop_back_val();
497
10.4k
    int NumOperands = IW->getNumOperands();
498
20.8k
    for (int i = 0; i < NumOperands; 
i++10.4k
) {
499
10.4k
      Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
500
10.4k
      if (!IM || 
IM->getOpcode() == Instruction::PHI10.3k
)
501
120
        continue;
502
10.3k
503
10.3k
      // If IM is in another BB, no need to move it, because this pass only
504
10.3k
      // vectorizes instructions within one BB.
505
10.3k
      if (IM->getParent() != I->getParent())
506
3
        continue;
507
10.2k
508
10.2k
      if (!OBB.dominates(IM, I)) {
509
9
        InstructionsToMove.insert(IM);
510
9
        Worklist.push_back(IM);
511
9
      }
512
10.2k
    }
513
10.4k
  }
514
10.3k
515
10.3k
  // All instructions to move should follow I. Start from I, not from begin().
516
348k
  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
517
338k
       ++BBI) {
518
338k
    if (!InstructionsToMove.count(&*BBI))
519
338k
      continue;
520
9
    Instruction *IM = &*BBI;
521
9
    --BBI;
522
9
    IM->removeFromParent();
523
9
    IM->insertBefore(I);
524
9
  }
525
10.3k
}
526
527
std::pair<BasicBlock::iterator, BasicBlock::iterator>
528
21.8k
Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
529
21.8k
  Instruction *C0 = Chain[0];
530
21.8k
  BasicBlock::iterator FirstInstr = C0->getIterator();
531
21.8k
  BasicBlock::iterator LastInstr = C0->getIterator();
532
21.8k
533
21.8k
  BasicBlock *BB = C0->getParent();
534
21.8k
  unsigned NumFound = 0;
535
529k
  for (Instruction &I : *BB) {
536
529k
    if (!is_contained(Chain, &I))
537
473k
      continue;
538
55.8k
539
55.8k
    ++NumFound;
540
55.8k
    if (NumFound == 1) {
541
21.8k
      FirstInstr = I.getIterator();
542
21.8k
    }
543
55.8k
    if (NumFound == Chain.size()) {
544
21.8k
      LastInstr = I.getIterator();
545
21.8k
      break;
546
21.8k
    }
547
55.8k
  }
548
21.8k
549
21.8k
  // Range is [first, last).
550
21.8k
  return std::make_pair(FirstInstr, ++LastInstr);
551
21.8k
}
552
553
10.7k
void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
554
10.7k
  SmallVector<Instruction *, 16> Instrs;
555
25.9k
  for (Instruction *I : Chain) {
556
25.9k
    Value *PtrOperand = getLoadStorePointerOperand(I);
557
25.9k
    assert(PtrOperand && "Instruction must have a pointer operand.");
558
25.9k
    Instrs.push_back(I);
559
25.9k
    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
560
2.68k
      Instrs.push_back(GEP);
561
25.9k
  }
562
10.7k
563
10.7k
  // Erase instructions.
564
10.7k
  for (Instruction *I : Instrs)
565
28.6k
    if (I->use_empty())
566
27.8k
      I->eraseFromParent();
567
10.7k
}
568
569
std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
570
Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
571
167
                               unsigned ElementSizeBits) {
572
167
  unsigned ElementSizeBytes = ElementSizeBits / 8;
573
167
  unsigned SizeBytes = ElementSizeBytes * Chain.size();
574
167
  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
575
167
  if (NumLeft == Chain.size()) {
576
95
    if ((NumLeft & 1) == 0)
577
93
      NumLeft /= 2; // Split even in half
578
2
    else
579
2
      --NumLeft;    // Split off last element
580
95
  } else 
if (72
NumLeft == 072
)
581
71
    NumLeft = 1;
582
167
  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
583
167
}
584
585
ArrayRef<Instruction *>
586
11.1k
Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
587
11.1k
  // These are in BB order, unlike Chain, which is in address order.
588
11.1k
  SmallVector<Instruction *, 16> MemoryInstrs;
589
11.1k
  SmallVector<Instruction *, 16> ChainInstrs;
590
11.1k
591
11.1k
  bool IsLoadChain = isa<LoadInst>(Chain[0]);
592
11.1k
  LLVM_DEBUG({
593
11.1k
    for (Instruction *I : Chain) {
594
11.1k
      if (IsLoadChain)
595
11.1k
        assert(isa<LoadInst>(I) &&
596
11.1k
               "All elements of Chain must be loads, or all must be stores.");
597
11.1k
      else
598
11.1k
        assert(isa<StoreInst>(I) &&
599
11.1k
               "All elements of Chain must be loads, or all must be stores.");
600
11.1k
    }
601
11.1k
  });
602
11.1k
603
61.7k
  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
604
61.7k
    if (isa<LoadInst>(I) || 
isa<StoreInst>(I)36.4k
) {
605
30.1k
      if (!is_contained(Chain, &I))
606
237
        MemoryInstrs.push_back(&I);
607
29.9k
      else
608
29.9k
        ChainInstrs.push_back(&I);
609
31.6k
    } else if (isa<IntrinsicInst>(&I) &&
610
31.6k
               cast<IntrinsicInst>(&I)->getIntrinsicID() ==
611
20
                   Intrinsic::sideeffect) {
612
4
      // Ignore llvm.sideeffect calls.
613
31.6k
    } else if (IsLoadChain && 
(27.7k
I.mayWriteToMemory()27.7k
||
I.mayThrow()27.7k
)) {
614
7
      LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
615
7
                        << '\n');
616
7
      break;
617
31.6k
    } else if (!IsLoadChain && 
(3.87k
I.mayReadOrWriteMemory()3.87k
||
I.mayThrow()3.85k
)) {
618
21
      LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
619
21
                        << '\n');
620
21
      break;
621
21
    }
622
61.7k
  }
623
11.1k
624
11.1k
  OrderedBasicBlock OBB(Chain[0]->getParent());
625
11.1k
626
11.1k
  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
627
11.1k
  unsigned ChainInstrIdx = 0;
628
11.1k
  Instruction *BarrierMemoryInstr = nullptr;
629
11.1k
630
40.9k
  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; 
++ChainInstrIdx29.8k
) {
631
29.8k
    Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
632
29.8k
633
29.8k
    // If a barrier memory instruction was found, chain instructions that follow
634
29.8k
    // will not be added to the valid prefix.
635
29.8k
    if (BarrierMemoryInstr && 
OBB.dominates(BarrierMemoryInstr, ChainInstr)20
)
636
17
      break;
637
29.8k
638
29.8k
    // Check (in BB order) if any instruction prevents ChainInstr from being
639
29.8k
    // vectorized. Find and store the first such "conflicting" instruction.
640
29.8k
    for (Instruction *MemInstr : MemoryInstrs) {
641
704
      // If a barrier memory instruction was found, do not check past it.
642
704
      if (BarrierMemoryInstr && 
OBB.dominates(BarrierMemoryInstr, MemInstr)3
)
643
0
        break;
644
704
645
704
      auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
646
704
      auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
647
704
      if (MemLoad && 
ChainLoad246
)
648
167
        continue;
649
537
650
537
      // We can ignore the alias if the we have a load store pair and the load
651
537
      // is known to be invariant. The load cannot be clobbered by the store.
652
537
      auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
653
533
        return LI->getMetadata(LLVMContext::MD_invariant_load);
654
533
      };
655
537
656
537
      // We can ignore the alias as long as the load comes before the store,
657
537
      // because that means we won't be moving the load past the store to
658
537
      // vectorize it (the vectorized load is inserted at the location of the
659
537
      // first load in the chain).
660
537
      if (isa<StoreInst>(MemInstr) && 
ChainLoad458
&&
661
537
          
(454
IsInvariantLoad(ChainLoad)454
||
OBB.dominates(ChainLoad, MemInstr)452
))
662
237
        continue;
663
300
664
300
      // Same case, but in reverse.
665
300
      if (MemLoad && 
isa<StoreInst>(ChainInstr)79
&&
666
300
          
(79
IsInvariantLoad(MemLoad)79
||
OBB.dominates(MemLoad, ChainInstr)79
))
667
21
        continue;
668
279
669
279
      if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
670
279
                        MemoryLocation::get(ChainInstr))) {
671
49
        LLVM_DEBUG({
672
49
          dbgs() << "LSV: Found alias:\n"
673
49
                    "  Aliasing instruction and pointer:\n"
674
49
                 << "  " << *MemInstr << '\n'
675
49
                 << "  " << *getLoadStorePointerOperand(MemInstr) << '\n'
676
49
                 << "  Aliased instruction and pointer:\n"
677
49
                 << "  " << *ChainInstr << '\n'
678
49
                 << "  " << *getLoadStorePointerOperand(ChainInstr) << '\n';
679
49
        });
680
49
        // Save this aliasing memory instruction as a barrier, but allow other
681
49
        // instructions that precede the barrier to be vectorized with this one.
682
49
        BarrierMemoryInstr = MemInstr;
683
49
        break;
684
49
      }
685
279
    }
686
29.8k
    // Continue the search only for store chains, since vectorizing stores that
687
29.8k
    // precede an aliasing load is valid. Conversely, vectorizing loads is valid
688
29.8k
    // up to an aliasing store, but should not pull loads from further down in
689
29.8k
    // the basic block.
690
29.8k
    if (IsLoadChain && 
BarrierMemoryInstr25.2k
) {
691
20
      // The BarrierMemoryInstr is a store that precedes ChainInstr.
692
20
      assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
693
20
      break;
694
20
    }
695
29.8k
  }
696
11.1k
697
11.1k
  // Find the largest prefix of Chain whose elements are all in
698
11.1k
  // ChainInstrs[0, ChainInstrIdx).  This is the largest vectorizable prefix of
699
11.1k
  // Chain.  (Recall that Chain is in address order, but ChainInstrs is in BB
700
11.1k
  // order.)
701
11.1k
  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
702
11.1k
      ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
703
11.1k
  unsigned ChainIdx = 0;
704
40.9k
  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; 
++ChainIdx29.8k
) {
705
29.8k
    if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
706
65
      break;
707
29.8k
  }
708
11.1k
  return Chain.slice(0, ChainIdx);
709
11.1k
}
710
711
61.7k
static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
712
61.7k
  const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
713
61.7k
  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
714
7
    // The select's themselves are distinct instructions even if they share the
715
7
    // same condition and evaluate to consecutive pointers for true and false
716
7
    // values of the condition. Therefore using the select's themselves for
717
7
    // grouping instructions would put consecutive accesses into different lists
718
7
    // and they won't be even checked for being consecutive, and won't be
719
7
    // vectorized.
720
7
    return Sel->getCondition();
721
7
  }
722
61.7k
  return ObjPtr;
723
61.7k
}
724
725
std::pair<InstrListMap, InstrListMap>
726
31.5k
Vectorizer::collectInstructions(BasicBlock *BB) {
727
31.5k
  InstrListMap LoadRefs;
728
31.5k
  InstrListMap StoreRefs;
729
31.5k
730
309k
  for (Instruction &I : *BB) {
731
309k
    if (!I.mayReadOrWriteMemory())
732
217k
      continue;
733
91.7k
734
91.7k
    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
735
57.0k
      if (!LI->isSimple())
736
4.84k
        continue;
737
52.1k
738
52.1k
      // Skip if it's not legal.
739
52.1k
      if (!TTI.isLegalToVectorizeLoad(LI))
740
0
        continue;
741
52.1k
742
52.1k
      Type *Ty = LI->getType();
743
52.1k
      if (!VectorType::isValidElementType(Ty->getScalarType()))
744
50
        continue;
745
52.1k
746
52.1k
      // Skip weird non-byte sizes. They probably aren't worth the effort of
747
52.1k
      // handling correctly.
748
52.1k
      unsigned TySize = DL.getTypeSizeInBits(Ty);
749
52.1k
      if ((TySize % 8) != 0)
750
427
        continue;
751
51.6k
752
51.6k
      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
753
51.6k
      // functions are currently using an integer type for the vectorized
754
51.6k
      // load/store, and does not support casting between the integer type and a
755
51.6k
      // vector of pointers (e.g. i64 to <2 x i16*>)
756
51.6k
      if (Ty->isVectorTy() && 
Ty->isPtrOrPtrVectorTy()7.61k
)
757
18
        continue;
758
51.6k
759
51.6k
      Value *Ptr = LI->getPointerOperand();
760
51.6k
      unsigned AS = Ptr->getType()->getPointerAddressSpace();
761
51.6k
      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
762
51.6k
763
51.6k
      unsigned VF = VecRegSize / TySize;
764
51.6k
      VectorType *VecTy = dyn_cast<VectorType>(Ty);
765
51.6k
766
51.6k
      // No point in looking at these if they're too big to vectorize.
767
51.6k
      if (TySize > VecRegSize / 2 ||
768
51.6k
          
(49.9k
VecTy49.9k
&&
TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 06.09k
))
769
1.77k
        continue;
770
49.8k
771
49.8k
      // Make sure all the users of a vector are constant-index extracts.
772
49.8k
      if (isa<VectorType>(Ty) && 
!llvm::all_of(LI->users(), [](const User *U) 5.97k
{
773
7.88k
            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
774
7.88k
            return EEI && 
isa<ConstantInt>(EEI->getOperand(1))2.76k
;
775
7.88k
          }))
776
5.15k
        continue;
777
44.7k
778
44.7k
      // Save the load locations.
779
44.7k
      const ChainID ID = getChainID(Ptr, DL);
780
44.7k
      LoadRefs[ID].push_back(LI);
781
44.7k
    } else 
if (StoreInst *34.7k
SI34.7k
= dyn_cast<StoreInst>(&I)) {
782
24.1k
      if (!SI->isSimple())
783
4.10k
        continue;
784
20.0k
785
20.0k
      // Skip if it's not legal.
786
20.0k
      if (!TTI.isLegalToVectorizeStore(SI))
787
0
        continue;
788
20.0k
789
20.0k
      Type *Ty = SI->getValueOperand()->getType();
790
20.0k
      if (!VectorType::isValidElementType(Ty->getScalarType()))
791
24
        continue;
792
20.0k
793
20.0k
      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
794
20.0k
      // functions are currently using an integer type for the vectorized
795
20.0k
      // load/store, and does not support casting between the integer type and a
796
20.0k
      // vector of pointers (e.g. i64 to <2 x i16*>)
797
20.0k
      if (Ty->isVectorTy() && 
Ty->isPtrOrPtrVectorTy()5.86k
)
798
6
        continue;
799
20.0k
800
20.0k
      // Skip weird non-byte sizes. They probably aren't worth the effort of
801
20.0k
      // handling correctly.
802
20.0k
      unsigned TySize = DL.getTypeSizeInBits(Ty);
803
20.0k
      if ((TySize % 8) != 0)
804
397
        continue;
805
19.6k
806
19.6k
      Value *Ptr = SI->getPointerOperand();
807
19.6k
      unsigned AS = Ptr->getType()->getPointerAddressSpace();
808
19.6k
      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
809
19.6k
810
19.6k
      unsigned VF = VecRegSize / TySize;
811
19.6k
      VectorType *VecTy = dyn_cast<VectorType>(Ty);
812
19.6k
813
19.6k
      // No point in looking at these if they're too big to vectorize.
814
19.6k
      if (TySize > VecRegSize / 2 ||
815
19.6k
          
(17.4k
VecTy17.4k
&&
TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 03.92k
))
816
2.59k
        continue;
817
17.0k
818
17.0k
      if (isa<VectorType>(Ty) && 
!llvm::all_of(SI->users(), [](const User *U) 3.52k
{
819
0
            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
820
0
            return EEI && isa<ConstantInt>(EEI->getOperand(1));
821
0
          }))
822
0
        continue;
823
17.0k
824
17.0k
      // Save store location.
825
17.0k
      const ChainID ID = getChainID(Ptr, DL);
826
17.0k
      StoreRefs[ID].push_back(SI);
827
17.0k
    }
828
91.7k
  }
829
31.5k
830
31.5k
  return {LoadRefs, StoreRefs};
831
31.5k
}
832
833
63.0k
bool Vectorizer::vectorizeChains(InstrListMap &Map) {
834
63.0k
  bool Changed = false;
835
63.0k
836
63.0k
  for (const std::pair<ChainID, InstrList> &Chain : Map) {
837
38.8k
    unsigned Size = Chain.second.size();
838
38.8k
    if (Size < 2)
839
25.2k
      continue;
840
13.6k
841
13.6k
    LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
842
13.6k
843
13.6k
    // Process the stores in chunks of 64.
844
27.2k
    for (unsigned CI = 0, CE = Size; CI < CE; 
CI += 6413.6k
) {
845
13.6k
      unsigned Len = std::min<unsigned>(CE - CI, 64);
846
13.6k
      ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
847
13.6k
      Changed |= vectorizeInstructions(Chunk);
848
13.6k
    }
849
13.6k
  }
850
63.0k
851
63.0k
  return Changed;
852
63.0k
}
853
854
13.6k
bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
855
13.6k
  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
856
13.6k
                    << " instructions.\n");
857
13.6k
  SmallVector<int, 16> Heads, Tails;
858
13.6k
  int ConsecutiveChain[64];
859
13.6k
860
13.6k
  // Do a quadratic search on all of the given loads/stores and find all of the
861
13.6k
  // pairs of loads/stores that follow each other.
862
50.1k
  for (int i = 0, e = Instrs.size(); i < e; 
++i36.5k
) {
863
36.5k
    ConsecutiveChain[i] = -1;
864
267k
    for (int j = e - 1; j >= 0; 
--j231k
) {
865
231k
      if (i == j)
866
36.5k
        continue;
867
194k
868
194k
      if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
869
15.6k
        if (ConsecutiveChain[i] != -1) {
870
93
          int CurDistance = std::abs(ConsecutiveChain[i] - i);
871
93
          int NewDistance = std::abs(ConsecutiveChain[i] - j);
872
93
          if (j < i || 
NewDistance > CurDistance76
)
873
17
            continue; // Should not insert.
874
15.6k
        }
875
15.6k
876
15.6k
        Tails.push_back(j);
877
15.6k
        Heads.push_back(i);
878
15.6k
        ConsecutiveChain[i] = j;
879
15.6k
      }
880
194k
    }
881
36.5k
  }
882
13.6k
883
13.6k
  bool Changed = false;
884
13.6k
  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
885
13.6k
886
15.6k
  for (int Head : Heads) {
887
15.6k
    if (InstructionsProcessed.count(Instrs[Head]))
888
4.77k
      continue;
889
10.8k
    bool LongerChainExists = false;
890
44.3k
    for (unsigned TIt = 0; TIt < Tails.size(); 
TIt++33.4k
)
891
33.4k
      if (Head == Tails[TIt] &&
892
33.4k
          
!InstructionsProcessed.count(Instrs[Heads[TIt]])65
) {
893
20
        LongerChainExists = true;
894
20
        break;
895
20
      }
896
10.8k
    if (LongerChainExists)
897
20
      continue;
898
10.8k
899
10.8k
    // We found an instr that starts a chain. Now follow the chain and try to
900
10.8k
    // vectorize it.
901
10.8k
    SmallVector<Instruction *, 16> Operands;
902
10.8k
    int I = Head;
903
37.2k
    while (I != -1 && 
(26.4k
is_contained(Tails, I)26.4k
||
is_contained(Heads, I)10.8k
)) {
904
26.4k
      if (InstructionsProcessed.count(Instrs[I]))
905
45
        break;
906
26.4k
907
26.4k
      Operands.push_back(Instrs[I]);
908
26.4k
      I = ConsecutiveChain[I];
909
26.4k
    }
910
10.8k
911
10.8k
    bool Vectorized = false;
912
10.8k
    if (isa<LoadInst>(*Operands.begin()))
913
10.5k
      Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
914
340
    else
915
340
      Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
916
10.8k
917
10.8k
    Changed |= Vectorized;
918
10.8k
  }
919
13.6k
920
13.6k
  return Changed;
921
13.6k
}
922
923
bool Vectorizer::vectorizeStoreChain(
924
    ArrayRef<Instruction *> Chain,
925
738
    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
926
738
  StoreInst *S0 = cast<StoreInst>(Chain[0]);
927
738
928
738
  // If the vector has an int element, default to int for the whole store.
929
738
  Type *StoreTy = nullptr;
930
4.27k
  for (Instruction *I : Chain) {
931
4.27k
    StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
932
4.27k
    if (StoreTy->isIntOrIntVectorTy())
933
481
      break;
934
3.79k
935
3.79k
    if (StoreTy->isPtrOrPtrVectorTy()) {
936
6
      StoreTy = Type::getIntNTy(F.getParent()->getContext(),
937
6
                                DL.getTypeSizeInBits(StoreTy));
938
6
      break;
939
6
    }
940
3.79k
  }
941
738
  assert(StoreTy && "Failed to find store type");
942
738
943
738
  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
944
738
  unsigned AS = S0->getPointerAddressSpace();
945
738
  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
946
738
  unsigned VF = VecRegSize / Sz;
947
738
  unsigned ChainSize = Chain.size();
948
738
  unsigned Alignment = getAlignment(S0);
949
738
950
738
  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
951
148
    InstructionsProcessed->insert(Chain.begin(), Chain.end());
952
148
    return false;
953
148
  }
954
590
955
590
  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
956
590
  if (NewChain.empty()) {
957
17
    // No vectorization possible.
958
17
    InstructionsProcessed->insert(Chain.begin(), Chain.end());
959
17
    return false;
960
17
  }
961
573
  if (NewChain.size() == 1) {
962
18
    // Failed after the first instruction. Discard it and try the smaller chain.
963
18
    InstructionsProcessed->insert(NewChain.front());
964
18
    return false;
965
18
  }
966
555
967
555
  // Update Chain to the valid vectorizable subchain.
968
555
  Chain = NewChain;
969
555
  ChainSize = Chain.size();
970
555
971
555
  // Check if it's legal to vectorize this chain. If not, split the chain and
972
555
  // try again.
973
555
  unsigned EltSzInBytes = Sz / 8;
974
555
  unsigned SzInBytes = EltSzInBytes * ChainSize;
975
555
976
555
  VectorType *VecTy;
977
555
  VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
978
555
  if (VecStoreTy)
979
17
    VecTy = VectorType::get(StoreTy->getScalarType(),
980
17
                            Chain.size() * VecStoreTy->getNumElements());
981
538
  else
982
538
    VecTy = VectorType::get(StoreTy, Chain.size());
983
555
984
555
  // If it's more than the max vector size or the target has a better
985
555
  // vector factor, break it into two pieces.
986
555
  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
987
555
  if (ChainSize > VF || 
(470
VF != TargetVF470
&&
TargetVF < ChainSize263
)) {
988
121
    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
989
121
                         " Creating two separate arrays.\n");
990
121
    return vectorizeStoreChain(Chain.slice(0, TargetVF),
991
121
                               InstructionsProcessed) |
992
121
           vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
993
121
  }
994
434
995
434
  LLVM_DEBUG({
996
434
    dbgs() << "LSV: Stores to vectorize:\n";
997
434
    for (Instruction *I : Chain)
998
434
      dbgs() << "  " << *I << "\n";
999
434
  });
1000
434
1001
434
  // We won't try again to vectorize the elements of the chain, regardless of
1002
434
  // whether we succeed below.
1003
434
  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1004
434
1005
434
  // If the store is going to be misaligned, don't vectorize it.
1006
434
  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1007
82
    if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1008
33
      auto Chains = splitOddVectorElts(Chain, Sz);
1009
33
      return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1010
33
             vectorizeStoreChain(Chains.second, InstructionsProcessed);
1011
33
    }
1012
49
1013
49
    unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
1014
49
                                                   StackAdjustedAlignment,
1015
49
                                                   DL, S0, nullptr, &DT);
1016
49
    if (NewAlign != 0)
1017
49
      Alignment = NewAlign;
1018
49
  }
1019
434
1020
434
  
if (401
!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)401
) {
1021
45
    auto Chains = splitOddVectorElts(Chain, Sz);
1022
45
    return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1023
45
           vectorizeStoreChain(Chains.second, InstructionsProcessed);
1024
45
  }
1025
356
1026
356
  BasicBlock::iterator First, Last;
1027
356
  std::tie(First, Last) = getBoundaryInstrs(Chain);
1028
356
  Builder.SetInsertPoint(&*Last);
1029
356
1030
356
  Value *Vec = UndefValue::get(VecTy);
1031
356
1032
356
  if (VecStoreTy) {
1033
4
    unsigned VecWidth = VecStoreTy->getNumElements();
1034
12
    for (unsigned I = 0, E = Chain.size(); I != E; 
++I8
) {
1035
8
      StoreInst *Store = cast<StoreInst>(Chain[I]);
1036
22
      for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; 
++J14
) {
1037
14
        unsigned NewIdx = J + I * VecWidth;
1038
14
        Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1039
14
                                                      Builder.getInt32(J));
1040
14
        if (Extract->getType() != StoreTy->getScalarType())
1041
0
          Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1042
14
1043
14
        Value *Insert =
1044
14
            Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1045
14
        Vec = Insert;
1046
14
      }
1047
8
    }
1048
352
  } else {
1049
1.38k
    for (unsigned I = 0, E = Chain.size(); I != E; 
++I1.03k
) {
1050
1.03k
      StoreInst *Store = cast<StoreInst>(Chain[I]);
1051
1.03k
      Value *Extract = Store->getValueOperand();
1052
1.03k
      if (Extract->getType() != StoreTy->getScalarType())
1053
18
        Extract =
1054
18
            Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1055
1.03k
1056
1.03k
      Value *Insert =
1057
1.03k
          Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1058
1.03k
      Vec = Insert;
1059
1.03k
    }
1060
352
  }
1061
356
1062
356
  StoreInst *SI = Builder.CreateAlignedStore(
1063
356
    Vec,
1064
356
    Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1065
356
    Alignment);
1066
356
  propagateMetadata(SI, Chain);
1067
356
1068
356
  eraseInstructions(Chain);
1069
356
  ++NumVectorInstructions;
1070
356
  NumScalarsVectorized += Chain.size();
1071
356
  return true;
1072
356
}
1073
1074
bool Vectorizer::vectorizeLoadChain(
1075
    ArrayRef<Instruction *> Chain,
1076
10.7k
    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1077
10.7k
  LoadInst *L0 = cast<LoadInst>(Chain[0]);
1078
10.7k
1079
10.7k
  // If the vector has an int element, default to int for the whole load.
1080
10.7k
  Type *LoadTy;
1081
12.0k
  for (const auto &V : Chain) {
1082
12.0k
    LoadTy = cast<LoadInst>(V)->getType();
1083
12.0k
    if (LoadTy->isIntOrIntVectorTy())
1084
1.94k
      break;
1085
10.0k
1086
10.0k
    if (LoadTy->isPtrOrPtrVectorTy()) {
1087
8.01k
      LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1088
8.01k
                               DL.getTypeSizeInBits(LoadTy));
1089
8.01k
      break;
1090
8.01k
    }
1091
10.0k
  }
1092
10.7k
1093
10.7k
  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1094
10.7k
  unsigned AS = L0->getPointerAddressSpace();
1095
10.7k
  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1096
10.7k
  unsigned VF = VecRegSize / Sz;
1097
10.7k
  unsigned ChainSize = Chain.size();
1098
10.7k
  unsigned Alignment = getAlignment(L0);
1099
10.7k
1100
10.7k
  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1101
191
    InstructionsProcessed->insert(Chain.begin(), Chain.end());
1102
191
    return false;
1103
191
  }
1104
10.5k
1105
10.5k
  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1106
10.5k
  if (NewChain.empty()) {
1107
0
    // No vectorization possible.
1108
0
    InstructionsProcessed->insert(Chain.begin(), Chain.end());
1109
0
    return false;
1110
0
  }
1111
10.5k
  if (NewChain.size() == 1) {
1112
21
    // Failed after the first instruction. Discard it and try the smaller chain.
1113
21
    InstructionsProcessed->insert(NewChain.front());
1114
21
    return false;
1115
21
  }
1116
10.5k
1117
10.5k
  // Update Chain to the valid vectorizable subchain.
1118
10.5k
  Chain = NewChain;
1119
10.5k
  ChainSize = Chain.size();
1120
10.5k
1121
10.5k
  // Check if it's legal to vectorize this chain. If not, split the chain and
1122
10.5k
  // try again.
1123
10.5k
  unsigned EltSzInBytes = Sz / 8;
1124
10.5k
  unsigned SzInBytes = EltSzInBytes * ChainSize;
1125
10.5k
  VectorType *VecTy;
1126
10.5k
  VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
1127
10.5k
  if (VecLoadTy)
1128
41
    VecTy = VectorType::get(LoadTy->getScalarType(),
1129
41
                            Chain.size() * VecLoadTy->getNumElements());
1130
10.4k
  else
1131
10.4k
    VecTy = VectorType::get(LoadTy, Chain.size());
1132
10.5k
1133
10.5k
  // If it's more than the max vector size or the target has a better
1134
10.5k
  // vector factor, break it into two pieces.
1135
10.5k
  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1136
10.5k
  if (ChainSize > VF || 
(10.5k
VF != TargetVF10.5k
&&
TargetVF < ChainSize62
)) {
1137
9
    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1138
9
                         " Creating two separate arrays.\n");
1139
9
    return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1140
9
           vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1141
9
  }
1142
10.4k
1143
10.4k
  // We won't try again to vectorize the elements of the chain, regardless of
1144
10.4k
  // whether we succeed below.
1145
10.4k
  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1146
10.4k
1147
10.4k
  // If the load is going to be misaligned, don't vectorize it.
1148
10.4k
  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1149
95
    if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1150
78
      auto Chains = splitOddVectorElts(Chain, Sz);
1151
78
      return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1152
78
             vectorizeLoadChain(Chains.second, InstructionsProcessed);
1153
78
    }
1154
17
1155
17
    Alignment = getOrEnforceKnownAlignment(
1156
17
        L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);
1157
17
  }
1158
10.4k
1159
10.4k
  
if (10.4k
!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)10.4k
) {
1160
11
    auto Chains = splitOddVectorElts(Chain, Sz);
1161
11
    return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1162
11
           vectorizeLoadChain(Chains.second, InstructionsProcessed);
1163
11
  }
1164
10.4k
1165
10.4k
  LLVM_DEBUG({
1166
10.4k
    dbgs() << "LSV: Loads to vectorize:\n";
1167
10.4k
    for (Instruction *I : Chain)
1168
10.4k
      I->dump();
1169
10.4k
  });
1170
10.4k
1171
10.4k
  // getVectorizablePrefix already computed getBoundaryInstrs.  The value of
1172
10.4k
  // Last may have changed since then, but the value of First won't have.  If it
1173
10.4k
  // matters, we could compute getBoundaryInstrs only once and reuse it here.
1174
10.4k
  BasicBlock::iterator First, Last;
1175
10.4k
  std::tie(First, Last) = getBoundaryInstrs(Chain);
1176
10.4k
  Builder.SetInsertPoint(&*First);
1177
10.4k
1178
10.4k
  Value *Bitcast =
1179
10.4k
      Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1180
10.4k
  LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
1181
10.4k
  propagateMetadata(LI, Chain);
1182
10.4k
1183
10.4k
  if (VecLoadTy) {
1184
35
    SmallVector<Instruction *, 16> InstrsToErase;
1185
35
1186
35
    unsigned VecWidth = VecLoadTy->getNumElements();
1187
111
    for (unsigned I = 0, E = Chain.size(); I != E; 
++I76
) {
1188
182
      for (auto Use : Chain[I]->users()) {
1189
182
        // All users of vector loads are ExtractElement instructions with
1190
182
        // constant indices, otherwise we would have bailed before now.
1191
182
        Instruction *UI = cast<Instruction>(Use);
1192
182
        unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1193
182
        unsigned NewIdx = Idx + I * VecWidth;
1194
182
        Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1195
182
                                                UI->getName());
1196
182
        if (V->getType() != UI->getType())
1197
0
          V = Builder.CreateBitCast(V, UI->getType());
1198
182
1199
182
        // Replace the old instruction.
1200
182
        UI->replaceAllUsesWith(V);
1201
182
        InstrsToErase.push_back(UI);
1202
182
      }
1203
76
    }
1204
35
1205
35
    // Bitcast might not be an Instruction, if the value being loaded is a
1206
35
    // constant.  In that case, no need to reorder anything.
1207
35
    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1208
33
      reorder(BitcastInst);
1209
35
1210
35
    for (auto I : InstrsToErase)
1211
182
      I->eraseFromParent();
1212
10.3k
  } else {
1213
35.2k
    for (unsigned I = 0, E = Chain.size(); I != E; 
++I24.8k
) {
1214
24.8k
      Value *CV = Chain[I];
1215
24.8k
      Value *V =
1216
24.8k
          Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1217
24.8k
      if (V->getType() != CV->getType()) {
1218
17.8k
        V = Builder.CreateBitOrPointerCast(V, CV->getType());
1219
17.8k
      }
1220
24.8k
1221
24.8k
      // Replace the old instruction.
1222
24.8k
      CV->replaceAllUsesWith(V);
1223
24.8k
    }
1224
10.3k
1225
10.3k
    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1226
10.3k
      reorder(BitcastInst);
1227
10.3k
  }
1228
10.4k
1229
10.4k
  eraseInstructions(Chain);
1230
10.4k
1231
10.4k
  ++NumVectorInstructions;
1232
10.4k
  NumScalarsVectorized += Chain.size();
1233
10.4k
  return true;
1234
10.4k
}
1235
1236
bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1237
10.9k
                                    unsigned Alignment) {
1238
10.9k
  if (Alignment % SzInBytes == 0)
1239
1.50k
    return false;
1240
9.42k
1241
9.42k
  bool Fast = false;
1242
9.42k
  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1243
9.42k
                                                   SzInBytes * 8, AddressSpace,
1244
9.42k
                                                   Alignment, &Fast);
1245
9.42k
  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1246
9.42k
                    << " and fast? " << Fast << "\n";);
1247
9.42k
  return !Allows || 
!Fast9.25k
;
1248
9.42k
}