Coverage Report

Created: 2017-06-30 05:10

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This file implements a basic-block vectorization pass. The algorithm was
11
// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
12
// et al. It works by looking for chains of pairable operations and then
13
// pairing them.
14
//
15
//===----------------------------------------------------------------------===//
16
17
#define BBV_NAME "bb-vectorize"
18
#include "llvm/ADT/DenseMap.h"
19
#include "llvm/ADT/DenseSet.h"
20
#include "llvm/ADT/STLExtras.h"
21
#include "llvm/ADT/SmallSet.h"
22
#include "llvm/ADT/SmallVector.h"
23
#include "llvm/ADT/Statistic.h"
24
#include "llvm/ADT/StringExtras.h"
25
#include "llvm/Analysis/AliasAnalysis.h"
26
#include "llvm/Analysis/AliasSetTracker.h"
27
#include "llvm/Analysis/GlobalsModRef.h"
28
#include "llvm/Analysis/ScalarEvolution.h"
29
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
30
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
31
#include "llvm/Analysis/TargetLibraryInfo.h"
32
#include "llvm/Analysis/TargetTransformInfo.h"
33
#include "llvm/Analysis/ValueTracking.h"
34
#include "llvm/IR/Constants.h"
35
#include "llvm/IR/DataLayout.h"
36
#include "llvm/IR/DerivedTypes.h"
37
#include "llvm/IR/Dominators.h"
38
#include "llvm/IR/Function.h"
39
#include "llvm/IR/Instructions.h"
40
#include "llvm/IR/IntrinsicInst.h"
41
#include "llvm/IR/Intrinsics.h"
42
#include "llvm/IR/LLVMContext.h"
43
#include "llvm/IR/Metadata.h"
44
#include "llvm/IR/Module.h"
45
#include "llvm/IR/Type.h"
46
#include "llvm/IR/ValueHandle.h"
47
#include "llvm/Pass.h"
48
#include "llvm/Support/CommandLine.h"
49
#include "llvm/Support/Debug.h"
50
#include "llvm/Support/raw_ostream.h"
51
#include "llvm/Transforms/Utils/Local.h"
52
#include "llvm/Transforms/Vectorize.h"
53
#include <algorithm>
54
using namespace llvm;
55
56
#define DEBUG_TYPE BBV_NAME
57
58
static cl::opt<bool>
59
IgnoreTargetInfo("bb-vectorize-ignore-target-info",  cl::init(false),
60
  cl::Hidden, cl::desc("Ignore target information"));
61
62
static cl::opt<unsigned>
63
ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
64
  cl::desc("The required chain depth for vectorization"));
65
66
static cl::opt<bool>
67
UseChainDepthWithTI("bb-vectorize-use-chain-depth",  cl::init(false),
68
  cl::Hidden, cl::desc("Use the chain depth requirement with"
69
                       " target information"));
70
71
static cl::opt<unsigned>
72
SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
73
  cl::desc("The maximum search distance for instruction pairs"));
74
75
static cl::opt<bool>
76
SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
77
  cl::desc("Replicating one element to a pair breaks the chain"));
78
79
static cl::opt<unsigned>
80
VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
81
  cl::desc("The size of the native vector registers"));
82
83
static cl::opt<unsigned>
84
MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
85
  cl::desc("The maximum number of pairing iterations"));
86
87
static cl::opt<bool>
88
Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
89
  cl::desc("Don't try to form non-2^n-length vectors"));
90
91
static cl::opt<unsigned>
92
MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
93
  cl::desc("The maximum number of pairable instructions per group"));
94
95
static cl::opt<unsigned>
96
MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
97
  cl::desc("The maximum number of candidate instruction pairs per group"));
98
99
static cl::opt<unsigned>
100
MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
101
  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
102
                       " a full cycle check"));
103
104
static cl::opt<bool>
105
NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
106
  cl::desc("Don't try to vectorize boolean (i1) values"));
107
108
static cl::opt<bool>
109
NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
110
  cl::desc("Don't try to vectorize integer values"));
111
112
static cl::opt<bool>
113
NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
114
  cl::desc("Don't try to vectorize floating-point values"));
115
116
// FIXME: This should default to false once pointer vector support works.
117
static cl::opt<bool>
118
NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
119
  cl::desc("Don't try to vectorize pointer values"));
120
121
static cl::opt<bool>
122
NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
123
  cl::desc("Don't try to vectorize casting (conversion) operations"));
124
125
static cl::opt<bool>
126
NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
127
  cl::desc("Don't try to vectorize floating-point math intrinsics"));
128
129
static cl::opt<bool>
130
  NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
131
  cl::desc("Don't try to vectorize BitManipulation intrinsics"));
132
133
static cl::opt<bool>
134
NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
135
  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
136
137
static cl::opt<bool>
138
NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
139
  cl::desc("Don't try to vectorize select instructions"));
140
141
static cl::opt<bool>
142
NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
143
  cl::desc("Don't try to vectorize comparison instructions"));
144
145
static cl::opt<bool>
146
NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
147
  cl::desc("Don't try to vectorize getelementptr instructions"));
148
149
static cl::opt<bool>
150
NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
151
  cl::desc("Don't try to vectorize loads and stores"));
152
153
static cl::opt<bool>
154
AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
155
  cl::desc("Only generate aligned loads and stores"));
156
157
static cl::opt<bool>
158
NoMemOpBoost("bb-vectorize-no-mem-op-boost",
159
  cl::init(false), cl::Hidden,
160
  cl::desc("Don't boost the chain-depth contribution of loads and stores"));
161
162
static cl::opt<bool>
163
FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
164
  cl::desc("Use a fast instruction dependency analysis"));
165
166
#ifndef NDEBUG
167
static cl::opt<bool>
168
DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
169
  cl::init(false), cl::Hidden,
170
  cl::desc("When debugging is enabled, output information on the"
171
           " instruction-examination process"));
172
static cl::opt<bool>
173
DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
174
  cl::init(false), cl::Hidden,
175
  cl::desc("When debugging is enabled, output information on the"
176
           " candidate-selection process"));
177
static cl::opt<bool>
178
DebugPairSelection("bb-vectorize-debug-pair-selection",
179
  cl::init(false), cl::Hidden,
180
  cl::desc("When debugging is enabled, output information on the"
181
           " pair-selection process"));
182
static cl::opt<bool>
183
DebugCycleCheck("bb-vectorize-debug-cycle-check",
184
  cl::init(false), cl::Hidden,
185
  cl::desc("When debugging is enabled, output information on the"
186
           " cycle-checking process"));
187
188
static cl::opt<bool>
189
PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
190
  cl::init(false), cl::Hidden,
191
  cl::desc("When debugging is enabled, dump the basic block after"
192
           " every pair is fused"));
193
#endif
194
195
STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
196
197
namespace {
198
  struct BBVectorize : public BasicBlockPass {
199
    static char ID; // Pass identification, replacement for typeid
200
201
    const VectorizeConfig Config;
202
203
    BBVectorize(const VectorizeConfig &C = VectorizeConfig())
204
36
      : BasicBlockPass(ID), Config(C) {
205
36
      initializeBBVectorizePass(*PassRegistry::getPassRegistry());
206
36
    }
207
208
    BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
209
0
      : BasicBlockPass(ID), Config(C) {
210
0
      AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults();
211
0
      DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
212
0
      SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE();
213
0
      TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
214
0
      TTI = IgnoreTargetInfo
215
0
                ? nullptr
216
0
                : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
217
0
    }
218
219
    typedef std::pair<Value *, Value *> ValuePair;
220
    typedef std::pair<ValuePair, int> ValuePairWithCost;
221
    typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
222
    typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
223
    typedef std::pair<VPPair, unsigned> VPPairWithType;
224
225
    AliasAnalysis *AA;
226
    DominatorTree *DT;
227
    ScalarEvolution *SE;
228
    const TargetLibraryInfo *TLI;
229
    const TargetTransformInfo *TTI;
230
231
    // FIXME: const correct?
232
233
    bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
234
235
    bool getCandidatePairs(BasicBlock &BB,
236
                       BasicBlock::iterator &Start,
237
                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
238
                       DenseSet<ValuePair> &FixedOrderPairs,
239
                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
240
                       std::vector<Value *> &PairableInsts, bool NonPow2Len);
241
242
    // FIXME: The current implementation does not account for pairs that
243
    // are connected in multiple ways. For example:
244
    //   C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
245
    enum PairConnectionType {
246
      PairConnectionDirect,
247
      PairConnectionSwap,
248
      PairConnectionSplat
249
    };
250
251
    void computeConnectedPairs(
252
             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
253
             DenseSet<ValuePair> &CandidatePairsSet,
254
             std::vector<Value *> &PairableInsts,
255
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
256
             DenseMap<VPPair, unsigned> &PairConnectionTypes);
257
258
    void buildDepMap(BasicBlock &BB,
259
             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
260
             std::vector<Value *> &PairableInsts,
261
             DenseSet<ValuePair> &PairableInstUsers);
262
263
    void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
264
             DenseSet<ValuePair> &CandidatePairsSet,
265
             DenseMap<ValuePair, int> &CandidatePairCostSavings,
266
             std::vector<Value *> &PairableInsts,
267
             DenseSet<ValuePair> &FixedOrderPairs,
268
             DenseMap<VPPair, unsigned> &PairConnectionTypes,
269
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
270
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
271
             DenseSet<ValuePair> &PairableInstUsers,
272
             DenseMap<Value *, Value *>& ChosenPairs);
273
274
    void fuseChosenPairs(BasicBlock &BB,
275
             std::vector<Value *> &PairableInsts,
276
             DenseMap<Value *, Value *>& ChosenPairs,
277
             DenseSet<ValuePair> &FixedOrderPairs,
278
             DenseMap<VPPair, unsigned> &PairConnectionTypes,
279
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
280
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
281
282
283
    bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
284
285
    bool areInstsCompatible(Instruction *I, Instruction *J,
286
                       bool IsSimpleLoadStore, bool NonPow2Len,
287
                       int &CostSavings, int &FixedOrder);
288
289
    bool trackUsesOfI(DenseSet<Value *> &Users,
290
                      AliasSetTracker &WriteSet, Instruction *I,
291
                      Instruction *J, bool UpdateUsers = true,
292
                      DenseSet<ValuePair> *LoadMoveSetPairs = nullptr);
293
294
  void computePairsConnectedTo(
295
             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
296
             DenseSet<ValuePair> &CandidatePairsSet,
297
             std::vector<Value *> &PairableInsts,
298
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
299
             DenseMap<VPPair, unsigned> &PairConnectionTypes,
300
             ValuePair P);
301
302
    bool pairsConflict(ValuePair P, ValuePair Q,
303
             DenseSet<ValuePair> &PairableInstUsers,
304
             DenseMap<ValuePair, std::vector<ValuePair> >
305
               *PairableInstUserMap = nullptr,
306
             DenseSet<VPPair> *PairableInstUserPairSet = nullptr);
307
308
    bool pairWillFormCycle(ValuePair P,
309
             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
310
             DenseSet<ValuePair> &CurrentPairs);
311
312
    void pruneDAGFor(
313
             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
314
             std::vector<Value *> &PairableInsts,
315
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
316
             DenseSet<ValuePair> &PairableInstUsers,
317
             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
318
             DenseSet<VPPair> &PairableInstUserPairSet,
319
             DenseMap<Value *, Value *> &ChosenPairs,
320
             DenseMap<ValuePair, size_t> &DAG,
321
             DenseSet<ValuePair> &PrunedDAG, ValuePair J,
322
             bool UseCycleCheck);
323
324
    void buildInitialDAGFor(
325
             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
326
             DenseSet<ValuePair> &CandidatePairsSet,
327
             std::vector<Value *> &PairableInsts,
328
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
329
             DenseSet<ValuePair> &PairableInstUsers,
330
             DenseMap<Value *, Value *> &ChosenPairs,
331
             DenseMap<ValuePair, size_t> &DAG, ValuePair J);
332
333
    void findBestDAGFor(
334
             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
335
             DenseSet<ValuePair> &CandidatePairsSet,
336
             DenseMap<ValuePair, int> &CandidatePairCostSavings,
337
             std::vector<Value *> &PairableInsts,
338
             DenseSet<ValuePair> &FixedOrderPairs,
339
             DenseMap<VPPair, unsigned> &PairConnectionTypes,
340
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
341
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
342
             DenseSet<ValuePair> &PairableInstUsers,
343
             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
344
             DenseSet<VPPair> &PairableInstUserPairSet,
345
             DenseMap<Value *, Value *> &ChosenPairs,
346
             DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
347
             int &BestEffSize, Value *II, std::vector<Value *>&JJ,
348
             bool UseCycleCheck);
349
350
    Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
351
                     Instruction *J, unsigned o);
352
353
    void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
354
                     unsigned MaskOffset, unsigned NumInElem,
355
                     unsigned NumInElem1, unsigned IdxOffset,
356
                     std::vector<Constant*> &Mask);
357
358
    Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
359
                     Instruction *J);
360
361
    bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
362
                       unsigned o, Value *&LOp, unsigned numElemL,
363
                       Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
364
                       unsigned IdxOff = 0);
365
366
    Value *getReplacementInput(LLVMContext& Context, Instruction *I,
367
                     Instruction *J, unsigned o, bool IBeforeJ);
368
369
    void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
370
                     Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
371
                     bool IBeforeJ);
372
373
    void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
374
                     Instruction *J, Instruction *K,
375
                     Instruction *&InsertionPt, Instruction *&K1,
376
                     Instruction *&K2);
377
378
    void collectPairLoadMoveSet(BasicBlock &BB,
379
                     DenseMap<Value *, Value *> &ChosenPairs,
380
                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
381
                     DenseSet<ValuePair> &LoadMoveSetPairs,
382
                     Instruction *I);
383
384
    void collectLoadMoveSet(BasicBlock &BB,
385
                     std::vector<Value *> &PairableInsts,
386
                     DenseMap<Value *, Value *> &ChosenPairs,
387
                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
388
                     DenseSet<ValuePair> &LoadMoveSetPairs);
389
390
    bool canMoveUsesOfIAfterJ(BasicBlock &BB,
391
                     DenseSet<ValuePair> &LoadMoveSetPairs,
392
                     Instruction *I, Instruction *J);
393
394
    void moveUsesOfIAfterJ(BasicBlock &BB,
395
                     DenseSet<ValuePair> &LoadMoveSetPairs,
396
                     Instruction *&InsertionPt,
397
                     Instruction *I, Instruction *J);
398
399
107
    bool vectorizeBB(BasicBlock &BB) {
400
107
      if (skipBasicBlock(BB))
401
0
        return false;
402
107
      
if (107
!DT->isReachableFromEntry(&BB)107
)
{4
403
4
        DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
404
4
              " in " << BB.getParent()->getName() << "\n");
405
4
        return false;
406
4
      }
407
107
408
103
      
DEBUG103
(if (TTI) dbgs() << "BBV: using target information\n");103
409
103
410
103
      bool changed = false;
411
103
      // Iterate a sufficient number of times to merge types of size 1 bit,
412
103
      // then 2 bits, then 4, etc. up to half of the target vector width of the
413
103
      // target vector register.
414
103
      unsigned n = 1;
415
103
      for (unsigned v = 2;
416
172
           
(TTI || 172
v <= Config.VectorBits100
) &&
417
172
           
(!Config.MaxIter || 172
n <= Config.MaxIter0
);
418
172
           
v *= 2, ++n69
)
{172
419
172
        DEBUG(dbgs() << "BBV: fusing loop #" << n <<
420
172
              " for " << BB.getName() << " in " <<
421
172
              BB.getParent()->getName() << "...\n");
422
172
        if (vectorizePairs(BB))
423
69
          changed = true;
424
172
        else
425
103
          break;
426
172
      }
427
103
428
103
      if (
changed && 103
!Pow2LenOnly60
)
{60
429
60
        ++n;
430
61
        for (; 
!Config.MaxIter || 61
n <= Config.MaxIter0
;
++n1
)
{61
431
61
          DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
432
61
                n << " for " << BB.getName() << " in " <<
433
61
                BB.getParent()->getName() << "...\n");
434
61
          if (
!vectorizePairs(BB, true)61
)
break60
;
435
61
        }
436
60
      }
437
103
438
103
      DEBUG(dbgs() << "BBV: done!\n");
439
103
      return changed;
440
107
    }
441
442
107
    bool runOnBasicBlock(BasicBlock &BB) override {
443
107
      // OptimizeNone check deferred to vectorizeBB().
444
107
445
107
      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
446
107
      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
447
107
      SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
448
107
      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
449
107
      TTI = IgnoreTargetInfo
450
64
                ? nullptr
451
43
                : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
452
43
                      *BB.getParent());
453
107
454
107
      return vectorizeBB(BB);
455
107
    }
456
457
36
    void getAnalysisUsage(AnalysisUsage &AU) const override {
458
36
      BasicBlockPass::getAnalysisUsage(AU);
459
36
      AU.addRequired<AAResultsWrapperPass>();
460
36
      AU.addRequired<DominatorTreeWrapperPass>();
461
36
      AU.addRequired<ScalarEvolutionWrapperPass>();
462
36
      AU.addRequired<TargetLibraryInfoWrapperPass>();
463
36
      AU.addRequired<TargetTransformInfoWrapperPass>();
464
36
      AU.addPreserved<DominatorTreeWrapperPass>();
465
36
      AU.addPreserved<GlobalsAAWrapperPass>();
466
36
      AU.addPreserved<ScalarEvolutionWrapperPass>();
467
36
      AU.addPreserved<SCEVAAWrapperPass>();
468
36
      AU.setPreservesCFG();
469
36
    }
470
471
34.6k
    static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
472
34.6k
      assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
473
34.6k
             "Cannot form vector from incompatible scalar types");
474
34.6k
      Type *STy = ElemTy->getScalarType();
475
34.6k
476
34.6k
      unsigned numElem;
477
34.6k
      if (VectorType *
VTy34.6k
= dyn_cast<VectorType>(ElemTy))
{16.5k
478
16.5k
        numElem = VTy->getNumElements();
479
18.0k
      } else {
480
18.0k
        numElem = 1;
481
18.0k
      }
482
34.6k
483
34.6k
      if (VectorType *
VTy34.6k
= dyn_cast<VectorType>(Elem2Ty))
{16.5k
484
16.5k
        numElem += VTy->getNumElements();
485
18.1k
      } else {
486
18.1k
        numElem += 1;
487
18.1k
      }
488
34.6k
489
34.6k
      return VectorType::get(STy, numElem);
490
34.6k
    }
491
492
    static inline void getInstructionTypes(Instruction *I,
493
29.5k
                                           Type *&T1, Type *&T2) {
494
29.5k
      if (StoreInst *
SI29.5k
= dyn_cast<StoreInst>(I))
{1.77k
495
1.77k
        // For stores, it is the value type, not the pointer type that matters
496
1.77k
        // because the value is what will come from a vector register.
497
1.77k
498
1.77k
        Value *IVal = SI->getValueOperand();
499
1.77k
        T1 = IVal->getType();
500
27.8k
      } else {
501
27.8k
        T1 = I->getType();
502
27.8k
      }
503
29.5k
504
29.5k
      if (CastInst *CI = dyn_cast<CastInst>(I))
505
1.54k
        T2 = CI->getSrcTy();
506
29.5k
      else
507
28.0k
        T2 = T1;
508
29.5k
509
29.5k
      if (SelectInst *
SI29.5k
= dyn_cast<SelectInst>(I))
{10
510
10
        T2 = SI->getCondition()->getType();
511
29.5k
      } else 
if (ShuffleVectorInst *29.5k
SI29.5k
= dyn_cast<ShuffleVectorInst>(I))
{499
512
499
        T2 = SI->getOperand(0)->getType();
513
29.0k
      } else 
if (CmpInst *29.0k
CI29.0k
= dyn_cast<CmpInst>(I))
{33
514
33
        T2 = CI->getOperand(0)->getType();
515
33
      }
516
29.5k
    }
517
518
    // Returns the weight associated with the provided value. A chain of
519
    // candidate pairs has a length given by the sum of the weights of its
520
    // members (one weight per pair; the weight of each member of the pair
521
    // is assumed to be the same). This length is then compared to the
522
    // chain-length threshold to determine if a given chain is significant
523
    // enough to be vectorized. The length is also used in comparing
524
    // candidate chains where longer chains are considered to be better.
525
    // Note: when this function returns 0, the resulting instructions are
526
    // not actually fused.
527
21.0k
    inline size_t getDepthFactor(Value *V) {
528
21.0k
      // InsertElement and ExtractElement have a depth factor of zero. This is
529
21.0k
      // for two reasons: First, they cannot be usefully fused. Second, because
530
21.0k
      // the pass generates a lot of these, they can confuse the simple metric
531
21.0k
      // used to compare the dags in the next iteration. Thus, giving them a
532
21.0k
      // weight of zero allows the pass to essentially ignore them in
533
21.0k
      // subsequent iterations when looking for vectorization opportunities
534
21.0k
      // while still tracking dependency chains that flow through those
535
21.0k
      // instructions.
536
21.0k
      if (
isa<InsertElementInst>(V) || 21.0k
isa<ExtractElementInst>(V)14.3k
)
537
7.59k
        return 0;
538
21.0k
539
21.0k
      // Give a load or store half of the required depth so that load/store
540
21.0k
      // pairs will vectorize.
541
13.4k
      
if (13.4k
!Config.NoMemOpBoost && 13.4k
(isa<LoadInst>(V) || 13.4k
isa<StoreInst>(V)13.2k
))
542
1.17k
        return Config.ReqChainDepth/2;
543
13.4k
544
12.2k
      return 1;
545
13.4k
    }
546
547
    // Returns the cost of the provided instruction using TTI.
548
    // This does not handle loads and stores.
549
    unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
550
                          TargetTransformInfo::OperandValueKind Op1VK =
551
                              TargetTransformInfo::OK_AnyValue,
552
                          TargetTransformInfo::OperandValueKind Op2VK =
553
                              TargetTransformInfo::OK_AnyValue,
554
33.9k
                          const Instruction *I = nullptr) {
555
33.9k
      switch (Opcode) {
556
6.18k
      default: break;
557
0
      case Instruction::GetElementPtr:
558
0
        // We mark this instruction as zero-cost because scalar GEPs are usually
559
0
        // lowered to the instruction addressing mode. At the moment we don't
560
0
        // generate vector GEPs.
561
0
        return 0;
562
0
      case Instruction::Br:
563
0
        return TTI->getCFInstrCost(Opcode);
564
0
      case Instruction::PHI:
565
0
        return 0;
566
19.9k
      case Instruction::Add:
567
19.9k
      case Instruction::FAdd:
568
19.9k
      case Instruction::Sub:
569
19.9k
      case Instruction::FSub:
570
19.9k
      case Instruction::Mul:
571
19.9k
      case Instruction::FMul:
572
19.9k
      case Instruction::UDiv:
573
19.9k
      case Instruction::SDiv:
574
19.9k
      case Instruction::FDiv:
575
19.9k
      case Instruction::URem:
576
19.9k
      case Instruction::SRem:
577
19.9k
      case Instruction::FRem:
578
19.9k
      case Instruction::Shl:
579
19.9k
      case Instruction::LShr:
580
19.9k
      case Instruction::AShr:
581
19.9k
      case Instruction::And:
582
19.9k
      case Instruction::Or:
583
19.9k
      case Instruction::Xor:
584
19.9k
        return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
585
3
      case Instruction::Select:
586
3
      case Instruction::ICmp:
587
3
      case Instruction::FCmp:
588
3
        return TTI->getCmpSelInstrCost(Opcode, T1, T2, I);
589
7.89k
      case Instruction::ZExt:
590
7.89k
      case Instruction::SExt:
591
7.89k
      case Instruction::FPToUI:
592
7.89k
      case Instruction::FPToSI:
593
7.89k
      case Instruction::FPExt:
594
7.89k
      case Instruction::PtrToInt:
595
7.89k
      case Instruction::IntToPtr:
596
7.89k
      case Instruction::SIToFP:
597
7.89k
      case Instruction::UIToFP:
598
7.89k
      case Instruction::Trunc:
599
7.89k
      case Instruction::FPTrunc:
600
7.89k
      case Instruction::BitCast:
601
7.89k
      case Instruction::ShuffleVector:
602
7.89k
        return TTI->getCastInstrCost(Opcode, T1, T2, I);
603
33.9k
      }
604
33.9k
605
6.18k
      return 1;
606
33.9k
    }
607
608
    // This determines the relative offset of two loads or stores, returning
609
    // true if the offset could be determined to be some constant value.
610
    // For example, if OffsetInElmts == 1, then J accesses the memory directly
611
    // after I; if OffsetInElmts == -1 then I accesses the memory
612
    // directly after J.
613
    bool getPairPtrInfo(Instruction *I, Instruction *J,
614
        Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
615
        unsigned &IAddressSpace, unsigned &JAddressSpace,
616
1.77k
        int64_t &OffsetInElmts, bool ComputeOffset = true) {
617
1.77k
      OffsetInElmts = 0;
618
1.77k
      if (LoadInst *
LI1.77k
= dyn_cast<LoadInst>(I))
{960
619
960
        LoadInst *LJ = cast<LoadInst>(J);
620
960
        IPtr = LI->getPointerOperand();
621
960
        JPtr = LJ->getPointerOperand();
622
960
        IAlignment = LI->getAlignment();
623
960
        JAlignment = LJ->getAlignment();
624
960
        IAddressSpace = LI->getPointerAddressSpace();
625
960
        JAddressSpace = LJ->getPointerAddressSpace();
626
818
      } else {
627
818
        StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
628
818
        IPtr = SI->getPointerOperand();
629
818
        JPtr = SJ->getPointerOperand();
630
818
        IAlignment = SI->getAlignment();
631
818
        JAlignment = SJ->getAlignment();
632
818
        IAddressSpace = SI->getPointerAddressSpace();
633
818
        JAddressSpace = SJ->getPointerAddressSpace();
634
818
      }
635
1.77k
636
1.77k
      if (!ComputeOffset)
637
104
        return true;
638
1.77k
639
1.67k
      const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
640
1.67k
      const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
641
1.67k
642
1.67k
      // If this is a trivial offset, then we'll get something like
643
1.67k
      // 1*sizeof(type). With target data, which we need anyway, this will get
644
1.67k
      // constant folded into a number.
645
1.67k
      const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
646
1.67k
      if (const SCEVConstant *ConstOffSCEV =
647
1.15k
            dyn_cast<SCEVConstant>(OffsetSCEV)) {
648
1.15k
        ConstantInt *IntOff = ConstOffSCEV->getValue();
649
1.15k
        int64_t Offset = IntOff->getSExtValue();
650
1.15k
        const DataLayout &DL = I->getModule()->getDataLayout();
651
1.15k
        Type *VTy = IPtr->getType()->getPointerElementType();
652
1.15k
        int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
653
1.15k
654
1.15k
        Type *VTy2 = JPtr->getType()->getPointerElementType();
655
1.15k
        if (
VTy != VTy2 && 1.15k
Offset < 00
)
{0
656
0
          int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
657
0
          OffsetInElmts = Offset/VTy2TSS;
658
0
          return (std::abs(Offset) % VTy2TSS) == 0;
659
0
        }
660
1.15k
661
1.15k
        OffsetInElmts = Offset/VTyTSS;
662
1.15k
        return (std::abs(Offset) % VTyTSS) == 0;
663
1.15k
      }
664
1.67k
665
516
      return false;
666
1.67k
    }
667
668
    // Returns true if the provided CallInst represents an intrinsic that can
669
    // be vectorized.
670
154
    bool isVectorizableIntrinsic(CallInst* I) {
671
154
      Function *F = I->getCalledFunction();
672
154
      if (
!F154
)
return false14
;
673
154
674
140
      Intrinsic::ID IID = F->getIntrinsicID();
675
140
      if (
!IID140
)
return false31
;
676
140
677
109
      switch(IID) {
678
27
      default:
679
27
        return false;
680
48
      case Intrinsic::sqrt:
681
48
      case Intrinsic::powi:
682
48
      case Intrinsic::sin:
683
48
      case Intrinsic::cos:
684
48
      case Intrinsic::log:
685
48
      case Intrinsic::log2:
686
48
      case Intrinsic::log10:
687
48
      case Intrinsic::exp:
688
48
      case Intrinsic::exp2:
689
48
      case Intrinsic::pow:
690
48
      case Intrinsic::round:
691
48
      case Intrinsic::copysign:
692
48
      case Intrinsic::ceil:
693
48
      case Intrinsic::nearbyint:
694
48
      case Intrinsic::rint:
695
48
      case Intrinsic::trunc:
696
48
      case Intrinsic::floor:
697
48
      case Intrinsic::fabs:
698
48
      case Intrinsic::minnum:
699
48
      case Intrinsic::maxnum:
700
48
        return Config.VectorizeMath;
701
20
      case Intrinsic::bswap:
702
20
      case Intrinsic::ctpop:
703
20
      case Intrinsic::ctlz:
704
20
      case Intrinsic::cttz:
705
20
        return Config.VectorizeBitManipulations;
706
14
      case Intrinsic::fma:
707
14
      case Intrinsic::fmuladd:
708
14
        return Config.VectorizeFMA;
709
109
      }
710
109
    }
711
712
2.03k
    bool isPureIEChain(InsertElementInst *IE) {
713
2.03k
      InsertElementInst *IENext = IE;
714
3.28k
      do {
715
3.28k
        if (!isa<UndefValue>(IENext->getOperand(0)) &&
716
1.33k
            
!isa<InsertElementInst>(IENext->getOperand(0))1.33k
)
{80
717
80
          return false;
718
80
        }
719
3.20k
      } while ((IENext =
720
3.20k
                 dyn_cast<InsertElementInst>(IENext->getOperand(0))));
721
2.03k
722
1.95k
      return true;
723
2.03k
    }
724
  };
725
726
  // This function implements one vectorization iteration on the provided
727
  // basic block. It returns true if the block is changed.
728
233
  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
729
233
    bool ShouldContinue;
730
233
    BasicBlock::iterator Start = BB.getFirstInsertionPt();
731
233
732
233
    std::vector<Value *> AllPairableInsts;
733
233
    DenseMap<Value *, Value *> AllChosenPairs;
734
233
    DenseSet<ValuePair> AllFixedOrderPairs;
735
233
    DenseMap<VPPair, unsigned> AllPairConnectionTypes;
736
233
    DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
737
233
                                                 AllConnectedPairDeps;
738
233
739
234
    do {
740
234
      std::vector<Value *> PairableInsts;
741
234
      DenseMap<Value *, std::vector<Value *> > CandidatePairs;
742
234
      DenseSet<ValuePair> FixedOrderPairs;
743
234
      DenseMap<ValuePair, int> CandidatePairCostSavings;
744
234
      ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
745
234
                                         FixedOrderPairs,
746
234
                                         CandidatePairCostSavings,
747
234
                                         PairableInsts, NonPow2Len);
748
234
      if (
PairableInsts.empty()234
)
continue59
;
749
234
750
234
      // Build the candidate pair set for faster lookups.
751
175
      DenseSet<ValuePair> CandidatePairsSet;
752
175
      for (DenseMap<Value *, std::vector<Value *> >::iterator I =
753
1.52k
           CandidatePairs.begin(), E = CandidatePairs.end(); 
I != E1.52k
;
++I1.35k
)
754
1.35k
        for (std::vector<Value *>::iterator J = I->second.begin(),
755
9.37k
             JE = I->second.end(); 
J != JE9.37k
;
++J8.02k
)
756
8.02k
          CandidatePairsSet.insert(ValuePair(I->first, *J));
757
175
758
175
      // Now we have a map of all of the pairable instructions and we need to
759
175
      // select the best possible pairing. A good pairing is one such that the
760
175
      // users of the pair are also paired. This defines a (directed) forest
761
175
      // over the pairs such that two pairs are connected iff the second pair
762
175
      // uses the first.
763
175
764
175
      // Note that it only matters that both members of the second pair use some
765
175
      // element of the first pair (to allow for splatting).
766
175
767
175
      DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
768
175
                                                   ConnectedPairDeps;
769
175
      DenseMap<VPPair, unsigned> PairConnectionTypes;
770
175
      computeConnectedPairs(CandidatePairs, CandidatePairsSet,
771
175
                            PairableInsts, ConnectedPairs, PairConnectionTypes);
772
175
      if (
ConnectedPairs.empty()175
)
continue88
;
773
175
774
87
      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
775
87
           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
776
3.58k
           
I != IE3.58k
;
++I3.49k
)
777
3.49k
        for (std::vector<ValuePair>::iterator J = I->second.begin(),
778
8.57k
             JE = I->second.end(); 
J != JE8.57k
;
++J5.07k
)
779
5.07k
          ConnectedPairDeps[*J].push_back(I->first);
780
87
781
87
      // Build the pairable-instruction dependency map
782
87
      DenseSet<ValuePair> PairableInstUsers;
783
87
      buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
784
87
785
87
      // There is now a graph of the connected pairs. For each variable, pick
786
87
      // the pairing with the largest dag meeting the depth requirement on at
787
87
      // least one branch. Then select all pairings that are part of that dag
788
87
      // and remove them from the list of available pairings and pairable
789
87
      // variables.
790
87
791
87
      DenseMap<Value *, Value *> ChosenPairs;
792
87
      choosePairs(CandidatePairs, CandidatePairsSet,
793
87
        CandidatePairCostSavings,
794
87
        PairableInsts, FixedOrderPairs, PairConnectionTypes,
795
87
        ConnectedPairs, ConnectedPairDeps,
796
87
        PairableInstUsers, ChosenPairs);
797
87
798
87
      if (
ChosenPairs.empty()87
)
continue17
;
799
70
      AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
800
70
                              PairableInsts.end());
801
70
      AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
802
70
803
70
      // Only for the chosen pairs, propagate information on fixed-order pairs,
804
70
      // pair connections, and their types to the data structures used by the
805
70
      // pair fusion procedures.
806
70
      for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
807
457
           IE = ChosenPairs.end(); 
I != IE457
;
++I387
)
{387
808
387
        if (FixedOrderPairs.count(*I))
809
96
          AllFixedOrderPairs.insert(*I);
810
291
        else 
if (291
FixedOrderPairs.count(ValuePair(I->second, I->first))291
)
811
8
          AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
812
387
813
387
        for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
814
6.10k
             
J != IE6.10k
;
++J5.71k
)
{5.71k
815
5.71k
          DenseMap<VPPair, unsigned>::iterator K =
816
5.71k
            PairConnectionTypes.find(VPPair(*I, *J));
817
5.71k
          if (
K != PairConnectionTypes.end()5.71k
)
{316
818
316
            AllPairConnectionTypes.insert(*K);
819
5.39k
          } else {
820
5.39k
            K = PairConnectionTypes.find(VPPair(*J, *I));
821
5.39k
            if (K != PairConnectionTypes.end())
822
316
              AllPairConnectionTypes.insert(*K);
823
5.39k
          }
824
5.71k
        }
825
387
      }
826
70
827
70
      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
828
70
           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
829
3.01k
           
I != IE3.01k
;
++I2.94k
)
830
2.94k
        for (std::vector<ValuePair>::iterator J = I->second.begin(),
831
7.42k
          JE = I->second.end(); 
J != JE7.42k
;
++J4.47k
)
832
4.47k
          
if (4.47k
AllPairConnectionTypes.count(VPPair(I->first, *J))4.47k
)
{391
833
391
            AllConnectedPairs[I->first].push_back(*J);
834
391
            AllConnectedPairDeps[*J].push_back(I->first);
835
391
          }
836
234
    } while (ShouldContinue);
837
233
838
233
    if (
AllChosenPairs.empty()233
)
return false163
;
839
70
    NumFusedOps += AllChosenPairs.size();
840
70
841
70
    // A set of pairs has now been selected. It is now necessary to replace the
842
70
    // paired instructions with vector instructions. For this procedure each
843
70
    // operand must be replaced with a vector operand. This vector is formed
844
70
    // by using build_vector on the old operands. The replaced values are then
845
70
    // replaced with a vector_extract on the result.  Subsequent optimization
846
70
    // passes should coalesce the build/extract combinations.
847
70
848
70
    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
849
70
                    AllPairConnectionTypes,
850
70
                    AllConnectedPairs, AllConnectedPairDeps);
851
70
852
70
    // It is important to cleanup here so that future iterations of this
853
70
    // function have less work to do.
854
70
    (void)SimplifyInstructionsInBlock(&BB, TLI);
855
70
    return true;
856
233
  }
857
858
  // This function returns true if the provided instruction is capable of being
859
  // fused into a vector instruction. This determination is based only on the
860
  // type and other attributes of the instruction.
861
  bool BBVectorize::isInstVectorizable(Instruction *I,
862
5.33k
                                         bool &IsSimpleLoadStore) {
863
5.33k
    IsSimpleLoadStore = false;
864
5.33k
865
5.33k
    if (CallInst *
C5.33k
= dyn_cast<CallInst>(I))
{154
866
154
      if (!isVectorizableIntrinsic(C))
867
72
        return false;
868
5.18k
    } else 
if (LoadInst *5.18k
L5.18k
= dyn_cast<LoadInst>(I))
{436
869
436
      // Vectorize simple loads if possbile:
870
436
      IsSimpleLoadStore = L->isSimple();
871
436
      if (
!IsSimpleLoadStore || 436
!Config.VectorizeMemOps436
)
872
0
        return false;
873
4.74k
    } else 
if (StoreInst *4.74k
S4.74k
= dyn_cast<StoreInst>(I))
{282
874
282
      // Vectorize simple stores if possbile:
875
282
      IsSimpleLoadStore = S->isSimple();
876
282
      if (
!IsSimpleLoadStore || 282
!Config.VectorizeMemOps282
)
877
0
        return false;
878
4.46k
    } else 
if (CastInst *4.46k
C4.46k
= dyn_cast<CastInst>(I))
{544
879
544
      // We can vectorize casts, but not casts of pointer types, etc.
880
544
      if (!Config.VectorizeCasts)
881
0
        return false;
882
544
883
544
      Type *SrcTy = C->getSrcTy();
884
544
      if (!SrcTy->isSingleValueType())
885
0
        return false;
886
544
887
544
      Type *DestTy = C->getDestTy();
888
544
      if (!DestTy->isSingleValueType())
889
0
        return false;
890
3.92k
    } else 
if (SelectInst *3.92k
SI3.92k
= dyn_cast<SelectInst>(I))
{26
891
26
      if (!Config.VectorizeSelect)
892
0
        return false;
893
26
      // We can vectorize a select if either all operands are scalars,
894
26
      // or all operands are vectors. Trying to "widen" a select between
895
26
      // vectors that has a scalar condition results in a malformed select.
896
26
      // FIXME: We could probably be smarter about this by rewriting the select
897
26
      // with different types instead.
898
26
      return (SI->getCondition()->getType()->isVectorTy() ==
899
26
              SI->getTrueValue()->getType()->isVectorTy());
900
3.89k
    } else 
if (3.89k
isa<CmpInst>(I)3.89k
)
{31
901
31
      if (!Config.VectorizeCmp)
902
0
        return false;
903
3.86k
    } else 
if (GetElementPtrInst *3.86k
G3.86k
= dyn_cast<GetElementPtrInst>(I))
{503
904
503
      if (!Config.VectorizeGEP)
905
0
        return false;
906
503
907
503
      // Currently, vector GEPs exist only with one index.
908
503
      
if (503
G->getNumIndices() != 1503
)
909
242
        return false;
910
3.36k
    } else 
if (3.36k
!(I->isBinaryOp() || 3.36k
isa<ShuffleVectorInst>(I)1.57k
||
911
1.42k
        
isa<ExtractElementInst>(I)1.42k
||
isa<InsertElementInst>(I)1.17k
))
{268
912
268
      return false;
913
268
    }
914
5.33k
915
4.72k
    Type *T1, *T2;
916
4.72k
    getInstructionTypes(I, T1, T2);
917
4.72k
918
4.72k
    // Not every type can be vectorized...
919
4.72k
    if (
!(VectorType::isValidElementType(T1) || 4.72k
T1->isVectorTy()1.83k
) ||
920
4.72k
        
!(VectorType::isValidElementType(T2) || 4.72k
T2->isVectorTy()1.83k
))
921
0
      return false;
922
4.72k
923
4.72k
    
if (4.72k
T1->getScalarSizeInBits() == 14.72k
)
{48
924
48
      if (!Config.VectorizeBools)
925
14
        return false;
926
4.68k
    } else {
927
4.68k
      if (
!Config.VectorizeInts && 4.68k
T1->isIntOrIntVectorTy()0
)
928
0
        return false;
929
4.68k
    }
930
4.72k
931
4.71k
    
if (4.71k
T2->getScalarSizeInBits() == 14.71k
)
{12
932
12
      if (!Config.VectorizeBools)
933
0
        return false;
934
4.70k
    } else {
935
4.70k
      if (
!Config.VectorizeInts && 4.70k
T2->isIntOrIntVectorTy()0
)
936
0
        return false;
937
4.70k
    }
938
4.71k
939
4.71k
    
if (4.71k
!Config.VectorizeFloats4.71k
940
0
        && 
(T1->isFPOrFPVectorTy() || 0
T2->isFPOrFPVectorTy()0
))
941
0
      return false;
942
4.71k
943
4.71k
    // Don't vectorize target-specific types.
944
4.71k
    
if (4.71k
T1->isX86_FP80Ty() || 4.71k
T1->isPPC_FP128Ty()4.71k
||
T1->isX86_MMXTy()4.70k
)
945
7
      return false;
946
4.70k
    
if (4.70k
T2->isX86_FP80Ty() || 4.70k
T2->isPPC_FP128Ty()4.70k
||
T2->isX86_MMXTy()4.70k
)
947
0
      return false;
948
4.70k
949
4.70k
    
if (4.70k
!Config.VectorizePointers && 4.70k
(T1->getScalarType()->isPointerTy() ||4.70k
950
4.13k
                                      T2->getScalarType()->isPointerTy()))
951
574
      return false;
952
4.70k
953
4.13k
    
if (4.13k
!TTI && 4.13k
(T1->getPrimitiveSizeInBits() >= Config.VectorBits ||1.82k
954
931
                 T2->getPrimitiveSizeInBits() >= Config.VectorBits))
955
908
      return false;
956
4.13k
957
3.22k
    return true;
958
4.13k
  }
959
960
  // This function returns true if the two provided instructions are compatible
961
  // (meaning that they can be fused into a vector instruction). This assumes
962
  // that I has already been determined to be vectorizable and that J is not
963
  // in the use dag of I.
964
  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
965
                       bool IsSimpleLoadStore, bool NonPow2Len,
966
93.9k
                       int &CostSavings, int &FixedOrder) {
967
93.9k
    DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
968
93.9k
                     " <-> " << *J << "\n");
969
93.9k
970
93.9k
    CostSavings = 0;
971
93.9k
    FixedOrder = 0;
972
93.9k
973
93.9k
    // Loads and stores can be merged if they have different alignments,
974
93.9k
    // but are otherwise the same.
975
93.9k
    if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
976
76.9k
                      (NonPow2Len ? 
Instruction::CompareUsingScalarTypes17.0k
:
076.9k
)))
977
81.5k
      return false;
978
93.9k
979
12.4k
    Type *IT1, *IT2, *JT1, *JT2;
980
12.4k
    getInstructionTypes(I, IT1, IT2);
981
12.4k
    getInstructionTypes(J, JT1, JT2);
982
12.4k
    unsigned MaxTypeBits = std::max(
983
12.4k
      IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
984
12.4k
      IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
985
12.4k
    if (
!TTI && 12.4k
MaxTypeBits > Config.VectorBits1.50k
)
986
83
      return false;
987
12.4k
988
12.4k
    // FIXME: handle addsub-type operations!
989
12.4k
990
12.3k
    
if (12.3k
IsSimpleLoadStore12.3k
)
{1.67k
991
1.67k
      Value *IPtr, *JPtr;
992
1.67k
      unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
993
1.67k
      int64_t OffsetInElmts = 0;
994
1.67k
      if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
995
1.67k
                         IAddressSpace, JAddressSpace, OffsetInElmts) &&
996
1.15k
          
std::abs(OffsetInElmts) == 11.15k
)
{239
997
239
        FixedOrder = (int) OffsetInElmts;
998
239
        unsigned BottomAlignment = IAlignment;
999
239
        if (
OffsetInElmts < 0239
)
BottomAlignment = JAlignment19
;
1000
239
1001
239
        Type *aTypeI = isa<StoreInst>(I) ?
1002
141
          
cast<StoreInst>(I)->getValueOperand()->getType()141
:
I->getType()98
;
1003
239
        Type *aTypeJ = isa<StoreInst>(J) ?
1004
141
          
cast<StoreInst>(J)->getValueOperand()->getType()141
:
J->getType()98
;
1005
239
        Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
1006
239
1007
239
        if (
Config.AlignedOnly239
)
{16
1008
16
          // An aligned load or store is possible only if the instruction
1009
16
          // with the lower offset has an alignment suitable for the
1010
16
          // vector type.
1011
16
          const DataLayout &DL = I->getModule()->getDataLayout();
1012
16
          unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
1013
16
          if (BottomAlignment < VecAlignment)
1014
15
            return false;
1015
16
        }
1016
239
1017
224
        
if (224
TTI224
)
{205
1018
205
          unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
1019
205
                                                IAlignment, IAddressSpace);
1020
205
          unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
1021
205
                                                JAlignment, JAddressSpace);
1022
205
          unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
1023
205
                                                BottomAlignment,
1024
205
                                                IAddressSpace);
1025
205
1026
205
          ICost += TTI->getAddressComputationCost(aTypeI);
1027
205
          JCost += TTI->getAddressComputationCost(aTypeJ);
1028
205
          VCost += TTI->getAddressComputationCost(VType);
1029
205
1030
205
          if (VCost > ICost + JCost)
1031
0
            return false;
1032
205
1033
205
          // We don't want to fuse to a type that will be split, even
1034
205
          // if the two input types will also be split and there is no other
1035
205
          // associated cost.
1036
205
          unsigned VParts = TTI->getNumberOfParts(VType);
1037
205
          if (VParts > 1)
1038
38
            return false;
1039
167
          else 
if (167
!VParts && 167
VCost == ICost + JCost15
)
1040
0
            return false;
1041
205
1042
167
          CostSavings = ICost + JCost - VCost;
1043
167
        }
1044
1.43k
      } else {
1045
1.43k
        return false;
1046
1.43k
      }
1047
10.6k
    } else 
if (10.6k
TTI10.6k
)
{9.34k
1048
9.34k
      TargetTransformInfo::OperandValueKind Op1VK =
1049
9.34k
          TargetTransformInfo::OK_AnyValue;
1050
9.34k
      TargetTransformInfo::OperandValueKind Op2VK =
1051
9.34k
          TargetTransformInfo::OK_AnyValue;
1052
9.34k
      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I);
1053
9.34k
      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J);
1054
9.34k
      Type *VT1 = getVecTypeForPair(IT1, JT1),
1055
9.34k
           *VT2 = getVecTypeForPair(IT2, JT2);
1056
9.34k
1057
9.34k
      // On some targets (example X86) the cost of a vector shift may vary
1058
9.34k
      // depending on whether the second operand is a Uniform or
1059
9.34k
      // NonUniform Constant.
1060
9.34k
      switch (I->getOpcode()) {
1061
8.99k
      default : break;
1062
350
      case Instruction::Shl:
1063
350
      case Instruction::LShr:
1064
350
      case Instruction::AShr:
1065
350
1066
350
        // If both I and J are scalar shifts by constant, then the
1067
350
        // merged vector shift count would be either a constant splat value
1068
350
        // or a non-uniform vector of constants.
1069
350
        if (ConstantInt *
CII350
= dyn_cast<ConstantInt>(I->getOperand(1)))
{345
1070
345
          if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1)))
1071
343
            
Op2VK = CII == CIJ ? 343
TargetTransformInfo::OK_UniformConstantValue41
:
1072
302
                               TargetTransformInfo::OK_NonUniformConstantValue;
1073
5
        } else {
1074
5
          // Check for a splat of a constant or for a non uniform vector
1075
5
          // of constants.
1076
5
          Value *IOp = I->getOperand(1);
1077
5
          Value *JOp = J->getOperand(1);
1078
5
          if (
(isa<ConstantVector>(IOp) || 5
isa<ConstantDataVector>(IOp)5
) &&
1079
5
              
(isa<ConstantVector>(JOp) || 5
isa<ConstantDataVector>(JOp)5
))
{0
1080
0
            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1081
0
            Constant *SplatValue = cast<Constant>(IOp)->getSplatValue();
1082
0
            if (SplatValue != nullptr &&
1083
0
                SplatValue == cast<Constant>(JOp)->getSplatValue())
1084
0
              Op2VK = TargetTransformInfo::OK_UniformConstantValue;
1085
0
          }
1086
5
        }
1087
9.34k
      }
1088
9.34k
1089
9.34k
      // Note that this procedure is incorrect for insert and extract element
1090
9.34k
      // instructions (because combining these often results in a shuffle),
1091
9.34k
      // but this cost is ignored (because insert and extract element
1092
9.34k
      // instructions are assigned a zero depth factor and are not really
1093
9.34k
      // fused in general).
1094
9.34k
      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I);
1095
9.34k
1096
9.34k
      if (VCost > ICost + JCost)
1097
390
        return false;
1098
9.34k
1099
9.34k
      // We don't want to fuse to a type that will be split, even
1100
9.34k
      // if the two input types will also be split and there is no other
1101
9.34k
      // associated cost.
1102
8.95k
      unsigned VParts1 = TTI->getNumberOfParts(VT1),
1103
8.95k
               VParts2 = TTI->getNumberOfParts(VT2);
1104
8.95k
      if (
VParts1 > 1 || 8.95k
VParts2 > 16.51k
)
1105
2.43k
        return false;
1106
6.51k
      else 
if (6.51k
(!VParts1 || 6.51k
!VParts26.50k
) &&
VCost == ICost + JCost11
)
1107
0
        return false;
1108
8.95k
1109
6.51k
      CostSavings = ICost + JCost - VCost;
1110
6.51k
    }
1111
12.3k
1112
12.3k
    // The powi,ctlz,cttz intrinsics are special because only the first
1113
12.3k
    // argument is vectorized, the second arguments must be equal.
1114
8.03k
    CallInst *CI = dyn_cast<CallInst>(I);
1115
8.03k
    Function *FI;
1116
8.03k
    if (
CI && 8.03k
(FI = CI->getCalledFunction())24
)
{24
1117
24
      Intrinsic::ID IID = FI->getIntrinsicID();
1118
24
      if (
IID == Intrinsic::powi || 24
IID == Intrinsic::ctlz20
||
1119
18
          
IID == Intrinsic::cttz18
)
{8
1120
8
        Value *A1I = CI->getArgOperand(1),
1121
8
              *A1J = cast<CallInst>(J)->getArgOperand(1);
1122
8
        const SCEV *A1ISCEV = SE->getSCEV(A1I),
1123
8
                   *A1JSCEV = SE->getSCEV(A1J);
1124
8
        return (A1ISCEV == A1JSCEV);
1125
8
      }
1126
24
1127
16
      
if (16
IID && 16
TTI16
)
{3
1128
3
        FastMathFlags FMFCI;
1129
3
        if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI))
1130
3
          FMFCI = FPMOCI->getFastMathFlags();
1131
3
        SmallVector<Value *, 4> IArgs(CI->arg_operands());
1132
3
        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI);
1133
3
1134
3
        CallInst *CJ = cast<CallInst>(J);
1135
3
1136
3
        FastMathFlags FMFCJ;
1137
3
        if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ))
1138
3
          FMFCJ = FPMOCJ->getFastMathFlags();
1139
3
1140
3
        SmallVector<Value *, 4> JArgs(CJ->arg_operands());
1141
3
        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ);
1142
3
1143
3
        assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
1144
3
               "Intrinsic argument counts differ");
1145
3
        SmallVector<Type*, 4> Tys;
1146
3
        SmallVector<Value *, 4> VecArgs;
1147
10
        for (unsigned i = 0, ie = CI->getNumArgOperands(); 
i != ie10
;
++i7
)
{7
1148
7
          if (
(IID == Intrinsic::powi || 7
IID == Intrinsic::ctlz7
||
1149
7
               
IID == Intrinsic::cttz7
) &&
i == 10
)
{0
1150
0
            Tys.push_back(CI->getArgOperand(i)->getType());
1151
0
            VecArgs.push_back(CI->getArgOperand(i));
1152
0
          }
1153
7
          else {
1154
7
            Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
1155
7
                                            CJ->getArgOperand(i)->getType()));
1156
7
            // Add both operands, and then count their scalarization overhead
1157
7
            // with VF 1.
1158
7
            VecArgs.push_back(CI->getArgOperand(i));
1159
7
            VecArgs.push_back(CJ->getArgOperand(i));
1160
7
          }
1161
7
        }
1162
3
1163
3
        // Compute the scalarization cost here with the original operands (to
1164
3
        // check for uniqueness etc), and then call getIntrinsicInstrCost()
1165
3
        // with the constructed vector types.
1166
3
        Type *RetTy = getVecTypeForPair(IT1, JT1);
1167
3
        unsigned ScalarizationCost = 0;
1168
3
        if (!RetTy->isVoidTy())
1169
3
          ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false);
1170
3
        ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1);
1171
3
1172
3
        FastMathFlags FMFV = FMFCI;
1173
3
        FMFV &= FMFCJ;
1174
3
        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV,
1175
3
                                                    ScalarizationCost);
1176
3
1177
3
        if (VCost > ICost + JCost)
1178
2
          return false;
1179
3
1180
3
        // We don't want to fuse to a type that will be split, even
1181
3
        // if the two input types will also be split and there is no other
1182
3
        // associated cost.
1183
1
        unsigned RetParts = TTI->getNumberOfParts(RetTy);
1184
1
        if (RetParts > 1)
1185
0
          return false;
1186
1
        else 
if (1
!RetParts && 1
VCost == ICost + JCost0
)
1187
0
          return false;
1188
1
1189
4
        
for (unsigned i = 0, ie = CI->getNumArgOperands(); 1
i != ie4
;
++i3
)
{3
1190
3
          if (!Tys[i]->isVectorTy())
1191
0
            continue;
1192
3
1193
3
          unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
1194
3
          if (NumParts > 1)
1195
0
            return false;
1196
3
          else 
if (3
!NumParts && 3
VCost == ICost + JCost0
)
1197
0
            return false;
1198
3
        }
1199
1
1200
1
        CostSavings = ICost + JCost - VCost;
1201
1
      }
1202
16
    }
1203
8.03k
1204
8.02k
    return true;
1205
8.03k
  }
1206
1207
  // Figure out whether or not J uses I and update the users and write-set
1208
  // structures associated with I. Specifically, Users represents the set of
1209
  // instructions that depend on I. WriteSet represents the set
1210
  // of memory locations that are dependent on I. If UpdateUsers is true,
1211
  // and J uses I, then Users is updated to contain J and WriteSet is updated
1212
  // to contain any memory locations to which J writes. The function returns
1213
  // true if J uses I. By default, alias analysis is used to determine
1214
  // whether J reads from memory that overlaps with a location in WriteSet.
1215
  // If LoadMoveSet is not null, then it is a previously-computed map
1216
  // where the key is the memory-based user instruction and the value is
1217
  // the instruction to be compared with I. So, if LoadMoveSet is provided,
1218
  // then the alias analysis is not used. This is necessary because this
1219
  // function is called during the process of moving instructions during
1220
  // vectorization and the results of the alias analysis are not stable during
1221
  // that process.
1222
  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
1223
                       AliasSetTracker &WriteSet, Instruction *I,
1224
                       Instruction *J, bool UpdateUsers,
1225
221k
                       DenseSet<ValuePair> *LoadMoveSetPairs) {
1226
221k
    bool UsesI = false;
1227
221k
1228
221k
    // This instruction may already be marked as a user due, for example, to
1229
221k
    // being a member of a selected pair.
1230
221k
    if (Users.count(J))
1231
0
      UsesI = true;
1232
221k
1233
221k
    if (!UsesI)
1234
221k
      for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
1235
583k
           
JU != JE583k
;
++JU362k
)
{403k
1236
403k
        Value *V = *JU;
1237
403k
        if (
I == V || 403k
Users.count(V)397k
)
{40.8k
1238
40.8k
          UsesI = true;
1239
40.8k
          break;
1240
40.8k
        }
1241
403k
      }
1242
221k
    if (
!UsesI && 221k
J->mayReadFromMemory()180k
)
{23.3k
1243
23.3k
      if (
LoadMoveSetPairs23.3k
)
{232
1244
232
        UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
1245
23.1k
      } else {
1246
23.1k
        for (AliasSetTracker::iterator W = WriteSet.begin(),
1247
35.6k
             WE = WriteSet.end(); 
W != WE35.6k
;
++W12.4k
)
{20.2k
1248
20.2k
          if (
W->aliasesUnknownInst(J, *AA)20.2k
)
{7.71k
1249
7.71k
            UsesI = true;
1250
7.71k
            break;
1251
7.71k
          }
1252
20.2k
        }
1253
23.1k
      }
1254
23.3k
    }
1255
221k
1256
221k
    if (
UsesI && 221k
UpdateUsers48.5k
)
{48.5k
1257
48.5k
      if (
J->mayWriteToMemory()48.5k
)
WriteSet.add(J)7.89k
;
1258
48.5k
      Users.insert(J);
1259
48.5k
    }
1260
221k
1261
221k
    return UsesI;
1262
221k
  }
1263
1264
  // This function iterates over all instruction pairs in the provided
1265
  // basic block and collects all candidate pairs for vectorization.
1266
  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
1267
                       BasicBlock::iterator &Start,
1268
                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1269
                       DenseSet<ValuePair> &FixedOrderPairs,
1270
                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
1271
234
                       std::vector<Value *> &PairableInsts, bool NonPow2Len) {
1272
234
    size_t TotalPairs = 0;
1273
234
    BasicBlock::iterator E = BB.end();
1274
234
    if (
Start == E234
)
return false0
;
1275
234
1276
234
    bool ShouldContinue = false, IAfterStart = false;
1277
5.57k
    for (BasicBlock::iterator I = Start++; 
I != E5.57k
;
++I5.33k
)
{5.33k
1278
5.33k
      if (
I == Start5.33k
)
IAfterStart = true446
;
1279
5.33k
1280
5.33k
      bool IsSimpleLoadStore;
1281
5.33k
      if (!isInstVectorizable(&*I, IsSimpleLoadStore))
1282
2.09k
        continue;
1283
5.33k
1284
5.33k
      // Look for an instruction with which to pair instruction *I...
1285
3.24k
      DenseSet<Value *> Users;
1286
3.24k
      AliasSetTracker WriteSet(*AA);
1287
3.24k
      if (I->mayWriteToMemory())
1288
268
        WriteSet.add(&*I);
1289
3.24k
1290
3.24k
      bool JAfterStart = IAfterStart;
1291
3.24k
      BasicBlock::iterator J = std::next(I);
1292
132k
      for (unsigned ss = 0; 
J != E && 132k
ss <= Config.SearchLimit129k
;
++J, ++ss129k
)
{129k
1293
129k
        if (J == Start)
1294
2.68k
          JAfterStart = true;
1295
129k
1296
129k
        // Determine if J uses I, if so, exit the loop.
1297
129k
        bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);
1298
129k
        if (
Config.FastDep129k
)
{0
1299
0
          // Note: For this heuristic to be effective, independent operations
1300
0
          // must tend to be intermixed. This is likely to be true from some
1301
0
          // kinds of grouped loop unrolling (but not the generic LLVM pass),
1302
0
          // but otherwise may require some kind of reordering pass.
1303
0
1304
0
          // When using fast dependency analysis,
1305
0
          // stop searching after first use:
1306
0
          if (
UsesI0
)
break0
;
1307
129k
        } else {
1308
129k
          if (
UsesI129k
)
continue35.1k
;
1309
129k
        }
1310
129k
1311
129k
        // J does not use I, and comes before the first use of I, so it can be
1312
129k
        // merged with I if the instructions are compatible.
1313
93.9k
        int CostSavings, FixedOrder;
1314
93.9k
        if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len,
1315
93.9k
                                CostSavings, FixedOrder))
1316
85.9k
          continue;
1317
93.9k
1318
93.9k
        // J is a candidate for merging with I.
1319
8.02k
        
if (8.02k
PairableInsts.empty() ||8.02k
1320
7.85k
            
PairableInsts[PairableInsts.size() - 1] != &*I7.85k
)
{1.35k
1321
1.35k
          PairableInsts.push_back(&*I);
1322
1.35k
        }
1323
8.02k
1324
8.02k
        CandidatePairs[&*I].push_back(&*J);
1325
8.02k
        ++TotalPairs;
1326
8.02k
        if (TTI)
1327
6.67k
          CandidatePairCostSavings.insert(
1328
6.67k
              ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));
1329
8.02k
1330
8.02k
        if (FixedOrder == 1)
1331
168
          FixedOrderPairs.insert(ValuePair(&*I, &*J));
1332
7.85k
        else 
if (7.85k
FixedOrder == -17.85k
)
1333
18
          FixedOrderPairs.insert(ValuePair(&*J, &*I));
1334
8.02k
1335
8.02k
        // The next call to this function must start after the last instruction
1336
8.02k
        // selected during this invocation.
1337
8.02k
        if (
JAfterStart8.02k
)
{641
1338
641
          Start = std::next(J);
1339
641
          IAfterStart = JAfterStart = false;
1340
641
        }
1341
8.02k
1342
8.02k
        DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
1343
8.02k
                     << *I << " <-> " << *J << " (cost savings: " <<
1344
8.02k
                     CostSavings << ")\n");
1345
8.02k
1346
8.02k
        // If we have already found too many pairs, break here and this function
1347
8.02k
        // will be called again starting after the last instruction selected
1348
8.02k
        // during this invocation.
1349
8.02k
        if (PairableInsts.size() >= Config.MaxInsts ||
1350
8.02k
            
TotalPairs >= Config.MaxPairs8.02k
)
{1
1351
1
          ShouldContinue = true;
1352
1
          break;
1353
1
        }
1354
8.02k
      }
1355
3.24k
1356
3.24k
      if (ShouldContinue)
1357
1
        break;
1358
3.24k
    }
1359
234
1360
234
    DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
1361
234
           << " instructions with candidate pairs\n");
1362
234
1363
234
    return ShouldContinue;
1364
234
  }
1365
1366
  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
1367
  // it looks for pairs such that both members have an input which is an
1368
  // output of PI or PJ.
1369
  void BBVectorize::computePairsConnectedTo(
1370
                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1371
                  DenseSet<ValuePair> &CandidatePairsSet,
1372
                  std::vector<Value *> &PairableInsts,
1373
                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1374
                  DenseMap<VPPair, unsigned> &PairConnectionTypes,
1375
8.02k
                  ValuePair P) {
1376
8.02k
    StoreInst *SI, *SJ;
1377
8.02k
1378
8.02k
    // For each possible pairing for this variable, look at the uses of
1379
8.02k
    // the first value...
1380
8.02k
    for (Value::user_iterator I = P.first->user_begin(),
1381
8.02k
                              E = P.first->user_end();
1382
16.5k
         
I != E16.5k
;
++I8.55k
)
{8.55k
1383
8.55k
      User *UI = *I;
1384
8.55k
      if (
isa<LoadInst>(UI)8.55k
)
{0
1385
0
        // A pair cannot be connected to a load because the load only takes one
1386
0
        // operand (the address) and it is a scalar even after vectorization.
1387
0
        continue;
1388
8.55k
      } else 
if (8.55k
(SI = dyn_cast<StoreInst>(UI)) &&8.55k
1389
638
                 
P.first == SI->getPointerOperand()638
)
{0
1390
0
        // Similarly, a pair cannot be connected to a store through its
1391
0
        // pointer operand.
1392
0
        continue;
1393
0
      }
1394
8.55k
1395
8.55k
      // For each use of the first variable, look for uses of the second
1396
8.55k
      // variable...
1397
9.65k
      
for (User *UJ : P.second->users()) 8.55k
{9.65k
1398
9.65k
        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
1399
693
            P.second == SJ->getPointerOperand())
1400
0
          continue;
1401
9.65k
1402
9.65k
        // Look for <I, J>:
1403
9.65k
        
if (9.65k
CandidatePairsSet.count(ValuePair(UI, UJ))9.65k
)
{3.91k
1404
3.91k
          VPPair VP(P, ValuePair(UI, UJ));
1405
3.91k
          ConnectedPairs[VP.first].push_back(VP.second);
1406
3.91k
          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
1407
3.91k
        }
1408
9.65k
1409
9.65k
        // Look for <J, I>:
1410
9.65k
        if (
CandidatePairsSet.count(ValuePair(UJ, UI))9.65k
)
{203
1411
203
          VPPair VP(P, ValuePair(UJ, UI));
1412
203
          ConnectedPairs[VP.first].push_back(VP.second);
1413
203
          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
1414
203
        }
1415
9.65k
      }
1416
8.55k
1417
8.55k
      if (
Config.SplatBreaksChain8.55k
)
continue0
;
1418
8.55k
      // Look for cases where just the first value in the pair is used by
1419
8.55k
      // both members of another pair (splatting).
1420
20.1k
      
for (Value::user_iterator J = P.first->user_begin(); 8.55k
J != E20.1k
;
++J11.6k
)
{11.6k
1421
11.6k
        User *UJ = *J;
1422
11.6k
        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
1423
673
            P.first == SJ->getPointerOperand())
1424
0
          continue;
1425
11.6k
1426
11.6k
        
if (11.6k
CandidatePairsSet.count(ValuePair(UI, UJ))11.6k
)
{659
1427
659
          VPPair VP(P, ValuePair(UI, UJ));
1428
659
          ConnectedPairs[VP.first].push_back(VP.second);
1429
659
          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1430
659
        }
1431
11.6k
      }
1432
8.55k
    }
1433
8.02k
1434
8.02k
    if (
Config.SplatBreaksChain8.02k
)
return0
;
1435
8.02k
    // Look for cases where just the second value in the pair is used by
1436
8.02k
    // both members of another pair (splatting).
1437
8.02k
    for (Value::user_iterator I = P.second->user_begin(),
1438
8.02k
                              E = P.second->user_end();
1439
16.2k
         
I != E16.2k
;
++I8.19k
)
{8.19k
1440
8.19k
      User *UI = *I;
1441
8.19k
      if (isa<LoadInst>(UI))
1442
0
        continue;
1443
8.19k
      else 
if (8.19k
(SI = dyn_cast<StoreInst>(UI)) &&8.19k
1444
682
               P.second == SI->getPointerOperand())
1445
0
        continue;
1446
8.19k
1447
18.0k
      
for (Value::user_iterator J = P.second->user_begin(); 8.19k
J != E18.0k
;
++J9.88k
)
{9.88k
1448
9.88k
        User *UJ = *J;
1449
9.88k
        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
1450
691
            P.second == SJ->getPointerOperand())
1451
0
          continue;
1452
9.88k
1453
9.88k
        
if (9.88k
CandidatePairsSet.count(ValuePair(UI, UJ))9.88k
)
{299
1454
299
          VPPair VP(P, ValuePair(UI, UJ));
1455
299
          ConnectedPairs[VP.first].push_back(VP.second);
1456
299
          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1457
299
        }
1458
9.88k
      }
1459
8.19k
    }
1460
8.02k
  }
1461
1462
  // This function figures out which pairs are connected.  Two pairs are
1463
  // connected if some output of the first pair forms an input to both members
1464
  // of the second pair.
1465
  void BBVectorize::computeConnectedPairs(
1466
                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1467
                  DenseSet<ValuePair> &CandidatePairsSet,
1468
                  std::vector<Value *> &PairableInsts,
1469
                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1470
175
                  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
1471
175
    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
1472
1.52k
         PE = PairableInsts.end(); 
PI != PE1.52k
;
++PI1.35k
)
{1.35k
1473
1.35k
      DenseMap<Value *, std::vector<Value *> >::iterator PP =
1474
1.35k
        CandidatePairs.find(*PI);
1475
1.35k
      if (PP == CandidatePairs.end())
1476
0
        continue;
1477
1.35k
1478
1.35k
      for (std::vector<Value *>::iterator P = PP->second.begin(),
1479
9.37k
           E = PP->second.end(); 
P != E9.37k
;
++P8.02k
)
1480
8.02k
        computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
1481
8.02k
                                PairableInsts, ConnectedPairs,
1482
8.02k
                                PairConnectionTypes, ValuePair(*PI, *P));
1483
1.35k
    }
1484
175
1485
175
    DEBUG(size_t TotalPairs = 0;
1486
175
          for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
1487
175
               ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
1488
175
            TotalPairs += I->second.size();
1489
175
          dbgs() << "BBV: found " << TotalPairs
1490
175
                 << " pair connections.\n");
1491
175
  }
1492
1493
  // This function builds a set of use tuples such that <A, B> is in the set
1494
  // if B is in the use dag of A. If B is in the use dag of A, then B
1495
  // depends on the output of A.
1496
  void BBVectorize::buildDepMap(
1497
                      BasicBlock &BB,
1498
                      DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1499
                      std::vector<Value *> &PairableInsts,
1500
87
                      DenseSet<ValuePair> &PairableInstUsers) {
1501
87
    DenseSet<Value *> IsInPair;
1502
87
    for (DenseMap<Value *, std::vector<Value *> >::iterator C =
1503
1.30k
         CandidatePairs.begin(), E = CandidatePairs.end(); 
C != E1.30k
;
++C1.22k
)
{1.22k
1504
1.22k
      IsInPair.insert(C->first);
1505
1.22k
      IsInPair.insert(C->second.begin(), C->second.end());
1506
1.22k
    }
1507
87
1508
87
    // Iterate through the basic block, recording all users of each
1509
87
    // pairable instruction.
1510
87
1511
87
    BasicBlock::iterator E = BB.end(), EL =
1512
87
      BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
1513
2.63k
    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); 
I != E2.63k
;
++I2.54k
)
{2.63k
1514
2.63k
      if (IsInPair.find(&*I) == IsInPair.end())
1515
1.12k
        continue;
1516
2.63k
1517
1.50k
      DenseSet<Value *> Users;
1518
1.50k
      AliasSetTracker WriteSet(*AA);
1519
1.50k
      if (I->mayWriteToMemory())
1520
120
        WriteSet.add(&*I);
1521
1.50k
1522
61.3k
      for (BasicBlock::iterator J = std::next(I); 
J != E61.3k
;
++J59.8k
)
{61.2k
1523
61.2k
        (void)trackUsesOfI(Users, WriteSet, &*I, &*J);
1524
61.2k
1525
61.2k
        if (J == EL)
1526
1.41k
          break;
1527
61.2k
      }
1528
1.50k
1529
1.50k
      for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
1530
11.0k
           
U != E11.0k
;
++U9.55k
)
{9.55k
1531
9.55k
        if (
IsInPair.find(*U) == IsInPair.end()9.55k
)
continue3.93k
;
1532
5.62k
        PairableInstUsers.insert(ValuePair(&*I, *U));
1533
5.62k
      }
1534
1.50k
1535
1.50k
      if (I == EL)
1536
87
        break;
1537
1.50k
    }
1538
87
  }
1539
1540
  // Returns true if an input to pair P is an output of pair Q and also an
1541
  // input of pair Q is an output of pair P. If this is the case, then these
1542
  // two pairs cannot be simultaneously fused.
1543
  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
1544
             DenseSet<ValuePair> &PairableInstUsers,
1545
             DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
1546
30.7k
             DenseSet<VPPair> *PairableInstUserPairSet) {
1547
30.7k
    // Two pairs are in conflict if they are mutual Users of eachother.
1548
30.7k
    bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
1549
19.1k
                  PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
1550
18.3k
                  PairableInstUsers.count(ValuePair(P.second, Q.first))  ||
1551
17.8k
                  PairableInstUsers.count(ValuePair(P.second, Q.second));
1552
30.7k
    bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first,  P.first))  ||
1553
28.8k
                  PairableInstUsers.count(ValuePair(Q.first,  P.second)) ||
1554
27.6k
                  PairableInstUsers.count(ValuePair(Q.second, P.first))  ||
1555
27.4k
                  PairableInstUsers.count(ValuePair(Q.second, P.second));
1556
30.7k
    if (
PairableInstUserMap30.7k
)
{5.00k
1557
5.00k
      // FIXME: The expensive part of the cycle check is not so much the cycle
1558
5.00k
      // check itself but this edge insertion procedure. This needs some
1559
5.00k
      // profiling and probably a different data structure.
1560
5.00k
      if (
PUsesQ5.00k
)
{589
1561
589
        if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
1562
248
          (*PairableInstUserMap)[Q].push_back(P);
1563
589
      }
1564
5.00k
      if (
QUsesP5.00k
)
{2.35k
1565
2.35k
        if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
1566
733
          (*PairableInstUserMap)[P].push_back(Q);
1567
2.35k
      }
1568
5.00k
    }
1569
30.7k
1570
13.1k
    return (QUsesP && PUsesQ);
1571
30.7k
  }
1572
1573
  // This function walks the use graph of current pairs to see if, starting
1574
  // from P, the walk returns to P.
1575
  bool BBVectorize::pairWillFormCycle(ValuePair P,
1576
             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1577
1.98k
             DenseSet<ValuePair> &CurrentPairs) {
1578
1.98k
    DEBUG(if (DebugCycleCheck)
1579
1.98k
            dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
1580
1.98k
                   << *P.second << "\n");
1581
1.98k
    // A lookup table of visisted pairs is kept because the PairableInstUserMap
1582
1.98k
    // contains non-direct associations.
1583
1.98k
    DenseSet<ValuePair> Visited;
1584
1.98k
    SmallVector<ValuePair, 32> Q;
1585
1.98k
    // General depth-first post-order traversal:
1586
1.98k
    Q.push_back(P);
1587
2.63k
    do {
1588
2.63k
      ValuePair QTop = Q.pop_back_val();
1589
2.63k
      Visited.insert(QTop);
1590
2.63k
1591
2.63k
      DEBUG(if (DebugCycleCheck)
1592
2.63k
              dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
1593
2.63k
                     << *QTop.second << "\n");
1594
2.63k
      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
1595
2.63k
        PairableInstUserMap.find(QTop);
1596
2.63k
      if (QQ == PairableInstUserMap.end())
1597
1.91k
        continue;
1598
2.63k
1599
718
      for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
1600
1.88k
           CE = QQ->second.end(); 
C != CE1.88k
;
++C1.16k
)
{1.16k
1601
1.16k
        if (
*C == P1.16k
)
{0
1602
0
          DEBUG(dbgs()
1603
0
                 << "BBV: rejected to prevent non-trivial cycle formation: "
1604
0
                 << QTop.first << " <-> " << C->second << "\n");
1605
0
          return true;
1606
0
        }
1607
1.16k
1608
1.16k
        
if (1.16k
CurrentPairs.count(*C) && 1.16k
!Visited.count(*C)884
)
1609
645
          Q.push_back(*C);
1610
1.16k
      }
1611
2.63k
    } while (!Q.empty());
1612
1.98k
1613
1.98k
    return false;
1614
1.98k
  }
1615
1616
  // This function builds the initial dag of connected pairs with the
1617
  // pair J at the root.
1618
  void BBVectorize::buildInitialDAGFor(
1619
                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1620
                  DenseSet<ValuePair> &CandidatePairsSet,
1621
                  std::vector<Value *> &PairableInsts,
1622
                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1623
                  DenseSet<ValuePair> &PairableInstUsers,
1624
                  DenseMap<Value *, Value *> &ChosenPairs,
1625
3.20k
                  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
1626
3.20k
    // Each of these pairs is viewed as the root node of a DAG. The DAG
1627
3.20k
    // is then walked (depth-first). As this happens, we keep track of
1628
3.20k
    // the pairs that compose the DAG and the maximum depth of the DAG.
1629
3.20k
    SmallVector<ValuePairWithDepth, 32> Q;
1630
3.20k
    // General depth-first post-order traversal:
1631
3.20k
    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1632
11.1k
    do {
1633
11.1k
      ValuePairWithDepth QTop = Q.back();
1634
11.1k
1635
11.1k
      // Push each child onto the queue:
1636
11.1k
      bool MoreChildren = false;
1637
11.1k
      size_t MaxChildDepth = QTop.second;
1638
11.1k
      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
1639
11.1k
        ConnectedPairs.find(QTop.first);
1640
11.1k
      if (QQ != ConnectedPairs.end())
1641
7.41k
        for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
1642
18.3k
             ke = QQ->second.end(); 
k != ke18.3k
;
++k10.9k
)
{10.9k
1643
10.9k
          // Make sure that this child pair is still a candidate:
1644
10.9k
          if (
CandidatePairsSet.count(*k)10.9k
)
{10.3k
1645
10.3k
            DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
1646
10.3k
            if (
C == DAG.end()10.3k
)
{4.59k
1647
4.59k
              size_t d = getDepthFactor(k->first);
1648
4.59k
              Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
1649
4.59k
              MoreChildren = true;
1650
5.78k
            } else {
1651
5.78k
              MaxChildDepth = std::max(MaxChildDepth, C->second);
1652
5.78k
            }
1653
10.3k
          }
1654
10.9k
        }
1655
11.1k
1656
11.1k
      if (
!MoreChildren11.1k
)
{7.80k
1657
7.80k
        // Record the current pair as part of the DAG:
1658
7.80k
        DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
1659
7.80k
        Q.pop_back();
1660
7.80k
      }
1661
11.1k
    } while (!Q.empty());
1662
3.20k
  }
1663
1664
  // Given some initial dag, prune it by removing conflicting pairs (pairs
1665
  // that cannot be simultaneously chosen for vectorization).
1666
  void BBVectorize::pruneDAGFor(
1667
              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1668
              std::vector<Value *> &PairableInsts,
1669
              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1670
              DenseSet<ValuePair> &PairableInstUsers,
1671
              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1672
              DenseSet<VPPair> &PairableInstUserPairSet,
1673
              DenseMap<Value *, Value *> &ChosenPairs,
1674
              DenseMap<ValuePair, size_t> &DAG,
1675
              DenseSet<ValuePair> &PrunedDAG, ValuePair J,
1676
3.20k
              bool UseCycleCheck) {
1677
3.20k
    SmallVector<ValuePairWithDepth, 32> Q;
1678
3.20k
    // General depth-first post-order traversal:
1679
3.20k
    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1680
6.43k
    do {
1681
6.43k
      ValuePairWithDepth QTop = Q.pop_back_val();
1682
6.43k
      PrunedDAG.insert(QTop.first);
1683
6.43k
1684
6.43k
      // Visit each child, pruning as necessary...
1685
6.43k
      SmallVector<ValuePairWithDepth, 8> BestChildren;
1686
6.43k
      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
1687
6.43k
        ConnectedPairs.find(QTop.first);
1688
6.43k
      if (QQ == ConnectedPairs.end())
1689
3.12k
        continue;
1690
6.43k
1691
3.30k
      for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
1692
7.85k
           KE = QQ->second.end(); 
K != KE7.85k
;
++K4.54k
)
{4.54k
1693
4.54k
        DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
1694
4.54k
        if (
C == DAG.end()4.54k
)
continue308
;
1695
4.54k
1696
4.54k
        // This child is in the DAG, now we need to make sure it is the
1697
4.54k
        // best of any conflicting children. There could be multiple
1698
4.54k
        // conflicting children, so first, determine if we're keeping
1699
4.54k
        // this child, then delete conflicting children as necessary.
1700
4.54k
1701
4.54k
        // It is also necessary to guard against pairing-induced
1702
4.54k
        // dependencies. Consider instructions a .. x .. y .. b
1703
4.54k
        // such that (a,b) are to be fused and (x,y) are to be fused
1704
4.54k
        // but a is an input to x and b is an output from y. This
1705
4.54k
        // means that y cannot be moved after b but x must be moved
1706
4.54k
        // after b for (a,b) to be fused. In other words, after
1707
4.54k
        // fusing (a,b) we have y .. a/b .. x where y is an input
1708
4.54k
        // to a/b and x is an output to a/b: x and y can no longer
1709
4.54k
        // be legally fused. To prevent this condition, we must
1710
4.54k
        // make sure that a child pair added to the DAG is not
1711
4.54k
        // both an input and output of an already-selected pair.
1712
4.54k
1713
4.54k
        // Pairing-induced dependencies can also form from more complicated
1714
4.54k
        // cycles. The pair vs. pair conflicts are easy to check, and so
1715
4.54k
        // that is done explicitly for "fast rejection", and because for
1716
4.54k
        // child vs. child conflicts, we may prefer to keep the current
1717
4.54k
        // pair in preference to the already-selected child.
1718
4.23k
        DenseSet<ValuePair> CurrentPairs;
1719
4.23k
1720
4.23k
        bool CanAdd = true;
1721
4.23k
        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
1722
4.23k
              = BestChildren.begin(), E2 = BestChildren.end();
1723
5.14k
             
C2 != E25.14k
;
++C2911
)
{1.71k
1724
1.71k
          if (C2->first.first == C->first.first ||
1725
1.44k
              C2->first.first == C->first.second ||
1726
1.22k
              C2->first.second == C->first.first ||
1727
1.20k
              C2->first.second == C->first.second ||
1728
863
              pairsConflict(C2->first, C->first, PairableInstUsers,
1729
744
                            UseCycleCheck ? 
&PairableInstUserMap119
:
nullptr744
,
1730
119
                            UseCycleCheck ? &PairableInstUserPairSet
1731
855
                                          : 
nullptr744
))
{855
1732
855
            if (
C2->second >= C->second855
)
{807
1733
807
              CanAdd = false;
1734
807
              break;
1735
807
            }
1736
855
1737
48
            CurrentPairs.insert(C2->first);
1738
48
          }
1739
1.71k
        }
1740
4.23k
        if (
!CanAdd4.23k
)
continue807
;
1741
4.23k
1742
4.23k
        // Even worse, this child could conflict with another node already
1743
4.23k
        // selected for the DAG. If that is the case, ignore this child.
1744
3.42k
        for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
1745
11.1k
             E2 = PrunedDAG.end(); 
T != E211.1k
;
++T7.70k
)
{7.83k
1746
7.83k
          if (T->first == C->first.first ||
1747
7.72k
              T->first == C->first.second ||
1748
7.72k
              T->second == C->first.first ||
1749
7.71k
              T->second == C->first.second ||
1750
7.70k
              pairsConflict(*T, C->first, PairableInstUsers,
1751
6.43k
                            UseCycleCheck ? 
&PairableInstUserMap1.26k
:
nullptr6.43k
,
1752
1.26k
                            UseCycleCheck ? &PairableInstUserPairSet
1753
6.43k
                                          : 
nullptr6.43k
))
{132
1754
132
            CanAdd = false;
1755
132
            break;
1756
132
          }
1757
7.83k
1758
7.70k
          CurrentPairs.insert(*T);
1759
7.70k
        }
1760
3.42k
        if (
!CanAdd3.42k
)
continue132
;
1761
3.42k
1762
3.42k
        // And check the queue too...
1763
3.29k
        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(),
1764
3.83k
             E2 = Q.end(); 
C2 != E23.83k
;
++C2535
)
{557
1765
557
          if (C2->first.first == C->first.first ||
1766
535
              C2->first.first == C->first.second ||
1767
535
              C2->first.second == C->first.first ||
1768
535
              C2->first.second == C->first.second ||
1769
535
              pairsConflict(C2->first, C->first, PairableInstUsers,
1770
490
                            UseCycleCheck ? 
&PairableInstUserMap45
:
nullptr490
,
1771
45
                            UseCycleCheck ? &PairableInstUserPairSet
1772
490
                                          : 
nullptr490
))
{22
1773
22
            CanAdd = false;
1774
22
            break;
1775
22
          }
1776
557
1777
535
          CurrentPairs.insert(C2->first);
1778
535
        }
1779
3.29k
        if (
!CanAdd3.29k
)
continue22
;
1780
3.29k
1781
3.29k
        // Last but not least, check for a conflict with any of the
1782
3.29k
        // already-chosen pairs.
1783
3.27k
        for (DenseMap<Value *, Value *>::iterator C2 =
1784
3.27k
              ChosenPairs.begin(), E2 = ChosenPairs.end();
1785
16.0k
             
C2 != E216.0k
;
++C212.7k
)
{12.7k
1786
12.7k
          if (pairsConflict(*C2, C->first, PairableInstUsers,
1787
11.0k
                            UseCycleCheck ? 
&PairableInstUserMap1.72k
:
nullptr11.0k
,
1788
1.72k
                            UseCycleCheck ? &PairableInstUserPairSet
1789
11.0k
                                          : 
nullptr11.0k
))
{0
1790
0
            CanAdd = false;
1791
0
            break;
1792
0
          }
1793
12.7k
1794
12.7k
          CurrentPairs.insert(*C2);
1795
12.7k
        }
1796
3.27k
        if (
!CanAdd3.27k
)
continue0
;
1797
3.27k
1798
3.27k
        // To check for non-trivial cycles formed by the addition of the
1799
3.27k
        // current pair we've formed a list of all relevant pairs, now use a
1800
3.27k
        // graph walk to check for a cycle. We start from the current pair and
1801
3.27k
        // walk the use dag to see if we again reach the current pair. If we
1802
3.27k
        // do, then the current pair is rejected.
1803
3.27k
1804
3.27k
        // FIXME: It may be more efficient to use a topological-ordering
1805
3.27k
        // algorithm to improve the cycle check. This should be investigated.
1806
3.27k
        
if (3.27k
UseCycleCheck &&3.27k
1807
716
            pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
1808
0
          continue;
1809
3.27k
1810
3.27k
        // This child can be added, but we may have chosen it in preference
1811
3.27k
        // to an already-selected child. Check for this here, and if a
1812
3.27k
        // conflict is found, then remove the previously-selected child
1813
3.27k
        // before adding this one in its place.
1814
3.27k
        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
1815
3.58k
              = BestChildren.begin(); 
C2 != BestChildren.end()3.58k
;)
{314
1816
314
          if (C2->first.first == C->first.first ||
1817
292
              C2->first.first == C->first.second ||
1818
284
              C2->first.second == C->first.first ||
1819
284
              C2->first.second == C->first.second ||
1820
271
              pairsConflict(C2->first, C->first, PairableInstUsers))
1821
43
            C2 = BestChildren.erase(C2);
1822
314
          else
1823
271
            ++C2;
1824
314
        }
1825
3.27k
1826
3.27k
        BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
1827
3.27k
      }
1828
3.30k
1829
3.30k
      for (SmallVectorImpl<ValuePairWithDepth>::iterator C
1830
3.30k
            = BestChildren.begin(), E2 = BestChildren.end();
1831
6.53k
           
C != E26.53k
;
++C3.23k
)
{3.23k
1832
3.23k
        size_t DepthF = getDepthFactor(C->first.first);
1833
3.23k
        Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
1834
3.23k
      }
1835
6.43k
    } while (!Q.empty());
1836
3.20k
  }
1837
1838
  // This function finds the best dag of mututally-compatible connected
1839
  // pairs, given the choice of root pairs as an iterator range.
1840
  void BBVectorize::findBestDAGFor(
1841
              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1842
              DenseSet<ValuePair> &CandidatePairsSet,
1843
              DenseMap<ValuePair, int> &CandidatePairCostSavings,
1844
              std::vector<Value *> &PairableInsts,
1845
              DenseSet<ValuePair> &FixedOrderPairs,
1846
              DenseMap<VPPair, unsigned> &PairConnectionTypes,
1847
              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1848
              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
1849
              DenseSet<ValuePair> &PairableInstUsers,
1850
              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1851
              DenseSet<VPPair> &PairableInstUserPairSet,
1852
              DenseMap<Value *, Value *> &ChosenPairs,
1853
              DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
1854
              int &BestEffSize, Value *II, std::vector<Value *>&JJ,
1855
1.22k
              bool UseCycleCheck) {
1856
1.22k
    for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
1857
9.07k
         
J != JE9.07k
;
++J7.85k
)
{7.85k
1858
7.85k
      ValuePair IJ(II, *J);
1859
7.85k
      if (!CandidatePairsSet.count(IJ))
1860
4.64k
        continue;
1861
7.85k
1862
7.85k
      // Before going any further, make sure that this pair does not
1863
7.85k
      // conflict with any already-selected pairs (see comment below
1864
7.85k
      // near the DAG pruning for more details).
1865
3.21k
      DenseSet<ValuePair> ChosenPairSet;
1866
3.21k
      bool DoesConflict = false;
1867
3.21k
      for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
1868
11.7k
           E = ChosenPairs.end(); 
C != E11.7k
;
++C8.56k
)
{8.57k
1869
8.57k
        if (pairsConflict(*C, IJ, PairableInstUsers,
1870
6.73k
                          UseCycleCheck ? 
&PairableInstUserMap1.84k
:
nullptr6.73k
,
1871
6.73k
                          UseCycleCheck ? 
&PairableInstUserPairSet1.84k
:
nullptr6.73k
))
{10
1872
10
          DoesConflict = true;
1873
10
          break;
1874
10
        }
1875
8.57k
1876
8.56k
        ChosenPairSet.insert(*C);
1877
8.56k
      }
1878
3.21k
      if (
DoesConflict3.21k
)
continue10
;
1879
3.21k
1880
3.20k
      
if (3.20k
UseCycleCheck &&3.20k
1881
1.27k
          pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
1882
0
        continue;
1883
3.20k
1884
3.20k
      DenseMap<ValuePair, size_t> DAG;
1885
3.20k
      buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
1886
3.20k
                          PairableInsts, ConnectedPairs,
1887
3.20k
                          PairableInstUsers, ChosenPairs, DAG, IJ);
1888
3.20k
1889
3.20k
      // Because we'll keep the child with the largest depth, the largest
1890
3.20k
      // depth is still the same in the unpruned DAG.
1891
3.20k
      size_t MaxDepth = DAG.lookup(IJ);
1892
3.20k
1893
3.20k
      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
1894
3.20k
                   << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
1895
3.20k
                   MaxDepth << " and size " << DAG.size() << "\n");
1896
3.20k
1897
3.20k
      // At this point the DAG has been constructed, but, may contain
1898
3.20k
      // contradictory children (meaning that different children of
1899
3.20k
      // some dag node may be attempting to fuse the same instruction).
1900
3.20k
      // So now we walk the dag again, in the case of a conflict,
1901
3.20k
      // keep only the child with the largest depth. To break a tie,
1902
3.20k
      // favor the first child.
1903
3.20k
1904
3.20k
      DenseSet<ValuePair> PrunedDAG;
1905
3.20k
      pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
1906
3.20k
                   PairableInstUsers, PairableInstUserMap,
1907
3.20k
                   PairableInstUserPairSet,
1908
3.20k
                   ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
1909
3.20k
1910
3.20k
      int EffSize = 0;
1911
3.20k
      if (
TTI3.20k
)
{2.67k
1912
2.67k
        DenseSet<Value *> PrunedDAGInstrs;
1913
2.67k
        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
1914
8.19k
             E = PrunedDAG.end(); 
S != E8.19k
;
++S5.51k
)
{5.51k
1915
5.51k
          PrunedDAGInstrs.insert(S->first);
1916
5.51k
          PrunedDAGInstrs.insert(S->second);
1917
5.51k
        }
1918
2.67k
1919
2.67k
        // The set of pairs that have already contributed to the total cost.
1920
2.67k
        DenseSet<ValuePair> IncomingPairs;
1921
2.67k
1922
2.67k
        // If the cost model were perfect, this might not be necessary; but we
1923
2.67k
        // need to make sure that we don't get stuck vectorizing our own
1924
2.67k
        // shuffle chains.
1925
2.67k
        bool HasNontrivialInsts = false;
1926
2.67k
1927
2.67k
        // The node weights represent the cost savings associated with
1928
2.67k
        // fusing the pair of instructions.
1929
2.67k
        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
1930
8.19k
             E = PrunedDAG.end(); 
S != E8.19k
;
++S5.51k
)
{5.51k
1931
5.51k
          if (!isa<ShuffleVectorInst>(S->first) &&
1932
4.26k
              !isa<InsertElementInst>(S->first) &&
1933
2.04k
              !isa<ExtractElementInst>(S->first))
1934
2.00k
            HasNontrivialInsts = true;
1935
5.51k
1936
5.51k
          bool FlipOrder = false;
1937
5.51k
1938
5.51k
          if (
getDepthFactor(S->first)5.51k
)
{3.25k
1939
3.25k
            int ESContrib = CandidatePairCostSavings.find(*S)->second;
1940
3.25k
            DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
1941
3.25k
                   << *S->first << " <-> " << *S->second << "} = " <<
1942
3.25k
                   ESContrib << "\n");
1943
3.25k
            EffSize += ESContrib;
1944
3.25k
          }
1945
5.51k
1946
5.51k
          // The edge weights contribute in a negative sense: they represent
1947
5.51k
          // the cost of shuffles.
1948
5.51k
          DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
1949
5.51k
            ConnectedPairDeps.find(*S);
1950
5.51k
          if (
SS != ConnectedPairDeps.end()5.51k
)
{3.76k
1951
3.76k
            unsigned NumDepsDirect = 0, NumDepsSwap = 0;
1952
3.76k
            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
1953
12.7k
                 TE = SS->second.end(); 
T != TE12.7k
;
++T9.00k
)
{9.00k
1954
9.00k
              VPPair Q(*S, *T);
1955
9.00k
              if (!PrunedDAG.count(Q.second))
1956
5.98k
                continue;
1957
3.02k
              DenseMap<VPPair, unsigned>::iterator R =
1958
3.02k
                PairConnectionTypes.find(VPPair(Q.second, Q.first));
1959
3.02k
              assert(R != PairConnectionTypes.end() &&
1960
3.02k
                     "Cannot find pair connection type");
1961
3.02k
              if (R->second == PairConnectionDirect)
1962
2.69k
                ++NumDepsDirect;
1963
330
              else 
if (330
R->second == PairConnectionSwap330
)
1964
126
                ++NumDepsSwap;
1965
3.02k
            }
1966
3.76k
1967
3.76k
            // If there are more swaps than direct connections, then
1968
3.76k
            // the pair order will be flipped during fusion. So the real
1969
3.76k
            // number of swaps is the minimum number.
1970
3.76k
            FlipOrder = !FixedOrderPairs.count(*S) &&
1971
3.56k
              ((NumDepsSwap > NumDepsDirect) ||
1972
3.43k
                FixedOrderPairs.count(ValuePair(S->second, S->first)));
1973
3.76k
1974
3.76k
            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
1975
12.7k
                 TE = SS->second.end(); 
T != TE12.7k
;
++T9.00k
)
{9.00k
1976
9.00k
              VPPair Q(*S, *T);
1977
9.00k
              if (!PrunedDAG.count(Q.second))
1978
5.98k
                continue;
1979
3.02k
              DenseMap<VPPair, unsigned>::iterator R =
1980
3.02k
                PairConnectionTypes.find(VPPair(Q.second, Q.first));
1981
3.02k
              assert(R != PairConnectionTypes.end() &&
1982
3.02k
                     "Cannot find pair connection type");
1983
3.02k
              Type *Ty1 = Q.second.first->getType(),
1984
3.02k
                   *Ty2 = Q.second.second->getType();
1985
3.02k
              Type *VTy = getVecTypeForPair(Ty1, Ty2);
1986
3.02k
              if (
(R->second == PairConnectionDirect && 3.02k
FlipOrder2.69k
) ||
1987
3.00k
                  
(R->second == PairConnectionSwap && 3.00k
!FlipOrder126
) ||
1988
3.00k
                  
R->second == PairConnectionSplat3.00k
)
{223
1989
223
                int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1990
223
                                                   VTy, VTy);
1991
223
1992
223
                if (
VTy->getVectorNumElements() == 2223
)
{40
1993
40
                  if (R->second == PairConnectionSplat)
1994
37
                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1995
37
                      TargetTransformInfo::SK_Broadcast, VTy));
1996
40
                  else
1997
3
                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1998
3
                      TargetTransformInfo::SK_Reverse, VTy));
1999
40
                }
2000
223
2001
223
                DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
2002
223
                  *Q.second.first << " <-> " << *Q.second.second <<
2003
223
                    "} -> {" <<
2004
223
                  *S->first << " <-> " << *S->second << "} = " <<
2005
223
                   ESContrib << "\n");
2006
223
                EffSize -= ESContrib;
2007
223
              }
2008
3.02k
            }
2009
3.76k
          }
2010
5.51k
2011
5.51k
          // Compute the cost of outgoing edges. We assume that edges outgoing
2012
5.51k
          // to shuffles, inserts or extracts can be merged, and so contribute
2013
5.51k
          // no additional cost.
2014
5.51k
          if (
!S->first->getType()->isVoidTy()5.51k
)
{5.25k
2015
5.25k
            Type *Ty1 = S->first->getType(),
2016
5.25k
                 *Ty2 = S->second->getType();
2017
5.25k
            Type *VTy = getVecTypeForPair(Ty1, Ty2);
2018
5.25k
2019
5.25k
            bool NeedsExtraction = false;
2020
5.59k
            for (User *U : S->first->users()) {
2021
5.59k
              if (ShuffleVectorInst *
SI5.59k
= dyn_cast<ShuffleVectorInst>(U))
{2.21k
2022
2.21k
                // Shuffle can be folded if it has no other input
2023
2.21k
                if (isa<UndefValue>(SI->getOperand(1)))
2024
458
                  continue;
2025
2.21k
              }
2026
5.14k
              
if (5.14k
isa<ExtractElementInst>(U)5.14k
)
2027
32
                continue;
2028
5.10k
              
if (5.10k
PrunedDAGInstrs.count(U)5.10k
)
2029
2.67k
                continue;
2030
2.42k
              NeedsExtraction = true;
2031
2.42k
              break;
2032
5.10k
            }
2033
5.25k
2034
5.25k
            if (
NeedsExtraction5.25k
)
{2.42k
2035
2.42k
              int ESContrib;
2036
2.42k
              if (
Ty1->isVectorTy()2.42k
)
{1.85k
2037
1.85k
                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2038
1.85k
                                               Ty1, VTy);
2039
1.85k
                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2040
1.85k
                  TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
2041
1.85k
              } else
2042
577
                ESContrib = (int) TTI->getVectorInstrCost(
2043
577
                                    Instruction::ExtractElement, VTy, 0);
2044
2.42k
2045
2.42k
              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
2046
2.42k
                *S->first << "} = " << ESContrib << "\n");
2047
2.42k
              EffSize -= ESContrib;
2048
2.42k
            }
2049
5.25k
2050
5.25k
            NeedsExtraction = false;
2051
5.36k
            for (User *U : S->second->users()) {
2052
5.36k
              if (ShuffleVectorInst *
SI5.36k
= dyn_cast<ShuffleVectorInst>(U))
{2.02k
2053
2.02k
                // Shuffle can be folded if it has no other input
2054
2.02k
                if (isa<UndefValue>(SI->getOperand(1)))
2055
150
                  continue;
2056
2.02k
              }
2057
5.21k
              
if (5.21k
isa<ExtractElementInst>(U)5.21k
)
2058
14
                continue;
2059
5.20k
              
if (5.20k
PrunedDAGInstrs.count(U)5.20k
)
2060
2.63k
                continue;
2061
2.56k
              NeedsExtraction = true;
2062
2.56k
              break;
2063
5.20k
            }
2064
5.25k
2065
5.25k
            if (
NeedsExtraction5.25k
)
{2.56k
2066
2.56k
              int ESContrib;
2067
2.56k
              if (
Ty2->isVectorTy()2.56k
)
{1.99k
2068
1.99k
                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2069
1.99k
                                               Ty2, VTy);
2070
1.99k
                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2071
1.99k
                  TargetTransformInfo::SK_ExtractSubvector, VTy,
2072
1.99k
                  Ty1->isVectorTy() ? 
Ty1->getVectorNumElements()1.99k
:
12
, Ty2));
2073
1.99k
              } else
2074
569
                ESContrib = (int) TTI->getVectorInstrCost(
2075
569
                                    Instruction::ExtractElement, VTy, 1);
2076
2.56k
              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
2077
2.56k
                *S->second << "} = " << ESContrib << "\n");
2078
2.56k
              EffSize -= ESContrib;
2079
2.56k
            }
2080
5.25k
          }
2081
5.51k
2082
5.51k
          // Compute the cost of incoming edges.
2083
5.51k
          if (
!isa<LoadInst>(S->first) && 5.51k
!isa<StoreInst>(S->first)5.47k
)
{5.21k
2084
5.21k
            Instruction *S1 = cast<Instruction>(S->first),
2085
5.21k
                        *S2 = cast<Instruction>(S->second);
2086
18.5k
            for (unsigned o = 0; 
o < S1->getNumOperands()18.5k
;
++o13.3k
)
{13.3k
2087
13.3k
              Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
2088
13.3k
2089
13.3k
              // Combining constants into vector constants (or small vector
2090
13.3k
              // constants into larger ones are assumed free).
2091
13.3k
              if (
isa<Constant>(O1) && 13.3k
isa<Constant>(O2)6.06k
)
2092
5.07k
                continue;
2093
13.3k
2094
8.22k
              
if (8.22k
FlipOrder8.22k
)
2095
245
                std::swap(O1, O2);
2096
8.22k
2097
8.22k
              ValuePair VP  = ValuePair(O1, O2);
2098
8.22k
              ValuePair VPR = ValuePair(O2, O1);
2099
8.22k
2100
8.22k
              // Internal edges are not handled here.
2101
8.22k
              if (
PrunedDAG.count(VP) || 8.22k
PrunedDAG.count(VPR)6.05k
)
2102
2.17k
                continue;
2103
8.22k
2104
6.05k
              Type *Ty1 = O1->getType(),
2105
6.05k
                   *Ty2 = O2->getType();
2106
6.05k
              Type *VTy = getVecTypeForPair(Ty1, Ty2);
2107
6.05k
2108
6.05k
              // Combining vector operations of the same type is also assumed
2109
6.05k
              // folded with other operations.
2110
6.05k
              if (
Ty1 == Ty26.05k
)
{5.91k
2111
5.91k
                // If both are insert elements, then both can be widened.
2112
5.91k
                InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
2113
5.91k
                                  *IEO2 = dyn_cast<InsertElementInst>(O2);
2114
5.91k
                if (
IEO1 && 5.91k
IEO21.44k
&&
isPureIEChain(IEO1)1.02k
&&
isPureIEChain(IEO2)1.00k
)
2115
947
                  continue;
2116
5.91k
                // If both are extract elements, and both have the same input
2117
5.91k
                // type, then they can be replaced with a shuffle
2118
4.96k
                ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
2119
4.96k
                                   *EIO2 = dyn_cast<ExtractElementInst>(O2);
2120
4.96k
                if (
EIO1 && 4.96k
EIO2237
&&
2121
20
                    EIO1->getOperand(0)->getType() ==
2122
20
                      EIO2->getOperand(0)->getType())
2123
20
                  continue;
2124
4.96k
                // If both are a shuffle with equal operand types and only two
2125
4.96k
                // unqiue operands, then they can be replaced with a single
2126
4.96k
                // shuffle
2127
4.94k
                ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
2128
4.94k
                                  *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
2129
4.94k
                if (
SIO1 && 4.94k
SIO2650
&&
2130
650
                    SIO1->getOperand(0)->getType() ==
2131
610
                      SIO2->getOperand(0)->getType()) {
2132
610
                  SmallSet<Value *, 4> SIOps;
2133
610
                  SIOps.insert(SIO1->getOperand(0));
2134
610
                  SIOps.insert(SIO1->getOperand(1));
2135
610
                  SIOps.insert(SIO2->getOperand(0));
2136
610
                  SIOps.insert(SIO2->getOperand(1));
2137
610
                  if (SIOps.size() <= 2)
2138
154
                    continue;
2139
610
                }
2140
4.94k
              }
2141
6.05k
2142
4.93k
              int ESContrib;
2143
4.93k
              // This pair has already been formed.
2144
4.93k
              if (
IncomingPairs.count(VP)4.93k
)
{71
2145
71
                continue;
2146
4.86k
              } else 
if (4.86k
IncomingPairs.count(VPR)4.86k
)
{6
2147
6
                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2148
6
                                               VTy, VTy);
2149
6
2150
6
                if (VTy->getVectorNumElements() == 2)
2151
6
                  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2152
6
                    TargetTransformInfo::SK_Reverse, VTy));
2153
4.85k
              } else 
if (4.85k
!Ty1->isVectorTy() && 4.85k
!Ty2->isVectorTy()3.10k
)
{3.10k
2154
3.10k
                ESContrib = (int) TTI->getVectorInstrCost(
2155
3.10k
                                    Instruction::InsertElement, VTy, 0);
2156
3.10k
                ESContrib += (int) TTI->getVectorInstrCost(
2157
3.10k
                                     Instruction::InsertElement, VTy, 1);
2158
1.75k
              } else 
if (1.75k
!Ty1->isVectorTy()1.75k
)
{6
2159
6
                // O1 needs to be inserted into a vector of size O2, and then
2160
6
                // both need to be shuffled together.
2161
6
                ESContrib = (int) TTI->getVectorInstrCost(
2162
6
                                    Instruction::InsertElement, Ty2, 0);
2163
6
                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2164
6
                                                VTy, Ty2);
2165
1.75k
              } else 
if (1.75k
!Ty2->isVectorTy()1.75k
)
{17
2166
17
                // O2 needs to be inserted into a vector of size O1, and then
2167
17
                // both need to be shuffled together.
2168
17
                ESContrib = (int) TTI->getVectorInstrCost(
2169
17
                                    Instruction::InsertElement, Ty1, 0);
2170
17
                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2171
17
                                                VTy, Ty1);
2172
1.73k
              } else {
2173
1.73k
                Type *TyBig = Ty1, *TySmall = Ty2;
2174
1.73k
                if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
2175
40
                  std::swap(TyBig, TySmall);
2176
1.73k
2177
1.73k
                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2178
1.73k
                                               VTy, TyBig);
2179
1.73k
                if (TyBig != TySmall)
2180
122
                  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2181
122
                                                  TyBig, TySmall);
2182
1.73k
              }
2183
4.93k
2184
4.86k
              
DEBUG4.86k
(if (DebugPairSelection) dbgs() << "\tcost {"4.86k
2185
4.86k
                     << *O1 << " <-> " << *O2 << "} = " <<
2186
4.86k
                     ESContrib << "\n");
2187
4.86k
              EffSize -= ESContrib;
2188
4.86k
              IncomingPairs.insert(VP);
2189
4.86k
            }
2190
5.21k
          }
2191
5.51k
        }
2192
2.67k
2193
2.67k
        if (
!HasNontrivialInsts2.67k
)
{1.52k
2194
1.52k
          DEBUG(if (DebugPairSelection) dbgs() <<
2195
1.52k
                "\tNo non-trivial instructions in DAG;"
2196
1.52k
                " override to zero effective size\n");
2197
1.52k
          EffSize = 0;
2198
1.52k
        }
2199
526
      } else {
2200
526
        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
2201
1.44k
             E = PrunedDAG.end(); 
S != E1.44k
;
++S914
)
2202
914
          EffSize += (int) getDepthFactor(S->first);
2203
526
      }
2204
3.20k
2205
3.20k
      DEBUG(if (DebugPairSelection)
2206
3.20k
             dbgs() << "BBV: found pruned DAG for pair {"
2207
3.20k
             << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
2208
3.20k
             MaxDepth << " and size " << PrunedDAG.size() <<
2209
3.20k
            " (effective size: " << EffSize << ")\n");
2210
3.20k
      if (
((TTI && 3.20k
!UseChainDepthWithTI2.67k
) ||
2211
526
            MaxDepth >= Config.ReqChainDepth) &&
2212
2.76k
          
EffSize > 02.76k
&&
EffSize > BestEffSize291
)
{267
2213
267
        BestMaxDepth = MaxDepth;
2214
267
        BestEffSize = EffSize;
2215
267
        BestDAG = PrunedDAG;
2216
267
      }
2217
3.20k
    }
2218
1.22k
  }
2219
2220
  // Given the list of candidate pairs, this function selects those
2221
  // that will be fused into vector instructions.
2222
  void BBVectorize::choosePairs(
2223
                DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
2224
                DenseSet<ValuePair> &CandidatePairsSet,
2225
                DenseMap<ValuePair, int> &CandidatePairCostSavings,
2226
                std::vector<Value *> &PairableInsts,
2227
                DenseSet<ValuePair> &FixedOrderPairs,
2228
                DenseMap<VPPair, unsigned> &PairConnectionTypes,
2229
                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
2230
                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
2231
                DenseSet<ValuePair> &PairableInstUsers,
2232
87
                DenseMap<Value *, Value *>& ChosenPairs) {
2233
87
    bool UseCycleCheck =
2234
87
     CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
2235
87
2236
87
    DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
2237
87
    for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
2238
7.94k
         E = CandidatePairsSet.end(); 
I != E7.94k
;
++I7.85k
)
{7.85k
2239
7.85k
      std::vector<Value *> &JJ = CandidatePairs2[I->second];
2240
7.85k
      if (
JJ.empty()7.85k
)
JJ.reserve(32)1.26k
;
2241
7.85k
      JJ.push_back(I->first);
2242
7.85k
    }
2243
87
2244
87
    DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
2245
87
    DenseSet<VPPair> PairableInstUserPairSet;
2246
87
    for (std::vector<Value *>::iterator I = PairableInsts.begin(),
2247
1.30k
         E = PairableInsts.end(); 
I != E1.30k
;
++I1.22k
)
{1.22k
2248
1.22k
      // The number of possible pairings for this variable:
2249
1.22k
      size_t NumChoices = CandidatePairs.lookup(*I).size();
2250
1.22k
      if (
!NumChoices1.22k
)
continue0
;
2251
1.22k
2252
1.22k
      std::vector<Value *> &JJ = CandidatePairs[*I];
2253
1.22k
2254
1.22k
      // The best pair to choose and its dag:
2255
1.22k
      size_t BestMaxDepth = 0;
2256
1.22k
      int BestEffSize = 0;
2257
1.22k
      DenseSet<ValuePair> BestDAG;
2258
1.22k
      findBestDAGFor(CandidatePairs, CandidatePairsSet,
2259
1.22k
                      CandidatePairCostSavings,
2260
1.22k
                      PairableInsts, FixedOrderPairs, PairConnectionTypes,
2261
1.22k
                      ConnectedPairs, ConnectedPairDeps,
2262
1.22k
                      PairableInstUsers, PairableInstUserMap,
2263
1.22k
                      PairableInstUserPairSet, ChosenPairs,
2264
1.22k
                      BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
2265
1.22k
                      UseCycleCheck);
2266
1.22k
2267
1.22k
      if (BestDAG.empty())
2268
956
        continue;
2269
1.22k
2270
1.22k
      // A dag has been chosen (or not) at this point. If no dag was
2271
1.22k
      // chosen, then this instruction, I, cannot be paired (and is no longer
2272
1.22k
      // considered).
2273
1.22k
2274
266
      
DEBUG266
(dbgs() << "BBV: selected pairs in the best DAG for: "266
2275
266
                   << *cast<Instruction>(*I) << "\n");
2276
266
2277
266
      for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
2278
1.18k
           SE2 = BestDAG.end(); 
S != SE21.18k
;
++S922
)
{922
2279
922
        // Insert the members of this dag into the list of chosen pairs.
2280
922
        ChosenPairs.insert(ValuePair(S->first, S->second));
2281
922
        DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
2282
922
               *S->second << "\n");
2283
922
2284
922
        // Remove all candidate pairs that have values in the chosen dag.
2285
922
        std::vector<Value *> &KK = CandidatePairs[S->first];
2286
922
        for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
2287
12.5k
             
K != KE12.5k
;
++K11.6k
)
{11.6k
2288
11.6k
          if (*K == S->second)
2289
922
            continue;
2290
11.6k
2291
10.6k
          CandidatePairsSet.erase(ValuePair(S->first, *K));
2292
10.6k
        }
2293
922
2294
922
        std::vector<Value *> &LL = CandidatePairs2[S->second];
2295
922
        for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
2296
8.59k
             
L != LE8.59k
;
++L7.67k
)
{7.67k
2297
7.67k
          if (*L == S->first)
2298
922
            continue;
2299
7.67k
2300
6.75k
          CandidatePairsSet.erase(ValuePair(*L, S->second));
2301
6.75k
        }
2302
922
2303
922
        std::vector<Value *> &MM = CandidatePairs[S->second];
2304
922
        for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
2305
9.16k
             
M != ME9.16k
;
++M8.24k
)
{8.24k
2306
8.24k
          assert(*M != S->first && "Flipped pair in candidate list?");
2307
8.24k
          CandidatePairsSet.erase(ValuePair(S->second, *M));
2308
8.24k
        }
2309
922
2310
922
        std::vector<Value *> &NN = CandidatePairs2[S->first];
2311
922
        for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
2312
5.46k
             
N != NE5.46k
;
++N4.54k
)
{4.54k
2313
4.54k
          assert(*N != S->second && "Flipped pair in candidate list?");
2314
4.54k
          CandidatePairsSet.erase(ValuePair(*N, S->first));
2315
4.54k
        }
2316
922
      }
2317
266
    }
2318
87
2319
87
    DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
2320
87
  }
2321
2322
  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
2323
1.27k
                     unsigned n = 0) {
2324
1.27k
    if (!I->hasName())
2325
294
      return "";
2326
1.27k
2327
976
    
return (I->getName() + (IsInput ? 976
".v.i"426
:
".v.r"550
) + utostr(o) +
2328
600
             (n > 0 ? 
"." + utostr(n)376
:
""600
)).str();
2329
1.27k
  }
2330
2331
  // Returns the value that is to be used as the pointer input to the vector
2332
  // instruction that fuses I with J.
2333
  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
2334
104
                     Instruction *I, Instruction *J, unsigned o) {
2335
104
    Value *IPtr, *JPtr;
2336
104
    unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
2337
104
    int64_t OffsetInElmts;
2338
104
2339
104
    // Note: the analysis might fail here, that is why the pair order has
2340
104
    // been precomputed (OffsetInElmts must be unused here).
2341
104
    (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
2342
104
                          IAddressSpace, JAddressSpace,
2343
104
                          OffsetInElmts, false);
2344
104
2345
104
    // The pointer value is taken to be the one with the lowest offset.
2346
104
    Value *VPtr = IPtr;
2347
104
2348
104
    Type *ArgTypeI = IPtr->getType()->getPointerElementType();
2349
104
    Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
2350
104
    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2351
104
    Type *VArgPtrType
2352
104
      = PointerType::get(VArgType,
2353
104
                         IPtr->getType()->getPointerAddressSpace());
2354
104
    return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
2355
104
                        /* insert before */ I);
2356
104
  }
2357
2358
  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
2359
                     unsigned MaskOffset, unsigned NumInElem,
2360
                     unsigned NumInElem1, unsigned IdxOffset,
2361
16
                     std::vector<Constant*> &Mask) {
2362
16
    unsigned NumElem1 = J->getType()->getVectorNumElements();
2363
96
    for (unsigned v = 0; 
v < NumElem196
;
++v80
)
{80
2364
80
      int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
2365
80
      if (
m < 080
)
{0
2366
0
        Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
2367
80
      } else {
2368
80
        unsigned mm = m + (int) IdxOffset;
2369
80
        if (m >= (int) NumInElem1)
2370
36
          mm += (int) NumInElem;
2371
80
2372
80
        Mask[v+MaskOffset] =
2373
80
          ConstantInt::get(Type::getInt32Ty(Context), mm);
2374
80
      }
2375
80
    }
2376
16
  }
2377
2378
  // Returns the value that is to be used as the vector-shuffle mask to the
2379
  // vector instruction that fuses I with J.
2380
  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
2381
8
                     Instruction *I, Instruction *J) {
2382
8
    // This is the shuffle mask. We need to append the second
2383
8
    // mask to the first, and the numbers need to be adjusted.
2384
8
2385
8
    Type *ArgTypeI = I->getType();
2386
8
    Type *ArgTypeJ = J->getType();
2387
8
    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2388
8
2389
8
    unsigned NumElemI = ArgTypeI->getVectorNumElements();
2390
8
2391
8
    // Get the total number of elements in the fused vector type.
2392
8
    // By definition, this must equal the number of elements in
2393
8
    // the final mask.
2394
8
    unsigned NumElem = VArgType->getVectorNumElements();
2395
8
    std::vector<Constant*> Mask(NumElem);
2396
8
2397
8
    Type *OpTypeI = I->getOperand(0)->getType();
2398
8
    unsigned NumInElemI = OpTypeI->getVectorNumElements();
2399
8
    Type *OpTypeJ = J->getOperand(0)->getType();
2400
8
    unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
2401
8
2402
8
    // The fused vector will be:
2403
8
    // -----------------------------------------------------
2404
8
    // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
2405
8
    // -----------------------------------------------------
2406
8
    // from which we'll extract NumElem total elements (where the first NumElemI
2407
8
    // of them come from the mask in I and the remainder come from the mask
2408
8
    // in J.
2409
8
2410
8
    // For the mask from the first pair...
2411
8
    fillNewShuffleMask(Context, I, 0,        NumInElemJ, NumInElemI,
2412
8
                       0,          Mask);
2413
8
2414
8
    // For the mask from the second pair...
2415
8
    fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
2416
8
                       NumInElemI, Mask);
2417
8
2418
8
    return ConstantVector::get(Mask);
2419
8
  }
2420
2421
  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
2422
                                  Instruction *J, unsigned o, Value *&LOp,
2423
                                  unsigned numElemL,
2424
                                  Type *ArgTypeL, Type *ArgTypeH,
2425
4
                                  bool IBeforeJ, unsigned IdxOff) {
2426
4
    bool ExpandedIEChain = false;
2427
4
    if (InsertElementInst *
LIE4
= dyn_cast<InsertElementInst>(LOp))
{4
2428
4
      // If we have a pure insertelement chain, then this can be rewritten
2429
4
      // into a chain that directly builds the larger type.
2430
4
      if (
isPureIEChain(LIE)4
)
{4
2431
4
        SmallVector<Value *, 8> VectElemts(numElemL,
2432
4
          UndefValue::get(ArgTypeL->getScalarType()));
2433
4
        InsertElementInst *LIENext = LIE;
2434
8
        do {
2435
8
          unsigned Idx =
2436
8
            cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
2437
8
          VectElemts[Idx] = LIENext->getOperand(1);
2438
8
        } while ((LIENext =
2439
8
                   dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
2440
4
2441
4
        LIENext = nullptr;
2442
4
        Value *LIEPrev = UndefValue::get(ArgTypeH);
2443
12
        for (unsigned i = 0; 
i < numElemL12
;
++i8
)
{8
2444
8
          if (
isa<UndefValue>(VectElemts[i])8
)
continue0
;
2445
8
          LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
2446
8
                             ConstantInt::get(Type::getInt32Ty(Context),
2447
8
                                              i + IdxOff),
2448
8
                             getReplacementName(IBeforeJ ? 
I8
:
J0
,
2449
8
                                                true, o, i+1));
2450
8
          LIENext->insertBefore(IBeforeJ ? 
J8
:
I0
);
2451
8
          LIEPrev = LIENext;
2452
8
        }
2453
4
2454
4
        LOp = LIENext ? 
(Value*) LIENext4
:
UndefValue::get(ArgTypeH)0
;
2455
4
        ExpandedIEChain = true;
2456
4
      }
2457
4
    }
2458
4
2459
4
    return ExpandedIEChain;
2460
4
  }
2461
2462
1.84k
  static unsigned getNumScalarElements(Type *Ty) {
2463
1.84k
    if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
2464
147
      return VecTy->getNumElements();
2465
1.69k
    return 1;
2466
1.84k
  }
2467
2468
  // Returns the value to be used as the specified operand of the vector
2469
  // instruction that fuses I with J.
2470
  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
2471
612
                     Instruction *J, unsigned o, bool IBeforeJ) {
2472
612
    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2473
612
    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
2474
612
2475
612
    // Compute the fused vector type for this operand
2476
612
    Type *ArgTypeI = I->getOperand(o)->getType();
2477
612
    Type *ArgTypeJ = J->getOperand(o)->getType();
2478
612
    VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2479
612
2480
612
    Instruction *L = I, *H = J;
2481
612
    Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
2482
612
2483
612
    unsigned numElemL = getNumScalarElements(ArgTypeL);
2484
612
    unsigned numElemH = getNumScalarElements(ArgTypeH);
2485
612
2486
612
    Value *LOp = L->getOperand(o);
2487
612
    Value *HOp = H->getOperand(o);
2488
612
    unsigned numElem = VArgType->getNumElements();
2489
612
2490
612
    // First, we check if we can reuse the "original" vector outputs (if these
2491
612
    // exist). We might need a shuffle.
2492
612
    ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
2493
612
    ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
2494
612
    ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
2495
612
    ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
2496
612
2497
612
    // FIXME: If we're fusing shuffle instructions, then we can't apply this
2498
612
    // optimization. The input vectors to the shuffle might be a different
2499
612
    // length from the shuffle outputs. Unfortunately, the replacement
2500
612
    // shuffle mask has already been formed, and the mask entries are sensitive
2501
612
    // to the sizes of the inputs.
2502
612
    bool IsSizeChangeShuffle =
2503
612
      isa<ShuffleVectorInst>(L) &&
2504
16
        
(LOp->getType() != L->getType() || 16
HOp->getType() != H->getType()2
);
2505
612
2506
612
    if (
(LEE || 612
LSV282
) &&
(HEE || 349
HSV30
) &&
!IsSizeChangeShuffle336
)
{334
2507
334
      // We can have at most two unique vector inputs.
2508
334
      bool CanUseInputs = true;
2509
334
      Value *I1, *I2 = nullptr;
2510
334
      if (
LEE334
)
{317
2511
317
        I1 = LEE->getOperand(0);
2512
17
      } else {
2513
17
        I1 = LSV->getOperand(0);
2514
17
        I2 = LSV->getOperand(1);
2515
17
        if (
I2 == I1 || 17
isa<UndefValue>(I2)17
)
2516
16
          I2 = nullptr;
2517
17
      }
2518
334
2519
334
      if (
HEE334
)
{319
2520
319
        Value *I3 = HEE->getOperand(0);
2521
319
        if (
!I2 && 319
I3 != I1319
)
2522
10
          I2 = I3;
2523
309
        else 
if (309
I3 != I1 && 309
I3 != I20
)
2524
0
          CanUseInputs = false;
2525
15
      } else {
2526
15
        Value *I3 = HSV->getOperand(0);
2527
15
        if (
!I2 && 15
I3 != I114
)
2528
0
          I2 = I3;
2529
15
        else 
if (15
I3 != I1 && 15
I3 != I21
)
2530
1
          CanUseInputs = false;
2531
15
2532
15
        if (
CanUseInputs15
)
{14
2533
14
          Value *I4 = HSV->getOperand(1);
2534
14
          if (
!isa<UndefValue>(I4)14
)
{0
2535
0
            if (
!I2 && 0
I4 != I10
)
2536
0
              I2 = I4;
2537
0
            else 
if (0
I4 != I1 && 0
I4 != I20
)
2538
0
              CanUseInputs = false;
2539
0
          }
2540
14
        }
2541
15
      }
2542
334
2543
334
      if (
CanUseInputs334
)
{333
2544
333
        unsigned LOpElem =
2545
333
          cast<Instruction>(LOp)->getOperand(0)->getType()
2546
333
            ->getVectorNumElements();
2547
333
2548
333
        unsigned HOpElem =
2549
333
          cast<Instruction>(HOp)->getOperand(0)->getType()
2550
333
            ->getVectorNumElements();
2551
333
2552
333
        // We have one or two input vectors. We need to map each index of the
2553
333
        // operands to the index of the original vector.
2554
333
        SmallVector<std::pair<int, int>, 8>  II(numElem);
2555
724
        for (unsigned i = 0; 
i < numElemL724
;
++i391
)
{391
2556
391
          int Idx, INum;
2557
391
          if (
LEE391
)
{317
2558
317
            Idx =
2559
317
              cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
2560
317
            INum = LEE->getOperand(0) == I1 ? 
0317
:
10
;
2561
74
          } else {
2562
74
            Idx = LSV->getMaskValue(i);
2563
74
            if (
Idx < (int) LOpElem74
)
{74
2564
74
              INum = LSV->getOperand(0) == I1 ? 
074
:
10
;
2565
0
            } else {
2566
0
              Idx -= LOpElem;
2567
0
              INum = LSV->getOperand(1) == I1 ? 
00
:
10
;
2568
0
            }
2569
74
          }
2570
391
2571
391
          II[i] = std::pair<int, int>(Idx, INum);
2572
391
        }
2573
722
        for (unsigned i = 0; 
i < numElemH722
;
++i389
)
{389
2574
389
          int Idx, INum;
2575
389
          if (
HEE389
)
{319
2576
319
            Idx =
2577
319
              cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
2578
309
            INum = HEE->getOperand(0) == I1 ? 
0309
:
110
;
2579
70
          } else {
2580
70
            Idx = HSV->getMaskValue(i);
2581
70
            if (
Idx < (int) HOpElem70
)
{70
2582
70
              INum = HSV->getOperand(0) == I1 ? 
070
:
10
;
2583
0
            } else {
2584
0
              Idx -= HOpElem;
2585
0
              INum = HSV->getOperand(1) == I1 ? 
00
:
10
;
2586
0
            }
2587
70
          }
2588
389
2589
389
          II[i + numElemL] = std::pair<int, int>(Idx, INum);
2590
389
        }
2591
333
2592
333
        // We now have an array which tells us from which index of which
2593
333
        // input vector each element of the operand comes.
2594
333
        VectorType *I1T = cast<VectorType>(I1->getType());
2595
333
        unsigned I1Elem = I1T->getNumElements();
2596
333
2597
333
        if (
!I2333
)
{323
2598
323
          // In this case there is only one underlying vector input. Check for
2599
323
          // the trivial case where we can use the input directly.
2600
323
          if (
I1Elem == numElem323
)
{323
2601
323
            bool ElemInOrder = true;
2602
1.05k
            for (unsigned i = 0; 
i < numElem1.05k
;
++i729
)
{739
2603
739
              if (
II[i].first != (int) i && 739
II[i].first != -110
)
{10
2604
10
                ElemInOrder = false;
2605
10
                break;
2606
10
              }
2607
739
            }
2608
323
2609
323
            if (ElemInOrder)
2610
313
              return I1;
2611
323
          }
2612
323
2613
323
          // A shuffle is needed.
2614
10
          std::vector<Constant *> Mask(numElem);
2615
44
          for (unsigned i = 0; 
i < numElem44
;
++i34
)
{34
2616
34
            int Idx = II[i].first;
2617
34
            if (Idx == -1)
2618
0
              Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
2619
34
            else
2620
34
              Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2621
34
          }
2622
10
2623
10
          Instruction *S =
2624
10
            new ShuffleVectorInst(I1, UndefValue::get(I1T),
2625
10
                                  ConstantVector::get(Mask),
2626
10
                                  getReplacementName(IBeforeJ ? 
I10
:
J0
,
2627
10
                                                     true, o));
2628
10
          S->insertBefore(IBeforeJ ? 
J10
:
I0
);
2629
10
          return S;
2630
323
        }
2631
333
2632
10
        VectorType *I2T = cast<VectorType>(I2->getType());
2633
10
        unsigned I2Elem = I2T->getNumElements();
2634
10
2635
10
        // This input comes from two distinct vectors. The first step is to
2636
10
        // make sure that both vectors are the same length. If not, the
2637
10
        // smaller one will need to grow before they can be shuffled together.
2638
10
        if (
I1Elem < I2Elem10
)
{0
2639
0
          std::vector<Constant *> Mask(I2Elem);
2640
0
          unsigned v = 0;
2641
0
          for (; 
v < I1Elem0
;
++v0
)
2642
0
            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2643
0
          for (; 
v < I2Elem0
;
++v0
)
2644
0
            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2645
0
2646
0
          Instruction *NewI1 =
2647
0
            new ShuffleVectorInst(I1, UndefValue::get(I1T),
2648
0
                                  ConstantVector::get(Mask),
2649
0
                                  getReplacementName(IBeforeJ ? 
I0
:
J0
,
2650
0
                                                     true, o, 1));
2651
0
          NewI1->insertBefore(IBeforeJ ? 
J0
:
I0
);
2652
0
          I1 = NewI1;
2653
0
          I1Elem = I2Elem;
2654
10
        } else 
if (10
I1Elem > I2Elem10
)
{0
2655
0
          std::vector<Constant *> Mask(I1Elem);
2656
0
          unsigned v = 0;
2657
0
          for (; 
v < I2Elem0
;
++v0
)
2658
0
            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2659
0
          for (; 
v < I1Elem0
;
++v0
)
2660
0
            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2661
0
2662
0
          Instruction *NewI2 =
2663
0
            new ShuffleVectorInst(I2, UndefValue::get(I2T),
2664
0
                                  ConstantVector::get(Mask),
2665
0
                                  getReplacementName(IBeforeJ ? 
I0
:
J0
,
2666
0
                                                     true, o, 1));
2667
0
          NewI2->insertBefore(IBeforeJ ? 
J0
:
I0
);
2668
0
          I2 = NewI2;
2669
0
        }
2670
10
2671
10
        // Now that both I1 and I2 are the same length we can shuffle them
2672
10
        // together (and use the result).
2673
10
        std::vector<Constant *> Mask(numElem);
2674
30
        for (unsigned v = 0; 
v < numElem30
;
++v20
)
{20
2675
20
          if (
II[v].first == -120
)
{0
2676
0
            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2677
20
          } else {
2678
20
            int Idx = II[v].first + II[v].second * I1Elem;
2679
20
            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2680
20
          }
2681
20
        }
2682
10
2683
10
        Instruction *NewOp =
2684
10
          new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
2685
10
                                getReplacementName(IBeforeJ ? 
I10
:
J0
, true, o));
2686
10
        NewOp->insertBefore(IBeforeJ ? 
J10
:
I0
);
2687
10
        return NewOp;
2688
333
      }
2689
334
    }
2690
612
2691
279
    Type *ArgType = ArgTypeL;
2692
279
    if (
numElemL < numElemH279
)
{0
2693
0
      if (
numElemL == 1 && 0
expandIEChain(Context, I, J, o, HOp, numElemH,0
2694
0
                                         ArgTypeL, VArgType, IBeforeJ, 1)) {
2695
0
        // This is another short-circuit case: we're combining a scalar into
2696
0
        // a vector that is formed by an IE chain. We've just expanded the IE
2697
0
        // chain, now insert the scalar and we're done.
2698
0
2699
0
        Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
2700
0
                           getReplacementName(IBeforeJ ? 
I0
:
J0
, true, o));
2701
0
        S->insertBefore(IBeforeJ ? 
J0
:
I0
);
2702
0
        return S;
2703
0
      } else 
if (0
!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,0
2704
0
                                ArgTypeH, IBeforeJ)) {
2705
0
        // The two vector inputs to the shuffle must be the same length,
2706
0
        // so extend the smaller vector to be the same length as the larger one.
2707
0
        Instruction *NLOp;
2708
0
        if (
numElemL > 10
)
{0
2709
0
2710
0
          std::vector<Constant *> Mask(numElemH);
2711
0
          unsigned v = 0;
2712
0
          for (; 
v < numElemL0
;
++v0
)
2713
0
            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2714
0
          for (; 
v < numElemH0
;
++v0
)
2715
0
            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2716
0
2717
0
          NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
2718
0
                                       ConstantVector::get(Mask),
2719
0
                                       getReplacementName(IBeforeJ ? 
I0
:
J0
,
2720
0
                                                          true, o, 1));
2721
0
        } else {
2722
0
          NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
2723
0
                                           getReplacementName(IBeforeJ ? 
I0
:
J0
,
2724
0
                                                              true, o, 1));
2725
0
        }
2726
0
2727
0
        NLOp->insertBefore(IBeforeJ ? 
J0
:
I0
);
2728
0
        LOp = NLOp;
2729
0
      }
2730
0
2731
0
      ArgType = ArgTypeH;
2732
279
    } else 
if (279
numElemL > numElemH279
)
{4
2733
4
      if (
numElemH == 1 && 4
expandIEChain(Context, I, J, o, LOp, numElemL,4
2734
4
                                         ArgTypeH, VArgType, IBeforeJ)) {
2735
4
        Instruction *S =
2736
4
          InsertElementInst::Create(LOp, HOp,
2737
4
                                    ConstantInt::get(Type::getInt32Ty(Context),
2738
4
                                                     numElemL),
2739
4
                                    getReplacementName(IBeforeJ ? 
I4
:
J0
,
2740
4
                                                       true, o));
2741
4
        S->insertBefore(IBeforeJ ? 
J4
:
I0
);
2742
4
        return S;
2743
0
      } else 
if (0
!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,0
2744
0
                                ArgTypeL, IBeforeJ)) {
2745
0
        Instruction *NHOp;
2746
0
        if (
numElemH > 10
)
{0
2747
0
          std::vector<Constant *> Mask(numElemL);
2748
0
          unsigned v = 0;
2749
0
          for (; 
v < numElemH0
;
++v0
)
2750
0
            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2751
0
          for (; 
v < numElemL0
;
++v0
)
2752
0
            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2753
0
2754
0
          NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
2755
0
                                       ConstantVector::get(Mask),
2756
0
                                       getReplacementName(IBeforeJ ? 
I0
:
J0
,
2757
0
                                                          true, o, 1));
2758
0
        } else {
2759
0
          NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
2760
0
                                           getReplacementName(IBeforeJ ? 
I0
:
J0
,
2761
0
                                                              true, o, 1));
2762
0
        }
2763
0
2764
0
        NHOp->insertBefore(IBeforeJ ? 
J0
:
I0
);
2765
0
        HOp = NHOp;
2766
0
      }
2767
4
    }
2768
279
2769
275
    
if (275
ArgType->isVectorTy()275
)
{38
2770
38
      unsigned numElem = VArgType->getVectorNumElements();
2771
38
      std::vector<Constant*> Mask(numElem);
2772
266
      for (unsigned v = 0; 
v < numElem266
;
++v228
)
{228
2773
228
        unsigned Idx = v;
2774
228
        // If the low vector was expanded, we need to skip the extra
2775
228
        // undefined entries.
2776
228
        if (
v >= numElemL && 228
numElemH > numElemL114
)
2777
0
          Idx += (numElemH - numElemL);
2778
228
        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2779
228
      }
2780
38
2781
38
      Instruction *BV = new ShuffleVectorInst(LOp, HOp,
2782
38
                          ConstantVector::get(Mask),
2783
35
                          getReplacementName(IBeforeJ ? 
I35
:
J3
, true, o));
2784
35
      BV->insertBefore(IBeforeJ ? 
J35
:
I3
);
2785
38
      return BV;
2786
38
    }
2787
275
2788
237
    Instruction *BV1 = InsertElementInst::Create(
2789
237
                                          UndefValue::get(VArgType), LOp, CV0,
2790
224
                                          getReplacementName(IBeforeJ ? 
I224
:
J13
,
2791
237
                                                             true, o, 1));
2792
224
    BV1->insertBefore(IBeforeJ ? 
J224
:
I13
);
2793
237
    Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
2794
224
                                          getReplacementName(IBeforeJ ? 
I224
:
J13
,
2795
237
                                                             true, o, 2));
2796
224
    BV2->insertBefore(IBeforeJ ? 
J224
:
I13
);
2797
237
    return BV2;
2798
275
  }
2799
2800
  // This function creates an array of values that will be used as the inputs
2801
  // to the vector instruction that fuses I with J.
2802
  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
2803
                     Instruction *I, Instruction *J,
2804
                     SmallVectorImpl<Value *> &ReplacedOperands,
2805
381
                     bool IBeforeJ) {
2806
381
    unsigned NumOperands = I->getNumOperands();
2807
381
2808
1.12k
    for (unsigned p = 0, o = NumOperands-1; 
p < NumOperands1.12k
;
++p, --o744
)
{744
2809
744
      // Iterate backward so that we look at the store pointer
2810
744
      // first and know whether or not we need to flip the inputs.
2811
744
2812
744
      if (
isa<LoadInst>(I) || 744
(o == 1 && 710
isa<StoreInst>(I)340
))
{104
2813
104
        // This is the pointer for a load/store instruction.
2814
104
        ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
2815
104
        continue;
2816
640
      } else 
if (640
isa<CallInst>(I)640
)
{44
2817
44
        Function *F = cast<CallInst>(I)->getCalledFunction();
2818
44
        Intrinsic::ID IID = F->getIntrinsicID();
2819
44
        if (
o == NumOperands-144
)
{17
2820
17
          BasicBlock &BB = *I->getParent();
2821
17
2822
17
          Module *M = BB.getParent()->getParent();
2823
17
          Type *ArgTypeI = I->getType();
2824
17
          Type *ArgTypeJ = J->getType();
2825
17
          Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2826
17
2827
17
          ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
2828
17
          continue;
2829
27
        } else 
if (27
(IID == Intrinsic::powi || 27
IID == Intrinsic::ctlz25
||
2830
23
                    
IID == Intrinsic::cttz23
) &&
o == 16
)
{3
2831
3
          // The second argument of powi/ctlz/cttz is a single integer/constant
2832
3
          // and we've already checked that both arguments are equal.
2833
3
          // As a result, we just keep I's second argument.
2834
3
          ReplacedOperands[o] = I->getOperand(o);
2835
3
          continue;
2836
3
        }
2837
596
      } else 
if (596
isa<ShuffleVectorInst>(I) && 596
o == NumOperands-124
)
{8
2838
8
        ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
2839
8
        continue;
2840
8
      }
2841
744
2842
612
      ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
2843
612
    }
2844
381
  }
2845
2846
  // This function creates two values that represent the outputs of the
2847
  // original I and J instructions. These are generally vector shuffles
2848
  // or extracts. In many cases, these will end up being unused and, thus,
2849
  // eliminated by later passes.
2850
  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
2851
                     Instruction *J, Instruction *K,
2852
                     Instruction *&InsertionPt,
2853
381
                     Instruction *&K1, Instruction *&K2) {
2854
381
    if (isa<StoreInst>(I))
2855
70
      return;
2856
381
2857
311
    Type *IType = I->getType();
2858
311
    Type *JType = J->getType();
2859
311
2860
311
    VectorType *VType = getVecTypeForPair(IType, JType);
2861
311
    unsigned numElem = VType->getNumElements();
2862
311
2863
311
    unsigned numElemI = getNumScalarElements(IType);
2864
311
    unsigned numElemJ = getNumScalarElements(JType);
2865
311
2866
311
    if (
IType->isVectorTy()311
)
{20
2867
20
      std::vector<Constant *> Mask1(numElemI), Mask2(numElemI);
2868
106
      for (unsigned v = 0; 
v < numElemI106
;
++v86
)
{86
2869
86
        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2870
86
        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);
2871
86
      }
2872
20
2873
20
      K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
2874
20
                                 ConstantVector::get(Mask1),
2875
20
                                 getReplacementName(K, false, 1));
2876
291
    } else {
2877
291
      Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2878
291
      K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1));
2879
291
    }
2880
311
2881
311
    if (
JType->isVectorTy()311
)
{17
2882
17
      std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ);
2883
97
      for (unsigned v = 0; 
v < numElemJ97
;
++v80
)
{80
2884
80
        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2885
80
        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);
2886
80
      }
2887
17
2888
17
      K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
2889
17
                                 ConstantVector::get(Mask2),
2890
17
                                 getReplacementName(K, false, 2));
2891
294
    } else {
2892
294
      Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1);
2893
294
      K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));
2894
294
    }
2895
311
2896
311
    K1->insertAfter(K);
2897
311
    K2->insertAfter(K1);
2898
311
    InsertionPt = K2;
2899
311
  }
2900
2901
  // Move all uses of the function I (including pairing-induced uses) after J.
2902
  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
2903
                     DenseSet<ValuePair> &LoadMoveSetPairs,
2904
381
                     Instruction *I, Instruction *J) {
2905
381
    // Skip to the first instruction past I.
2906
381
    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2907
381
2908
381
    DenseSet<Value *> Users;
2909
381
    AliasSetTracker WriteSet(*AA);
2910
381
    if (
I->mayWriteToMemory()381
)
WriteSet.add(I)70
;
2911
381
2912
2.36k
    for (; 
cast<Instruction>(L) != J2.36k
;
++L1.98k
)
2913
1.98k
      (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);
2914
381
2915
381
    assert(cast<Instruction>(L) == J &&
2916
381
      "Tracking has not proceeded far enough to check for dependencies");
2917
381
    // If J is now in the use set of I, then trackUsesOfI will return true
2918
381
    // and we have a dependency cycle (and the fusing operation must abort).
2919
381
    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
2920
381
  }
2921
2922
  // Move all uses of the function I (including pairing-induced uses) after J.
2923
  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
2924
                     DenseSet<ValuePair> &LoadMoveSetPairs,
2925
                     Instruction *&InsertionPt,
2926
381
                     Instruction *I, Instruction *J) {
2927
381
    // Skip to the first instruction past I.
2928
381
    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2929
381
2930
381
    DenseSet<Value *> Users;
2931
381
    AliasSetTracker WriteSet(*AA);
2932
381
    if (
I->mayWriteToMemory()381
)
WriteSet.add(I)70
;
2933
381
2934
2.91k
    for (; 
cast<Instruction>(L) != J2.91k
;)
{2.53k
2935
2.53k
      if (
trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)2.53k
)
{421
2936
421
        // Move this instruction
2937
421
        Instruction *InstToMove = &*L++;
2938
421
2939
421
        DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
2940
421
                        " to after " << *InsertionPt << "\n");
2941
421
        InstToMove->removeFromParent();
2942
421
        InstToMove->insertAfter(InsertionPt);
2943
421
        InsertionPt = InstToMove;
2944
2.11k
      } else {
2945
2.11k
        ++L;
2946
2.11k
      }
2947
2.53k
    }
2948
381
  }
2949
2950
  // Collect all load instruction that are in the move set of a given first
2951
  // pair member.  These loads depend on the first instruction, I, and so need
2952
  // to be moved after J (the second instruction) when the pair is fused.
2953
  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
2954
                     DenseMap<Value *, Value *> &ChosenPairs,
2955
                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
2956
                     DenseSet<ValuePair> &LoadMoveSetPairs,
2957
558
                     Instruction *I) {
2958
558
    // Skip to the first instruction past I.
2959
558
    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2960
558
2961
558
    DenseSet<Value *> Users;
2962
558
    AliasSetTracker WriteSet(*AA);
2963
558
    if (
I->mayWriteToMemory()558
)
WriteSet.add(I)108
;
2964
558
2965
558
    // Note: We cannot end the loop when we reach J because J could be moved
2966
558
    // farther down the use chain by another instruction pairing. Also, J
2967
558
    // could be before I if this is an inverted input.
2968
26.6k
    for (BasicBlock::iterator E = BB.end(); 
L != E26.6k
;
++L26.0k
)
{26.0k
2969
26.0k
      if (
trackUsesOfI(Users, WriteSet, I, &*L)26.0k
)
{3.04k
2970
3.04k
        if (
L->mayReadFromMemory()3.04k
)
{265
2971
265
          LoadMoveSet[&*L].push_back(I);
2972
265
          LoadMoveSetPairs.insert(ValuePair(&*L, I));
2973
265
        }
2974
3.04k
      }
2975
26.0k
    }
2976
558
  }
2977
2978
  // In cases where both load/stores and the computation of their pointers
2979
  // are chosen for vectorization, we can end up in a situation where the
2980
  // aliasing analysis starts returning different query results as the
2981
  // process of fusing instruction pairs continues. Because the algorithm
2982
  // relies on finding the same use dags here as were found earlier, we'll
2983
  // need to precompute the necessary aliasing information here and then
2984
  // manually update it during the fusion process.
2985
  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
2986
                     std::vector<Value *> &PairableInsts,
2987
                     DenseMap<Value *, Value *> &ChosenPairs,
2988
                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
2989
70
                     DenseSet<ValuePair> &LoadMoveSetPairs) {
2990
70
    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
2991
960
         PIE = PairableInsts.end(); 
PI != PIE960
;
++PI890
)
{890
2992
890
      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
2993
890
      if (
P == ChosenPairs.end()890
)
continue332
;
2994
890
2995
558
      Instruction *I = cast<Instruction>(P->first);
2996
558
      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
2997
558
                             LoadMoveSetPairs, I);
2998
558
    }
2999
70
  }
3000
3001
  // This function fuses the chosen instruction pairs into vector instructions,
3002
  // taking care preserve any needed scalar outputs and, then, it reorders the
3003
  // remaining instructions as needed (users of the first member of the pair
3004
  // need to be moved to after the location of the second member of the pair
3005
  // because the vector instruction is inserted in the location of the pair's
3006
  // second member).
3007
  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
3008
             std::vector<Value *> &PairableInsts,
3009
             DenseMap<Value *, Value *> &ChosenPairs,
3010
             DenseSet<ValuePair> &FixedOrderPairs,
3011
             DenseMap<VPPair, unsigned> &PairConnectionTypes,
3012
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
3013
70
             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
3014
70
    LLVMContext& Context = BB.getContext();
3015
70
3016
70
    // During the vectorization process, the order of the pairs to be fused
3017
70
    // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
3018
70
    // list. After a pair is fused, the flipped pair is removed from the list.
3019
70
    DenseSet<ValuePair> FlippedPairs;
3020
70
    for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
3021
457
         E = ChosenPairs.end(); 
P != E457
;
++P387
)
3022
387
      FlippedPairs.insert(ValuePair(P->second, P->first));
3023
70
    for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
3024
457
         E = FlippedPairs.end(); 
P != E457
;
++P387
)
3025
387
      ChosenPairs.insert(*P);
3026
70
3027
70
    DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
3028
70
    DenseSet<ValuePair> LoadMoveSetPairs;
3029
70
    collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
3030
70
                       LoadMoveSet, LoadMoveSetPairs);
3031
70
3032
70
    DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
3033
70
3034
3.38k
    for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); 
PI != BB.end()3.38k
;)
{3.31k
3035
3.31k
      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI);
3036
3.31k
      if (
P == ChosenPairs.end()3.31k
)
{2.92k
3037
2.92k
        ++PI;
3038
2.92k
        continue;
3039
2.92k
      }
3040
3.31k
3041
393
      
if (393
getDepthFactor(P->first) == 0393
)
{12
3042
12
        // These instructions are not really fused, but are tracked as though
3043
12
        // they are. Any case in which it would be interesting to fuse them
3044
12
        // will be taken care of by InstCombine.
3045
12
        --NumFusedOps;
3046
12
        ++PI;
3047
12
        continue;
3048
12
      }
3049
393
3050
381
      Instruction *I = cast<Instruction>(P->first),
3051
381
        *J = cast<Instruction>(P->second);
3052
381
3053
381
      DEBUG(dbgs() << "BBV: fusing: " << *I <<
3054
381
             " <-> " << *J << "\n");
3055
381
3056
381
      // Remove the pair and flipped pair from the list.
3057
381
      DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
3058
381
      assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
3059
381
      ChosenPairs.erase(FP);
3060
381
      ChosenPairs.erase(P);
3061
381
3062
381
      if (
!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)381
)
{0
3063
0
        DEBUG(dbgs() << "BBV: fusion of: " << *I <<
3064
0
               " <-> " << *J <<
3065
0
               " aborted because of non-trivial dependency cycle\n");
3066
0
        --NumFusedOps;
3067
0
        ++PI;
3068
0
        continue;
3069
0
      }
3070
381
3071
381
      // If the pair must have the other order, then flip it.
3072
381
      bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
3073
381
      if (
!FlipPairOrder && 381
!FixedOrderPairs.count(ValuePair(I, J))372
)
{277
3074
277
        // This pair does not have a fixed order, and so we might want to
3075
277
        // flip it if that will yield fewer shuffles. We count the number
3076
277
        // of dependencies connected via swaps, and those directly connected,
3077
277
        // and flip the order if the number of swaps is greater.
3078
277
        bool OrigOrder = true;
3079
277
        DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
3080
277
          ConnectedPairDeps.find(ValuePair(I, J));
3081
277
        if (
IJ == ConnectedPairDeps.end()277
)
{57
3082
57
          IJ = ConnectedPairDeps.find(ValuePair(J, I));
3083
57
          OrigOrder = false;
3084
57
        }
3085
277
3086
277
        if (
IJ != ConnectedPairDeps.end()277
)
{221
3087
221
          unsigned NumDepsDirect = 0, NumDepsSwap = 0;
3088
221
          for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
3089
584
               TE = IJ->second.end(); 
T != TE584
;
++T363
)
{363
3090
363
            VPPair Q(IJ->first, *T);
3091
363
            DenseMap<VPPair, unsigned>::iterator R =
3092
363
              PairConnectionTypes.find(VPPair(Q.second, Q.first));
3093
363
            assert(R != PairConnectionTypes.end() &&
3094
363
                   "Cannot find pair connection type");
3095
363
            if (R->second == PairConnectionDirect)
3096
341
              ++NumDepsDirect;
3097
22
            else 
if (22
R->second == PairConnectionSwap22
)
3098
8
              ++NumDepsSwap;
3099
363
          }
3100
221
3101
221
          if (!OrigOrder)
3102
1
            std::swap(NumDepsDirect, NumDepsSwap);
3103
221
3104
221
          if (
NumDepsSwap > NumDepsDirect221
)
{8
3105
8
            FlipPairOrder = true;
3106
8
            DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
3107
8
                            " <-> " << *J << "\n");
3108
8
          }
3109
221
        }
3110
277
      }
3111
381
3112
381
      Instruction *L = I, *H = J;
3113
381
      if (FlipPairOrder)
3114
17
        std::swap(H, L);
3115
381
3116
381
      // If the pair being fused uses the opposite order from that in the pair
3117
381
      // connection map, then we need to flip the types.
3118
381
      DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
3119
381
        ConnectedPairs.find(ValuePair(H, L));
3120
381
      if (HL != ConnectedPairs.end())
3121
2
        for (std::vector<ValuePair>::iterator T = HL->second.begin(),
3122
4
             TE = HL->second.end(); 
T != TE4
;
++T2
)
{2
3123
2
          VPPair Q(HL->first, *T);
3124
2
          DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
3125
2
          assert(R != PairConnectionTypes.end() &&
3126
2
                 "Cannot find pair connection type");
3127
2
          if (R->second == PairConnectionDirect)
3128
1
            R->second = PairConnectionSwap;
3129
1
          else 
if (1
R->second == PairConnectionSwap1
)
3130
0
            R->second = PairConnectionDirect;
3131
2
        }
3132
381
3133
381
      bool LBeforeH = !FlipPairOrder;
3134
381
      unsigned NumOperands = I->getNumOperands();
3135
381
      SmallVector<Value *, 3> ReplacedOperands(NumOperands);
3136
381
      getReplacementInputsForPair(Context, L, H, ReplacedOperands,
3137
381
                                  LBeforeH);
3138
381
3139
381
      // Make a copy of the original operation, change its type to the vector
3140
381
      // type and replace its operands with the vector operands.
3141
381
      Instruction *K = L->clone();
3142
381
      if (L->hasName())
3143
275
        K->takeName(L);
3144
106
      else 
if (106
H->hasName()106
)
3145
0
        K->takeName(H);
3146
381
3147
381
      if (auto 
CS381
= CallSite(K))
{17
3148
17
        SmallVector<Type *, 3> Tys;
3149
17
        FunctionType *Old = CS.getFunctionType();
3150
17
        unsigned NumOld = Old->getNumParams();
3151
17
        assert(NumOld <= ReplacedOperands.size());
3152
44
        for (unsigned i = 0; 
i != NumOld44
;
++i27
)
3153
27
          Tys.push_back(ReplacedOperands[i]->getType());
3154
17
        CS.mutateFunctionType(
3155
17
            FunctionType::get(getVecTypeForPair(L->getType(), H->getType()),
3156
17
                              Tys, Old->isVarArg()));
3157
364
      } else 
if (364
!isa<StoreInst>(K)364
)
3158
294
        K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
3159
381
3160
381
      unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
3161
381
                             LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
3162
381
                             LLVMContext::MD_invariant_group};
3163
381
      combineMetadata(K, H, KnownIDs);
3164
381
      K->andIRFlags(H);
3165
381
3166
1.12k
      for (unsigned o = 0; 
o < NumOperands1.12k
;
++o744
)
3167
744
        K->setOperand(o, ReplacedOperands[o]);
3168
381
3169
381
      K->insertAfter(J);
3170
381
3171
381
      // Instruction insertion point:
3172
381
      Instruction *InsertionPt = K;
3173
381
      Instruction *K1 = nullptr, *K2 = nullptr;
3174
381
      replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
3175
381
3176
381
      // The use dag of the first original instruction must be moved to after
3177
381
      // the location of the second instruction. The entire use dag of the
3178
381
      // first instruction is disjoint from the input dag of the second
3179
381
      // (by definition), and so commutes with it.
3180
381
3181
381
      moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
3182
381
3183
381
      if (
!isa<StoreInst>(I)381
)
{311
3184
311
        L->replaceAllUsesWith(K1);
3185
311
        H->replaceAllUsesWith(K2);
3186
311
      }
3187
381
3188
381
      // Instructions that may read from memory may be in the load move set.
3189
381
      // Once an instruction is fused, we no longer need its move set, and so
3190
381
      // the values of the map never need to be updated. However, when a load
3191
381
      // is fused, we need to merge the entries from both instructions in the
3192
381
      // pair in case those instructions were in the move set of some other
3193
381
      // yet-to-be-fused pair. The loads in question are the keys of the map.
3194
381
      if (
I->mayReadFromMemory()381
)
{34
3195
34
        std::vector<ValuePair> NewSetMembers;
3196
34
        DenseMap<Value *, std::vector<Value *> >::iterator II =
3197
34
          LoadMoveSet.find(I);
3198
34
        if (II != LoadMoveSet.end())
3199
0
          for (std::vector<Value *>::iterator N = II->second.begin(),
3200
0
               NE = II->second.end(); 
N != NE0
;
++N0
)
3201
0
            NewSetMembers.push_back(ValuePair(K, *N));
3202
34
        DenseMap<Value *, std::vector<Value *> >::iterator JJ =
3203
34
          LoadMoveSet.find(J);
3204
34
        if (JJ != LoadMoveSet.end())
3205
0
          for (std::vector<Value *>::iterator N = JJ->second.begin(),
3206
0
               NE = JJ->second.end(); 
N != NE0
;
++N0
)
3207
0
            NewSetMembers.push_back(ValuePair(K, *N));
3208
34
        for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
3209
34
             AE = NewSetMembers.end(); 
A != AE34
;
++A0
)
{0
3210
0
          LoadMoveSet[A->first].push_back(A->second);
3211
0
          LoadMoveSetPairs.insert(*A);
3212
0
        }
3213
34
      }
3214
381
3215
381
      // Before removing I, set the iterator to the next instruction.
3216
381
      PI = std::next(BasicBlock::iterator(I));
3217
381
      if (cast<Instruction>(PI) == J)
3218
89
        ++PI;
3219
381
3220
381
      SE->forgetValue(I);
3221
381
      SE->forgetValue(J);
3222
381
      I->eraseFromParent();
3223
381
      J->eraseFromParent();
3224
381
3225
381
      DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
3226
381
                                               BB << "\n");
3227
381
    }
3228
70
3229
70
    DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
3230
70
  }
3231
}
3232
3233
char BBVectorize::ID = 0;
3234
static const char bb_vectorize_name[] = "Basic-Block Vectorization";
3235
23.2k
INITIALIZE_PASS_BEGIN23.2k
(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)23.2k
3236
23.2k
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
3237
23.2k
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
3238
23.2k
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
3239
23.2k
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
3240
23.2k
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
3241
23.2k
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
3242
23.2k
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
3243
23.2k
INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
3244
23.2k
INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
3245
3246
0
BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
3247
0
  return new BBVectorize(C);
3248
0
}
3249
3250
bool
3251
0
llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
3252
0
  BBVectorize BBVectorizer(P, *BB.getParent(), C);
3253
0
  return BBVectorizer.vectorizeBB(BB);
3254
0
}
3255
3256
//===----------------------------------------------------------------------===//
3257
36
VectorizeConfig::VectorizeConfig() {
3258
36
  VectorBits = ::VectorBits;
3259
36
  VectorizeBools = !::NoBools;
3260
36
  VectorizeInts = !::NoInts;
3261
36
  VectorizeFloats = !::NoFloats;
3262
36
  VectorizePointers = !::NoPointers;
3263
36
  VectorizeCasts = !::NoCasts;
3264
36
  VectorizeMath = !::NoMath;
3265
36
  VectorizeBitManipulations = !::NoBitManipulation;
3266
36
  VectorizeFMA = !::NoFMA;
3267
36
  VectorizeSelect = !::NoSelect;
3268
36
  VectorizeCmp = !::NoCmp;
3269
36
  VectorizeGEP = !::NoGEP;
3270
36
  VectorizeMemOps = !::NoMemOps;
3271
36
  AlignedOnly = ::AlignedOnly;
3272
36
  ReqChainDepth= ::ReqChainDepth;
3273
36
  SearchLimit = ::SearchLimit;
3274
36
  MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
3275
36
  SplatBreaksChain = ::SplatBreaksChain;
3276
36
  MaxInsts = ::MaxInsts;
3277
36
  MaxPairs = ::MaxPairs;
3278
36
  MaxIter = ::MaxIter;
3279
36
  Pow2LenOnly = ::Pow2LenOnly;
3280
36
  NoMemOpBoost = ::NoMemOpBoost;
3281
36
  FastDep = ::FastDep;
3282
36
}