/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This file implements a basic-block vectorization pass. The algorithm was |
11 | | // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, |
12 | | // et al. It works by looking for chains of pairable operations and then |
13 | | // pairing them. |
14 | | // |
15 | | //===----------------------------------------------------------------------===// |
16 | | |
17 | | #define BBV_NAME "bb-vectorize" |
18 | | #include "llvm/ADT/DenseMap.h" |
19 | | #include "llvm/ADT/DenseSet.h" |
20 | | #include "llvm/ADT/STLExtras.h" |
21 | | #include "llvm/ADT/SmallSet.h" |
22 | | #include "llvm/ADT/SmallVector.h" |
23 | | #include "llvm/ADT/Statistic.h" |
24 | | #include "llvm/ADT/StringExtras.h" |
25 | | #include "llvm/Analysis/AliasAnalysis.h" |
26 | | #include "llvm/Analysis/AliasSetTracker.h" |
27 | | #include "llvm/Analysis/GlobalsModRef.h" |
28 | | #include "llvm/Analysis/ScalarEvolution.h" |
29 | | #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" |
30 | | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
31 | | #include "llvm/Analysis/TargetLibraryInfo.h" |
32 | | #include "llvm/Analysis/TargetTransformInfo.h" |
33 | | #include "llvm/Analysis/ValueTracking.h" |
34 | | #include "llvm/IR/Constants.h" |
35 | | #include "llvm/IR/DataLayout.h" |
36 | | #include "llvm/IR/DerivedTypes.h" |
37 | | #include "llvm/IR/Dominators.h" |
38 | | #include "llvm/IR/Function.h" |
39 | | #include "llvm/IR/Instructions.h" |
40 | | #include "llvm/IR/IntrinsicInst.h" |
41 | | #include "llvm/IR/Intrinsics.h" |
42 | | #include "llvm/IR/LLVMContext.h" |
43 | | #include "llvm/IR/Metadata.h" |
44 | | #include "llvm/IR/Module.h" |
45 | | #include "llvm/IR/Type.h" |
46 | | #include "llvm/IR/ValueHandle.h" |
47 | | #include "llvm/Pass.h" |
48 | | #include "llvm/Support/CommandLine.h" |
49 | | #include "llvm/Support/Debug.h" |
50 | | #include "llvm/Support/raw_ostream.h" |
51 | | #include "llvm/Transforms/Utils/Local.h" |
52 | | #include "llvm/Transforms/Vectorize.h" |
53 | | #include <algorithm> |
54 | | using namespace llvm; |
55 | | |
56 | | #define DEBUG_TYPE BBV_NAME |
57 | | |
58 | | static cl::opt<bool> |
59 | | IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), |
60 | | cl::Hidden, cl::desc("Ignore target information")); |
61 | | |
62 | | static cl::opt<unsigned> |
63 | | ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, |
64 | | cl::desc("The required chain depth for vectorization")); |
65 | | |
66 | | static cl::opt<bool> |
67 | | UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), |
68 | | cl::Hidden, cl::desc("Use the chain depth requirement with" |
69 | | " target information")); |
70 | | |
71 | | static cl::opt<unsigned> |
72 | | SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, |
73 | | cl::desc("The maximum search distance for instruction pairs")); |
74 | | |
75 | | static cl::opt<bool> |
76 | | SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, |
77 | | cl::desc("Replicating one element to a pair breaks the chain")); |
78 | | |
79 | | static cl::opt<unsigned> |
80 | | VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, |
81 | | cl::desc("The size of the native vector registers")); |
82 | | |
83 | | static cl::opt<unsigned> |
84 | | MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, |
85 | | cl::desc("The maximum number of pairing iterations")); |
86 | | |
87 | | static cl::opt<bool> |
88 | | Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, |
89 | | cl::desc("Don't try to form non-2^n-length vectors")); |
90 | | |
91 | | static cl::opt<unsigned> |
92 | | MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, |
93 | | cl::desc("The maximum number of pairable instructions per group")); |
94 | | |
95 | | static cl::opt<unsigned> |
96 | | MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden, |
97 | | cl::desc("The maximum number of candidate instruction pairs per group")); |
98 | | |
99 | | static cl::opt<unsigned> |
100 | | MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), |
101 | | cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" |
102 | | " a full cycle check")); |
103 | | |
104 | | static cl::opt<bool> |
105 | | NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, |
106 | | cl::desc("Don't try to vectorize boolean (i1) values")); |
107 | | |
108 | | static cl::opt<bool> |
109 | | NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, |
110 | | cl::desc("Don't try to vectorize integer values")); |
111 | | |
112 | | static cl::opt<bool> |
113 | | NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, |
114 | | cl::desc("Don't try to vectorize floating-point values")); |
115 | | |
116 | | // FIXME: This should default to false once pointer vector support works. |
117 | | static cl::opt<bool> |
118 | | NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, |
119 | | cl::desc("Don't try to vectorize pointer values")); |
120 | | |
121 | | static cl::opt<bool> |
122 | | NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, |
123 | | cl::desc("Don't try to vectorize casting (conversion) operations")); |
124 | | |
125 | | static cl::opt<bool> |
126 | | NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, |
127 | | cl::desc("Don't try to vectorize floating-point math intrinsics")); |
128 | | |
129 | | static cl::opt<bool> |
130 | | NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden, |
131 | | cl::desc("Don't try to vectorize BitManipulation intrinsics")); |
132 | | |
133 | | static cl::opt<bool> |
134 | | NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, |
135 | | cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); |
136 | | |
137 | | static cl::opt<bool> |
138 | | NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, |
139 | | cl::desc("Don't try to vectorize select instructions")); |
140 | | |
141 | | static cl::opt<bool> |
142 | | NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, |
143 | | cl::desc("Don't try to vectorize comparison instructions")); |
144 | | |
145 | | static cl::opt<bool> |
146 | | NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, |
147 | | cl::desc("Don't try to vectorize getelementptr instructions")); |
148 | | |
149 | | static cl::opt<bool> |
150 | | NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, |
151 | | cl::desc("Don't try to vectorize loads and stores")); |
152 | | |
153 | | static cl::opt<bool> |
154 | | AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, |
155 | | cl::desc("Only generate aligned loads and stores")); |
156 | | |
157 | | static cl::opt<bool> |
158 | | NoMemOpBoost("bb-vectorize-no-mem-op-boost", |
159 | | cl::init(false), cl::Hidden, |
160 | | cl::desc("Don't boost the chain-depth contribution of loads and stores")); |
161 | | |
162 | | static cl::opt<bool> |
163 | | FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, |
164 | | cl::desc("Use a fast instruction dependency analysis")); |
165 | | |
166 | | #ifndef NDEBUG |
167 | | static cl::opt<bool> |
168 | | DebugInstructionExamination("bb-vectorize-debug-instruction-examination", |
169 | | cl::init(false), cl::Hidden, |
170 | | cl::desc("When debugging is enabled, output information on the" |
171 | | " instruction-examination process")); |
172 | | static cl::opt<bool> |
173 | | DebugCandidateSelection("bb-vectorize-debug-candidate-selection", |
174 | | cl::init(false), cl::Hidden, |
175 | | cl::desc("When debugging is enabled, output information on the" |
176 | | " candidate-selection process")); |
177 | | static cl::opt<bool> |
178 | | DebugPairSelection("bb-vectorize-debug-pair-selection", |
179 | | cl::init(false), cl::Hidden, |
180 | | cl::desc("When debugging is enabled, output information on the" |
181 | | " pair-selection process")); |
182 | | static cl::opt<bool> |
183 | | DebugCycleCheck("bb-vectorize-debug-cycle-check", |
184 | | cl::init(false), cl::Hidden, |
185 | | cl::desc("When debugging is enabled, output information on the" |
186 | | " cycle-checking process")); |
187 | | |
188 | | static cl::opt<bool> |
189 | | PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", |
190 | | cl::init(false), cl::Hidden, |
191 | | cl::desc("When debugging is enabled, dump the basic block after" |
192 | | " every pair is fused")); |
193 | | #endif |
194 | | |
195 | | STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); |
196 | | |
197 | | namespace { |
198 | | struct BBVectorize : public BasicBlockPass { |
199 | | static char ID; // Pass identification, replacement for typeid |
200 | | |
201 | | const VectorizeConfig Config; |
202 | | |
203 | | BBVectorize(const VectorizeConfig &C = VectorizeConfig()) |
204 | 36 | : BasicBlockPass(ID), Config(C) { |
205 | 36 | initializeBBVectorizePass(*PassRegistry::getPassRegistry()); |
206 | 36 | } |
207 | | |
208 | | BBVectorize(Pass *P, Function &F, const VectorizeConfig &C) |
209 | 0 | : BasicBlockPass(ID), Config(C) { |
210 | 0 | AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults(); |
211 | 0 | DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
212 | 0 | SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
213 | 0 | TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); |
214 | 0 | TTI = IgnoreTargetInfo |
215 | 0 | ? nullptr |
216 | 0 | : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
217 | 0 | } |
218 | | |
219 | | typedef std::pair<Value *, Value *> ValuePair; |
220 | | typedef std::pair<ValuePair, int> ValuePairWithCost; |
221 | | typedef std::pair<ValuePair, size_t> ValuePairWithDepth; |
222 | | typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair |
223 | | typedef std::pair<VPPair, unsigned> VPPairWithType; |
224 | | |
225 | | AliasAnalysis *AA; |
226 | | DominatorTree *DT; |
227 | | ScalarEvolution *SE; |
228 | | const TargetLibraryInfo *TLI; |
229 | | const TargetTransformInfo *TTI; |
230 | | |
231 | | // FIXME: const correct? |
232 | | |
233 | | bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false); |
234 | | |
235 | | bool getCandidatePairs(BasicBlock &BB, |
236 | | BasicBlock::iterator &Start, |
237 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
238 | | DenseSet<ValuePair> &FixedOrderPairs, |
239 | | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
240 | | std::vector<Value *> &PairableInsts, bool NonPow2Len); |
241 | | |
242 | | // FIXME: The current implementation does not account for pairs that |
243 | | // are connected in multiple ways. For example: |
244 | | // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) |
245 | | enum PairConnectionType { |
246 | | PairConnectionDirect, |
247 | | PairConnectionSwap, |
248 | | PairConnectionSplat |
249 | | }; |
250 | | |
251 | | void computeConnectedPairs( |
252 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
253 | | DenseSet<ValuePair> &CandidatePairsSet, |
254 | | std::vector<Value *> &PairableInsts, |
255 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
256 | | DenseMap<VPPair, unsigned> &PairConnectionTypes); |
257 | | |
258 | | void buildDepMap(BasicBlock &BB, |
259 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
260 | | std::vector<Value *> &PairableInsts, |
261 | | DenseSet<ValuePair> &PairableInstUsers); |
262 | | |
263 | | void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
264 | | DenseSet<ValuePair> &CandidatePairsSet, |
265 | | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
266 | | std::vector<Value *> &PairableInsts, |
267 | | DenseSet<ValuePair> &FixedOrderPairs, |
268 | | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
269 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
270 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, |
271 | | DenseSet<ValuePair> &PairableInstUsers, |
272 | | DenseMap<Value *, Value *>& ChosenPairs); |
273 | | |
274 | | void fuseChosenPairs(BasicBlock &BB, |
275 | | std::vector<Value *> &PairableInsts, |
276 | | DenseMap<Value *, Value *>& ChosenPairs, |
277 | | DenseSet<ValuePair> &FixedOrderPairs, |
278 | | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
279 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
280 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps); |
281 | | |
282 | | |
283 | | bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); |
284 | | |
285 | | bool areInstsCompatible(Instruction *I, Instruction *J, |
286 | | bool IsSimpleLoadStore, bool NonPow2Len, |
287 | | int &CostSavings, int &FixedOrder); |
288 | | |
289 | | bool trackUsesOfI(DenseSet<Value *> &Users, |
290 | | AliasSetTracker &WriteSet, Instruction *I, |
291 | | Instruction *J, bool UpdateUsers = true, |
292 | | DenseSet<ValuePair> *LoadMoveSetPairs = nullptr); |
293 | | |
294 | | void computePairsConnectedTo( |
295 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
296 | | DenseSet<ValuePair> &CandidatePairsSet, |
297 | | std::vector<Value *> &PairableInsts, |
298 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
299 | | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
300 | | ValuePair P); |
301 | | |
302 | | bool pairsConflict(ValuePair P, ValuePair Q, |
303 | | DenseSet<ValuePair> &PairableInstUsers, |
304 | | DenseMap<ValuePair, std::vector<ValuePair> > |
305 | | *PairableInstUserMap = nullptr, |
306 | | DenseSet<VPPair> *PairableInstUserPairSet = nullptr); |
307 | | |
308 | | bool pairWillFormCycle(ValuePair P, |
309 | | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers, |
310 | | DenseSet<ValuePair> &CurrentPairs); |
311 | | |
312 | | void pruneDAGFor( |
313 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
314 | | std::vector<Value *> &PairableInsts, |
315 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
316 | | DenseSet<ValuePair> &PairableInstUsers, |
317 | | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, |
318 | | DenseSet<VPPair> &PairableInstUserPairSet, |
319 | | DenseMap<Value *, Value *> &ChosenPairs, |
320 | | DenseMap<ValuePair, size_t> &DAG, |
321 | | DenseSet<ValuePair> &PrunedDAG, ValuePair J, |
322 | | bool UseCycleCheck); |
323 | | |
324 | | void buildInitialDAGFor( |
325 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
326 | | DenseSet<ValuePair> &CandidatePairsSet, |
327 | | std::vector<Value *> &PairableInsts, |
328 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
329 | | DenseSet<ValuePair> &PairableInstUsers, |
330 | | DenseMap<Value *, Value *> &ChosenPairs, |
331 | | DenseMap<ValuePair, size_t> &DAG, ValuePair J); |
332 | | |
333 | | void findBestDAGFor( |
334 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
335 | | DenseSet<ValuePair> &CandidatePairsSet, |
336 | | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
337 | | std::vector<Value *> &PairableInsts, |
338 | | DenseSet<ValuePair> &FixedOrderPairs, |
339 | | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
340 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
341 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, |
342 | | DenseSet<ValuePair> &PairableInstUsers, |
343 | | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, |
344 | | DenseSet<VPPair> &PairableInstUserPairSet, |
345 | | DenseMap<Value *, Value *> &ChosenPairs, |
346 | | DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, |
347 | | int &BestEffSize, Value *II, std::vector<Value *>&JJ, |
348 | | bool UseCycleCheck); |
349 | | |
350 | | Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, |
351 | | Instruction *J, unsigned o); |
352 | | |
353 | | void fillNewShuffleMask(LLVMContext& Context, Instruction *J, |
354 | | unsigned MaskOffset, unsigned NumInElem, |
355 | | unsigned NumInElem1, unsigned IdxOffset, |
356 | | std::vector<Constant*> &Mask); |
357 | | |
358 | | Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, |
359 | | Instruction *J); |
360 | | |
361 | | bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, |
362 | | unsigned o, Value *&LOp, unsigned numElemL, |
363 | | Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, |
364 | | unsigned IdxOff = 0); |
365 | | |
366 | | Value *getReplacementInput(LLVMContext& Context, Instruction *I, |
367 | | Instruction *J, unsigned o, bool IBeforeJ); |
368 | | |
369 | | void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, |
370 | | Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands, |
371 | | bool IBeforeJ); |
372 | | |
373 | | void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, |
374 | | Instruction *J, Instruction *K, |
375 | | Instruction *&InsertionPt, Instruction *&K1, |
376 | | Instruction *&K2); |
377 | | |
378 | | void collectPairLoadMoveSet(BasicBlock &BB, |
379 | | DenseMap<Value *, Value *> &ChosenPairs, |
380 | | DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, |
381 | | DenseSet<ValuePair> &LoadMoveSetPairs, |
382 | | Instruction *I); |
383 | | |
384 | | void collectLoadMoveSet(BasicBlock &BB, |
385 | | std::vector<Value *> &PairableInsts, |
386 | | DenseMap<Value *, Value *> &ChosenPairs, |
387 | | DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, |
388 | | DenseSet<ValuePair> &LoadMoveSetPairs); |
389 | | |
390 | | bool canMoveUsesOfIAfterJ(BasicBlock &BB, |
391 | | DenseSet<ValuePair> &LoadMoveSetPairs, |
392 | | Instruction *I, Instruction *J); |
393 | | |
394 | | void moveUsesOfIAfterJ(BasicBlock &BB, |
395 | | DenseSet<ValuePair> &LoadMoveSetPairs, |
396 | | Instruction *&InsertionPt, |
397 | | Instruction *I, Instruction *J); |
398 | | |
399 | 107 | bool vectorizeBB(BasicBlock &BB) { |
400 | 107 | if (skipBasicBlock(BB)) |
401 | 0 | return false; |
402 | 107 | if (107 !DT->isReachableFromEntry(&BB)107 ) {4 |
403 | 4 | DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() << |
404 | 4 | " in " << BB.getParent()->getName() << "\n"); |
405 | 4 | return false; |
406 | 4 | } |
407 | 107 | |
408 | 103 | DEBUG103 (if (TTI) dbgs() << "BBV: using target information\n");103 |
409 | 103 | |
410 | 103 | bool changed = false; |
411 | 103 | // Iterate a sufficient number of times to merge types of size 1 bit, |
412 | 103 | // then 2 bits, then 4, etc. up to half of the target vector width of the |
413 | 103 | // target vector register. |
414 | 103 | unsigned n = 1; |
415 | 103 | for (unsigned v = 2; |
416 | 172 | (TTI || 172 v <= Config.VectorBits100 ) && |
417 | 172 | (!Config.MaxIter || 172 n <= Config.MaxIter0 ); |
418 | 172 | v *= 2, ++n69 ) {172 |
419 | 172 | DEBUG(dbgs() << "BBV: fusing loop #" << n << |
420 | 172 | " for " << BB.getName() << " in " << |
421 | 172 | BB.getParent()->getName() << "...\n"); |
422 | 172 | if (vectorizePairs(BB)) |
423 | 69 | changed = true; |
424 | 172 | else |
425 | 103 | break; |
426 | 172 | } |
427 | 103 | |
428 | 103 | if (changed && 103 !Pow2LenOnly60 ) {60 |
429 | 60 | ++n; |
430 | 61 | for (; !Config.MaxIter || 61 n <= Config.MaxIter0 ; ++n1 ) {61 |
431 | 61 | DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << |
432 | 61 | n << " for " << BB.getName() << " in " << |
433 | 61 | BB.getParent()->getName() << "...\n"); |
434 | 61 | if (!vectorizePairs(BB, true)61 ) break60 ; |
435 | 61 | } |
436 | 60 | } |
437 | 103 | |
438 | 103 | DEBUG(dbgs() << "BBV: done!\n"); |
439 | 103 | return changed; |
440 | 107 | } |
441 | | |
442 | 107 | bool runOnBasicBlock(BasicBlock &BB) override { |
443 | 107 | // OptimizeNone check deferred to vectorizeBB(). |
444 | 107 | |
445 | 107 | AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
446 | 107 | DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
447 | 107 | SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
448 | 107 | TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); |
449 | 107 | TTI = IgnoreTargetInfo |
450 | 64 | ? nullptr |
451 | 43 | : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( |
452 | 43 | *BB.getParent()); |
453 | 107 | |
454 | 107 | return vectorizeBB(BB); |
455 | 107 | } |
456 | | |
457 | 36 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
458 | 36 | BasicBlockPass::getAnalysisUsage(AU); |
459 | 36 | AU.addRequired<AAResultsWrapperPass>(); |
460 | 36 | AU.addRequired<DominatorTreeWrapperPass>(); |
461 | 36 | AU.addRequired<ScalarEvolutionWrapperPass>(); |
462 | 36 | AU.addRequired<TargetLibraryInfoWrapperPass>(); |
463 | 36 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
464 | 36 | AU.addPreserved<DominatorTreeWrapperPass>(); |
465 | 36 | AU.addPreserved<GlobalsAAWrapperPass>(); |
466 | 36 | AU.addPreserved<ScalarEvolutionWrapperPass>(); |
467 | 36 | AU.addPreserved<SCEVAAWrapperPass>(); |
468 | 36 | AU.setPreservesCFG(); |
469 | 36 | } |
470 | | |
471 | 34.6k | static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { |
472 | 34.6k | assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() && |
473 | 34.6k | "Cannot form vector from incompatible scalar types"); |
474 | 34.6k | Type *STy = ElemTy->getScalarType(); |
475 | 34.6k | |
476 | 34.6k | unsigned numElem; |
477 | 34.6k | if (VectorType *VTy34.6k = dyn_cast<VectorType>(ElemTy)) {16.5k |
478 | 16.5k | numElem = VTy->getNumElements(); |
479 | 18.0k | } else { |
480 | 18.0k | numElem = 1; |
481 | 18.0k | } |
482 | 34.6k | |
483 | 34.6k | if (VectorType *VTy34.6k = dyn_cast<VectorType>(Elem2Ty)) {16.5k |
484 | 16.5k | numElem += VTy->getNumElements(); |
485 | 18.1k | } else { |
486 | 18.1k | numElem += 1; |
487 | 18.1k | } |
488 | 34.6k | |
489 | 34.6k | return VectorType::get(STy, numElem); |
490 | 34.6k | } |
491 | | |
492 | | static inline void getInstructionTypes(Instruction *I, |
493 | 29.5k | Type *&T1, Type *&T2) { |
494 | 29.5k | if (StoreInst *SI29.5k = dyn_cast<StoreInst>(I)) {1.77k |
495 | 1.77k | // For stores, it is the value type, not the pointer type that matters |
496 | 1.77k | // because the value is what will come from a vector register. |
497 | 1.77k | |
498 | 1.77k | Value *IVal = SI->getValueOperand(); |
499 | 1.77k | T1 = IVal->getType(); |
500 | 27.8k | } else { |
501 | 27.8k | T1 = I->getType(); |
502 | 27.8k | } |
503 | 29.5k | |
504 | 29.5k | if (CastInst *CI = dyn_cast<CastInst>(I)) |
505 | 1.54k | T2 = CI->getSrcTy(); |
506 | 29.5k | else |
507 | 28.0k | T2 = T1; |
508 | 29.5k | |
509 | 29.5k | if (SelectInst *SI29.5k = dyn_cast<SelectInst>(I)) {10 |
510 | 10 | T2 = SI->getCondition()->getType(); |
511 | 29.5k | } else if (ShuffleVectorInst *29.5k SI29.5k = dyn_cast<ShuffleVectorInst>(I)) {499 |
512 | 499 | T2 = SI->getOperand(0)->getType(); |
513 | 29.0k | } else if (CmpInst *29.0k CI29.0k = dyn_cast<CmpInst>(I)) {33 |
514 | 33 | T2 = CI->getOperand(0)->getType(); |
515 | 33 | } |
516 | 29.5k | } |
517 | | |
518 | | // Returns the weight associated with the provided value. A chain of |
519 | | // candidate pairs has a length given by the sum of the weights of its |
520 | | // members (one weight per pair; the weight of each member of the pair |
521 | | // is assumed to be the same). This length is then compared to the |
522 | | // chain-length threshold to determine if a given chain is significant |
523 | | // enough to be vectorized. The length is also used in comparing |
524 | | // candidate chains where longer chains are considered to be better. |
525 | | // Note: when this function returns 0, the resulting instructions are |
526 | | // not actually fused. |
527 | 21.0k | inline size_t getDepthFactor(Value *V) { |
528 | 21.0k | // InsertElement and ExtractElement have a depth factor of zero. This is |
529 | 21.0k | // for two reasons: First, they cannot be usefully fused. Second, because |
530 | 21.0k | // the pass generates a lot of these, they can confuse the simple metric |
531 | 21.0k | // used to compare the dags in the next iteration. Thus, giving them a |
532 | 21.0k | // weight of zero allows the pass to essentially ignore them in |
533 | 21.0k | // subsequent iterations when looking for vectorization opportunities |
534 | 21.0k | // while still tracking dependency chains that flow through those |
535 | 21.0k | // instructions. |
536 | 21.0k | if (isa<InsertElementInst>(V) || 21.0k isa<ExtractElementInst>(V)14.3k ) |
537 | 7.59k | return 0; |
538 | 21.0k | |
539 | 21.0k | // Give a load or store half of the required depth so that load/store |
540 | 21.0k | // pairs will vectorize. |
541 | 13.4k | if (13.4k !Config.NoMemOpBoost && 13.4k (isa<LoadInst>(V) || 13.4k isa<StoreInst>(V)13.2k )) |
542 | 1.17k | return Config.ReqChainDepth/2; |
543 | 13.4k | |
544 | 12.2k | return 1; |
545 | 13.4k | } |
546 | | |
547 | | // Returns the cost of the provided instruction using TTI. |
548 | | // This does not handle loads and stores. |
549 | | unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2, |
550 | | TargetTransformInfo::OperandValueKind Op1VK = |
551 | | TargetTransformInfo::OK_AnyValue, |
552 | | TargetTransformInfo::OperandValueKind Op2VK = |
553 | | TargetTransformInfo::OK_AnyValue, |
554 | 33.9k | const Instruction *I = nullptr) { |
555 | 33.9k | switch (Opcode) { |
556 | 6.18k | default: break; |
557 | 0 | case Instruction::GetElementPtr: |
558 | 0 | // We mark this instruction as zero-cost because scalar GEPs are usually |
559 | 0 | // lowered to the instruction addressing mode. At the moment we don't |
560 | 0 | // generate vector GEPs. |
561 | 0 | return 0; |
562 | 0 | case Instruction::Br: |
563 | 0 | return TTI->getCFInstrCost(Opcode); |
564 | 0 | case Instruction::PHI: |
565 | 0 | return 0; |
566 | 19.9k | case Instruction::Add: |
567 | 19.9k | case Instruction::FAdd: |
568 | 19.9k | case Instruction::Sub: |
569 | 19.9k | case Instruction::FSub: |
570 | 19.9k | case Instruction::Mul: |
571 | 19.9k | case Instruction::FMul: |
572 | 19.9k | case Instruction::UDiv: |
573 | 19.9k | case Instruction::SDiv: |
574 | 19.9k | case Instruction::FDiv: |
575 | 19.9k | case Instruction::URem: |
576 | 19.9k | case Instruction::SRem: |
577 | 19.9k | case Instruction::FRem: |
578 | 19.9k | case Instruction::Shl: |
579 | 19.9k | case Instruction::LShr: |
580 | 19.9k | case Instruction::AShr: |
581 | 19.9k | case Instruction::And: |
582 | 19.9k | case Instruction::Or: |
583 | 19.9k | case Instruction::Xor: |
584 | 19.9k | return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK); |
585 | 3 | case Instruction::Select: |
586 | 3 | case Instruction::ICmp: |
587 | 3 | case Instruction::FCmp: |
588 | 3 | return TTI->getCmpSelInstrCost(Opcode, T1, T2, I); |
589 | 7.89k | case Instruction::ZExt: |
590 | 7.89k | case Instruction::SExt: |
591 | 7.89k | case Instruction::FPToUI: |
592 | 7.89k | case Instruction::FPToSI: |
593 | 7.89k | case Instruction::FPExt: |
594 | 7.89k | case Instruction::PtrToInt: |
595 | 7.89k | case Instruction::IntToPtr: |
596 | 7.89k | case Instruction::SIToFP: |
597 | 7.89k | case Instruction::UIToFP: |
598 | 7.89k | case Instruction::Trunc: |
599 | 7.89k | case Instruction::FPTrunc: |
600 | 7.89k | case Instruction::BitCast: |
601 | 7.89k | case Instruction::ShuffleVector: |
602 | 7.89k | return TTI->getCastInstrCost(Opcode, T1, T2, I); |
603 | 33.9k | } |
604 | 33.9k | |
605 | 6.18k | return 1; |
606 | 33.9k | } |
607 | | |
608 | | // This determines the relative offset of two loads or stores, returning |
609 | | // true if the offset could be determined to be some constant value. |
610 | | // For example, if OffsetInElmts == 1, then J accesses the memory directly |
611 | | // after I; if OffsetInElmts == -1 then I accesses the memory |
612 | | // directly after J. |
613 | | bool getPairPtrInfo(Instruction *I, Instruction *J, |
614 | | Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, |
615 | | unsigned &IAddressSpace, unsigned &JAddressSpace, |
616 | 1.77k | int64_t &OffsetInElmts, bool ComputeOffset = true) { |
617 | 1.77k | OffsetInElmts = 0; |
618 | 1.77k | if (LoadInst *LI1.77k = dyn_cast<LoadInst>(I)) {960 |
619 | 960 | LoadInst *LJ = cast<LoadInst>(J); |
620 | 960 | IPtr = LI->getPointerOperand(); |
621 | 960 | JPtr = LJ->getPointerOperand(); |
622 | 960 | IAlignment = LI->getAlignment(); |
623 | 960 | JAlignment = LJ->getAlignment(); |
624 | 960 | IAddressSpace = LI->getPointerAddressSpace(); |
625 | 960 | JAddressSpace = LJ->getPointerAddressSpace(); |
626 | 818 | } else { |
627 | 818 | StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); |
628 | 818 | IPtr = SI->getPointerOperand(); |
629 | 818 | JPtr = SJ->getPointerOperand(); |
630 | 818 | IAlignment = SI->getAlignment(); |
631 | 818 | JAlignment = SJ->getAlignment(); |
632 | 818 | IAddressSpace = SI->getPointerAddressSpace(); |
633 | 818 | JAddressSpace = SJ->getPointerAddressSpace(); |
634 | 818 | } |
635 | 1.77k | |
636 | 1.77k | if (!ComputeOffset) |
637 | 104 | return true; |
638 | 1.77k | |
639 | 1.67k | const SCEV *IPtrSCEV = SE->getSCEV(IPtr); |
640 | 1.67k | const SCEV *JPtrSCEV = SE->getSCEV(JPtr); |
641 | 1.67k | |
642 | 1.67k | // If this is a trivial offset, then we'll get something like |
643 | 1.67k | // 1*sizeof(type). With target data, which we need anyway, this will get |
644 | 1.67k | // constant folded into a number. |
645 | 1.67k | const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); |
646 | 1.67k | if (const SCEVConstant *ConstOffSCEV = |
647 | 1.15k | dyn_cast<SCEVConstant>(OffsetSCEV)) { |
648 | 1.15k | ConstantInt *IntOff = ConstOffSCEV->getValue(); |
649 | 1.15k | int64_t Offset = IntOff->getSExtValue(); |
650 | 1.15k | const DataLayout &DL = I->getModule()->getDataLayout(); |
651 | 1.15k | Type *VTy = IPtr->getType()->getPointerElementType(); |
652 | 1.15k | int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy); |
653 | 1.15k | |
654 | 1.15k | Type *VTy2 = JPtr->getType()->getPointerElementType(); |
655 | 1.15k | if (VTy != VTy2 && 1.15k Offset < 00 ) {0 |
656 | 0 | int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2); |
657 | 0 | OffsetInElmts = Offset/VTy2TSS; |
658 | 0 | return (std::abs(Offset) % VTy2TSS) == 0; |
659 | 0 | } |
660 | 1.15k | |
661 | 1.15k | OffsetInElmts = Offset/VTyTSS; |
662 | 1.15k | return (std::abs(Offset) % VTyTSS) == 0; |
663 | 1.15k | } |
664 | 1.67k | |
665 | 516 | return false; |
666 | 1.67k | } |
667 | | |
668 | | // Returns true if the provided CallInst represents an intrinsic that can |
669 | | // be vectorized. |
670 | 154 | bool isVectorizableIntrinsic(CallInst* I) { |
671 | 154 | Function *F = I->getCalledFunction(); |
672 | 154 | if (!F154 ) return false14 ; |
673 | 154 | |
674 | 140 | Intrinsic::ID IID = F->getIntrinsicID(); |
675 | 140 | if (!IID140 ) return false31 ; |
676 | 140 | |
677 | 109 | switch(IID) { |
678 | 27 | default: |
679 | 27 | return false; |
680 | 48 | case Intrinsic::sqrt: |
681 | 48 | case Intrinsic::powi: |
682 | 48 | case Intrinsic::sin: |
683 | 48 | case Intrinsic::cos: |
684 | 48 | case Intrinsic::log: |
685 | 48 | case Intrinsic::log2: |
686 | 48 | case Intrinsic::log10: |
687 | 48 | case Intrinsic::exp: |
688 | 48 | case Intrinsic::exp2: |
689 | 48 | case Intrinsic::pow: |
690 | 48 | case Intrinsic::round: |
691 | 48 | case Intrinsic::copysign: |
692 | 48 | case Intrinsic::ceil: |
693 | 48 | case Intrinsic::nearbyint: |
694 | 48 | case Intrinsic::rint: |
695 | 48 | case Intrinsic::trunc: |
696 | 48 | case Intrinsic::floor: |
697 | 48 | case Intrinsic::fabs: |
698 | 48 | case Intrinsic::minnum: |
699 | 48 | case Intrinsic::maxnum: |
700 | 48 | return Config.VectorizeMath; |
701 | 20 | case Intrinsic::bswap: |
702 | 20 | case Intrinsic::ctpop: |
703 | 20 | case Intrinsic::ctlz: |
704 | 20 | case Intrinsic::cttz: |
705 | 20 | return Config.VectorizeBitManipulations; |
706 | 14 | case Intrinsic::fma: |
707 | 14 | case Intrinsic::fmuladd: |
708 | 14 | return Config.VectorizeFMA; |
709 | 109 | } |
710 | 109 | } |
711 | | |
712 | 2.03k | bool isPureIEChain(InsertElementInst *IE) { |
713 | 2.03k | InsertElementInst *IENext = IE; |
714 | 3.28k | do { |
715 | 3.28k | if (!isa<UndefValue>(IENext->getOperand(0)) && |
716 | 1.33k | !isa<InsertElementInst>(IENext->getOperand(0))1.33k ) {80 |
717 | 80 | return false; |
718 | 80 | } |
719 | 3.20k | } while ((IENext = |
720 | 3.20k | dyn_cast<InsertElementInst>(IENext->getOperand(0)))); |
721 | 2.03k | |
722 | 1.95k | return true; |
723 | 2.03k | } |
724 | | }; |
725 | | |
726 | | // This function implements one vectorization iteration on the provided |
727 | | // basic block. It returns true if the block is changed. |
728 | 233 | bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) { |
729 | 233 | bool ShouldContinue; |
730 | 233 | BasicBlock::iterator Start = BB.getFirstInsertionPt(); |
731 | 233 | |
732 | 233 | std::vector<Value *> AllPairableInsts; |
733 | 233 | DenseMap<Value *, Value *> AllChosenPairs; |
734 | 233 | DenseSet<ValuePair> AllFixedOrderPairs; |
735 | 233 | DenseMap<VPPair, unsigned> AllPairConnectionTypes; |
736 | 233 | DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs, |
737 | 233 | AllConnectedPairDeps; |
738 | 233 | |
739 | 234 | do { |
740 | 234 | std::vector<Value *> PairableInsts; |
741 | 234 | DenseMap<Value *, std::vector<Value *> > CandidatePairs; |
742 | 234 | DenseSet<ValuePair> FixedOrderPairs; |
743 | 234 | DenseMap<ValuePair, int> CandidatePairCostSavings; |
744 | 234 | ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, |
745 | 234 | FixedOrderPairs, |
746 | 234 | CandidatePairCostSavings, |
747 | 234 | PairableInsts, NonPow2Len); |
748 | 234 | if (PairableInsts.empty()234 ) continue59 ; |
749 | 234 | |
750 | 234 | // Build the candidate pair set for faster lookups. |
751 | 175 | DenseSet<ValuePair> CandidatePairsSet; |
752 | 175 | for (DenseMap<Value *, std::vector<Value *> >::iterator I = |
753 | 1.52k | CandidatePairs.begin(), E = CandidatePairs.end(); I != E1.52k ; ++I1.35k ) |
754 | 1.35k | for (std::vector<Value *>::iterator J = I->second.begin(), |
755 | 9.37k | JE = I->second.end(); J != JE9.37k ; ++J8.02k ) |
756 | 8.02k | CandidatePairsSet.insert(ValuePair(I->first, *J)); |
757 | 175 | |
758 | 175 | // Now we have a map of all of the pairable instructions and we need to |
759 | 175 | // select the best possible pairing. A good pairing is one such that the |
760 | 175 | // users of the pair are also paired. This defines a (directed) forest |
761 | 175 | // over the pairs such that two pairs are connected iff the second pair |
762 | 175 | // uses the first. |
763 | 175 | |
764 | 175 | // Note that it only matters that both members of the second pair use some |
765 | 175 | // element of the first pair (to allow for splatting). |
766 | 175 | |
767 | 175 | DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs, |
768 | 175 | ConnectedPairDeps; |
769 | 175 | DenseMap<VPPair, unsigned> PairConnectionTypes; |
770 | 175 | computeConnectedPairs(CandidatePairs, CandidatePairsSet, |
771 | 175 | PairableInsts, ConnectedPairs, PairConnectionTypes); |
772 | 175 | if (ConnectedPairs.empty()175 ) continue88 ; |
773 | 175 | |
774 | 87 | for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator |
775 | 87 | I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); |
776 | 3.58k | I != IE3.58k ; ++I3.49k ) |
777 | 3.49k | for (std::vector<ValuePair>::iterator J = I->second.begin(), |
778 | 8.57k | JE = I->second.end(); J != JE8.57k ; ++J5.07k ) |
779 | 5.07k | ConnectedPairDeps[*J].push_back(I->first); |
780 | 87 | |
781 | 87 | // Build the pairable-instruction dependency map |
782 | 87 | DenseSet<ValuePair> PairableInstUsers; |
783 | 87 | buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); |
784 | 87 | |
785 | 87 | // There is now a graph of the connected pairs. For each variable, pick |
786 | 87 | // the pairing with the largest dag meeting the depth requirement on at |
787 | 87 | // least one branch. Then select all pairings that are part of that dag |
788 | 87 | // and remove them from the list of available pairings and pairable |
789 | 87 | // variables. |
790 | 87 | |
791 | 87 | DenseMap<Value *, Value *> ChosenPairs; |
792 | 87 | choosePairs(CandidatePairs, CandidatePairsSet, |
793 | 87 | CandidatePairCostSavings, |
794 | 87 | PairableInsts, FixedOrderPairs, PairConnectionTypes, |
795 | 87 | ConnectedPairs, ConnectedPairDeps, |
796 | 87 | PairableInstUsers, ChosenPairs); |
797 | 87 | |
798 | 87 | if (ChosenPairs.empty()87 ) continue17 ; |
799 | 70 | AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), |
800 | 70 | PairableInsts.end()); |
801 | 70 | AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); |
802 | 70 | |
803 | 70 | // Only for the chosen pairs, propagate information on fixed-order pairs, |
804 | 70 | // pair connections, and their types to the data structures used by the |
805 | 70 | // pair fusion procedures. |
806 | 70 | for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(), |
807 | 457 | IE = ChosenPairs.end(); I != IE457 ; ++I387 ) {387 |
808 | 387 | if (FixedOrderPairs.count(*I)) |
809 | 96 | AllFixedOrderPairs.insert(*I); |
810 | 291 | else if (291 FixedOrderPairs.count(ValuePair(I->second, I->first))291 ) |
811 | 8 | AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); |
812 | 387 | |
813 | 387 | for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin(); |
814 | 6.10k | J != IE6.10k ; ++J5.71k ) {5.71k |
815 | 5.71k | DenseMap<VPPair, unsigned>::iterator K = |
816 | 5.71k | PairConnectionTypes.find(VPPair(*I, *J)); |
817 | 5.71k | if (K != PairConnectionTypes.end()5.71k ) {316 |
818 | 316 | AllPairConnectionTypes.insert(*K); |
819 | 5.39k | } else { |
820 | 5.39k | K = PairConnectionTypes.find(VPPair(*J, *I)); |
821 | 5.39k | if (K != PairConnectionTypes.end()) |
822 | 316 | AllPairConnectionTypes.insert(*K); |
823 | 5.39k | } |
824 | 5.71k | } |
825 | 387 | } |
826 | 70 | |
827 | 70 | for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator |
828 | 70 | I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); |
829 | 3.01k | I != IE3.01k ; ++I2.94k ) |
830 | 2.94k | for (std::vector<ValuePair>::iterator J = I->second.begin(), |
831 | 7.42k | JE = I->second.end(); J != JE7.42k ; ++J4.47k ) |
832 | 4.47k | if (4.47k AllPairConnectionTypes.count(VPPair(I->first, *J))4.47k ) {391 |
833 | 391 | AllConnectedPairs[I->first].push_back(*J); |
834 | 391 | AllConnectedPairDeps[*J].push_back(I->first); |
835 | 391 | } |
836 | 234 | } while (ShouldContinue); |
837 | 233 | |
838 | 233 | if (AllChosenPairs.empty()233 ) return false163 ; |
839 | 70 | NumFusedOps += AllChosenPairs.size(); |
840 | 70 | |
841 | 70 | // A set of pairs has now been selected. It is now necessary to replace the |
842 | 70 | // paired instructions with vector instructions. For this procedure each |
843 | 70 | // operand must be replaced with a vector operand. This vector is formed |
844 | 70 | // by using build_vector on the old operands. The replaced values are then |
845 | 70 | // replaced with a vector_extract on the result. Subsequent optimization |
846 | 70 | // passes should coalesce the build/extract combinations. |
847 | 70 | |
848 | 70 | fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, |
849 | 70 | AllPairConnectionTypes, |
850 | 70 | AllConnectedPairs, AllConnectedPairDeps); |
851 | 70 | |
852 | 70 | // It is important to cleanup here so that future iterations of this |
853 | 70 | // function have less work to do. |
854 | 70 | (void)SimplifyInstructionsInBlock(&BB, TLI); |
855 | 70 | return true; |
856 | 233 | } |
857 | | |
858 | | // This function returns true if the provided instruction is capable of being |
859 | | // fused into a vector instruction. This determination is based only on the |
860 | | // type and other attributes of the instruction. |
861 | | bool BBVectorize::isInstVectorizable(Instruction *I, |
862 | 5.33k | bool &IsSimpleLoadStore) { |
863 | 5.33k | IsSimpleLoadStore = false; |
864 | 5.33k | |
865 | 5.33k | if (CallInst *C5.33k = dyn_cast<CallInst>(I)) {154 |
866 | 154 | if (!isVectorizableIntrinsic(C)) |
867 | 72 | return false; |
868 | 5.18k | } else if (LoadInst *5.18k L5.18k = dyn_cast<LoadInst>(I)) {436 |
869 | 436 | // Vectorize simple loads if possbile: |
870 | 436 | IsSimpleLoadStore = L->isSimple(); |
871 | 436 | if (!IsSimpleLoadStore || 436 !Config.VectorizeMemOps436 ) |
872 | 0 | return false; |
873 | 4.74k | } else if (StoreInst *4.74k S4.74k = dyn_cast<StoreInst>(I)) {282 |
874 | 282 | // Vectorize simple stores if possbile: |
875 | 282 | IsSimpleLoadStore = S->isSimple(); |
876 | 282 | if (!IsSimpleLoadStore || 282 !Config.VectorizeMemOps282 ) |
877 | 0 | return false; |
878 | 4.46k | } else if (CastInst *4.46k C4.46k = dyn_cast<CastInst>(I)) {544 |
879 | 544 | // We can vectorize casts, but not casts of pointer types, etc. |
880 | 544 | if (!Config.VectorizeCasts) |
881 | 0 | return false; |
882 | 544 | |
883 | 544 | Type *SrcTy = C->getSrcTy(); |
884 | 544 | if (!SrcTy->isSingleValueType()) |
885 | 0 | return false; |
886 | 544 | |
887 | 544 | Type *DestTy = C->getDestTy(); |
888 | 544 | if (!DestTy->isSingleValueType()) |
889 | 0 | return false; |
890 | 3.92k | } else if (SelectInst *3.92k SI3.92k = dyn_cast<SelectInst>(I)) {26 |
891 | 26 | if (!Config.VectorizeSelect) |
892 | 0 | return false; |
893 | 26 | // We can vectorize a select if either all operands are scalars, |
894 | 26 | // or all operands are vectors. Trying to "widen" a select between |
895 | 26 | // vectors that has a scalar condition results in a malformed select. |
896 | 26 | // FIXME: We could probably be smarter about this by rewriting the select |
897 | 26 | // with different types instead. |
898 | 26 | return (SI->getCondition()->getType()->isVectorTy() == |
899 | 26 | SI->getTrueValue()->getType()->isVectorTy()); |
900 | 3.89k | } else if (3.89k isa<CmpInst>(I)3.89k ) {31 |
901 | 31 | if (!Config.VectorizeCmp) |
902 | 0 | return false; |
903 | 3.86k | } else if (GetElementPtrInst *3.86k G3.86k = dyn_cast<GetElementPtrInst>(I)) {503 |
904 | 503 | if (!Config.VectorizeGEP) |
905 | 0 | return false; |
906 | 503 | |
907 | 503 | // Currently, vector GEPs exist only with one index. |
908 | 503 | if (503 G->getNumIndices() != 1503 ) |
909 | 242 | return false; |
910 | 3.36k | } else if (3.36k !(I->isBinaryOp() || 3.36k isa<ShuffleVectorInst>(I)1.57k || |
911 | 1.42k | isa<ExtractElementInst>(I)1.42k || isa<InsertElementInst>(I)1.17k )) {268 |
912 | 268 | return false; |
913 | 268 | } |
914 | 5.33k | |
915 | 4.72k | Type *T1, *T2; |
916 | 4.72k | getInstructionTypes(I, T1, T2); |
917 | 4.72k | |
918 | 4.72k | // Not every type can be vectorized... |
919 | 4.72k | if (!(VectorType::isValidElementType(T1) || 4.72k T1->isVectorTy()1.83k ) || |
920 | 4.72k | !(VectorType::isValidElementType(T2) || 4.72k T2->isVectorTy()1.83k )) |
921 | 0 | return false; |
922 | 4.72k | |
923 | 4.72k | if (4.72k T1->getScalarSizeInBits() == 14.72k ) {48 |
924 | 48 | if (!Config.VectorizeBools) |
925 | 14 | return false; |
926 | 4.68k | } else { |
927 | 4.68k | if (!Config.VectorizeInts && 4.68k T1->isIntOrIntVectorTy()0 ) |
928 | 0 | return false; |
929 | 4.68k | } |
930 | 4.72k | |
931 | 4.71k | if (4.71k T2->getScalarSizeInBits() == 14.71k ) {12 |
932 | 12 | if (!Config.VectorizeBools) |
933 | 0 | return false; |
934 | 4.70k | } else { |
935 | 4.70k | if (!Config.VectorizeInts && 4.70k T2->isIntOrIntVectorTy()0 ) |
936 | 0 | return false; |
937 | 4.70k | } |
938 | 4.71k | |
939 | 4.71k | if (4.71k !Config.VectorizeFloats4.71k |
940 | 0 | && (T1->isFPOrFPVectorTy() || 0 T2->isFPOrFPVectorTy()0 )) |
941 | 0 | return false; |
942 | 4.71k | |
943 | 4.71k | // Don't vectorize target-specific types. |
944 | 4.71k | if (4.71k T1->isX86_FP80Ty() || 4.71k T1->isPPC_FP128Ty()4.71k || T1->isX86_MMXTy()4.70k ) |
945 | 7 | return false; |
946 | 4.70k | if (4.70k T2->isX86_FP80Ty() || 4.70k T2->isPPC_FP128Ty()4.70k || T2->isX86_MMXTy()4.70k ) |
947 | 0 | return false; |
948 | 4.70k | |
949 | 4.70k | if (4.70k !Config.VectorizePointers && 4.70k (T1->getScalarType()->isPointerTy() ||4.70k |
950 | 4.13k | T2->getScalarType()->isPointerTy())) |
951 | 574 | return false; |
952 | 4.70k | |
953 | 4.13k | if (4.13k !TTI && 4.13k (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||1.82k |
954 | 931 | T2->getPrimitiveSizeInBits() >= Config.VectorBits)) |
955 | 908 | return false; |
956 | 4.13k | |
957 | 3.22k | return true; |
958 | 4.13k | } |
959 | | |
960 | | // This function returns true if the two provided instructions are compatible |
961 | | // (meaning that they can be fused into a vector instruction). This assumes |
962 | | // that I has already been determined to be vectorizable and that J is not |
963 | | // in the use dag of I. |
964 | | bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, |
965 | | bool IsSimpleLoadStore, bool NonPow2Len, |
966 | 93.9k | int &CostSavings, int &FixedOrder) { |
967 | 93.9k | DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << |
968 | 93.9k | " <-> " << *J << "\n"); |
969 | 93.9k | |
970 | 93.9k | CostSavings = 0; |
971 | 93.9k | FixedOrder = 0; |
972 | 93.9k | |
973 | 93.9k | // Loads and stores can be merged if they have different alignments, |
974 | 93.9k | // but are otherwise the same. |
975 | 93.9k | if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | |
976 | 76.9k | (NonPow2Len ? Instruction::CompareUsingScalarTypes17.0k : 076.9k ))) |
977 | 81.5k | return false; |
978 | 93.9k | |
979 | 12.4k | Type *IT1, *IT2, *JT1, *JT2; |
980 | 12.4k | getInstructionTypes(I, IT1, IT2); |
981 | 12.4k | getInstructionTypes(J, JT1, JT2); |
982 | 12.4k | unsigned MaxTypeBits = std::max( |
983 | 12.4k | IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), |
984 | 12.4k | IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); |
985 | 12.4k | if (!TTI && 12.4k MaxTypeBits > Config.VectorBits1.50k ) |
986 | 83 | return false; |
987 | 12.4k | |
988 | 12.4k | // FIXME: handle addsub-type operations! |
989 | 12.4k | |
990 | 12.3k | if (12.3k IsSimpleLoadStore12.3k ) {1.67k |
991 | 1.67k | Value *IPtr, *JPtr; |
992 | 1.67k | unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; |
993 | 1.67k | int64_t OffsetInElmts = 0; |
994 | 1.67k | if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, |
995 | 1.67k | IAddressSpace, JAddressSpace, OffsetInElmts) && |
996 | 1.15k | std::abs(OffsetInElmts) == 11.15k ) {239 |
997 | 239 | FixedOrder = (int) OffsetInElmts; |
998 | 239 | unsigned BottomAlignment = IAlignment; |
999 | 239 | if (OffsetInElmts < 0239 ) BottomAlignment = JAlignment19 ; |
1000 | 239 | |
1001 | 239 | Type *aTypeI = isa<StoreInst>(I) ? |
1002 | 141 | cast<StoreInst>(I)->getValueOperand()->getType()141 : I->getType()98 ; |
1003 | 239 | Type *aTypeJ = isa<StoreInst>(J) ? |
1004 | 141 | cast<StoreInst>(J)->getValueOperand()->getType()141 : J->getType()98 ; |
1005 | 239 | Type *VType = getVecTypeForPair(aTypeI, aTypeJ); |
1006 | 239 | |
1007 | 239 | if (Config.AlignedOnly239 ) {16 |
1008 | 16 | // An aligned load or store is possible only if the instruction |
1009 | 16 | // with the lower offset has an alignment suitable for the |
1010 | 16 | // vector type. |
1011 | 16 | const DataLayout &DL = I->getModule()->getDataLayout(); |
1012 | 16 | unsigned VecAlignment = DL.getPrefTypeAlignment(VType); |
1013 | 16 | if (BottomAlignment < VecAlignment) |
1014 | 15 | return false; |
1015 | 16 | } |
1016 | 239 | |
1017 | 224 | if (224 TTI224 ) {205 |
1018 | 205 | unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI, |
1019 | 205 | IAlignment, IAddressSpace); |
1020 | 205 | unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ, |
1021 | 205 | JAlignment, JAddressSpace); |
1022 | 205 | unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType, |
1023 | 205 | BottomAlignment, |
1024 | 205 | IAddressSpace); |
1025 | 205 | |
1026 | 205 | ICost += TTI->getAddressComputationCost(aTypeI); |
1027 | 205 | JCost += TTI->getAddressComputationCost(aTypeJ); |
1028 | 205 | VCost += TTI->getAddressComputationCost(VType); |
1029 | 205 | |
1030 | 205 | if (VCost > ICost + JCost) |
1031 | 0 | return false; |
1032 | 205 | |
1033 | 205 | // We don't want to fuse to a type that will be split, even |
1034 | 205 | // if the two input types will also be split and there is no other |
1035 | 205 | // associated cost. |
1036 | 205 | unsigned VParts = TTI->getNumberOfParts(VType); |
1037 | 205 | if (VParts > 1) |
1038 | 38 | return false; |
1039 | 167 | else if (167 !VParts && 167 VCost == ICost + JCost15 ) |
1040 | 0 | return false; |
1041 | 205 | |
1042 | 167 | CostSavings = ICost + JCost - VCost; |
1043 | 167 | } |
1044 | 1.43k | } else { |
1045 | 1.43k | return false; |
1046 | 1.43k | } |
1047 | 10.6k | } else if (10.6k TTI10.6k ) {9.34k |
1048 | 9.34k | TargetTransformInfo::OperandValueKind Op1VK = |
1049 | 9.34k | TargetTransformInfo::OK_AnyValue; |
1050 | 9.34k | TargetTransformInfo::OperandValueKind Op2VK = |
1051 | 9.34k | TargetTransformInfo::OK_AnyValue; |
1052 | 9.34k | unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I); |
1053 | 9.34k | unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J); |
1054 | 9.34k | Type *VT1 = getVecTypeForPair(IT1, JT1), |
1055 | 9.34k | *VT2 = getVecTypeForPair(IT2, JT2); |
1056 | 9.34k | |
1057 | 9.34k | // On some targets (example X86) the cost of a vector shift may vary |
1058 | 9.34k | // depending on whether the second operand is a Uniform or |
1059 | 9.34k | // NonUniform Constant. |
1060 | 9.34k | switch (I->getOpcode()) { |
1061 | 8.99k | default : break; |
1062 | 350 | case Instruction::Shl: |
1063 | 350 | case Instruction::LShr: |
1064 | 350 | case Instruction::AShr: |
1065 | 350 | |
1066 | 350 | // If both I and J are scalar shifts by constant, then the |
1067 | 350 | // merged vector shift count would be either a constant splat value |
1068 | 350 | // or a non-uniform vector of constants. |
1069 | 350 | if (ConstantInt *CII350 = dyn_cast<ConstantInt>(I->getOperand(1))) {345 |
1070 | 345 | if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1))) |
1071 | 343 | Op2VK = CII == CIJ ? 343 TargetTransformInfo::OK_UniformConstantValue41 : |
1072 | 302 | TargetTransformInfo::OK_NonUniformConstantValue; |
1073 | 5 | } else { |
1074 | 5 | // Check for a splat of a constant or for a non uniform vector |
1075 | 5 | // of constants. |
1076 | 5 | Value *IOp = I->getOperand(1); |
1077 | 5 | Value *JOp = J->getOperand(1); |
1078 | 5 | if ((isa<ConstantVector>(IOp) || 5 isa<ConstantDataVector>(IOp)5 ) && |
1079 | 5 | (isa<ConstantVector>(JOp) || 5 isa<ConstantDataVector>(JOp)5 )) {0 |
1080 | 0 | Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; |
1081 | 0 | Constant *SplatValue = cast<Constant>(IOp)->getSplatValue(); |
1082 | 0 | if (SplatValue != nullptr && |
1083 | 0 | SplatValue == cast<Constant>(JOp)->getSplatValue()) |
1084 | 0 | Op2VK = TargetTransformInfo::OK_UniformConstantValue; |
1085 | 0 | } |
1086 | 5 | } |
1087 | 9.34k | } |
1088 | 9.34k | |
1089 | 9.34k | // Note that this procedure is incorrect for insert and extract element |
1090 | 9.34k | // instructions (because combining these often results in a shuffle), |
1091 | 9.34k | // but this cost is ignored (because insert and extract element |
1092 | 9.34k | // instructions are assigned a zero depth factor and are not really |
1093 | 9.34k | // fused in general). |
1094 | 9.34k | unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I); |
1095 | 9.34k | |
1096 | 9.34k | if (VCost > ICost + JCost) |
1097 | 390 | return false; |
1098 | 9.34k | |
1099 | 9.34k | // We don't want to fuse to a type that will be split, even |
1100 | 9.34k | // if the two input types will also be split and there is no other |
1101 | 9.34k | // associated cost. |
1102 | 8.95k | unsigned VParts1 = TTI->getNumberOfParts(VT1), |
1103 | 8.95k | VParts2 = TTI->getNumberOfParts(VT2); |
1104 | 8.95k | if (VParts1 > 1 || 8.95k VParts2 > 16.51k ) |
1105 | 2.43k | return false; |
1106 | 6.51k | else if (6.51k (!VParts1 || 6.51k !VParts26.50k ) && VCost == ICost + JCost11 ) |
1107 | 0 | return false; |
1108 | 8.95k | |
1109 | 6.51k | CostSavings = ICost + JCost - VCost; |
1110 | 6.51k | } |
1111 | 12.3k | |
1112 | 12.3k | // The powi,ctlz,cttz intrinsics are special because only the first |
1113 | 12.3k | // argument is vectorized, the second arguments must be equal. |
1114 | 8.03k | CallInst *CI = dyn_cast<CallInst>(I); |
1115 | 8.03k | Function *FI; |
1116 | 8.03k | if (CI && 8.03k (FI = CI->getCalledFunction())24 ) {24 |
1117 | 24 | Intrinsic::ID IID = FI->getIntrinsicID(); |
1118 | 24 | if (IID == Intrinsic::powi || 24 IID == Intrinsic::ctlz20 || |
1119 | 18 | IID == Intrinsic::cttz18 ) {8 |
1120 | 8 | Value *A1I = CI->getArgOperand(1), |
1121 | 8 | *A1J = cast<CallInst>(J)->getArgOperand(1); |
1122 | 8 | const SCEV *A1ISCEV = SE->getSCEV(A1I), |
1123 | 8 | *A1JSCEV = SE->getSCEV(A1J); |
1124 | 8 | return (A1ISCEV == A1JSCEV); |
1125 | 8 | } |
1126 | 24 | |
1127 | 16 | if (16 IID && 16 TTI16 ) {3 |
1128 | 3 | FastMathFlags FMFCI; |
1129 | 3 | if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI)) |
1130 | 3 | FMFCI = FPMOCI->getFastMathFlags(); |
1131 | 3 | SmallVector<Value *, 4> IArgs(CI->arg_operands()); |
1132 | 3 | unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI); |
1133 | 3 | |
1134 | 3 | CallInst *CJ = cast<CallInst>(J); |
1135 | 3 | |
1136 | 3 | FastMathFlags FMFCJ; |
1137 | 3 | if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ)) |
1138 | 3 | FMFCJ = FPMOCJ->getFastMathFlags(); |
1139 | 3 | |
1140 | 3 | SmallVector<Value *, 4> JArgs(CJ->arg_operands()); |
1141 | 3 | unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ); |
1142 | 3 | |
1143 | 3 | assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && |
1144 | 3 | "Intrinsic argument counts differ"); |
1145 | 3 | SmallVector<Type*, 4> Tys; |
1146 | 3 | SmallVector<Value *, 4> VecArgs; |
1147 | 10 | for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie10 ; ++i7 ) {7 |
1148 | 7 | if ((IID == Intrinsic::powi || 7 IID == Intrinsic::ctlz7 || |
1149 | 7 | IID == Intrinsic::cttz7 ) && i == 10 ) {0 |
1150 | 0 | Tys.push_back(CI->getArgOperand(i)->getType()); |
1151 | 0 | VecArgs.push_back(CI->getArgOperand(i)); |
1152 | 0 | } |
1153 | 7 | else { |
1154 | 7 | Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), |
1155 | 7 | CJ->getArgOperand(i)->getType())); |
1156 | 7 | // Add both operands, and then count their scalarization overhead |
1157 | 7 | // with VF 1. |
1158 | 7 | VecArgs.push_back(CI->getArgOperand(i)); |
1159 | 7 | VecArgs.push_back(CJ->getArgOperand(i)); |
1160 | 7 | } |
1161 | 7 | } |
1162 | 3 | |
1163 | 3 | // Compute the scalarization cost here with the original operands (to |
1164 | 3 | // check for uniqueness etc), and then call getIntrinsicInstrCost() |
1165 | 3 | // with the constructed vector types. |
1166 | 3 | Type *RetTy = getVecTypeForPair(IT1, JT1); |
1167 | 3 | unsigned ScalarizationCost = 0; |
1168 | 3 | if (!RetTy->isVoidTy()) |
1169 | 3 | ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false); |
1170 | 3 | ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1); |
1171 | 3 | |
1172 | 3 | FastMathFlags FMFV = FMFCI; |
1173 | 3 | FMFV &= FMFCJ; |
1174 | 3 | unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV, |
1175 | 3 | ScalarizationCost); |
1176 | 3 | |
1177 | 3 | if (VCost > ICost + JCost) |
1178 | 2 | return false; |
1179 | 3 | |
1180 | 3 | // We don't want to fuse to a type that will be split, even |
1181 | 3 | // if the two input types will also be split and there is no other |
1182 | 3 | // associated cost. |
1183 | 1 | unsigned RetParts = TTI->getNumberOfParts(RetTy); |
1184 | 1 | if (RetParts > 1) |
1185 | 0 | return false; |
1186 | 1 | else if (1 !RetParts && 1 VCost == ICost + JCost0 ) |
1187 | 0 | return false; |
1188 | 1 | |
1189 | 4 | for (unsigned i = 0, ie = CI->getNumArgOperands(); 1 i != ie4 ; ++i3 ) {3 |
1190 | 3 | if (!Tys[i]->isVectorTy()) |
1191 | 0 | continue; |
1192 | 3 | |
1193 | 3 | unsigned NumParts = TTI->getNumberOfParts(Tys[i]); |
1194 | 3 | if (NumParts > 1) |
1195 | 0 | return false; |
1196 | 3 | else if (3 !NumParts && 3 VCost == ICost + JCost0 ) |
1197 | 0 | return false; |
1198 | 3 | } |
1199 | 1 | |
1200 | 1 | CostSavings = ICost + JCost - VCost; |
1201 | 1 | } |
1202 | 16 | } |
1203 | 8.03k | |
1204 | 8.02k | return true; |
1205 | 8.03k | } |
1206 | | |
1207 | | // Figure out whether or not J uses I and update the users and write-set |
1208 | | // structures associated with I. Specifically, Users represents the set of |
1209 | | // instructions that depend on I. WriteSet represents the set |
1210 | | // of memory locations that are dependent on I. If UpdateUsers is true, |
1211 | | // and J uses I, then Users is updated to contain J and WriteSet is updated |
1212 | | // to contain any memory locations to which J writes. The function returns |
1213 | | // true if J uses I. By default, alias analysis is used to determine |
1214 | | // whether J reads from memory that overlaps with a location in WriteSet. |
1215 | | // If LoadMoveSet is not null, then it is a previously-computed map |
1216 | | // where the key is the memory-based user instruction and the value is |
1217 | | // the instruction to be compared with I. So, if LoadMoveSet is provided, |
1218 | | // then the alias analysis is not used. This is necessary because this |
1219 | | // function is called during the process of moving instructions during |
1220 | | // vectorization and the results of the alias analysis are not stable during |
1221 | | // that process. |
1222 | | bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, |
1223 | | AliasSetTracker &WriteSet, Instruction *I, |
1224 | | Instruction *J, bool UpdateUsers, |
1225 | 221k | DenseSet<ValuePair> *LoadMoveSetPairs) { |
1226 | 221k | bool UsesI = false; |
1227 | 221k | |
1228 | 221k | // This instruction may already be marked as a user due, for example, to |
1229 | 221k | // being a member of a selected pair. |
1230 | 221k | if (Users.count(J)) |
1231 | 0 | UsesI = true; |
1232 | 221k | |
1233 | 221k | if (!UsesI) |
1234 | 221k | for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); |
1235 | 583k | JU != JE583k ; ++JU362k ) {403k |
1236 | 403k | Value *V = *JU; |
1237 | 403k | if (I == V || 403k Users.count(V)397k ) {40.8k |
1238 | 40.8k | UsesI = true; |
1239 | 40.8k | break; |
1240 | 40.8k | } |
1241 | 403k | } |
1242 | 221k | if (!UsesI && 221k J->mayReadFromMemory()180k ) {23.3k |
1243 | 23.3k | if (LoadMoveSetPairs23.3k ) {232 |
1244 | 232 | UsesI = LoadMoveSetPairs->count(ValuePair(J, I)); |
1245 | 23.1k | } else { |
1246 | 23.1k | for (AliasSetTracker::iterator W = WriteSet.begin(), |
1247 | 35.6k | WE = WriteSet.end(); W != WE35.6k ; ++W12.4k ) {20.2k |
1248 | 20.2k | if (W->aliasesUnknownInst(J, *AA)20.2k ) {7.71k |
1249 | 7.71k | UsesI = true; |
1250 | 7.71k | break; |
1251 | 7.71k | } |
1252 | 20.2k | } |
1253 | 23.1k | } |
1254 | 23.3k | } |
1255 | 221k | |
1256 | 221k | if (UsesI && 221k UpdateUsers48.5k ) {48.5k |
1257 | 48.5k | if (J->mayWriteToMemory()48.5k ) WriteSet.add(J)7.89k ; |
1258 | 48.5k | Users.insert(J); |
1259 | 48.5k | } |
1260 | 221k | |
1261 | 221k | return UsesI; |
1262 | 221k | } |
1263 | | |
1264 | | // This function iterates over all instruction pairs in the provided |
1265 | | // basic block and collects all candidate pairs for vectorization. |
1266 | | bool BBVectorize::getCandidatePairs(BasicBlock &BB, |
1267 | | BasicBlock::iterator &Start, |
1268 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1269 | | DenseSet<ValuePair> &FixedOrderPairs, |
1270 | | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
1271 | 234 | std::vector<Value *> &PairableInsts, bool NonPow2Len) { |
1272 | 234 | size_t TotalPairs = 0; |
1273 | 234 | BasicBlock::iterator E = BB.end(); |
1274 | 234 | if (Start == E234 ) return false0 ; |
1275 | 234 | |
1276 | 234 | bool ShouldContinue = false, IAfterStart = false; |
1277 | 5.57k | for (BasicBlock::iterator I = Start++; I != E5.57k ; ++I5.33k ) {5.33k |
1278 | 5.33k | if (I == Start5.33k ) IAfterStart = true446 ; |
1279 | 5.33k | |
1280 | 5.33k | bool IsSimpleLoadStore; |
1281 | 5.33k | if (!isInstVectorizable(&*I, IsSimpleLoadStore)) |
1282 | 2.09k | continue; |
1283 | 5.33k | |
1284 | 5.33k | // Look for an instruction with which to pair instruction *I... |
1285 | 3.24k | DenseSet<Value *> Users; |
1286 | 3.24k | AliasSetTracker WriteSet(*AA); |
1287 | 3.24k | if (I->mayWriteToMemory()) |
1288 | 268 | WriteSet.add(&*I); |
1289 | 3.24k | |
1290 | 3.24k | bool JAfterStart = IAfterStart; |
1291 | 3.24k | BasicBlock::iterator J = std::next(I); |
1292 | 132k | for (unsigned ss = 0; J != E && 132k ss <= Config.SearchLimit129k ; ++J, ++ss129k ) {129k |
1293 | 129k | if (J == Start) |
1294 | 2.68k | JAfterStart = true; |
1295 | 129k | |
1296 | 129k | // Determine if J uses I, if so, exit the loop. |
1297 | 129k | bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep); |
1298 | 129k | if (Config.FastDep129k ) {0 |
1299 | 0 | // Note: For this heuristic to be effective, independent operations |
1300 | 0 | // must tend to be intermixed. This is likely to be true from some |
1301 | 0 | // kinds of grouped loop unrolling (but not the generic LLVM pass), |
1302 | 0 | // but otherwise may require some kind of reordering pass. |
1303 | 0 |
|
1304 | 0 | // When using fast dependency analysis, |
1305 | 0 | // stop searching after first use: |
1306 | 0 | if (UsesI0 ) break0 ; |
1307 | 129k | } else { |
1308 | 129k | if (UsesI129k ) continue35.1k ; |
1309 | 129k | } |
1310 | 129k | |
1311 | 129k | // J does not use I, and comes before the first use of I, so it can be |
1312 | 129k | // merged with I if the instructions are compatible. |
1313 | 93.9k | int CostSavings, FixedOrder; |
1314 | 93.9k | if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len, |
1315 | 93.9k | CostSavings, FixedOrder)) |
1316 | 85.9k | continue; |
1317 | 93.9k | |
1318 | 93.9k | // J is a candidate for merging with I. |
1319 | 8.02k | if (8.02k PairableInsts.empty() ||8.02k |
1320 | 7.85k | PairableInsts[PairableInsts.size() - 1] != &*I7.85k ) {1.35k |
1321 | 1.35k | PairableInsts.push_back(&*I); |
1322 | 1.35k | } |
1323 | 8.02k | |
1324 | 8.02k | CandidatePairs[&*I].push_back(&*J); |
1325 | 8.02k | ++TotalPairs; |
1326 | 8.02k | if (TTI) |
1327 | 6.67k | CandidatePairCostSavings.insert( |
1328 | 6.67k | ValuePairWithCost(ValuePair(&*I, &*J), CostSavings)); |
1329 | 8.02k | |
1330 | 8.02k | if (FixedOrder == 1) |
1331 | 168 | FixedOrderPairs.insert(ValuePair(&*I, &*J)); |
1332 | 7.85k | else if (7.85k FixedOrder == -17.85k ) |
1333 | 18 | FixedOrderPairs.insert(ValuePair(&*J, &*I)); |
1334 | 8.02k | |
1335 | 8.02k | // The next call to this function must start after the last instruction |
1336 | 8.02k | // selected during this invocation. |
1337 | 8.02k | if (JAfterStart8.02k ) {641 |
1338 | 641 | Start = std::next(J); |
1339 | 641 | IAfterStart = JAfterStart = false; |
1340 | 641 | } |
1341 | 8.02k | |
1342 | 8.02k | DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " |
1343 | 8.02k | << *I << " <-> " << *J << " (cost savings: " << |
1344 | 8.02k | CostSavings << ")\n"); |
1345 | 8.02k | |
1346 | 8.02k | // If we have already found too many pairs, break here and this function |
1347 | 8.02k | // will be called again starting after the last instruction selected |
1348 | 8.02k | // during this invocation. |
1349 | 8.02k | if (PairableInsts.size() >= Config.MaxInsts || |
1350 | 8.02k | TotalPairs >= Config.MaxPairs8.02k ) {1 |
1351 | 1 | ShouldContinue = true; |
1352 | 1 | break; |
1353 | 1 | } |
1354 | 8.02k | } |
1355 | 3.24k | |
1356 | 3.24k | if (ShouldContinue) |
1357 | 1 | break; |
1358 | 3.24k | } |
1359 | 234 | |
1360 | 234 | DEBUG(dbgs() << "BBV: found " << PairableInsts.size() |
1361 | 234 | << " instructions with candidate pairs\n"); |
1362 | 234 | |
1363 | 234 | return ShouldContinue; |
1364 | 234 | } |
1365 | | |
1366 | | // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that |
1367 | | // it looks for pairs such that both members have an input which is an |
1368 | | // output of PI or PJ. |
1369 | | void BBVectorize::computePairsConnectedTo( |
1370 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1371 | | DenseSet<ValuePair> &CandidatePairsSet, |
1372 | | std::vector<Value *> &PairableInsts, |
1373 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
1374 | | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
1375 | 8.02k | ValuePair P) { |
1376 | 8.02k | StoreInst *SI, *SJ; |
1377 | 8.02k | |
1378 | 8.02k | // For each possible pairing for this variable, look at the uses of |
1379 | 8.02k | // the first value... |
1380 | 8.02k | for (Value::user_iterator I = P.first->user_begin(), |
1381 | 8.02k | E = P.first->user_end(); |
1382 | 16.5k | I != E16.5k ; ++I8.55k ) {8.55k |
1383 | 8.55k | User *UI = *I; |
1384 | 8.55k | if (isa<LoadInst>(UI)8.55k ) {0 |
1385 | 0 | // A pair cannot be connected to a load because the load only takes one |
1386 | 0 | // operand (the address) and it is a scalar even after vectorization. |
1387 | 0 | continue; |
1388 | 8.55k | } else if (8.55k (SI = dyn_cast<StoreInst>(UI)) &&8.55k |
1389 | 638 | P.first == SI->getPointerOperand()638 ) {0 |
1390 | 0 | // Similarly, a pair cannot be connected to a store through its |
1391 | 0 | // pointer operand. |
1392 | 0 | continue; |
1393 | 0 | } |
1394 | 8.55k | |
1395 | 8.55k | // For each use of the first variable, look for uses of the second |
1396 | 8.55k | // variable... |
1397 | 9.65k | for (User *UJ : P.second->users()) 8.55k {9.65k |
1398 | 9.65k | if ((SJ = dyn_cast<StoreInst>(UJ)) && |
1399 | 693 | P.second == SJ->getPointerOperand()) |
1400 | 0 | continue; |
1401 | 9.65k | |
1402 | 9.65k | // Look for <I, J>: |
1403 | 9.65k | if (9.65k CandidatePairsSet.count(ValuePair(UI, UJ))9.65k ) {3.91k |
1404 | 3.91k | VPPair VP(P, ValuePair(UI, UJ)); |
1405 | 3.91k | ConnectedPairs[VP.first].push_back(VP.second); |
1406 | 3.91k | PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); |
1407 | 3.91k | } |
1408 | 9.65k | |
1409 | 9.65k | // Look for <J, I>: |
1410 | 9.65k | if (CandidatePairsSet.count(ValuePair(UJ, UI))9.65k ) {203 |
1411 | 203 | VPPair VP(P, ValuePair(UJ, UI)); |
1412 | 203 | ConnectedPairs[VP.first].push_back(VP.second); |
1413 | 203 | PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); |
1414 | 203 | } |
1415 | 9.65k | } |
1416 | 8.55k | |
1417 | 8.55k | if (Config.SplatBreaksChain8.55k ) continue0 ; |
1418 | 8.55k | // Look for cases where just the first value in the pair is used by |
1419 | 8.55k | // both members of another pair (splatting). |
1420 | 20.1k | for (Value::user_iterator J = P.first->user_begin(); 8.55k J != E20.1k ; ++J11.6k ) {11.6k |
1421 | 11.6k | User *UJ = *J; |
1422 | 11.6k | if ((SJ = dyn_cast<StoreInst>(UJ)) && |
1423 | 673 | P.first == SJ->getPointerOperand()) |
1424 | 0 | continue; |
1425 | 11.6k | |
1426 | 11.6k | if (11.6k CandidatePairsSet.count(ValuePair(UI, UJ))11.6k ) {659 |
1427 | 659 | VPPair VP(P, ValuePair(UI, UJ)); |
1428 | 659 | ConnectedPairs[VP.first].push_back(VP.second); |
1429 | 659 | PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); |
1430 | 659 | } |
1431 | 11.6k | } |
1432 | 8.55k | } |
1433 | 8.02k | |
1434 | 8.02k | if (Config.SplatBreaksChain8.02k ) return0 ; |
1435 | 8.02k | // Look for cases where just the second value in the pair is used by |
1436 | 8.02k | // both members of another pair (splatting). |
1437 | 8.02k | for (Value::user_iterator I = P.second->user_begin(), |
1438 | 8.02k | E = P.second->user_end(); |
1439 | 16.2k | I != E16.2k ; ++I8.19k ) {8.19k |
1440 | 8.19k | User *UI = *I; |
1441 | 8.19k | if (isa<LoadInst>(UI)) |
1442 | 0 | continue; |
1443 | 8.19k | else if (8.19k (SI = dyn_cast<StoreInst>(UI)) &&8.19k |
1444 | 682 | P.second == SI->getPointerOperand()) |
1445 | 0 | continue; |
1446 | 8.19k | |
1447 | 18.0k | for (Value::user_iterator J = P.second->user_begin(); 8.19k J != E18.0k ; ++J9.88k ) {9.88k |
1448 | 9.88k | User *UJ = *J; |
1449 | 9.88k | if ((SJ = dyn_cast<StoreInst>(UJ)) && |
1450 | 691 | P.second == SJ->getPointerOperand()) |
1451 | 0 | continue; |
1452 | 9.88k | |
1453 | 9.88k | if (9.88k CandidatePairsSet.count(ValuePair(UI, UJ))9.88k ) {299 |
1454 | 299 | VPPair VP(P, ValuePair(UI, UJ)); |
1455 | 299 | ConnectedPairs[VP.first].push_back(VP.second); |
1456 | 299 | PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); |
1457 | 299 | } |
1458 | 9.88k | } |
1459 | 8.19k | } |
1460 | 8.02k | } |
1461 | | |
1462 | | // This function figures out which pairs are connected. Two pairs are |
1463 | | // connected if some output of the first pair forms an input to both members |
1464 | | // of the second pair. |
1465 | | void BBVectorize::computeConnectedPairs( |
1466 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1467 | | DenseSet<ValuePair> &CandidatePairsSet, |
1468 | | std::vector<Value *> &PairableInsts, |
1469 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
1470 | 175 | DenseMap<VPPair, unsigned> &PairConnectionTypes) { |
1471 | 175 | for (std::vector<Value *>::iterator PI = PairableInsts.begin(), |
1472 | 1.52k | PE = PairableInsts.end(); PI != PE1.52k ; ++PI1.35k ) {1.35k |
1473 | 1.35k | DenseMap<Value *, std::vector<Value *> >::iterator PP = |
1474 | 1.35k | CandidatePairs.find(*PI); |
1475 | 1.35k | if (PP == CandidatePairs.end()) |
1476 | 0 | continue; |
1477 | 1.35k | |
1478 | 1.35k | for (std::vector<Value *>::iterator P = PP->second.begin(), |
1479 | 9.37k | E = PP->second.end(); P != E9.37k ; ++P8.02k ) |
1480 | 8.02k | computePairsConnectedTo(CandidatePairs, CandidatePairsSet, |
1481 | 8.02k | PairableInsts, ConnectedPairs, |
1482 | 8.02k | PairConnectionTypes, ValuePair(*PI, *P)); |
1483 | 1.35k | } |
1484 | 175 | |
1485 | 175 | DEBUG(size_t TotalPairs = 0; |
1486 | 175 | for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I = |
1487 | 175 | ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I) |
1488 | 175 | TotalPairs += I->second.size(); |
1489 | 175 | dbgs() << "BBV: found " << TotalPairs |
1490 | 175 | << " pair connections.\n"); |
1491 | 175 | } |
1492 | | |
1493 | | // This function builds a set of use tuples such that <A, B> is in the set |
1494 | | // if B is in the use dag of A. If B is in the use dag of A, then B |
1495 | | // depends on the output of A. |
1496 | | void BBVectorize::buildDepMap( |
1497 | | BasicBlock &BB, |
1498 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1499 | | std::vector<Value *> &PairableInsts, |
1500 | 87 | DenseSet<ValuePair> &PairableInstUsers) { |
1501 | 87 | DenseSet<Value *> IsInPair; |
1502 | 87 | for (DenseMap<Value *, std::vector<Value *> >::iterator C = |
1503 | 1.30k | CandidatePairs.begin(), E = CandidatePairs.end(); C != E1.30k ; ++C1.22k ) {1.22k |
1504 | 1.22k | IsInPair.insert(C->first); |
1505 | 1.22k | IsInPair.insert(C->second.begin(), C->second.end()); |
1506 | 1.22k | } |
1507 | 87 | |
1508 | 87 | // Iterate through the basic block, recording all users of each |
1509 | 87 | // pairable instruction. |
1510 | 87 | |
1511 | 87 | BasicBlock::iterator E = BB.end(), EL = |
1512 | 87 | BasicBlock::iterator(cast<Instruction>(PairableInsts.back())); |
1513 | 2.63k | for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E2.63k ; ++I2.54k ) {2.63k |
1514 | 2.63k | if (IsInPair.find(&*I) == IsInPair.end()) |
1515 | 1.12k | continue; |
1516 | 2.63k | |
1517 | 1.50k | DenseSet<Value *> Users; |
1518 | 1.50k | AliasSetTracker WriteSet(*AA); |
1519 | 1.50k | if (I->mayWriteToMemory()) |
1520 | 120 | WriteSet.add(&*I); |
1521 | 1.50k | |
1522 | 61.3k | for (BasicBlock::iterator J = std::next(I); J != E61.3k ; ++J59.8k ) {61.2k |
1523 | 61.2k | (void)trackUsesOfI(Users, WriteSet, &*I, &*J); |
1524 | 61.2k | |
1525 | 61.2k | if (J == EL) |
1526 | 1.41k | break; |
1527 | 61.2k | } |
1528 | 1.50k | |
1529 | 1.50k | for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); |
1530 | 11.0k | U != E11.0k ; ++U9.55k ) {9.55k |
1531 | 9.55k | if (IsInPair.find(*U) == IsInPair.end()9.55k ) continue3.93k ; |
1532 | 5.62k | PairableInstUsers.insert(ValuePair(&*I, *U)); |
1533 | 5.62k | } |
1534 | 1.50k | |
1535 | 1.50k | if (I == EL) |
1536 | 87 | break; |
1537 | 1.50k | } |
1538 | 87 | } |
1539 | | |
1540 | | // Returns true if an input to pair P is an output of pair Q and also an |
1541 | | // input of pair Q is an output of pair P. If this is the case, then these |
1542 | | // two pairs cannot be simultaneously fused. |
1543 | | bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, |
1544 | | DenseSet<ValuePair> &PairableInstUsers, |
1545 | | DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap, |
1546 | 30.7k | DenseSet<VPPair> *PairableInstUserPairSet) { |
1547 | 30.7k | // Two pairs are in conflict if they are mutual Users of eachother. |
1548 | 30.7k | bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || |
1549 | 19.1k | PairableInstUsers.count(ValuePair(P.first, Q.second)) || |
1550 | 18.3k | PairableInstUsers.count(ValuePair(P.second, Q.first)) || |
1551 | 17.8k | PairableInstUsers.count(ValuePair(P.second, Q.second)); |
1552 | 30.7k | bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || |
1553 | 28.8k | PairableInstUsers.count(ValuePair(Q.first, P.second)) || |
1554 | 27.6k | PairableInstUsers.count(ValuePair(Q.second, P.first)) || |
1555 | 27.4k | PairableInstUsers.count(ValuePair(Q.second, P.second)); |
1556 | 30.7k | if (PairableInstUserMap30.7k ) {5.00k |
1557 | 5.00k | // FIXME: The expensive part of the cycle check is not so much the cycle |
1558 | 5.00k | // check itself but this edge insertion procedure. This needs some |
1559 | 5.00k | // profiling and probably a different data structure. |
1560 | 5.00k | if (PUsesQ5.00k ) {589 |
1561 | 589 | if (PairableInstUserPairSet->insert(VPPair(Q, P)).second) |
1562 | 248 | (*PairableInstUserMap)[Q].push_back(P); |
1563 | 589 | } |
1564 | 5.00k | if (QUsesP5.00k ) {2.35k |
1565 | 2.35k | if (PairableInstUserPairSet->insert(VPPair(P, Q)).second) |
1566 | 733 | (*PairableInstUserMap)[P].push_back(Q); |
1567 | 2.35k | } |
1568 | 5.00k | } |
1569 | 30.7k | |
1570 | 13.1k | return (QUsesP && PUsesQ); |
1571 | 30.7k | } |
1572 | | |
1573 | | // This function walks the use graph of current pairs to see if, starting |
1574 | | // from P, the walk returns to P. |
1575 | | bool BBVectorize::pairWillFormCycle(ValuePair P, |
1576 | | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, |
1577 | 1.98k | DenseSet<ValuePair> &CurrentPairs) { |
1578 | 1.98k | DEBUG(if (DebugCycleCheck) |
1579 | 1.98k | dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " |
1580 | 1.98k | << *P.second << "\n"); |
1581 | 1.98k | // A lookup table of visisted pairs is kept because the PairableInstUserMap |
1582 | 1.98k | // contains non-direct associations. |
1583 | 1.98k | DenseSet<ValuePair> Visited; |
1584 | 1.98k | SmallVector<ValuePair, 32> Q; |
1585 | 1.98k | // General depth-first post-order traversal: |
1586 | 1.98k | Q.push_back(P); |
1587 | 2.63k | do { |
1588 | 2.63k | ValuePair QTop = Q.pop_back_val(); |
1589 | 2.63k | Visited.insert(QTop); |
1590 | 2.63k | |
1591 | 2.63k | DEBUG(if (DebugCycleCheck) |
1592 | 2.63k | dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " |
1593 | 2.63k | << *QTop.second << "\n"); |
1594 | 2.63k | DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = |
1595 | 2.63k | PairableInstUserMap.find(QTop); |
1596 | 2.63k | if (QQ == PairableInstUserMap.end()) |
1597 | 1.91k | continue; |
1598 | 2.63k | |
1599 | 718 | for (std::vector<ValuePair>::iterator C = QQ->second.begin(), |
1600 | 1.88k | CE = QQ->second.end(); C != CE1.88k ; ++C1.16k ) {1.16k |
1601 | 1.16k | if (*C == P1.16k ) {0 |
1602 | 0 | DEBUG(dbgs() |
1603 | 0 | << "BBV: rejected to prevent non-trivial cycle formation: " |
1604 | 0 | << QTop.first << " <-> " << C->second << "\n"); |
1605 | 0 | return true; |
1606 | 0 | } |
1607 | 1.16k | |
1608 | 1.16k | if (1.16k CurrentPairs.count(*C) && 1.16k !Visited.count(*C)884 ) |
1609 | 645 | Q.push_back(*C); |
1610 | 1.16k | } |
1611 | 2.63k | } while (!Q.empty()); |
1612 | 1.98k | |
1613 | 1.98k | return false; |
1614 | 1.98k | } |
1615 | | |
1616 | | // This function builds the initial dag of connected pairs with the |
1617 | | // pair J at the root. |
1618 | | void BBVectorize::buildInitialDAGFor( |
1619 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1620 | | DenseSet<ValuePair> &CandidatePairsSet, |
1621 | | std::vector<Value *> &PairableInsts, |
1622 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
1623 | | DenseSet<ValuePair> &PairableInstUsers, |
1624 | | DenseMap<Value *, Value *> &ChosenPairs, |
1625 | 3.20k | DenseMap<ValuePair, size_t> &DAG, ValuePair J) { |
1626 | 3.20k | // Each of these pairs is viewed as the root node of a DAG. The DAG |
1627 | 3.20k | // is then walked (depth-first). As this happens, we keep track of |
1628 | 3.20k | // the pairs that compose the DAG and the maximum depth of the DAG. |
1629 | 3.20k | SmallVector<ValuePairWithDepth, 32> Q; |
1630 | 3.20k | // General depth-first post-order traversal: |
1631 | 3.20k | Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); |
1632 | 11.1k | do { |
1633 | 11.1k | ValuePairWithDepth QTop = Q.back(); |
1634 | 11.1k | |
1635 | 11.1k | // Push each child onto the queue: |
1636 | 11.1k | bool MoreChildren = false; |
1637 | 11.1k | size_t MaxChildDepth = QTop.second; |
1638 | 11.1k | DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = |
1639 | 11.1k | ConnectedPairs.find(QTop.first); |
1640 | 11.1k | if (QQ != ConnectedPairs.end()) |
1641 | 7.41k | for (std::vector<ValuePair>::iterator k = QQ->second.begin(), |
1642 | 18.3k | ke = QQ->second.end(); k != ke18.3k ; ++k10.9k ) {10.9k |
1643 | 10.9k | // Make sure that this child pair is still a candidate: |
1644 | 10.9k | if (CandidatePairsSet.count(*k)10.9k ) {10.3k |
1645 | 10.3k | DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k); |
1646 | 10.3k | if (C == DAG.end()10.3k ) {4.59k |
1647 | 4.59k | size_t d = getDepthFactor(k->first); |
1648 | 4.59k | Q.push_back(ValuePairWithDepth(*k, QTop.second+d)); |
1649 | 4.59k | MoreChildren = true; |
1650 | 5.78k | } else { |
1651 | 5.78k | MaxChildDepth = std::max(MaxChildDepth, C->second); |
1652 | 5.78k | } |
1653 | 10.3k | } |
1654 | 10.9k | } |
1655 | 11.1k | |
1656 | 11.1k | if (!MoreChildren11.1k ) {7.80k |
1657 | 7.80k | // Record the current pair as part of the DAG: |
1658 | 7.80k | DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); |
1659 | 7.80k | Q.pop_back(); |
1660 | 7.80k | } |
1661 | 11.1k | } while (!Q.empty()); |
1662 | 3.20k | } |
1663 | | |
1664 | | // Given some initial dag, prune it by removing conflicting pairs (pairs |
1665 | | // that cannot be simultaneously chosen for vectorization). |
1666 | | void BBVectorize::pruneDAGFor( |
1667 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1668 | | std::vector<Value *> &PairableInsts, |
1669 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
1670 | | DenseSet<ValuePair> &PairableInstUsers, |
1671 | | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, |
1672 | | DenseSet<VPPair> &PairableInstUserPairSet, |
1673 | | DenseMap<Value *, Value *> &ChosenPairs, |
1674 | | DenseMap<ValuePair, size_t> &DAG, |
1675 | | DenseSet<ValuePair> &PrunedDAG, ValuePair J, |
1676 | 3.20k | bool UseCycleCheck) { |
1677 | 3.20k | SmallVector<ValuePairWithDepth, 32> Q; |
1678 | 3.20k | // General depth-first post-order traversal: |
1679 | 3.20k | Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); |
1680 | 6.43k | do { |
1681 | 6.43k | ValuePairWithDepth QTop = Q.pop_back_val(); |
1682 | 6.43k | PrunedDAG.insert(QTop.first); |
1683 | 6.43k | |
1684 | 6.43k | // Visit each child, pruning as necessary... |
1685 | 6.43k | SmallVector<ValuePairWithDepth, 8> BestChildren; |
1686 | 6.43k | DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = |
1687 | 6.43k | ConnectedPairs.find(QTop.first); |
1688 | 6.43k | if (QQ == ConnectedPairs.end()) |
1689 | 3.12k | continue; |
1690 | 6.43k | |
1691 | 3.30k | for (std::vector<ValuePair>::iterator K = QQ->second.begin(), |
1692 | 7.85k | KE = QQ->second.end(); K != KE7.85k ; ++K4.54k ) {4.54k |
1693 | 4.54k | DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K); |
1694 | 4.54k | if (C == DAG.end()4.54k ) continue308 ; |
1695 | 4.54k | |
1696 | 4.54k | // This child is in the DAG, now we need to make sure it is the |
1697 | 4.54k | // best of any conflicting children. There could be multiple |
1698 | 4.54k | // conflicting children, so first, determine if we're keeping |
1699 | 4.54k | // this child, then delete conflicting children as necessary. |
1700 | 4.54k | |
1701 | 4.54k | // It is also necessary to guard against pairing-induced |
1702 | 4.54k | // dependencies. Consider instructions a .. x .. y .. b |
1703 | 4.54k | // such that (a,b) are to be fused and (x,y) are to be fused |
1704 | 4.54k | // but a is an input to x and b is an output from y. This |
1705 | 4.54k | // means that y cannot be moved after b but x must be moved |
1706 | 4.54k | // after b for (a,b) to be fused. In other words, after |
1707 | 4.54k | // fusing (a,b) we have y .. a/b .. x where y is an input |
1708 | 4.54k | // to a/b and x is an output to a/b: x and y can no longer |
1709 | 4.54k | // be legally fused. To prevent this condition, we must |
1710 | 4.54k | // make sure that a child pair added to the DAG is not |
1711 | 4.54k | // both an input and output of an already-selected pair. |
1712 | 4.54k | |
1713 | 4.54k | // Pairing-induced dependencies can also form from more complicated |
1714 | 4.54k | // cycles. The pair vs. pair conflicts are easy to check, and so |
1715 | 4.54k | // that is done explicitly for "fast rejection", and because for |
1716 | 4.54k | // child vs. child conflicts, we may prefer to keep the current |
1717 | 4.54k | // pair in preference to the already-selected child. |
1718 | 4.23k | DenseSet<ValuePair> CurrentPairs; |
1719 | 4.23k | |
1720 | 4.23k | bool CanAdd = true; |
1721 | 4.23k | for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 |
1722 | 4.23k | = BestChildren.begin(), E2 = BestChildren.end(); |
1723 | 5.14k | C2 != E25.14k ; ++C2911 ) {1.71k |
1724 | 1.71k | if (C2->first.first == C->first.first || |
1725 | 1.44k | C2->first.first == C->first.second || |
1726 | 1.22k | C2->first.second == C->first.first || |
1727 | 1.20k | C2->first.second == C->first.second || |
1728 | 863 | pairsConflict(C2->first, C->first, PairableInstUsers, |
1729 | 744 | UseCycleCheck ? &PairableInstUserMap119 : nullptr744 , |
1730 | 119 | UseCycleCheck ? &PairableInstUserPairSet |
1731 | 855 | : nullptr744 )) {855 |
1732 | 855 | if (C2->second >= C->second855 ) {807 |
1733 | 807 | CanAdd = false; |
1734 | 807 | break; |
1735 | 807 | } |
1736 | 855 | |
1737 | 48 | CurrentPairs.insert(C2->first); |
1738 | 48 | } |
1739 | 1.71k | } |
1740 | 4.23k | if (!CanAdd4.23k ) continue807 ; |
1741 | 4.23k | |
1742 | 4.23k | // Even worse, this child could conflict with another node already |
1743 | 4.23k | // selected for the DAG. If that is the case, ignore this child. |
1744 | 3.42k | for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(), |
1745 | 11.1k | E2 = PrunedDAG.end(); T != E211.1k ; ++T7.70k ) {7.83k |
1746 | 7.83k | if (T->first == C->first.first || |
1747 | 7.72k | T->first == C->first.second || |
1748 | 7.72k | T->second == C->first.first || |
1749 | 7.71k | T->second == C->first.second || |
1750 | 7.70k | pairsConflict(*T, C->first, PairableInstUsers, |
1751 | 6.43k | UseCycleCheck ? &PairableInstUserMap1.26k : nullptr6.43k , |
1752 | 1.26k | UseCycleCheck ? &PairableInstUserPairSet |
1753 | 6.43k | : nullptr6.43k )) {132 |
1754 | 132 | CanAdd = false; |
1755 | 132 | break; |
1756 | 132 | } |
1757 | 7.83k | |
1758 | 7.70k | CurrentPairs.insert(*T); |
1759 | 7.70k | } |
1760 | 3.42k | if (!CanAdd3.42k ) continue132 ; |
1761 | 3.42k | |
1762 | 3.42k | // And check the queue too... |
1763 | 3.29k | for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(), |
1764 | 3.83k | E2 = Q.end(); C2 != E23.83k ; ++C2535 ) {557 |
1765 | 557 | if (C2->first.first == C->first.first || |
1766 | 535 | C2->first.first == C->first.second || |
1767 | 535 | C2->first.second == C->first.first || |
1768 | 535 | C2->first.second == C->first.second || |
1769 | 535 | pairsConflict(C2->first, C->first, PairableInstUsers, |
1770 | 490 | UseCycleCheck ? &PairableInstUserMap45 : nullptr490 , |
1771 | 45 | UseCycleCheck ? &PairableInstUserPairSet |
1772 | 490 | : nullptr490 )) {22 |
1773 | 22 | CanAdd = false; |
1774 | 22 | break; |
1775 | 22 | } |
1776 | 557 | |
1777 | 535 | CurrentPairs.insert(C2->first); |
1778 | 535 | } |
1779 | 3.29k | if (!CanAdd3.29k ) continue22 ; |
1780 | 3.29k | |
1781 | 3.29k | // Last but not least, check for a conflict with any of the |
1782 | 3.29k | // already-chosen pairs. |
1783 | 3.27k | for (DenseMap<Value *, Value *>::iterator C2 = |
1784 | 3.27k | ChosenPairs.begin(), E2 = ChosenPairs.end(); |
1785 | 16.0k | C2 != E216.0k ; ++C212.7k ) {12.7k |
1786 | 12.7k | if (pairsConflict(*C2, C->first, PairableInstUsers, |
1787 | 11.0k | UseCycleCheck ? &PairableInstUserMap1.72k : nullptr11.0k , |
1788 | 1.72k | UseCycleCheck ? &PairableInstUserPairSet |
1789 | 11.0k | : nullptr11.0k )) {0 |
1790 | 0 | CanAdd = false; |
1791 | 0 | break; |
1792 | 0 | } |
1793 | 12.7k | |
1794 | 12.7k | CurrentPairs.insert(*C2); |
1795 | 12.7k | } |
1796 | 3.27k | if (!CanAdd3.27k ) continue0 ; |
1797 | 3.27k | |
1798 | 3.27k | // To check for non-trivial cycles formed by the addition of the |
1799 | 3.27k | // current pair we've formed a list of all relevant pairs, now use a |
1800 | 3.27k | // graph walk to check for a cycle. We start from the current pair and |
1801 | 3.27k | // walk the use dag to see if we again reach the current pair. If we |
1802 | 3.27k | // do, then the current pair is rejected. |
1803 | 3.27k | |
1804 | 3.27k | // FIXME: It may be more efficient to use a topological-ordering |
1805 | 3.27k | // algorithm to improve the cycle check. This should be investigated. |
1806 | 3.27k | if (3.27k UseCycleCheck &&3.27k |
1807 | 716 | pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) |
1808 | 0 | continue; |
1809 | 3.27k | |
1810 | 3.27k | // This child can be added, but we may have chosen it in preference |
1811 | 3.27k | // to an already-selected child. Check for this here, and if a |
1812 | 3.27k | // conflict is found, then remove the previously-selected child |
1813 | 3.27k | // before adding this one in its place. |
1814 | 3.27k | for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 |
1815 | 3.58k | = BestChildren.begin(); C2 != BestChildren.end()3.58k ;) {314 |
1816 | 314 | if (C2->first.first == C->first.first || |
1817 | 292 | C2->first.first == C->first.second || |
1818 | 284 | C2->first.second == C->first.first || |
1819 | 284 | C2->first.second == C->first.second || |
1820 | 271 | pairsConflict(C2->first, C->first, PairableInstUsers)) |
1821 | 43 | C2 = BestChildren.erase(C2); |
1822 | 314 | else |
1823 | 271 | ++C2; |
1824 | 314 | } |
1825 | 3.27k | |
1826 | 3.27k | BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); |
1827 | 3.27k | } |
1828 | 3.30k | |
1829 | 3.30k | for (SmallVectorImpl<ValuePairWithDepth>::iterator C |
1830 | 3.30k | = BestChildren.begin(), E2 = BestChildren.end(); |
1831 | 6.53k | C != E26.53k ; ++C3.23k ) {3.23k |
1832 | 3.23k | size_t DepthF = getDepthFactor(C->first.first); |
1833 | 3.23k | Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); |
1834 | 3.23k | } |
1835 | 6.43k | } while (!Q.empty()); |
1836 | 3.20k | } |
1837 | | |
1838 | | // This function finds the best dag of mututally-compatible connected |
1839 | | // pairs, given the choice of root pairs as an iterator range. |
1840 | | void BBVectorize::findBestDAGFor( |
1841 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
1842 | | DenseSet<ValuePair> &CandidatePairsSet, |
1843 | | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
1844 | | std::vector<Value *> &PairableInsts, |
1845 | | DenseSet<ValuePair> &FixedOrderPairs, |
1846 | | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
1847 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
1848 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, |
1849 | | DenseSet<ValuePair> &PairableInstUsers, |
1850 | | DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, |
1851 | | DenseSet<VPPair> &PairableInstUserPairSet, |
1852 | | DenseMap<Value *, Value *> &ChosenPairs, |
1853 | | DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, |
1854 | | int &BestEffSize, Value *II, std::vector<Value *>&JJ, |
1855 | 1.22k | bool UseCycleCheck) { |
1856 | 1.22k | for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end(); |
1857 | 9.07k | J != JE9.07k ; ++J7.85k ) {7.85k |
1858 | 7.85k | ValuePair IJ(II, *J); |
1859 | 7.85k | if (!CandidatePairsSet.count(IJ)) |
1860 | 4.64k | continue; |
1861 | 7.85k | |
1862 | 7.85k | // Before going any further, make sure that this pair does not |
1863 | 7.85k | // conflict with any already-selected pairs (see comment below |
1864 | 7.85k | // near the DAG pruning for more details). |
1865 | 3.21k | DenseSet<ValuePair> ChosenPairSet; |
1866 | 3.21k | bool DoesConflict = false; |
1867 | 3.21k | for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), |
1868 | 11.7k | E = ChosenPairs.end(); C != E11.7k ; ++C8.56k ) {8.57k |
1869 | 8.57k | if (pairsConflict(*C, IJ, PairableInstUsers, |
1870 | 6.73k | UseCycleCheck ? &PairableInstUserMap1.84k : nullptr6.73k , |
1871 | 6.73k | UseCycleCheck ? &PairableInstUserPairSet1.84k : nullptr6.73k )) {10 |
1872 | 10 | DoesConflict = true; |
1873 | 10 | break; |
1874 | 10 | } |
1875 | 8.57k | |
1876 | 8.56k | ChosenPairSet.insert(*C); |
1877 | 8.56k | } |
1878 | 3.21k | if (DoesConflict3.21k ) continue10 ; |
1879 | 3.21k | |
1880 | 3.20k | if (3.20k UseCycleCheck &&3.20k |
1881 | 1.27k | pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet)) |
1882 | 0 | continue; |
1883 | 3.20k | |
1884 | 3.20k | DenseMap<ValuePair, size_t> DAG; |
1885 | 3.20k | buildInitialDAGFor(CandidatePairs, CandidatePairsSet, |
1886 | 3.20k | PairableInsts, ConnectedPairs, |
1887 | 3.20k | PairableInstUsers, ChosenPairs, DAG, IJ); |
1888 | 3.20k | |
1889 | 3.20k | // Because we'll keep the child with the largest depth, the largest |
1890 | 3.20k | // depth is still the same in the unpruned DAG. |
1891 | 3.20k | size_t MaxDepth = DAG.lookup(IJ); |
1892 | 3.20k | |
1893 | 3.20k | DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {" |
1894 | 3.20k | << *IJ.first << " <-> " << *IJ.second << "} of depth " << |
1895 | 3.20k | MaxDepth << " and size " << DAG.size() << "\n"); |
1896 | 3.20k | |
1897 | 3.20k | // At this point the DAG has been constructed, but, may contain |
1898 | 3.20k | // contradictory children (meaning that different children of |
1899 | 3.20k | // some dag node may be attempting to fuse the same instruction). |
1900 | 3.20k | // So now we walk the dag again, in the case of a conflict, |
1901 | 3.20k | // keep only the child with the largest depth. To break a tie, |
1902 | 3.20k | // favor the first child. |
1903 | 3.20k | |
1904 | 3.20k | DenseSet<ValuePair> PrunedDAG; |
1905 | 3.20k | pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs, |
1906 | 3.20k | PairableInstUsers, PairableInstUserMap, |
1907 | 3.20k | PairableInstUserPairSet, |
1908 | 3.20k | ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck); |
1909 | 3.20k | |
1910 | 3.20k | int EffSize = 0; |
1911 | 3.20k | if (TTI3.20k ) {2.67k |
1912 | 2.67k | DenseSet<Value *> PrunedDAGInstrs; |
1913 | 2.67k | for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), |
1914 | 8.19k | E = PrunedDAG.end(); S != E8.19k ; ++S5.51k ) {5.51k |
1915 | 5.51k | PrunedDAGInstrs.insert(S->first); |
1916 | 5.51k | PrunedDAGInstrs.insert(S->second); |
1917 | 5.51k | } |
1918 | 2.67k | |
1919 | 2.67k | // The set of pairs that have already contributed to the total cost. |
1920 | 2.67k | DenseSet<ValuePair> IncomingPairs; |
1921 | 2.67k | |
1922 | 2.67k | // If the cost model were perfect, this might not be necessary; but we |
1923 | 2.67k | // need to make sure that we don't get stuck vectorizing our own |
1924 | 2.67k | // shuffle chains. |
1925 | 2.67k | bool HasNontrivialInsts = false; |
1926 | 2.67k | |
1927 | 2.67k | // The node weights represent the cost savings associated with |
1928 | 2.67k | // fusing the pair of instructions. |
1929 | 2.67k | for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), |
1930 | 8.19k | E = PrunedDAG.end(); S != E8.19k ; ++S5.51k ) {5.51k |
1931 | 5.51k | if (!isa<ShuffleVectorInst>(S->first) && |
1932 | 4.26k | !isa<InsertElementInst>(S->first) && |
1933 | 2.04k | !isa<ExtractElementInst>(S->first)) |
1934 | 2.00k | HasNontrivialInsts = true; |
1935 | 5.51k | |
1936 | 5.51k | bool FlipOrder = false; |
1937 | 5.51k | |
1938 | 5.51k | if (getDepthFactor(S->first)5.51k ) {3.25k |
1939 | 3.25k | int ESContrib = CandidatePairCostSavings.find(*S)->second; |
1940 | 3.25k | DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" |
1941 | 3.25k | << *S->first << " <-> " << *S->second << "} = " << |
1942 | 3.25k | ESContrib << "\n"); |
1943 | 3.25k | EffSize += ESContrib; |
1944 | 3.25k | } |
1945 | 5.51k | |
1946 | 5.51k | // The edge weights contribute in a negative sense: they represent |
1947 | 5.51k | // the cost of shuffles. |
1948 | 5.51k | DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS = |
1949 | 5.51k | ConnectedPairDeps.find(*S); |
1950 | 5.51k | if (SS != ConnectedPairDeps.end()5.51k ) {3.76k |
1951 | 3.76k | unsigned NumDepsDirect = 0, NumDepsSwap = 0; |
1952 | 3.76k | for (std::vector<ValuePair>::iterator T = SS->second.begin(), |
1953 | 12.7k | TE = SS->second.end(); T != TE12.7k ; ++T9.00k ) {9.00k |
1954 | 9.00k | VPPair Q(*S, *T); |
1955 | 9.00k | if (!PrunedDAG.count(Q.second)) |
1956 | 5.98k | continue; |
1957 | 3.02k | DenseMap<VPPair, unsigned>::iterator R = |
1958 | 3.02k | PairConnectionTypes.find(VPPair(Q.second, Q.first)); |
1959 | 3.02k | assert(R != PairConnectionTypes.end() && |
1960 | 3.02k | "Cannot find pair connection type"); |
1961 | 3.02k | if (R->second == PairConnectionDirect) |
1962 | 2.69k | ++NumDepsDirect; |
1963 | 330 | else if (330 R->second == PairConnectionSwap330 ) |
1964 | 126 | ++NumDepsSwap; |
1965 | 3.02k | } |
1966 | 3.76k | |
1967 | 3.76k | // If there are more swaps than direct connections, then |
1968 | 3.76k | // the pair order will be flipped during fusion. So the real |
1969 | 3.76k | // number of swaps is the minimum number. |
1970 | 3.76k | FlipOrder = !FixedOrderPairs.count(*S) && |
1971 | 3.56k | ((NumDepsSwap > NumDepsDirect) || |
1972 | 3.43k | FixedOrderPairs.count(ValuePair(S->second, S->first))); |
1973 | 3.76k | |
1974 | 3.76k | for (std::vector<ValuePair>::iterator T = SS->second.begin(), |
1975 | 12.7k | TE = SS->second.end(); T != TE12.7k ; ++T9.00k ) {9.00k |
1976 | 9.00k | VPPair Q(*S, *T); |
1977 | 9.00k | if (!PrunedDAG.count(Q.second)) |
1978 | 5.98k | continue; |
1979 | 3.02k | DenseMap<VPPair, unsigned>::iterator R = |
1980 | 3.02k | PairConnectionTypes.find(VPPair(Q.second, Q.first)); |
1981 | 3.02k | assert(R != PairConnectionTypes.end() && |
1982 | 3.02k | "Cannot find pair connection type"); |
1983 | 3.02k | Type *Ty1 = Q.second.first->getType(), |
1984 | 3.02k | *Ty2 = Q.second.second->getType(); |
1985 | 3.02k | Type *VTy = getVecTypeForPair(Ty1, Ty2); |
1986 | 3.02k | if ((R->second == PairConnectionDirect && 3.02k FlipOrder2.69k ) || |
1987 | 3.00k | (R->second == PairConnectionSwap && 3.00k !FlipOrder126 ) || |
1988 | 3.00k | R->second == PairConnectionSplat3.00k ) {223 |
1989 | 223 | int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, |
1990 | 223 | VTy, VTy); |
1991 | 223 | |
1992 | 223 | if (VTy->getVectorNumElements() == 2223 ) {40 |
1993 | 40 | if (R->second == PairConnectionSplat) |
1994 | 37 | ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( |
1995 | 37 | TargetTransformInfo::SK_Broadcast, VTy)); |
1996 | 40 | else |
1997 | 3 | ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( |
1998 | 3 | TargetTransformInfo::SK_Reverse, VTy)); |
1999 | 40 | } |
2000 | 223 | |
2001 | 223 | DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << |
2002 | 223 | *Q.second.first << " <-> " << *Q.second.second << |
2003 | 223 | "} -> {" << |
2004 | 223 | *S->first << " <-> " << *S->second << "} = " << |
2005 | 223 | ESContrib << "\n"); |
2006 | 223 | EffSize -= ESContrib; |
2007 | 223 | } |
2008 | 3.02k | } |
2009 | 3.76k | } |
2010 | 5.51k | |
2011 | 5.51k | // Compute the cost of outgoing edges. We assume that edges outgoing |
2012 | 5.51k | // to shuffles, inserts or extracts can be merged, and so contribute |
2013 | 5.51k | // no additional cost. |
2014 | 5.51k | if (!S->first->getType()->isVoidTy()5.51k ) {5.25k |
2015 | 5.25k | Type *Ty1 = S->first->getType(), |
2016 | 5.25k | *Ty2 = S->second->getType(); |
2017 | 5.25k | Type *VTy = getVecTypeForPair(Ty1, Ty2); |
2018 | 5.25k | |
2019 | 5.25k | bool NeedsExtraction = false; |
2020 | 5.59k | for (User *U : S->first->users()) { |
2021 | 5.59k | if (ShuffleVectorInst *SI5.59k = dyn_cast<ShuffleVectorInst>(U)) {2.21k |
2022 | 2.21k | // Shuffle can be folded if it has no other input |
2023 | 2.21k | if (isa<UndefValue>(SI->getOperand(1))) |
2024 | 458 | continue; |
2025 | 2.21k | } |
2026 | 5.14k | if (5.14k isa<ExtractElementInst>(U)5.14k ) |
2027 | 32 | continue; |
2028 | 5.10k | if (5.10k PrunedDAGInstrs.count(U)5.10k ) |
2029 | 2.67k | continue; |
2030 | 2.42k | NeedsExtraction = true; |
2031 | 2.42k | break; |
2032 | 5.10k | } |
2033 | 5.25k | |
2034 | 5.25k | if (NeedsExtraction5.25k ) {2.42k |
2035 | 2.42k | int ESContrib; |
2036 | 2.42k | if (Ty1->isVectorTy()2.42k ) {1.85k |
2037 | 1.85k | ESContrib = (int) getInstrCost(Instruction::ShuffleVector, |
2038 | 1.85k | Ty1, VTy); |
2039 | 1.85k | ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( |
2040 | 1.85k | TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1)); |
2041 | 1.85k | } else |
2042 | 577 | ESContrib = (int) TTI->getVectorInstrCost( |
2043 | 577 | Instruction::ExtractElement, VTy, 0); |
2044 | 2.42k | |
2045 | 2.42k | DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << |
2046 | 2.42k | *S->first << "} = " << ESContrib << "\n"); |
2047 | 2.42k | EffSize -= ESContrib; |
2048 | 2.42k | } |
2049 | 5.25k | |
2050 | 5.25k | NeedsExtraction = false; |
2051 | 5.36k | for (User *U : S->second->users()) { |
2052 | 5.36k | if (ShuffleVectorInst *SI5.36k = dyn_cast<ShuffleVectorInst>(U)) {2.02k |
2053 | 2.02k | // Shuffle can be folded if it has no other input |
2054 | 2.02k | if (isa<UndefValue>(SI->getOperand(1))) |
2055 | 150 | continue; |
2056 | 2.02k | } |
2057 | 5.21k | if (5.21k isa<ExtractElementInst>(U)5.21k ) |
2058 | 14 | continue; |
2059 | 5.20k | if (5.20k PrunedDAGInstrs.count(U)5.20k ) |
2060 | 2.63k | continue; |
2061 | 2.56k | NeedsExtraction = true; |
2062 | 2.56k | break; |
2063 | 5.20k | } |
2064 | 5.25k | |
2065 | 5.25k | if (NeedsExtraction5.25k ) {2.56k |
2066 | 2.56k | int ESContrib; |
2067 | 2.56k | if (Ty2->isVectorTy()2.56k ) {1.99k |
2068 | 1.99k | ESContrib = (int) getInstrCost(Instruction::ShuffleVector, |
2069 | 1.99k | Ty2, VTy); |
2070 | 1.99k | ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( |
2071 | 1.99k | TargetTransformInfo::SK_ExtractSubvector, VTy, |
2072 | 1.99k | Ty1->isVectorTy() ? Ty1->getVectorNumElements()1.99k : 12 , Ty2)); |
2073 | 1.99k | } else |
2074 | 569 | ESContrib = (int) TTI->getVectorInstrCost( |
2075 | 569 | Instruction::ExtractElement, VTy, 1); |
2076 | 2.56k | DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << |
2077 | 2.56k | *S->second << "} = " << ESContrib << "\n"); |
2078 | 2.56k | EffSize -= ESContrib; |
2079 | 2.56k | } |
2080 | 5.25k | } |
2081 | 5.51k | |
2082 | 5.51k | // Compute the cost of incoming edges. |
2083 | 5.51k | if (!isa<LoadInst>(S->first) && 5.51k !isa<StoreInst>(S->first)5.47k ) {5.21k |
2084 | 5.21k | Instruction *S1 = cast<Instruction>(S->first), |
2085 | 5.21k | *S2 = cast<Instruction>(S->second); |
2086 | 18.5k | for (unsigned o = 0; o < S1->getNumOperands()18.5k ; ++o13.3k ) {13.3k |
2087 | 13.3k | Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); |
2088 | 13.3k | |
2089 | 13.3k | // Combining constants into vector constants (or small vector |
2090 | 13.3k | // constants into larger ones are assumed free). |
2091 | 13.3k | if (isa<Constant>(O1) && 13.3k isa<Constant>(O2)6.06k ) |
2092 | 5.07k | continue; |
2093 | 13.3k | |
2094 | 8.22k | if (8.22k FlipOrder8.22k ) |
2095 | 245 | std::swap(O1, O2); |
2096 | 8.22k | |
2097 | 8.22k | ValuePair VP = ValuePair(O1, O2); |
2098 | 8.22k | ValuePair VPR = ValuePair(O2, O1); |
2099 | 8.22k | |
2100 | 8.22k | // Internal edges are not handled here. |
2101 | 8.22k | if (PrunedDAG.count(VP) || 8.22k PrunedDAG.count(VPR)6.05k ) |
2102 | 2.17k | continue; |
2103 | 8.22k | |
2104 | 6.05k | Type *Ty1 = O1->getType(), |
2105 | 6.05k | *Ty2 = O2->getType(); |
2106 | 6.05k | Type *VTy = getVecTypeForPair(Ty1, Ty2); |
2107 | 6.05k | |
2108 | 6.05k | // Combining vector operations of the same type is also assumed |
2109 | 6.05k | // folded with other operations. |
2110 | 6.05k | if (Ty1 == Ty26.05k ) {5.91k |
2111 | 5.91k | // If both are insert elements, then both can be widened. |
2112 | 5.91k | InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1), |
2113 | 5.91k | *IEO2 = dyn_cast<InsertElementInst>(O2); |
2114 | 5.91k | if (IEO1 && 5.91k IEO21.44k && isPureIEChain(IEO1)1.02k && isPureIEChain(IEO2)1.00k ) |
2115 | 947 | continue; |
2116 | 5.91k | // If both are extract elements, and both have the same input |
2117 | 5.91k | // type, then they can be replaced with a shuffle |
2118 | 4.96k | ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1), |
2119 | 4.96k | *EIO2 = dyn_cast<ExtractElementInst>(O2); |
2120 | 4.96k | if (EIO1 && 4.96k EIO2237 && |
2121 | 20 | EIO1->getOperand(0)->getType() == |
2122 | 20 | EIO2->getOperand(0)->getType()) |
2123 | 20 | continue; |
2124 | 4.96k | // If both are a shuffle with equal operand types and only two |
2125 | 4.96k | // unqiue operands, then they can be replaced with a single |
2126 | 4.96k | // shuffle |
2127 | 4.94k | ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1), |
2128 | 4.94k | *SIO2 = dyn_cast<ShuffleVectorInst>(O2); |
2129 | 4.94k | if (SIO1 && 4.94k SIO2650 && |
2130 | 650 | SIO1->getOperand(0)->getType() == |
2131 | 610 | SIO2->getOperand(0)->getType()) { |
2132 | 610 | SmallSet<Value *, 4> SIOps; |
2133 | 610 | SIOps.insert(SIO1->getOperand(0)); |
2134 | 610 | SIOps.insert(SIO1->getOperand(1)); |
2135 | 610 | SIOps.insert(SIO2->getOperand(0)); |
2136 | 610 | SIOps.insert(SIO2->getOperand(1)); |
2137 | 610 | if (SIOps.size() <= 2) |
2138 | 154 | continue; |
2139 | 610 | } |
2140 | 4.94k | } |
2141 | 6.05k | |
2142 | 4.93k | int ESContrib; |
2143 | 4.93k | // This pair has already been formed. |
2144 | 4.93k | if (IncomingPairs.count(VP)4.93k ) {71 |
2145 | 71 | continue; |
2146 | 4.86k | } else if (4.86k IncomingPairs.count(VPR)4.86k ) {6 |
2147 | 6 | ESContrib = (int) getInstrCost(Instruction::ShuffleVector, |
2148 | 6 | VTy, VTy); |
2149 | 6 | |
2150 | 6 | if (VTy->getVectorNumElements() == 2) |
2151 | 6 | ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( |
2152 | 6 | TargetTransformInfo::SK_Reverse, VTy)); |
2153 | 4.85k | } else if (4.85k !Ty1->isVectorTy() && 4.85k !Ty2->isVectorTy()3.10k ) {3.10k |
2154 | 3.10k | ESContrib = (int) TTI->getVectorInstrCost( |
2155 | 3.10k | Instruction::InsertElement, VTy, 0); |
2156 | 3.10k | ESContrib += (int) TTI->getVectorInstrCost( |
2157 | 3.10k | Instruction::InsertElement, VTy, 1); |
2158 | 1.75k | } else if (1.75k !Ty1->isVectorTy()1.75k ) {6 |
2159 | 6 | // O1 needs to be inserted into a vector of size O2, and then |
2160 | 6 | // both need to be shuffled together. |
2161 | 6 | ESContrib = (int) TTI->getVectorInstrCost( |
2162 | 6 | Instruction::InsertElement, Ty2, 0); |
2163 | 6 | ESContrib += (int) getInstrCost(Instruction::ShuffleVector, |
2164 | 6 | VTy, Ty2); |
2165 | 1.75k | } else if (1.75k !Ty2->isVectorTy()1.75k ) {17 |
2166 | 17 | // O2 needs to be inserted into a vector of size O1, and then |
2167 | 17 | // both need to be shuffled together. |
2168 | 17 | ESContrib = (int) TTI->getVectorInstrCost( |
2169 | 17 | Instruction::InsertElement, Ty1, 0); |
2170 | 17 | ESContrib += (int) getInstrCost(Instruction::ShuffleVector, |
2171 | 17 | VTy, Ty1); |
2172 | 1.73k | } else { |
2173 | 1.73k | Type *TyBig = Ty1, *TySmall = Ty2; |
2174 | 1.73k | if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) |
2175 | 40 | std::swap(TyBig, TySmall); |
2176 | 1.73k | |
2177 | 1.73k | ESContrib = (int) getInstrCost(Instruction::ShuffleVector, |
2178 | 1.73k | VTy, TyBig); |
2179 | 1.73k | if (TyBig != TySmall) |
2180 | 122 | ESContrib += (int) getInstrCost(Instruction::ShuffleVector, |
2181 | 122 | TyBig, TySmall); |
2182 | 1.73k | } |
2183 | 4.93k | |
2184 | 4.86k | DEBUG4.86k (if (DebugPairSelection) dbgs() << "\tcost {"4.86k |
2185 | 4.86k | << *O1 << " <-> " << *O2 << "} = " << |
2186 | 4.86k | ESContrib << "\n"); |
2187 | 4.86k | EffSize -= ESContrib; |
2188 | 4.86k | IncomingPairs.insert(VP); |
2189 | 4.86k | } |
2190 | 5.21k | } |
2191 | 5.51k | } |
2192 | 2.67k | |
2193 | 2.67k | if (!HasNontrivialInsts2.67k ) {1.52k |
2194 | 1.52k | DEBUG(if (DebugPairSelection) dbgs() << |
2195 | 1.52k | "\tNo non-trivial instructions in DAG;" |
2196 | 1.52k | " override to zero effective size\n"); |
2197 | 1.52k | EffSize = 0; |
2198 | 1.52k | } |
2199 | 526 | } else { |
2200 | 526 | for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), |
2201 | 1.44k | E = PrunedDAG.end(); S != E1.44k ; ++S914 ) |
2202 | 914 | EffSize += (int) getDepthFactor(S->first); |
2203 | 526 | } |
2204 | 3.20k | |
2205 | 3.20k | DEBUG(if (DebugPairSelection) |
2206 | 3.20k | dbgs() << "BBV: found pruned DAG for pair {" |
2207 | 3.20k | << *IJ.first << " <-> " << *IJ.second << "} of depth " << |
2208 | 3.20k | MaxDepth << " and size " << PrunedDAG.size() << |
2209 | 3.20k | " (effective size: " << EffSize << ")\n"); |
2210 | 3.20k | if (((TTI && 3.20k !UseChainDepthWithTI2.67k ) || |
2211 | 526 | MaxDepth >= Config.ReqChainDepth) && |
2212 | 2.76k | EffSize > 02.76k && EffSize > BestEffSize291 ) {267 |
2213 | 267 | BestMaxDepth = MaxDepth; |
2214 | 267 | BestEffSize = EffSize; |
2215 | 267 | BestDAG = PrunedDAG; |
2216 | 267 | } |
2217 | 3.20k | } |
2218 | 1.22k | } |
2219 | | |
2220 | | // Given the list of candidate pairs, this function selects those |
2221 | | // that will be fused into vector instructions. |
2222 | | void BBVectorize::choosePairs( |
2223 | | DenseMap<Value *, std::vector<Value *> > &CandidatePairs, |
2224 | | DenseSet<ValuePair> &CandidatePairsSet, |
2225 | | DenseMap<ValuePair, int> &CandidatePairCostSavings, |
2226 | | std::vector<Value *> &PairableInsts, |
2227 | | DenseSet<ValuePair> &FixedOrderPairs, |
2228 | | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
2229 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
2230 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, |
2231 | | DenseSet<ValuePair> &PairableInstUsers, |
2232 | 87 | DenseMap<Value *, Value *>& ChosenPairs) { |
2233 | 87 | bool UseCycleCheck = |
2234 | 87 | CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck; |
2235 | 87 | |
2236 | 87 | DenseMap<Value *, std::vector<Value *> > CandidatePairs2; |
2237 | 87 | for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(), |
2238 | 7.94k | E = CandidatePairsSet.end(); I != E7.94k ; ++I7.85k ) {7.85k |
2239 | 7.85k | std::vector<Value *> &JJ = CandidatePairs2[I->second]; |
2240 | 7.85k | if (JJ.empty()7.85k ) JJ.reserve(32)1.26k ; |
2241 | 7.85k | JJ.push_back(I->first); |
2242 | 7.85k | } |
2243 | 87 | |
2244 | 87 | DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap; |
2245 | 87 | DenseSet<VPPair> PairableInstUserPairSet; |
2246 | 87 | for (std::vector<Value *>::iterator I = PairableInsts.begin(), |
2247 | 1.30k | E = PairableInsts.end(); I != E1.30k ; ++I1.22k ) {1.22k |
2248 | 1.22k | // The number of possible pairings for this variable: |
2249 | 1.22k | size_t NumChoices = CandidatePairs.lookup(*I).size(); |
2250 | 1.22k | if (!NumChoices1.22k ) continue0 ; |
2251 | 1.22k | |
2252 | 1.22k | std::vector<Value *> &JJ = CandidatePairs[*I]; |
2253 | 1.22k | |
2254 | 1.22k | // The best pair to choose and its dag: |
2255 | 1.22k | size_t BestMaxDepth = 0; |
2256 | 1.22k | int BestEffSize = 0; |
2257 | 1.22k | DenseSet<ValuePair> BestDAG; |
2258 | 1.22k | findBestDAGFor(CandidatePairs, CandidatePairsSet, |
2259 | 1.22k | CandidatePairCostSavings, |
2260 | 1.22k | PairableInsts, FixedOrderPairs, PairConnectionTypes, |
2261 | 1.22k | ConnectedPairs, ConnectedPairDeps, |
2262 | 1.22k | PairableInstUsers, PairableInstUserMap, |
2263 | 1.22k | PairableInstUserPairSet, ChosenPairs, |
2264 | 1.22k | BestDAG, BestMaxDepth, BestEffSize, *I, JJ, |
2265 | 1.22k | UseCycleCheck); |
2266 | 1.22k | |
2267 | 1.22k | if (BestDAG.empty()) |
2268 | 956 | continue; |
2269 | 1.22k | |
2270 | 1.22k | // A dag has been chosen (or not) at this point. If no dag was |
2271 | 1.22k | // chosen, then this instruction, I, cannot be paired (and is no longer |
2272 | 1.22k | // considered). |
2273 | 1.22k | |
2274 | 266 | DEBUG266 (dbgs() << "BBV: selected pairs in the best DAG for: "266 |
2275 | 266 | << *cast<Instruction>(*I) << "\n"); |
2276 | 266 | |
2277 | 266 | for (DenseSet<ValuePair>::iterator S = BestDAG.begin(), |
2278 | 1.18k | SE2 = BestDAG.end(); S != SE21.18k ; ++S922 ) {922 |
2279 | 922 | // Insert the members of this dag into the list of chosen pairs. |
2280 | 922 | ChosenPairs.insert(ValuePair(S->first, S->second)); |
2281 | 922 | DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << |
2282 | 922 | *S->second << "\n"); |
2283 | 922 | |
2284 | 922 | // Remove all candidate pairs that have values in the chosen dag. |
2285 | 922 | std::vector<Value *> &KK = CandidatePairs[S->first]; |
2286 | 922 | for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end(); |
2287 | 12.5k | K != KE12.5k ; ++K11.6k ) {11.6k |
2288 | 11.6k | if (*K == S->second) |
2289 | 922 | continue; |
2290 | 11.6k | |
2291 | 10.6k | CandidatePairsSet.erase(ValuePair(S->first, *K)); |
2292 | 10.6k | } |
2293 | 922 | |
2294 | 922 | std::vector<Value *> &LL = CandidatePairs2[S->second]; |
2295 | 922 | for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end(); |
2296 | 8.59k | L != LE8.59k ; ++L7.67k ) {7.67k |
2297 | 7.67k | if (*L == S->first) |
2298 | 922 | continue; |
2299 | 7.67k | |
2300 | 6.75k | CandidatePairsSet.erase(ValuePair(*L, S->second)); |
2301 | 6.75k | } |
2302 | 922 | |
2303 | 922 | std::vector<Value *> &MM = CandidatePairs[S->second]; |
2304 | 922 | for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end(); |
2305 | 9.16k | M != ME9.16k ; ++M8.24k ) {8.24k |
2306 | 8.24k | assert(*M != S->first && "Flipped pair in candidate list?"); |
2307 | 8.24k | CandidatePairsSet.erase(ValuePair(S->second, *M)); |
2308 | 8.24k | } |
2309 | 922 | |
2310 | 922 | std::vector<Value *> &NN = CandidatePairs2[S->first]; |
2311 | 922 | for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end(); |
2312 | 5.46k | N != NE5.46k ; ++N4.54k ) {4.54k |
2313 | 4.54k | assert(*N != S->second && "Flipped pair in candidate list?"); |
2314 | 4.54k | CandidatePairsSet.erase(ValuePair(*N, S->first)); |
2315 | 4.54k | } |
2316 | 922 | } |
2317 | 266 | } |
2318 | 87 | |
2319 | 87 | DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); |
2320 | 87 | } |
2321 | | |
2322 | | std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, |
2323 | 1.27k | unsigned n = 0) { |
2324 | 1.27k | if (!I->hasName()) |
2325 | 294 | return ""; |
2326 | 1.27k | |
2327 | 976 | return (I->getName() + (IsInput ? 976 ".v.i"426 : ".v.r"550 ) + utostr(o) + |
2328 | 600 | (n > 0 ? "." + utostr(n)376 : ""600 )).str(); |
2329 | 1.27k | } |
2330 | | |
2331 | | // Returns the value that is to be used as the pointer input to the vector |
2332 | | // instruction that fuses I with J. |
2333 | | Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, |
2334 | 104 | Instruction *I, Instruction *J, unsigned o) { |
2335 | 104 | Value *IPtr, *JPtr; |
2336 | 104 | unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; |
2337 | 104 | int64_t OffsetInElmts; |
2338 | 104 | |
2339 | 104 | // Note: the analysis might fail here, that is why the pair order has |
2340 | 104 | // been precomputed (OffsetInElmts must be unused here). |
2341 | 104 | (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, |
2342 | 104 | IAddressSpace, JAddressSpace, |
2343 | 104 | OffsetInElmts, false); |
2344 | 104 | |
2345 | 104 | // The pointer value is taken to be the one with the lowest offset. |
2346 | 104 | Value *VPtr = IPtr; |
2347 | 104 | |
2348 | 104 | Type *ArgTypeI = IPtr->getType()->getPointerElementType(); |
2349 | 104 | Type *ArgTypeJ = JPtr->getType()->getPointerElementType(); |
2350 | 104 | Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); |
2351 | 104 | Type *VArgPtrType |
2352 | 104 | = PointerType::get(VArgType, |
2353 | 104 | IPtr->getType()->getPointerAddressSpace()); |
2354 | 104 | return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), |
2355 | 104 | /* insert before */ I); |
2356 | 104 | } |
2357 | | |
2358 | | void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, |
2359 | | unsigned MaskOffset, unsigned NumInElem, |
2360 | | unsigned NumInElem1, unsigned IdxOffset, |
2361 | 16 | std::vector<Constant*> &Mask) { |
2362 | 16 | unsigned NumElem1 = J->getType()->getVectorNumElements(); |
2363 | 96 | for (unsigned v = 0; v < NumElem196 ; ++v80 ) {80 |
2364 | 80 | int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); |
2365 | 80 | if (m < 080 ) {0 |
2366 | 0 | Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); |
2367 | 80 | } else { |
2368 | 80 | unsigned mm = m + (int) IdxOffset; |
2369 | 80 | if (m >= (int) NumInElem1) |
2370 | 36 | mm += (int) NumInElem; |
2371 | 80 | |
2372 | 80 | Mask[v+MaskOffset] = |
2373 | 80 | ConstantInt::get(Type::getInt32Ty(Context), mm); |
2374 | 80 | } |
2375 | 80 | } |
2376 | 16 | } |
2377 | | |
2378 | | // Returns the value that is to be used as the vector-shuffle mask to the |
2379 | | // vector instruction that fuses I with J. |
2380 | | Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, |
2381 | 8 | Instruction *I, Instruction *J) { |
2382 | 8 | // This is the shuffle mask. We need to append the second |
2383 | 8 | // mask to the first, and the numbers need to be adjusted. |
2384 | 8 | |
2385 | 8 | Type *ArgTypeI = I->getType(); |
2386 | 8 | Type *ArgTypeJ = J->getType(); |
2387 | 8 | Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); |
2388 | 8 | |
2389 | 8 | unsigned NumElemI = ArgTypeI->getVectorNumElements(); |
2390 | 8 | |
2391 | 8 | // Get the total number of elements in the fused vector type. |
2392 | 8 | // By definition, this must equal the number of elements in |
2393 | 8 | // the final mask. |
2394 | 8 | unsigned NumElem = VArgType->getVectorNumElements(); |
2395 | 8 | std::vector<Constant*> Mask(NumElem); |
2396 | 8 | |
2397 | 8 | Type *OpTypeI = I->getOperand(0)->getType(); |
2398 | 8 | unsigned NumInElemI = OpTypeI->getVectorNumElements(); |
2399 | 8 | Type *OpTypeJ = J->getOperand(0)->getType(); |
2400 | 8 | unsigned NumInElemJ = OpTypeJ->getVectorNumElements(); |
2401 | 8 | |
2402 | 8 | // The fused vector will be: |
2403 | 8 | // ----------------------------------------------------- |
2404 | 8 | // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ | |
2405 | 8 | // ----------------------------------------------------- |
2406 | 8 | // from which we'll extract NumElem total elements (where the first NumElemI |
2407 | 8 | // of them come from the mask in I and the remainder come from the mask |
2408 | 8 | // in J. |
2409 | 8 | |
2410 | 8 | // For the mask from the first pair... |
2411 | 8 | fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI, |
2412 | 8 | 0, Mask); |
2413 | 8 | |
2414 | 8 | // For the mask from the second pair... |
2415 | 8 | fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ, |
2416 | 8 | NumInElemI, Mask); |
2417 | 8 | |
2418 | 8 | return ConstantVector::get(Mask); |
2419 | 8 | } |
2420 | | |
2421 | | bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I, |
2422 | | Instruction *J, unsigned o, Value *&LOp, |
2423 | | unsigned numElemL, |
2424 | | Type *ArgTypeL, Type *ArgTypeH, |
2425 | 4 | bool IBeforeJ, unsigned IdxOff) { |
2426 | 4 | bool ExpandedIEChain = false; |
2427 | 4 | if (InsertElementInst *LIE4 = dyn_cast<InsertElementInst>(LOp)) {4 |
2428 | 4 | // If we have a pure insertelement chain, then this can be rewritten |
2429 | 4 | // into a chain that directly builds the larger type. |
2430 | 4 | if (isPureIEChain(LIE)4 ) {4 |
2431 | 4 | SmallVector<Value *, 8> VectElemts(numElemL, |
2432 | 4 | UndefValue::get(ArgTypeL->getScalarType())); |
2433 | 4 | InsertElementInst *LIENext = LIE; |
2434 | 8 | do { |
2435 | 8 | unsigned Idx = |
2436 | 8 | cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue(); |
2437 | 8 | VectElemts[Idx] = LIENext->getOperand(1); |
2438 | 8 | } while ((LIENext = |
2439 | 8 | dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); |
2440 | 4 | |
2441 | 4 | LIENext = nullptr; |
2442 | 4 | Value *LIEPrev = UndefValue::get(ArgTypeH); |
2443 | 12 | for (unsigned i = 0; i < numElemL12 ; ++i8 ) {8 |
2444 | 8 | if (isa<UndefValue>(VectElemts[i])8 ) continue0 ; |
2445 | 8 | LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], |
2446 | 8 | ConstantInt::get(Type::getInt32Ty(Context), |
2447 | 8 | i + IdxOff), |
2448 | 8 | getReplacementName(IBeforeJ ? I8 : J0 , |
2449 | 8 | true, o, i+1)); |
2450 | 8 | LIENext->insertBefore(IBeforeJ ? J8 : I0 ); |
2451 | 8 | LIEPrev = LIENext; |
2452 | 8 | } |
2453 | 4 | |
2454 | 4 | LOp = LIENext ? (Value*) LIENext4 : UndefValue::get(ArgTypeH)0 ; |
2455 | 4 | ExpandedIEChain = true; |
2456 | 4 | } |
2457 | 4 | } |
2458 | 4 | |
2459 | 4 | return ExpandedIEChain; |
2460 | 4 | } |
2461 | | |
2462 | 1.84k | static unsigned getNumScalarElements(Type *Ty) { |
2463 | 1.84k | if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) |
2464 | 147 | return VecTy->getNumElements(); |
2465 | 1.69k | return 1; |
2466 | 1.84k | } |
2467 | | |
2468 | | // Returns the value to be used as the specified operand of the vector |
2469 | | // instruction that fuses I with J. |
2470 | | Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, |
2471 | 612 | Instruction *J, unsigned o, bool IBeforeJ) { |
2472 | 612 | Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); |
2473 | 612 | Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); |
2474 | 612 | |
2475 | 612 | // Compute the fused vector type for this operand |
2476 | 612 | Type *ArgTypeI = I->getOperand(o)->getType(); |
2477 | 612 | Type *ArgTypeJ = J->getOperand(o)->getType(); |
2478 | 612 | VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); |
2479 | 612 | |
2480 | 612 | Instruction *L = I, *H = J; |
2481 | 612 | Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; |
2482 | 612 | |
2483 | 612 | unsigned numElemL = getNumScalarElements(ArgTypeL); |
2484 | 612 | unsigned numElemH = getNumScalarElements(ArgTypeH); |
2485 | 612 | |
2486 | 612 | Value *LOp = L->getOperand(o); |
2487 | 612 | Value *HOp = H->getOperand(o); |
2488 | 612 | unsigned numElem = VArgType->getNumElements(); |
2489 | 612 | |
2490 | 612 | // First, we check if we can reuse the "original" vector outputs (if these |
2491 | 612 | // exist). We might need a shuffle. |
2492 | 612 | ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp); |
2493 | 612 | ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp); |
2494 | 612 | ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp); |
2495 | 612 | ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp); |
2496 | 612 | |
2497 | 612 | // FIXME: If we're fusing shuffle instructions, then we can't apply this |
2498 | 612 | // optimization. The input vectors to the shuffle might be a different |
2499 | 612 | // length from the shuffle outputs. Unfortunately, the replacement |
2500 | 612 | // shuffle mask has already been formed, and the mask entries are sensitive |
2501 | 612 | // to the sizes of the inputs. |
2502 | 612 | bool IsSizeChangeShuffle = |
2503 | 612 | isa<ShuffleVectorInst>(L) && |
2504 | 16 | (LOp->getType() != L->getType() || 16 HOp->getType() != H->getType()2 ); |
2505 | 612 | |
2506 | 612 | if ((LEE || 612 LSV282 ) && (HEE || 349 HSV30 ) && !IsSizeChangeShuffle336 ) {334 |
2507 | 334 | // We can have at most two unique vector inputs. |
2508 | 334 | bool CanUseInputs = true; |
2509 | 334 | Value *I1, *I2 = nullptr; |
2510 | 334 | if (LEE334 ) {317 |
2511 | 317 | I1 = LEE->getOperand(0); |
2512 | 17 | } else { |
2513 | 17 | I1 = LSV->getOperand(0); |
2514 | 17 | I2 = LSV->getOperand(1); |
2515 | 17 | if (I2 == I1 || 17 isa<UndefValue>(I2)17 ) |
2516 | 16 | I2 = nullptr; |
2517 | 17 | } |
2518 | 334 | |
2519 | 334 | if (HEE334 ) {319 |
2520 | 319 | Value *I3 = HEE->getOperand(0); |
2521 | 319 | if (!I2 && 319 I3 != I1319 ) |
2522 | 10 | I2 = I3; |
2523 | 309 | else if (309 I3 != I1 && 309 I3 != I20 ) |
2524 | 0 | CanUseInputs = false; |
2525 | 15 | } else { |
2526 | 15 | Value *I3 = HSV->getOperand(0); |
2527 | 15 | if (!I2 && 15 I3 != I114 ) |
2528 | 0 | I2 = I3; |
2529 | 15 | else if (15 I3 != I1 && 15 I3 != I21 ) |
2530 | 1 | CanUseInputs = false; |
2531 | 15 | |
2532 | 15 | if (CanUseInputs15 ) {14 |
2533 | 14 | Value *I4 = HSV->getOperand(1); |
2534 | 14 | if (!isa<UndefValue>(I4)14 ) {0 |
2535 | 0 | if (!I2 && 0 I4 != I10 ) |
2536 | 0 | I2 = I4; |
2537 | 0 | else if (0 I4 != I1 && 0 I4 != I20 ) |
2538 | 0 | CanUseInputs = false; |
2539 | 0 | } |
2540 | 14 | } |
2541 | 15 | } |
2542 | 334 | |
2543 | 334 | if (CanUseInputs334 ) {333 |
2544 | 333 | unsigned LOpElem = |
2545 | 333 | cast<Instruction>(LOp)->getOperand(0)->getType() |
2546 | 333 | ->getVectorNumElements(); |
2547 | 333 | |
2548 | 333 | unsigned HOpElem = |
2549 | 333 | cast<Instruction>(HOp)->getOperand(0)->getType() |
2550 | 333 | ->getVectorNumElements(); |
2551 | 333 | |
2552 | 333 | // We have one or two input vectors. We need to map each index of the |
2553 | 333 | // operands to the index of the original vector. |
2554 | 333 | SmallVector<std::pair<int, int>, 8> II(numElem); |
2555 | 724 | for (unsigned i = 0; i < numElemL724 ; ++i391 ) {391 |
2556 | 391 | int Idx, INum; |
2557 | 391 | if (LEE391 ) {317 |
2558 | 317 | Idx = |
2559 | 317 | cast<ConstantInt>(LEE->getOperand(1))->getSExtValue(); |
2560 | 317 | INum = LEE->getOperand(0) == I1 ? 0317 : 10 ; |
2561 | 74 | } else { |
2562 | 74 | Idx = LSV->getMaskValue(i); |
2563 | 74 | if (Idx < (int) LOpElem74 ) {74 |
2564 | 74 | INum = LSV->getOperand(0) == I1 ? 074 : 10 ; |
2565 | 0 | } else { |
2566 | 0 | Idx -= LOpElem; |
2567 | 0 | INum = LSV->getOperand(1) == I1 ? 00 : 10 ; |
2568 | 0 | } |
2569 | 74 | } |
2570 | 391 | |
2571 | 391 | II[i] = std::pair<int, int>(Idx, INum); |
2572 | 391 | } |
2573 | 722 | for (unsigned i = 0; i < numElemH722 ; ++i389 ) {389 |
2574 | 389 | int Idx, INum; |
2575 | 389 | if (HEE389 ) {319 |
2576 | 319 | Idx = |
2577 | 319 | cast<ConstantInt>(HEE->getOperand(1))->getSExtValue(); |
2578 | 309 | INum = HEE->getOperand(0) == I1 ? 0309 : 110 ; |
2579 | 70 | } else { |
2580 | 70 | Idx = HSV->getMaskValue(i); |
2581 | 70 | if (Idx < (int) HOpElem70 ) {70 |
2582 | 70 | INum = HSV->getOperand(0) == I1 ? 070 : 10 ; |
2583 | 0 | } else { |
2584 | 0 | Idx -= HOpElem; |
2585 | 0 | INum = HSV->getOperand(1) == I1 ? 00 : 10 ; |
2586 | 0 | } |
2587 | 70 | } |
2588 | 389 | |
2589 | 389 | II[i + numElemL] = std::pair<int, int>(Idx, INum); |
2590 | 389 | } |
2591 | 333 | |
2592 | 333 | // We now have an array which tells us from which index of which |
2593 | 333 | // input vector each element of the operand comes. |
2594 | 333 | VectorType *I1T = cast<VectorType>(I1->getType()); |
2595 | 333 | unsigned I1Elem = I1T->getNumElements(); |
2596 | 333 | |
2597 | 333 | if (!I2333 ) {323 |
2598 | 323 | // In this case there is only one underlying vector input. Check for |
2599 | 323 | // the trivial case where we can use the input directly. |
2600 | 323 | if (I1Elem == numElem323 ) {323 |
2601 | 323 | bool ElemInOrder = true; |
2602 | 1.05k | for (unsigned i = 0; i < numElem1.05k ; ++i729 ) {739 |
2603 | 739 | if (II[i].first != (int) i && 739 II[i].first != -110 ) {10 |
2604 | 10 | ElemInOrder = false; |
2605 | 10 | break; |
2606 | 10 | } |
2607 | 739 | } |
2608 | 323 | |
2609 | 323 | if (ElemInOrder) |
2610 | 313 | return I1; |
2611 | 323 | } |
2612 | 323 | |
2613 | 323 | // A shuffle is needed. |
2614 | 10 | std::vector<Constant *> Mask(numElem); |
2615 | 44 | for (unsigned i = 0; i < numElem44 ; ++i34 ) {34 |
2616 | 34 | int Idx = II[i].first; |
2617 | 34 | if (Idx == -1) |
2618 | 0 | Mask[i] = UndefValue::get(Type::getInt32Ty(Context)); |
2619 | 34 | else |
2620 | 34 | Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx); |
2621 | 34 | } |
2622 | 10 | |
2623 | 10 | Instruction *S = |
2624 | 10 | new ShuffleVectorInst(I1, UndefValue::get(I1T), |
2625 | 10 | ConstantVector::get(Mask), |
2626 | 10 | getReplacementName(IBeforeJ ? I10 : J0 , |
2627 | 10 | true, o)); |
2628 | 10 | S->insertBefore(IBeforeJ ? J10 : I0 ); |
2629 | 10 | return S; |
2630 | 323 | } |
2631 | 333 | |
2632 | 10 | VectorType *I2T = cast<VectorType>(I2->getType()); |
2633 | 10 | unsigned I2Elem = I2T->getNumElements(); |
2634 | 10 | |
2635 | 10 | // This input comes from two distinct vectors. The first step is to |
2636 | 10 | // make sure that both vectors are the same length. If not, the |
2637 | 10 | // smaller one will need to grow before they can be shuffled together. |
2638 | 10 | if (I1Elem < I2Elem10 ) {0 |
2639 | 0 | std::vector<Constant *> Mask(I2Elem); |
2640 | 0 | unsigned v = 0; |
2641 | 0 | for (; v < I1Elem0 ; ++v0 ) |
2642 | 0 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2643 | 0 | for (; v < I2Elem0 ; ++v0 ) |
2644 | 0 | Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); |
2645 | 0 |
|
2646 | 0 | Instruction *NewI1 = |
2647 | 0 | new ShuffleVectorInst(I1, UndefValue::get(I1T), |
2648 | 0 | ConstantVector::get(Mask), |
2649 | 0 | getReplacementName(IBeforeJ ? I0 : J0 , |
2650 | 0 | true, o, 1)); |
2651 | 0 | NewI1->insertBefore(IBeforeJ ? J0 : I0 ); |
2652 | 0 | I1 = NewI1; |
2653 | 0 | I1Elem = I2Elem; |
2654 | 10 | } else if (10 I1Elem > I2Elem10 ) {0 |
2655 | 0 | std::vector<Constant *> Mask(I1Elem); |
2656 | 0 | unsigned v = 0; |
2657 | 0 | for (; v < I2Elem0 ; ++v0 ) |
2658 | 0 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2659 | 0 | for (; v < I1Elem0 ; ++v0 ) |
2660 | 0 | Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); |
2661 | 0 |
|
2662 | 0 | Instruction *NewI2 = |
2663 | 0 | new ShuffleVectorInst(I2, UndefValue::get(I2T), |
2664 | 0 | ConstantVector::get(Mask), |
2665 | 0 | getReplacementName(IBeforeJ ? I0 : J0 , |
2666 | 0 | true, o, 1)); |
2667 | 0 | NewI2->insertBefore(IBeforeJ ? J0 : I0 ); |
2668 | 0 | I2 = NewI2; |
2669 | 0 | } |
2670 | 10 | |
2671 | 10 | // Now that both I1 and I2 are the same length we can shuffle them |
2672 | 10 | // together (and use the result). |
2673 | 10 | std::vector<Constant *> Mask(numElem); |
2674 | 30 | for (unsigned v = 0; v < numElem30 ; ++v20 ) {20 |
2675 | 20 | if (II[v].first == -120 ) {0 |
2676 | 0 | Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); |
2677 | 20 | } else { |
2678 | 20 | int Idx = II[v].first + II[v].second * I1Elem; |
2679 | 20 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); |
2680 | 20 | } |
2681 | 20 | } |
2682 | 10 | |
2683 | 10 | Instruction *NewOp = |
2684 | 10 | new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), |
2685 | 10 | getReplacementName(IBeforeJ ? I10 : J0 , true, o)); |
2686 | 10 | NewOp->insertBefore(IBeforeJ ? J10 : I0 ); |
2687 | 10 | return NewOp; |
2688 | 333 | } |
2689 | 334 | } |
2690 | 612 | |
2691 | 279 | Type *ArgType = ArgTypeL; |
2692 | 279 | if (numElemL < numElemH279 ) {0 |
2693 | 0 | if (numElemL == 1 && 0 expandIEChain(Context, I, J, o, HOp, numElemH,0 |
2694 | 0 | ArgTypeL, VArgType, IBeforeJ, 1)) { |
2695 | 0 | // This is another short-circuit case: we're combining a scalar into |
2696 | 0 | // a vector that is formed by an IE chain. We've just expanded the IE |
2697 | 0 | // chain, now insert the scalar and we're done. |
2698 | 0 |
|
2699 | 0 | Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, |
2700 | 0 | getReplacementName(IBeforeJ ? I0 : J0 , true, o)); |
2701 | 0 | S->insertBefore(IBeforeJ ? J0 : I0 ); |
2702 | 0 | return S; |
2703 | 0 | } else if (0 !expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,0 |
2704 | 0 | ArgTypeH, IBeforeJ)) { |
2705 | 0 | // The two vector inputs to the shuffle must be the same length, |
2706 | 0 | // so extend the smaller vector to be the same length as the larger one. |
2707 | 0 | Instruction *NLOp; |
2708 | 0 | if (numElemL > 10 ) {0 |
2709 | 0 |
|
2710 | 0 | std::vector<Constant *> Mask(numElemH); |
2711 | 0 | unsigned v = 0; |
2712 | 0 | for (; v < numElemL0 ; ++v0 ) |
2713 | 0 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2714 | 0 | for (; v < numElemH0 ; ++v0 ) |
2715 | 0 | Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); |
2716 | 0 |
|
2717 | 0 | NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), |
2718 | 0 | ConstantVector::get(Mask), |
2719 | 0 | getReplacementName(IBeforeJ ? I0 : J0 , |
2720 | 0 | true, o, 1)); |
2721 | 0 | } else { |
2722 | 0 | NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, |
2723 | 0 | getReplacementName(IBeforeJ ? I0 : J0 , |
2724 | 0 | true, o, 1)); |
2725 | 0 | } |
2726 | 0 |
|
2727 | 0 | NLOp->insertBefore(IBeforeJ ? J0 : I0 ); |
2728 | 0 | LOp = NLOp; |
2729 | 0 | } |
2730 | 0 |
|
2731 | 0 | ArgType = ArgTypeH; |
2732 | 279 | } else if (279 numElemL > numElemH279 ) {4 |
2733 | 4 | if (numElemH == 1 && 4 expandIEChain(Context, I, J, o, LOp, numElemL,4 |
2734 | 4 | ArgTypeH, VArgType, IBeforeJ)) { |
2735 | 4 | Instruction *S = |
2736 | 4 | InsertElementInst::Create(LOp, HOp, |
2737 | 4 | ConstantInt::get(Type::getInt32Ty(Context), |
2738 | 4 | numElemL), |
2739 | 4 | getReplacementName(IBeforeJ ? I4 : J0 , |
2740 | 4 | true, o)); |
2741 | 4 | S->insertBefore(IBeforeJ ? J4 : I0 ); |
2742 | 4 | return S; |
2743 | 0 | } else if (0 !expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,0 |
2744 | 0 | ArgTypeL, IBeforeJ)) { |
2745 | 0 | Instruction *NHOp; |
2746 | 0 | if (numElemH > 10 ) {0 |
2747 | 0 | std::vector<Constant *> Mask(numElemL); |
2748 | 0 | unsigned v = 0; |
2749 | 0 | for (; v < numElemH0 ; ++v0 ) |
2750 | 0 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2751 | 0 | for (; v < numElemL0 ; ++v0 ) |
2752 | 0 | Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); |
2753 | 0 |
|
2754 | 0 | NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), |
2755 | 0 | ConstantVector::get(Mask), |
2756 | 0 | getReplacementName(IBeforeJ ? I0 : J0 , |
2757 | 0 | true, o, 1)); |
2758 | 0 | } else { |
2759 | 0 | NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, |
2760 | 0 | getReplacementName(IBeforeJ ? I0 : J0 , |
2761 | 0 | true, o, 1)); |
2762 | 0 | } |
2763 | 0 |
|
2764 | 0 | NHOp->insertBefore(IBeforeJ ? J0 : I0 ); |
2765 | 0 | HOp = NHOp; |
2766 | 0 | } |
2767 | 4 | } |
2768 | 279 | |
2769 | 275 | if (275 ArgType->isVectorTy()275 ) {38 |
2770 | 38 | unsigned numElem = VArgType->getVectorNumElements(); |
2771 | 38 | std::vector<Constant*> Mask(numElem); |
2772 | 266 | for (unsigned v = 0; v < numElem266 ; ++v228 ) {228 |
2773 | 228 | unsigned Idx = v; |
2774 | 228 | // If the low vector was expanded, we need to skip the extra |
2775 | 228 | // undefined entries. |
2776 | 228 | if (v >= numElemL && 228 numElemH > numElemL114 ) |
2777 | 0 | Idx += (numElemH - numElemL); |
2778 | 228 | Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); |
2779 | 228 | } |
2780 | 38 | |
2781 | 38 | Instruction *BV = new ShuffleVectorInst(LOp, HOp, |
2782 | 38 | ConstantVector::get(Mask), |
2783 | 35 | getReplacementName(IBeforeJ ? I35 : J3 , true, o)); |
2784 | 35 | BV->insertBefore(IBeforeJ ? J35 : I3 ); |
2785 | 38 | return BV; |
2786 | 38 | } |
2787 | 275 | |
2788 | 237 | Instruction *BV1 = InsertElementInst::Create( |
2789 | 237 | UndefValue::get(VArgType), LOp, CV0, |
2790 | 224 | getReplacementName(IBeforeJ ? I224 : J13 , |
2791 | 237 | true, o, 1)); |
2792 | 224 | BV1->insertBefore(IBeforeJ ? J224 : I13 ); |
2793 | 237 | Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, |
2794 | 224 | getReplacementName(IBeforeJ ? I224 : J13 , |
2795 | 237 | true, o, 2)); |
2796 | 224 | BV2->insertBefore(IBeforeJ ? J224 : I13 ); |
2797 | 237 | return BV2; |
2798 | 275 | } |
2799 | | |
2800 | | // This function creates an array of values that will be used as the inputs |
2801 | | // to the vector instruction that fuses I with J. |
2802 | | void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, |
2803 | | Instruction *I, Instruction *J, |
2804 | | SmallVectorImpl<Value *> &ReplacedOperands, |
2805 | 381 | bool IBeforeJ) { |
2806 | 381 | unsigned NumOperands = I->getNumOperands(); |
2807 | 381 | |
2808 | 1.12k | for (unsigned p = 0, o = NumOperands-1; p < NumOperands1.12k ; ++p, --o744 ) {744 |
2809 | 744 | // Iterate backward so that we look at the store pointer |
2810 | 744 | // first and know whether or not we need to flip the inputs. |
2811 | 744 | |
2812 | 744 | if (isa<LoadInst>(I) || 744 (o == 1 && 710 isa<StoreInst>(I)340 )) {104 |
2813 | 104 | // This is the pointer for a load/store instruction. |
2814 | 104 | ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); |
2815 | 104 | continue; |
2816 | 640 | } else if (640 isa<CallInst>(I)640 ) {44 |
2817 | 44 | Function *F = cast<CallInst>(I)->getCalledFunction(); |
2818 | 44 | Intrinsic::ID IID = F->getIntrinsicID(); |
2819 | 44 | if (o == NumOperands-144 ) {17 |
2820 | 17 | BasicBlock &BB = *I->getParent(); |
2821 | 17 | |
2822 | 17 | Module *M = BB.getParent()->getParent(); |
2823 | 17 | Type *ArgTypeI = I->getType(); |
2824 | 17 | Type *ArgTypeJ = J->getType(); |
2825 | 17 | Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); |
2826 | 17 | |
2827 | 17 | ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); |
2828 | 17 | continue; |
2829 | 27 | } else if (27 (IID == Intrinsic::powi || 27 IID == Intrinsic::ctlz25 || |
2830 | 23 | IID == Intrinsic::cttz23 ) && o == 16 ) {3 |
2831 | 3 | // The second argument of powi/ctlz/cttz is a single integer/constant |
2832 | 3 | // and we've already checked that both arguments are equal. |
2833 | 3 | // As a result, we just keep I's second argument. |
2834 | 3 | ReplacedOperands[o] = I->getOperand(o); |
2835 | 3 | continue; |
2836 | 3 | } |
2837 | 596 | } else if (596 isa<ShuffleVectorInst>(I) && 596 o == NumOperands-124 ) {8 |
2838 | 8 | ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); |
2839 | 8 | continue; |
2840 | 8 | } |
2841 | 744 | |
2842 | 612 | ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); |
2843 | 612 | } |
2844 | 381 | } |
2845 | | |
2846 | | // This function creates two values that represent the outputs of the |
2847 | | // original I and J instructions. These are generally vector shuffles |
2848 | | // or extracts. In many cases, these will end up being unused and, thus, |
2849 | | // eliminated by later passes. |
2850 | | void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, |
2851 | | Instruction *J, Instruction *K, |
2852 | | Instruction *&InsertionPt, |
2853 | 381 | Instruction *&K1, Instruction *&K2) { |
2854 | 381 | if (isa<StoreInst>(I)) |
2855 | 70 | return; |
2856 | 381 | |
2857 | 311 | Type *IType = I->getType(); |
2858 | 311 | Type *JType = J->getType(); |
2859 | 311 | |
2860 | 311 | VectorType *VType = getVecTypeForPair(IType, JType); |
2861 | 311 | unsigned numElem = VType->getNumElements(); |
2862 | 311 | |
2863 | 311 | unsigned numElemI = getNumScalarElements(IType); |
2864 | 311 | unsigned numElemJ = getNumScalarElements(JType); |
2865 | 311 | |
2866 | 311 | if (IType->isVectorTy()311 ) {20 |
2867 | 20 | std::vector<Constant *> Mask1(numElemI), Mask2(numElemI); |
2868 | 106 | for (unsigned v = 0; v < numElemI106 ; ++v86 ) {86 |
2869 | 86 | Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2870 | 86 | Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v); |
2871 | 86 | } |
2872 | 20 | |
2873 | 20 | K1 = new ShuffleVectorInst(K, UndefValue::get(VType), |
2874 | 20 | ConstantVector::get(Mask1), |
2875 | 20 | getReplacementName(K, false, 1)); |
2876 | 291 | } else { |
2877 | 291 | Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); |
2878 | 291 | K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1)); |
2879 | 291 | } |
2880 | 311 | |
2881 | 311 | if (JType->isVectorTy()311 ) {17 |
2882 | 17 | std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ); |
2883 | 97 | for (unsigned v = 0; v < numElemJ97 ; ++v80 ) {80 |
2884 | 80 | Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); |
2885 | 80 | Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v); |
2886 | 80 | } |
2887 | 17 | |
2888 | 17 | K2 = new ShuffleVectorInst(K, UndefValue::get(VType), |
2889 | 17 | ConstantVector::get(Mask2), |
2890 | 17 | getReplacementName(K, false, 2)); |
2891 | 294 | } else { |
2892 | 294 | Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1); |
2893 | 294 | K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2)); |
2894 | 294 | } |
2895 | 311 | |
2896 | 311 | K1->insertAfter(K); |
2897 | 311 | K2->insertAfter(K1); |
2898 | 311 | InsertionPt = K2; |
2899 | 311 | } |
2900 | | |
2901 | | // Move all uses of the function I (including pairing-induced uses) after J. |
2902 | | bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, |
2903 | | DenseSet<ValuePair> &LoadMoveSetPairs, |
2904 | 381 | Instruction *I, Instruction *J) { |
2905 | 381 | // Skip to the first instruction past I. |
2906 | 381 | BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); |
2907 | 381 | |
2908 | 381 | DenseSet<Value *> Users; |
2909 | 381 | AliasSetTracker WriteSet(*AA); |
2910 | 381 | if (I->mayWriteToMemory()381 ) WriteSet.add(I)70 ; |
2911 | 381 | |
2912 | 2.36k | for (; cast<Instruction>(L) != J2.36k ; ++L1.98k ) |
2913 | 1.98k | (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs); |
2914 | 381 | |
2915 | 381 | assert(cast<Instruction>(L) == J && |
2916 | 381 | "Tracking has not proceeded far enough to check for dependencies"); |
2917 | 381 | // If J is now in the use set of I, then trackUsesOfI will return true |
2918 | 381 | // and we have a dependency cycle (and the fusing operation must abort). |
2919 | 381 | return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs); |
2920 | 381 | } |
2921 | | |
2922 | | // Move all uses of the function I (including pairing-induced uses) after J. |
2923 | | void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, |
2924 | | DenseSet<ValuePair> &LoadMoveSetPairs, |
2925 | | Instruction *&InsertionPt, |
2926 | 381 | Instruction *I, Instruction *J) { |
2927 | 381 | // Skip to the first instruction past I. |
2928 | 381 | BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); |
2929 | 381 | |
2930 | 381 | DenseSet<Value *> Users; |
2931 | 381 | AliasSetTracker WriteSet(*AA); |
2932 | 381 | if (I->mayWriteToMemory()381 ) WriteSet.add(I)70 ; |
2933 | 381 | |
2934 | 2.91k | for (; cast<Instruction>(L) != J2.91k ;) {2.53k |
2935 | 2.53k | if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)2.53k ) {421 |
2936 | 421 | // Move this instruction |
2937 | 421 | Instruction *InstToMove = &*L++; |
2938 | 421 | |
2939 | 421 | DEBUG(dbgs() << "BBV: moving: " << *InstToMove << |
2940 | 421 | " to after " << *InsertionPt << "\n"); |
2941 | 421 | InstToMove->removeFromParent(); |
2942 | 421 | InstToMove->insertAfter(InsertionPt); |
2943 | 421 | InsertionPt = InstToMove; |
2944 | 2.11k | } else { |
2945 | 2.11k | ++L; |
2946 | 2.11k | } |
2947 | 2.53k | } |
2948 | 381 | } |
2949 | | |
2950 | | // Collect all load instruction that are in the move set of a given first |
2951 | | // pair member. These loads depend on the first instruction, I, and so need |
2952 | | // to be moved after J (the second instruction) when the pair is fused. |
2953 | | void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, |
2954 | | DenseMap<Value *, Value *> &ChosenPairs, |
2955 | | DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, |
2956 | | DenseSet<ValuePair> &LoadMoveSetPairs, |
2957 | 558 | Instruction *I) { |
2958 | 558 | // Skip to the first instruction past I. |
2959 | 558 | BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); |
2960 | 558 | |
2961 | 558 | DenseSet<Value *> Users; |
2962 | 558 | AliasSetTracker WriteSet(*AA); |
2963 | 558 | if (I->mayWriteToMemory()558 ) WriteSet.add(I)108 ; |
2964 | 558 | |
2965 | 558 | // Note: We cannot end the loop when we reach J because J could be moved |
2966 | 558 | // farther down the use chain by another instruction pairing. Also, J |
2967 | 558 | // could be before I if this is an inverted input. |
2968 | 26.6k | for (BasicBlock::iterator E = BB.end(); L != E26.6k ; ++L26.0k ) {26.0k |
2969 | 26.0k | if (trackUsesOfI(Users, WriteSet, I, &*L)26.0k ) {3.04k |
2970 | 3.04k | if (L->mayReadFromMemory()3.04k ) {265 |
2971 | 265 | LoadMoveSet[&*L].push_back(I); |
2972 | 265 | LoadMoveSetPairs.insert(ValuePair(&*L, I)); |
2973 | 265 | } |
2974 | 3.04k | } |
2975 | 26.0k | } |
2976 | 558 | } |
2977 | | |
2978 | | // In cases where both load/stores and the computation of their pointers |
2979 | | // are chosen for vectorization, we can end up in a situation where the |
2980 | | // aliasing analysis starts returning different query results as the |
2981 | | // process of fusing instruction pairs continues. Because the algorithm |
2982 | | // relies on finding the same use dags here as were found earlier, we'll |
2983 | | // need to precompute the necessary aliasing information here and then |
2984 | | // manually update it during the fusion process. |
2985 | | void BBVectorize::collectLoadMoveSet(BasicBlock &BB, |
2986 | | std::vector<Value *> &PairableInsts, |
2987 | | DenseMap<Value *, Value *> &ChosenPairs, |
2988 | | DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, |
2989 | 70 | DenseSet<ValuePair> &LoadMoveSetPairs) { |
2990 | 70 | for (std::vector<Value *>::iterator PI = PairableInsts.begin(), |
2991 | 960 | PIE = PairableInsts.end(); PI != PIE960 ; ++PI890 ) {890 |
2992 | 890 | DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); |
2993 | 890 | if (P == ChosenPairs.end()890 ) continue332 ; |
2994 | 890 | |
2995 | 558 | Instruction *I = cast<Instruction>(P->first); |
2996 | 558 | collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, |
2997 | 558 | LoadMoveSetPairs, I); |
2998 | 558 | } |
2999 | 70 | } |
3000 | | |
3001 | | // This function fuses the chosen instruction pairs into vector instructions, |
3002 | | // taking care preserve any needed scalar outputs and, then, it reorders the |
3003 | | // remaining instructions as needed (users of the first member of the pair |
3004 | | // need to be moved to after the location of the second member of the pair |
3005 | | // because the vector instruction is inserted in the location of the pair's |
3006 | | // second member). |
3007 | | void BBVectorize::fuseChosenPairs(BasicBlock &BB, |
3008 | | std::vector<Value *> &PairableInsts, |
3009 | | DenseMap<Value *, Value *> &ChosenPairs, |
3010 | | DenseSet<ValuePair> &FixedOrderPairs, |
3011 | | DenseMap<VPPair, unsigned> &PairConnectionTypes, |
3012 | | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, |
3013 | 70 | DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) { |
3014 | 70 | LLVMContext& Context = BB.getContext(); |
3015 | 70 | |
3016 | 70 | // During the vectorization process, the order of the pairs to be fused |
3017 | 70 | // could be flipped. So we'll add each pair, flipped, into the ChosenPairs |
3018 | 70 | // list. After a pair is fused, the flipped pair is removed from the list. |
3019 | 70 | DenseSet<ValuePair> FlippedPairs; |
3020 | 70 | for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), |
3021 | 457 | E = ChosenPairs.end(); P != E457 ; ++P387 ) |
3022 | 387 | FlippedPairs.insert(ValuePair(P->second, P->first)); |
3023 | 70 | for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(), |
3024 | 457 | E = FlippedPairs.end(); P != E457 ; ++P387 ) |
3025 | 387 | ChosenPairs.insert(*P); |
3026 | 70 | |
3027 | 70 | DenseMap<Value *, std::vector<Value *> > LoadMoveSet; |
3028 | 70 | DenseSet<ValuePair> LoadMoveSetPairs; |
3029 | 70 | collectLoadMoveSet(BB, PairableInsts, ChosenPairs, |
3030 | 70 | LoadMoveSet, LoadMoveSetPairs); |
3031 | 70 | |
3032 | 70 | DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); |
3033 | 70 | |
3034 | 3.38k | for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end()3.38k ;) {3.31k |
3035 | 3.31k | DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI); |
3036 | 3.31k | if (P == ChosenPairs.end()3.31k ) {2.92k |
3037 | 2.92k | ++PI; |
3038 | 2.92k | continue; |
3039 | 2.92k | } |
3040 | 3.31k | |
3041 | 393 | if (393 getDepthFactor(P->first) == 0393 ) {12 |
3042 | 12 | // These instructions are not really fused, but are tracked as though |
3043 | 12 | // they are. Any case in which it would be interesting to fuse them |
3044 | 12 | // will be taken care of by InstCombine. |
3045 | 12 | --NumFusedOps; |
3046 | 12 | ++PI; |
3047 | 12 | continue; |
3048 | 12 | } |
3049 | 393 | |
3050 | 381 | Instruction *I = cast<Instruction>(P->first), |
3051 | 381 | *J = cast<Instruction>(P->second); |
3052 | 381 | |
3053 | 381 | DEBUG(dbgs() << "BBV: fusing: " << *I << |
3054 | 381 | " <-> " << *J << "\n"); |
3055 | 381 | |
3056 | 381 | // Remove the pair and flipped pair from the list. |
3057 | 381 | DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); |
3058 | 381 | assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); |
3059 | 381 | ChosenPairs.erase(FP); |
3060 | 381 | ChosenPairs.erase(P); |
3061 | 381 | |
3062 | 381 | if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)381 ) {0 |
3063 | 0 | DEBUG(dbgs() << "BBV: fusion of: " << *I << |
3064 | 0 | " <-> " << *J << |
3065 | 0 | " aborted because of non-trivial dependency cycle\n"); |
3066 | 0 | --NumFusedOps; |
3067 | 0 | ++PI; |
3068 | 0 | continue; |
3069 | 0 | } |
3070 | 381 | |
3071 | 381 | // If the pair must have the other order, then flip it. |
3072 | 381 | bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); |
3073 | 381 | if (!FlipPairOrder && 381 !FixedOrderPairs.count(ValuePair(I, J))372 ) {277 |
3074 | 277 | // This pair does not have a fixed order, and so we might want to |
3075 | 277 | // flip it if that will yield fewer shuffles. We count the number |
3076 | 277 | // of dependencies connected via swaps, and those directly connected, |
3077 | 277 | // and flip the order if the number of swaps is greater. |
3078 | 277 | bool OrigOrder = true; |
3079 | 277 | DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ = |
3080 | 277 | ConnectedPairDeps.find(ValuePair(I, J)); |
3081 | 277 | if (IJ == ConnectedPairDeps.end()277 ) {57 |
3082 | 57 | IJ = ConnectedPairDeps.find(ValuePair(J, I)); |
3083 | 57 | OrigOrder = false; |
3084 | 57 | } |
3085 | 277 | |
3086 | 277 | if (IJ != ConnectedPairDeps.end()277 ) {221 |
3087 | 221 | unsigned NumDepsDirect = 0, NumDepsSwap = 0; |
3088 | 221 | for (std::vector<ValuePair>::iterator T = IJ->second.begin(), |
3089 | 584 | TE = IJ->second.end(); T != TE584 ; ++T363 ) {363 |
3090 | 363 | VPPair Q(IJ->first, *T); |
3091 | 363 | DenseMap<VPPair, unsigned>::iterator R = |
3092 | 363 | PairConnectionTypes.find(VPPair(Q.second, Q.first)); |
3093 | 363 | assert(R != PairConnectionTypes.end() && |
3094 | 363 | "Cannot find pair connection type"); |
3095 | 363 | if (R->second == PairConnectionDirect) |
3096 | 341 | ++NumDepsDirect; |
3097 | 22 | else if (22 R->second == PairConnectionSwap22 ) |
3098 | 8 | ++NumDepsSwap; |
3099 | 363 | } |
3100 | 221 | |
3101 | 221 | if (!OrigOrder) |
3102 | 1 | std::swap(NumDepsDirect, NumDepsSwap); |
3103 | 221 | |
3104 | 221 | if (NumDepsSwap > NumDepsDirect221 ) {8 |
3105 | 8 | FlipPairOrder = true; |
3106 | 8 | DEBUG(dbgs() << "BBV: reordering pair: " << *I << |
3107 | 8 | " <-> " << *J << "\n"); |
3108 | 8 | } |
3109 | 221 | } |
3110 | 277 | } |
3111 | 381 | |
3112 | 381 | Instruction *L = I, *H = J; |
3113 | 381 | if (FlipPairOrder) |
3114 | 17 | std::swap(H, L); |
3115 | 381 | |
3116 | 381 | // If the pair being fused uses the opposite order from that in the pair |
3117 | 381 | // connection map, then we need to flip the types. |
3118 | 381 | DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL = |
3119 | 381 | ConnectedPairs.find(ValuePair(H, L)); |
3120 | 381 | if (HL != ConnectedPairs.end()) |
3121 | 2 | for (std::vector<ValuePair>::iterator T = HL->second.begin(), |
3122 | 4 | TE = HL->second.end(); T != TE4 ; ++T2 ) {2 |
3123 | 2 | VPPair Q(HL->first, *T); |
3124 | 2 | DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q); |
3125 | 2 | assert(R != PairConnectionTypes.end() && |
3126 | 2 | "Cannot find pair connection type"); |
3127 | 2 | if (R->second == PairConnectionDirect) |
3128 | 1 | R->second = PairConnectionSwap; |
3129 | 1 | else if (1 R->second == PairConnectionSwap1 ) |
3130 | 0 | R->second = PairConnectionDirect; |
3131 | 2 | } |
3132 | 381 | |
3133 | 381 | bool LBeforeH = !FlipPairOrder; |
3134 | 381 | unsigned NumOperands = I->getNumOperands(); |
3135 | 381 | SmallVector<Value *, 3> ReplacedOperands(NumOperands); |
3136 | 381 | getReplacementInputsForPair(Context, L, H, ReplacedOperands, |
3137 | 381 | LBeforeH); |
3138 | 381 | |
3139 | 381 | // Make a copy of the original operation, change its type to the vector |
3140 | 381 | // type and replace its operands with the vector operands. |
3141 | 381 | Instruction *K = L->clone(); |
3142 | 381 | if (L->hasName()) |
3143 | 275 | K->takeName(L); |
3144 | 106 | else if (106 H->hasName()106 ) |
3145 | 0 | K->takeName(H); |
3146 | 381 | |
3147 | 381 | if (auto CS381 = CallSite(K)) {17 |
3148 | 17 | SmallVector<Type *, 3> Tys; |
3149 | 17 | FunctionType *Old = CS.getFunctionType(); |
3150 | 17 | unsigned NumOld = Old->getNumParams(); |
3151 | 17 | assert(NumOld <= ReplacedOperands.size()); |
3152 | 44 | for (unsigned i = 0; i != NumOld44 ; ++i27 ) |
3153 | 27 | Tys.push_back(ReplacedOperands[i]->getType()); |
3154 | 17 | CS.mutateFunctionType( |
3155 | 17 | FunctionType::get(getVecTypeForPair(L->getType(), H->getType()), |
3156 | 17 | Tys, Old->isVarArg())); |
3157 | 364 | } else if (364 !isa<StoreInst>(K)364 ) |
3158 | 294 | K->mutateType(getVecTypeForPair(L->getType(), H->getType())); |
3159 | 381 | |
3160 | 381 | unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, |
3161 | 381 | LLVMContext::MD_noalias, LLVMContext::MD_fpmath, |
3162 | 381 | LLVMContext::MD_invariant_group}; |
3163 | 381 | combineMetadata(K, H, KnownIDs); |
3164 | 381 | K->andIRFlags(H); |
3165 | 381 | |
3166 | 1.12k | for (unsigned o = 0; o < NumOperands1.12k ; ++o744 ) |
3167 | 744 | K->setOperand(o, ReplacedOperands[o]); |
3168 | 381 | |
3169 | 381 | K->insertAfter(J); |
3170 | 381 | |
3171 | 381 | // Instruction insertion point: |
3172 | 381 | Instruction *InsertionPt = K; |
3173 | 381 | Instruction *K1 = nullptr, *K2 = nullptr; |
3174 | 381 | replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); |
3175 | 381 | |
3176 | 381 | // The use dag of the first original instruction must be moved to after |
3177 | 381 | // the location of the second instruction. The entire use dag of the |
3178 | 381 | // first instruction is disjoint from the input dag of the second |
3179 | 381 | // (by definition), and so commutes with it. |
3180 | 381 | |
3181 | 381 | moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J); |
3182 | 381 | |
3183 | 381 | if (!isa<StoreInst>(I)381 ) {311 |
3184 | 311 | L->replaceAllUsesWith(K1); |
3185 | 311 | H->replaceAllUsesWith(K2); |
3186 | 311 | } |
3187 | 381 | |
3188 | 381 | // Instructions that may read from memory may be in the load move set. |
3189 | 381 | // Once an instruction is fused, we no longer need its move set, and so |
3190 | 381 | // the values of the map never need to be updated. However, when a load |
3191 | 381 | // is fused, we need to merge the entries from both instructions in the |
3192 | 381 | // pair in case those instructions were in the move set of some other |
3193 | 381 | // yet-to-be-fused pair. The loads in question are the keys of the map. |
3194 | 381 | if (I->mayReadFromMemory()381 ) {34 |
3195 | 34 | std::vector<ValuePair> NewSetMembers; |
3196 | 34 | DenseMap<Value *, std::vector<Value *> >::iterator II = |
3197 | 34 | LoadMoveSet.find(I); |
3198 | 34 | if (II != LoadMoveSet.end()) |
3199 | 0 | for (std::vector<Value *>::iterator N = II->second.begin(), |
3200 | 0 | NE = II->second.end(); N != NE0 ; ++N0 ) |
3201 | 0 | NewSetMembers.push_back(ValuePair(K, *N)); |
3202 | 34 | DenseMap<Value *, std::vector<Value *> >::iterator JJ = |
3203 | 34 | LoadMoveSet.find(J); |
3204 | 34 | if (JJ != LoadMoveSet.end()) |
3205 | 0 | for (std::vector<Value *>::iterator N = JJ->second.begin(), |
3206 | 0 | NE = JJ->second.end(); N != NE0 ; ++N0 ) |
3207 | 0 | NewSetMembers.push_back(ValuePair(K, *N)); |
3208 | 34 | for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), |
3209 | 34 | AE = NewSetMembers.end(); A != AE34 ; ++A0 ) {0 |
3210 | 0 | LoadMoveSet[A->first].push_back(A->second); |
3211 | 0 | LoadMoveSetPairs.insert(*A); |
3212 | 0 | } |
3213 | 34 | } |
3214 | 381 | |
3215 | 381 | // Before removing I, set the iterator to the next instruction. |
3216 | 381 | PI = std::next(BasicBlock::iterator(I)); |
3217 | 381 | if (cast<Instruction>(PI) == J) |
3218 | 89 | ++PI; |
3219 | 381 | |
3220 | 381 | SE->forgetValue(I); |
3221 | 381 | SE->forgetValue(J); |
3222 | 381 | I->eraseFromParent(); |
3223 | 381 | J->eraseFromParent(); |
3224 | 381 | |
3225 | 381 | DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << |
3226 | 381 | BB << "\n"); |
3227 | 381 | } |
3228 | 70 | |
3229 | 70 | DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); |
3230 | 70 | } |
3231 | | } |
3232 | | |
3233 | | char BBVectorize::ID = 0; |
3234 | | static const char bb_vectorize_name[] = "Basic-Block Vectorization"; |
3235 | 23.2k | INITIALIZE_PASS_BEGIN23.2k (BBVectorize, BBV_NAME, bb_vectorize_name, false, false)23.2k
|
3236 | 23.2k | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
3237 | 23.2k | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
3238 | 23.2k | INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) |
3239 | 23.2k | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
3240 | 23.2k | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
3241 | 23.2k | INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) |
3242 | 23.2k | INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) |
3243 | 23.2k | INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) |
3244 | 23.2k | INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) |
3245 | | |
3246 | 0 | BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { |
3247 | 0 | return new BBVectorize(C); |
3248 | 0 | } |
3249 | | |
3250 | | bool |
3251 | 0 | llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { |
3252 | 0 | BBVectorize BBVectorizer(P, *BB.getParent(), C); |
3253 | 0 | return BBVectorizer.vectorizeBB(BB); |
3254 | 0 | } |
3255 | | |
3256 | | //===----------------------------------------------------------------------===// |
3257 | 36 | VectorizeConfig::VectorizeConfig() { |
3258 | 36 | VectorBits = ::VectorBits; |
3259 | 36 | VectorizeBools = !::NoBools; |
3260 | 36 | VectorizeInts = !::NoInts; |
3261 | 36 | VectorizeFloats = !::NoFloats; |
3262 | 36 | VectorizePointers = !::NoPointers; |
3263 | 36 | VectorizeCasts = !::NoCasts; |
3264 | 36 | VectorizeMath = !::NoMath; |
3265 | 36 | VectorizeBitManipulations = !::NoBitManipulation; |
3266 | 36 | VectorizeFMA = !::NoFMA; |
3267 | 36 | VectorizeSelect = !::NoSelect; |
3268 | 36 | VectorizeCmp = !::NoCmp; |
3269 | 36 | VectorizeGEP = !::NoGEP; |
3270 | 36 | VectorizeMemOps = !::NoMemOps; |
3271 | 36 | AlignedOnly = ::AlignedOnly; |
3272 | 36 | ReqChainDepth= ::ReqChainDepth; |
3273 | 36 | SearchLimit = ::SearchLimit; |
3274 | 36 | MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; |
3275 | 36 | SplatBreaksChain = ::SplatBreaksChain; |
3276 | 36 | MaxInsts = ::MaxInsts; |
3277 | 36 | MaxPairs = ::MaxPairs; |
3278 | 36 | MaxIter = ::MaxIter; |
3279 | 36 | Pow2LenOnly = ::Pow2LenOnly; |
3280 | 36 | NoMemOpBoost = ::NoMemOpBoost; |
3281 | 36 | FastDep = ::FastDep; |
3282 | 36 | } |