/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- HexagonLoopIdiomRecognition.cpp ------------------------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | |
10 | | #define DEBUG_TYPE "hexagon-lir" |
11 | | |
12 | | #include "llvm/ADT/APInt.h" |
13 | | #include "llvm/ADT/DenseMap.h" |
14 | | #include "llvm/ADT/SetVector.h" |
15 | | #include "llvm/ADT/SmallPtrSet.h" |
16 | | #include "llvm/ADT/SmallSet.h" |
17 | | #include "llvm/ADT/SmallVector.h" |
18 | | #include "llvm/ADT/StringRef.h" |
19 | | #include "llvm/ADT/Triple.h" |
20 | | #include "llvm/Analysis/AliasAnalysis.h" |
21 | | #include "llvm/Analysis/InstructionSimplify.h" |
22 | | #include "llvm/Analysis/LoopInfo.h" |
23 | | #include "llvm/Analysis/LoopPass.h" |
24 | | #include "llvm/Analysis/MemoryLocation.h" |
25 | | #include "llvm/Analysis/ScalarEvolution.h" |
26 | | #include "llvm/Analysis/ScalarEvolutionExpander.h" |
27 | | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
28 | | #include "llvm/Analysis/TargetLibraryInfo.h" |
29 | | #include "llvm/Analysis/ValueTracking.h" |
30 | | #include "llvm/IR/Attributes.h" |
31 | | #include "llvm/IR/BasicBlock.h" |
32 | | #include "llvm/IR/Constant.h" |
33 | | #include "llvm/IR/Constants.h" |
34 | | #include "llvm/IR/DataLayout.h" |
35 | | #include "llvm/IR/DebugLoc.h" |
36 | | #include "llvm/IR/DerivedTypes.h" |
37 | | #include "llvm/IR/Dominators.h" |
38 | | #include "llvm/IR/Function.h" |
39 | | #include "llvm/IR/IRBuilder.h" |
40 | | #include "llvm/IR/InstrTypes.h" |
41 | | #include "llvm/IR/Instruction.h" |
42 | | #include "llvm/IR/Instructions.h" |
43 | | #include "llvm/IR/IntrinsicInst.h" |
44 | | #include "llvm/IR/Intrinsics.h" |
45 | | #include "llvm/IR/Module.h" |
46 | | #include "llvm/IR/PatternMatch.h" |
47 | | #include "llvm/IR/Type.h" |
48 | | #include "llvm/IR/User.h" |
49 | | #include "llvm/IR/Value.h" |
50 | | #include "llvm/Pass.h" |
51 | | #include "llvm/Support/Casting.h" |
52 | | #include "llvm/Support/CommandLine.h" |
53 | | #include "llvm/Support/Compiler.h" |
54 | | #include "llvm/Support/Debug.h" |
55 | | #include "llvm/Support/ErrorHandling.h" |
56 | | #include "llvm/Support/KnownBits.h" |
57 | | #include "llvm/Support/raw_ostream.h" |
58 | | #include "llvm/Transforms/Scalar.h" |
59 | | #include "llvm/Transforms/Utils/Local.h" |
60 | | #include <algorithm> |
61 | | #include <array> |
62 | | #include <cassert> |
63 | | #include <cstdint> |
64 | | #include <cstdlib> |
65 | | #include <deque> |
66 | | #include <functional> |
67 | | #include <iterator> |
68 | | #include <map> |
69 | | #include <set> |
70 | | #include <utility> |
71 | | #include <vector> |
72 | | |
73 | | using namespace llvm; |
74 | | |
75 | | static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom", |
76 | | cl::Hidden, cl::init(false), |
77 | | cl::desc("Disable generation of memcpy in loop idiom recognition")); |
78 | | |
79 | | static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom", |
80 | | cl::Hidden, cl::init(false), |
81 | | cl::desc("Disable generation of memmove in loop idiom recognition")); |
82 | | |
83 | | static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold", |
84 | | cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime " |
85 | | "check guarding the memmove.")); |
86 | | |
87 | | static cl::opt<unsigned> CompileTimeMemSizeThreshold( |
88 | | "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64), |
89 | | cl::desc("Threshold (in bytes) to perform the transformation, if the " |
90 | | "runtime loop count (mem transfer size) is known at compile-time.")); |
91 | | |
92 | | static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom", |
93 | | cl::Hidden, cl::init(true), |
94 | | cl::desc("Only enable generating memmove in non-nested loops")); |
95 | | |
96 | | cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy", |
97 | | cl::Hidden, cl::init(false), |
98 | | cl::desc("Enable Hexagon-specific memcpy for volatile destination.")); |
99 | | |
100 | | static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000), |
101 | | cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR")); |
102 | | |
103 | | static const char *HexagonVolatileMemcpyName |
104 | | = "hexagon_memcpy_forward_vp4cp4n2"; |
105 | | |
106 | | |
107 | | namespace llvm { |
108 | | |
109 | | void initializeHexagonLoopIdiomRecognizePass(PassRegistry&); |
110 | | Pass *createHexagonLoopIdiomPass(); |
111 | | |
112 | | } // end namespace llvm |
113 | | |
114 | | namespace { |
115 | | |
116 | | class HexagonLoopIdiomRecognize : public LoopPass { |
117 | | public: |
118 | | static char ID; |
119 | | |
120 | 14 | explicit HexagonLoopIdiomRecognize() : LoopPass(ID) { |
121 | 14 | initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); |
122 | 14 | } |
123 | | |
124 | 0 | StringRef getPassName() const override { |
125 | 0 | return "Recognize Hexagon-specific loop idioms"; |
126 | 0 | } |
127 | | |
128 | 14 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
129 | 14 | AU.addRequired<LoopInfoWrapperPass>(); |
130 | 14 | AU.addRequiredID(LoopSimplifyID); |
131 | 14 | AU.addRequiredID(LCSSAID); |
132 | 14 | AU.addRequired<AAResultsWrapperPass>(); |
133 | 14 | AU.addPreserved<AAResultsWrapperPass>(); |
134 | 14 | AU.addRequired<ScalarEvolutionWrapperPass>(); |
135 | 14 | AU.addRequired<DominatorTreeWrapperPass>(); |
136 | 14 | AU.addRequired<TargetLibraryInfoWrapperPass>(); |
137 | 14 | AU.addPreserved<TargetLibraryInfoWrapperPass>(); |
138 | 14 | } |
139 | | |
140 | | bool runOnLoop(Loop *L, LPPassManager &LPM) override; |
141 | | |
142 | | private: |
143 | | unsigned getStoreSizeInBytes(StoreInst *SI); |
144 | | int getSCEVStride(const SCEVAddRecExpr *StoreEv); |
145 | | bool isLegalStore(Loop *CurLoop, StoreInst *SI); |
146 | | void collectStores(Loop *CurLoop, BasicBlock *BB, |
147 | | SmallVectorImpl<StoreInst*> &Stores); |
148 | | bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount); |
149 | | bool coverLoop(Loop *L, SmallVectorImpl<Instruction*> &Insts) const; |
150 | | bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount, |
151 | | SmallVectorImpl<BasicBlock*> &ExitBlocks); |
152 | | bool runOnCountableLoop(Loop *L); |
153 | | |
154 | | AliasAnalysis *AA; |
155 | | const DataLayout *DL; |
156 | | DominatorTree *DT; |
157 | | LoopInfo *LF; |
158 | | const TargetLibraryInfo *TLI; |
159 | | ScalarEvolution *SE; |
160 | | bool HasMemcpy, HasMemmove; |
161 | | }; |
162 | | |
163 | | struct Simplifier { |
164 | | using Rule = std::function<Value * (Instruction *, LLVMContext &)>; |
165 | | |
166 | 42 | void addRule(const Rule &R) { Rules.push_back(R); } |
167 | | |
168 | | private: |
169 | | struct WorkListType { |
170 | 5.22k | WorkListType() = default; |
171 | | |
172 | 2.23M | void push_back(Value* V) { |
173 | 2.23M | // Do not push back duplicates. |
174 | 2.23M | if (!S.count(V)2.23M ) { Q.push_back(V); S.insert(V); }1.17M |
175 | 2.23M | } |
176 | | |
177 | 1.11M | Value *pop_front_val() { |
178 | 1.11M | Value *V = Q.front(); Q.pop_front(); S.erase(V); |
179 | 1.11M | return V; |
180 | 1.11M | } |
181 | | |
182 | 1.11M | bool empty() const { return Q.empty(); } |
183 | | |
184 | | private: |
185 | | std::deque<Value*> Q; |
186 | | std::set<Value*> S; |
187 | | }; |
188 | | |
189 | | using ValueSetType = std::set<Value *>; |
190 | | |
191 | | std::vector<Rule> Rules; |
192 | | |
193 | | public: |
194 | | struct Context { |
195 | | using ValueMapType = DenseMap<Value *, Value *>; |
196 | | |
197 | | Value *Root; |
198 | | ValueSetType Used; // The set of all cloned values used by Root. |
199 | | ValueSetType Clones; // The set of all cloned values. |
200 | | LLVMContext &Ctx; |
201 | | |
202 | | Context(Instruction *Exp) |
203 | 19 | : Ctx(Exp->getParent()->getParent()->getContext()) { |
204 | 19 | initialize(Exp); |
205 | 19 | } |
206 | | |
207 | 19 | ~Context() { cleanup(); } |
208 | | |
209 | | void print(raw_ostream &OS, const Value *V) const; |
210 | | Value *materialize(BasicBlock *B, BasicBlock::iterator At); |
211 | | |
212 | | private: |
213 | | friend struct Simplifier; |
214 | | |
215 | | void initialize(Instruction *Exp); |
216 | | void cleanup(); |
217 | | |
218 | | template <typename FuncT> void traverse(Value *V, FuncT F); |
219 | | void record(Value *V); |
220 | | void use(Value *V); |
221 | | void unuse(Value *V); |
222 | | |
223 | | bool equal(const Instruction *I, const Instruction *J) const; |
224 | | Value *find(Value *Tree, Value *Sub) const; |
225 | | Value *subst(Value *Tree, Value *OldV, Value *NewV); |
226 | | void replace(Value *OldV, Value *NewV); |
227 | | void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At); |
228 | | }; |
229 | | |
230 | | Value *simplify(Context &C); |
231 | | }; |
232 | | |
233 | | struct PE { |
234 | 0 | PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {} |
235 | | |
236 | | const Simplifier::Context &C; |
237 | | const Value *V; |
238 | | }; |
239 | | |
240 | | raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED; |
241 | 0 | raw_ostream &operator<< (raw_ostream &OS, const PE &P) { |
242 | 0 | P.C.print(OS, P.V ? P.V0 : P.C.Root0 ); |
243 | 0 | return OS; |
244 | 0 | } |
245 | | |
246 | | } // end anonymous namespace |
247 | | |
248 | | char HexagonLoopIdiomRecognize::ID = 0; |
249 | | |
250 | 90.0k | INITIALIZE_PASS_BEGIN90.0k (HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
|
251 | 90.0k | "Recognize Hexagon-specific loop idioms", false, false) |
252 | 90.0k | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
253 | 90.0k | INITIALIZE_PASS_DEPENDENCY(LoopSimplify) |
254 | 90.0k | INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) |
255 | 90.0k | INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) |
256 | 90.0k | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
257 | 90.0k | INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) |
258 | 90.0k | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
259 | 90.0k | INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom", |
260 | | "Recognize Hexagon-specific loop idioms", false, false) |
261 | | |
262 | | template <typename FuncT> |
263 | 1.78k | void Simplifier::Context::traverse(Value *V, FuncT F) { |
264 | 1.78k | WorkListType Q; |
265 | 1.78k | Q.push_back(V); |
266 | 1.78k | |
267 | 207k | while (!Q.empty()207k ) { |
268 | 205k | Instruction *U = dyn_cast<Instruction>(Q.pop_front_val()); |
269 | 205k | if (!U || 205k U->getParent()176k ) |
270 | 31.6k | continue; |
271 | 174k | if (174k !F(U)174k ) |
272 | 1.02k | continue; |
273 | 173k | for (Value *Op : U->operands()) |
274 | 390k | Q.push_back(Op); |
275 | 205k | } |
276 | 1.78k | } HexagonLoopIdiomRecognition.cpp:void (anonymous namespace)::Simplifier::Context::traverse<(anonymous namespace)::Simplifier::Context::record(llvm::Value*)::$_0>(llvm::Value*, (anonymous namespace)::Simplifier::Context::record(llvm::Value*)::$_0) Line | Count | Source | 263 | 470 | void Simplifier::Context::traverse(Value *V, FuncT F) { | 264 | 470 | WorkListType Q; | 265 | 470 | Q.push_back(V); | 266 | 470 | | 267 | 16.7k | while (!Q.empty()16.7k ) { | 268 | 16.2k | Instruction *U = dyn_cast<Instruction>(Q.pop_front_val()); | 269 | 16.2k | if (!U || 16.2k U->getParent()11.3k ) | 270 | 5.55k | continue; | 271 | 10.7k | if (10.7k !F(U)10.7k ) | 272 | 0 | continue; | 273 | 10.7k | for (Value *Op : U->operands()) | 274 | 21.0k | Q.push_back(Op); | 275 | 16.2k | } | 276 | 470 | } |
HexagonLoopIdiomRecognition.cpp:void (anonymous namespace)::Simplifier::Context::traverse<(anonymous namespace)::Simplifier::Context::use(llvm::Value*)::$_1>(llvm::Value*, (anonymous namespace)::Simplifier::Context::use(llvm::Value*)::$_1) Line | Count | Source | 263 | 470 | void Simplifier::Context::traverse(Value *V, FuncT F) { | 264 | 470 | WorkListType Q; | 265 | 470 | Q.push_back(V); | 266 | 470 | | 267 | 188k | while (!Q.empty()188k ) { | 268 | 187k | Instruction *U = dyn_cast<Instruction>(Q.pop_front_val()); | 269 | 187k | if (!U || 187k U->getParent()164k ) | 270 | 25.8k | continue; | 271 | 162k | if (162k !F(U)162k ) | 272 | 0 | continue; | 273 | 162k | for (Value *Op : U->operands()) | 274 | 368k | Q.push_back(Op); | 275 | 187k | } | 276 | 470 | } |
HexagonLoopIdiomRecognition.cpp:void (anonymous namespace)::Simplifier::Context::traverse<(anonymous namespace)::Simplifier::Context::unuse(llvm::Value*)::$_2>(llvm::Value*, (anonymous namespace)::Simplifier::Context::unuse(llvm::Value*)::$_2) Line | Count | Source | 263 | 845 | void Simplifier::Context::traverse(Value *V, FuncT F) { | 264 | 845 | WorkListType Q; | 265 | 845 | Q.push_back(V); | 266 | 845 | | 267 | 2.54k | while (!Q.empty()2.54k ) { | 268 | 1.69k | Instruction *U = dyn_cast<Instruction>(Q.pop_front_val()); | 269 | 1.69k | if (!U || 1.69k U->getParent()1.46k ) | 270 | 252 | continue; | 271 | 1.44k | if (1.44k !F(U)1.44k ) | 272 | 1.02k | continue; | 273 | 419 | for (Value *Op : U->operands()) | 274 | 852 | Q.push_back(Op); | 275 | 1.69k | } | 276 | 845 | } |
|
277 | | |
278 | 0 | void Simplifier::Context::print(raw_ostream &OS, const Value *V) const { |
279 | 0 | const auto *U = dyn_cast<const Instruction>(V); |
280 | 0 | if (!U0 ) { |
281 | 0 | OS << V << '(' << *V << ')'; |
282 | 0 | return; |
283 | 0 | } |
284 | 0 |
|
285 | 0 | if (0 U->getParent()0 ) { |
286 | 0 | OS << U << '('; |
287 | 0 | U->printAsOperand(OS, true); |
288 | 0 | OS << ')'; |
289 | 0 | return; |
290 | 0 | } |
291 | 0 |
|
292 | 0 | unsigned N = U->getNumOperands(); |
293 | 0 | if (N != 0) |
294 | 0 | OS << U << '('; |
295 | 0 | OS << U->getOpcodeName(); |
296 | 0 | for (const Value *Op : U->operands()) { |
297 | 0 | OS << ' '; |
298 | 0 | print(OS, Op); |
299 | 0 | } |
300 | 0 | if (N != 0) |
301 | 0 | OS << ')'; |
302 | 0 | } |
303 | | |
304 | 19 | void Simplifier::Context::initialize(Instruction *Exp) { |
305 | 19 | // Perform a deep clone of the expression, set Root to the root |
306 | 19 | // of the clone, and build a map from the cloned values to the |
307 | 19 | // original ones. |
308 | 19 | ValueMapType M; |
309 | 19 | BasicBlock *Block = Exp->getParent(); |
310 | 19 | WorkListType Q; |
311 | 19 | Q.push_back(Exp); |
312 | 19 | |
313 | 307 | while (!Q.empty()307 ) { |
314 | 288 | Value *V = Q.pop_front_val(); |
315 | 288 | if (M.find(V) != M.end()) |
316 | 5 | continue; |
317 | 283 | if (Instruction *283 U283 = dyn_cast<Instruction>(V)) { |
318 | 201 | if (isa<PHINode>(U) || 201 U->getParent() != Block182 ) |
319 | 32 | continue; |
320 | 169 | for (Value *Op : U->operands()) |
321 | 339 | Q.push_back(Op); |
322 | 201 | M.insert({U, U->clone()}); |
323 | 201 | } |
324 | 288 | } |
325 | 19 | |
326 | 169 | for (std::pair<Value*,Value*> P : M) { |
327 | 169 | Instruction *U = cast<Instruction>(P.second); |
328 | 508 | for (unsigned i = 0, n = U->getNumOperands(); i != n508 ; ++i339 ) { |
329 | 339 | auto F = M.find(U->getOperand(i)); |
330 | 339 | if (F != M.end()) |
331 | 186 | U->setOperand(i, F->second); |
332 | 339 | } |
333 | 169 | } |
334 | 19 | |
335 | 19 | auto R = M.find(Exp); |
336 | 19 | assert(R != M.end()); |
337 | 19 | Root = R->second; |
338 | 19 | |
339 | 19 | record(Root); |
340 | 19 | use(Root); |
341 | 19 | } |
342 | | |
343 | 470 | void Simplifier::Context::record(Value *V) { |
344 | 10.7k | auto Record = [this](Instruction *U) -> bool { |
345 | 10.7k | Clones.insert(U); |
346 | 10.7k | return true; |
347 | 10.7k | }; |
348 | 470 | traverse(V, Record); |
349 | 470 | } |
350 | | |
351 | 470 | void Simplifier::Context::use(Value *V) { |
352 | 162k | auto Use = [this](Instruction *U) -> bool { |
353 | 162k | Used.insert(U); |
354 | 162k | return true; |
355 | 162k | }; |
356 | 470 | traverse(V, Use); |
357 | 470 | } |
358 | | |
359 | 845 | void Simplifier::Context::unuse(Value *V) { |
360 | 845 | if (!isa<Instruction>(V) || 845 cast<Instruction>(V)->getParent() != nullptr845 ) |
361 | 0 | return; |
362 | 845 | |
363 | 845 | auto Unuse = [this](Instruction *U) -> bool 845 { |
364 | 1.44k | if (!U->use_empty()) |
365 | 1.02k | return false; |
366 | 419 | Used.erase(U); |
367 | 419 | return true; |
368 | 419 | }; |
369 | 845 | traverse(V, Unuse); |
370 | 845 | } |
371 | | |
372 | 545 | Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) { |
373 | 545 | if (Tree == OldV) |
374 | 1 | return NewV; |
375 | 544 | if (544 OldV == NewV544 ) |
376 | 0 | return Tree; |
377 | 544 | |
378 | 544 | WorkListType Q; |
379 | 544 | Q.push_back(Tree); |
380 | 187k | while (!Q.empty()187k ) { |
381 | 186k | Instruction *U = dyn_cast<Instruction>(Q.pop_front_val()); |
382 | 186k | // If U is not an instruction, or it's not a clone, skip it. |
383 | 186k | if (!U || 186k U->getParent()162k ) |
384 | 26.4k | continue; |
385 | 524k | for (unsigned i = 0, n = U->getNumOperands(); 160k i != n524k ; ++i364k ) { |
386 | 364k | Value *Op = U->getOperand(i); |
387 | 364k | if (Op == OldV364k ) { |
388 | 845 | U->setOperand(i, NewV); |
389 | 845 | unuse(OldV); |
390 | 364k | } else { |
391 | 363k | Q.push_back(Op); |
392 | 363k | } |
393 | 364k | } |
394 | 186k | } |
395 | 545 | return Tree; |
396 | 545 | } |
397 | | |
398 | 451 | void Simplifier::Context::replace(Value *OldV, Value *NewV) { |
399 | 451 | if (Root == OldV451 ) { |
400 | 2 | Root = NewV; |
401 | 2 | use(Root); |
402 | 2 | return; |
403 | 2 | } |
404 | 449 | |
405 | 449 | // NewV may be a complex tree that has just been created by one of the |
406 | 449 | // transformation rules. We need to make sure that it is commoned with |
407 | 449 | // the existing Root to the maximum extent possible. |
408 | 449 | // Identify all subtrees of NewV (including NewV itself) that have |
409 | 449 | // equivalent counterparts in Root, and replace those subtrees with |
410 | 449 | // these counterparts. |
411 | 449 | WorkListType Q; |
412 | 449 | Q.push_back(NewV); |
413 | 3.38k | while (!Q.empty()3.38k ) { |
414 | 2.93k | Value *V = Q.pop_front_val(); |
415 | 2.93k | Instruction *U = dyn_cast<Instruction>(V); |
416 | 2.93k | if (!U || 2.93k U->getParent()2.41k ) |
417 | 531 | continue; |
418 | 2.40k | if (Value *2.40k DupV2.40k = find(Root, V)) { |
419 | 1.21k | if (DupV != V) |
420 | 96 | NewV = subst(NewV, V, DupV); |
421 | 2.40k | } else { |
422 | 1.18k | for (Value *Op : U->operands()) |
423 | 2.81k | Q.push_back(Op); |
424 | 1.18k | } |
425 | 2.93k | } |
426 | 451 | |
427 | 451 | // Now, simply replace OldV with NewV in Root. |
428 | 451 | Root = subst(Root, OldV, NewV); |
429 | 451 | use(Root); |
430 | 451 | } |
431 | | |
432 | 19 | void Simplifier::Context::cleanup() { |
433 | 1.44k | for (Value *V : Clones) { |
434 | 1.44k | Instruction *U = cast<Instruction>(V); |
435 | 1.44k | if (!U->getParent()) |
436 | 1.43k | U->dropAllReferences(); |
437 | 1.44k | } |
438 | 19 | |
439 | 1.44k | for (Value *V : Clones) { |
440 | 1.44k | Instruction *U = cast<Instruction>(V); |
441 | 1.44k | if (!U->getParent()) |
442 | 1.43k | U->deleteValue(); |
443 | 1.44k | } |
444 | 19 | } |
445 | | |
446 | | bool Simplifier::Context::equal(const Instruction *I, |
447 | 1.26M | const Instruction *J) const { |
448 | 1.26M | if (I == J) |
449 | 0 | return true; |
450 | 1.26M | if (1.26M !I->isSameOperationAs(J)1.26M ) |
451 | 497k | return false; |
452 | 769k | if (769k isa<PHINode>(I)769k ) |
453 | 593 | return I->isIdenticalTo(J); |
454 | 769k | |
455 | 843k | for (unsigned i = 0, n = I->getNumOperands(); 769k i != n843k ; ++i74.8k ) { |
456 | 843k | Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i); |
457 | 843k | if (OpI == OpJ) |
458 | 74.4k | continue; |
459 | 769k | auto *InI = dyn_cast<const Instruction>(OpI); |
460 | 769k | auto *InJ = dyn_cast<const Instruction>(OpJ); |
461 | 769k | if (InI && 769k InJ712k ) { |
462 | 649k | if (!equal(InI, InJ)) |
463 | 649k | return false; |
464 | 119k | } else if (119k InI != InJ || 119k !InI23.3k ) |
465 | 119k | return false; |
466 | 843k | } |
467 | 475 | return true; |
468 | 1.26M | } |
469 | | |
470 | 2.40k | Value *Simplifier::Context::find(Value *Tree, Value *Sub) const { |
471 | 2.40k | Instruction *SubI = dyn_cast<Instruction>(Sub); |
472 | 2.40k | WorkListType Q; |
473 | 2.40k | Q.push_back(Tree); |
474 | 2.40k | |
475 | 707k | while (!Q.empty()707k ) { |
476 | 705k | Value *V = Q.pop_front_val(); |
477 | 705k | if (V == Sub) |
478 | 1.12k | return V; |
479 | 704k | Instruction *U = dyn_cast<Instruction>(V); |
480 | 704k | if (!U || 704k U->getParent()624k ) |
481 | 86.9k | continue; |
482 | 617k | if (617k SubI && 617k equal(SubI, U)617k ) |
483 | 96 | return U; |
484 | 617k | assert(!isa<PHINode>(U)); |
485 | 617k | for (Value *Op : U->operands()) |
486 | 1.44M | Q.push_back(Op); |
487 | 705k | } |
488 | 1.18k | return nullptr; |
489 | 2.40k | } |
490 | | |
491 | | void Simplifier::Context::link(Instruction *I, BasicBlock *B, |
492 | 24 | BasicBlock::iterator At) { |
493 | 24 | if (I->getParent()) |
494 | 9 | return; |
495 | 15 | |
496 | 15 | for (Value *Op : I->operands()) 15 { |
497 | 29 | if (Instruction *OpI = dyn_cast<Instruction>(Op)) |
498 | 22 | link(OpI, B, At); |
499 | 29 | } |
500 | 24 | |
501 | 24 | B->getInstList().insert(At, I); |
502 | 24 | } |
503 | | |
504 | | Value *Simplifier::Context::materialize(BasicBlock *B, |
505 | 2 | BasicBlock::iterator At) { |
506 | 2 | if (Instruction *RootI = dyn_cast<Instruction>(Root)) |
507 | 2 | link(RootI, B, At); |
508 | 2 | return Root; |
509 | 2 | } |
510 | | |
511 | 19 | Value *Simplifier::simplify(Context &C) { |
512 | 19 | WorkListType Q; |
513 | 19 | Q.push_back(C.Root); |
514 | 19 | unsigned Count = 0; |
515 | 19 | const unsigned Limit = SimplifyLimit; |
516 | 19 | |
517 | 11.5k | while (!Q.empty()11.5k ) { |
518 | 11.5k | if (Count++ >= Limit) |
519 | 1 | break; |
520 | 11.5k | Instruction *U = dyn_cast<Instruction>(Q.pop_front_val()); |
521 | 11.5k | if (!U || 11.5k U->getParent()11.0k || !C.Used.count(U)10.8k ) |
522 | 664 | continue; |
523 | 10.8k | bool Changed = false; |
524 | 74.4k | for (Rule &R : Rules) { |
525 | 74.4k | Value *W = R(U, C.Ctx); |
526 | 74.4k | if (!W) |
527 | 74.0k | continue; |
528 | 451 | Changed = true; |
529 | 451 | C.record(W); |
530 | 451 | C.replace(U, W); |
531 | 451 | Q.push_back(C.Root); |
532 | 451 | break; |
533 | 451 | } |
534 | 10.8k | if (!Changed10.8k ) { |
535 | 10.4k | for (Value *Op : U->operands()) |
536 | 25.8k | Q.push_back(Op); |
537 | 10.4k | } |
538 | 11.5k | } |
539 | 19 | return Count < Limit ? C.Root18 : nullptr1 ; |
540 | 19 | } |
541 | | |
542 | | //===----------------------------------------------------------------------===// |
543 | | // |
544 | | // Implementation of PolynomialMultiplyRecognize |
545 | | // |
546 | | //===----------------------------------------------------------------------===// |
547 | | |
548 | | namespace { |
549 | | |
550 | | class PolynomialMultiplyRecognize { |
551 | | public: |
552 | | explicit PolynomialMultiplyRecognize(Loop *loop, const DataLayout &dl, |
553 | | const DominatorTree &dt, const TargetLibraryInfo &tli, |
554 | | ScalarEvolution &se) |
555 | 7 | : CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {} |
556 | | |
557 | | bool recognize(); |
558 | | |
559 | | private: |
560 | | using ValueSeq = SetVector<Value *>; |
561 | | |
562 | 1 | IntegerType *getPmpyType() const { |
563 | 1 | LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext(); |
564 | 1 | return IntegerType::get(Ctx, 32); |
565 | 1 | } |
566 | | |
567 | | bool isPromotableTo(Value *V, IntegerType *Ty); |
568 | | void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB); |
569 | | bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB); |
570 | | |
571 | | Value *getCountIV(BasicBlock *BB); |
572 | | bool findCycle(Value *Out, Value *In, ValueSeq &Cycle); |
573 | | void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early, |
574 | | ValueSeq &Late); |
575 | | bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late); |
576 | | bool commutesWithShift(Instruction *I); |
577 | | bool highBitsAreZero(Value *V, unsigned IterCount); |
578 | | bool keepsHighBitsZero(Value *V, unsigned IterCount); |
579 | | bool isOperandShifted(Instruction *I, Value *Op); |
580 | | bool convertShiftsToLeft(BasicBlock *LoopB, BasicBlock *ExitB, |
581 | | unsigned IterCount); |
582 | | void cleanupLoopBody(BasicBlock *LoopB); |
583 | | |
584 | | struct ParsedValues { |
585 | 6 | ParsedValues() = default; |
586 | | |
587 | | Value *M = nullptr; |
588 | | Value *P = nullptr; |
589 | | Value *Q = nullptr; |
590 | | Value *R = nullptr; |
591 | | Value *X = nullptr; |
592 | | Instruction *Res = nullptr; |
593 | | unsigned IterCount = 0; |
594 | | bool Left = false; |
595 | | bool Inv = false; |
596 | | }; |
597 | | |
598 | | bool matchLeftShift(SelectInst *SelI, Value *CIV, ParsedValues &PV); |
599 | | bool matchRightShift(SelectInst *SelI, ParsedValues &PV); |
600 | | bool scanSelect(SelectInst *SI, BasicBlock *LoopB, BasicBlock *PrehB, |
601 | | Value *CIV, ParsedValues &PV, bool PreScan); |
602 | | unsigned getInverseMxN(unsigned QP); |
603 | | Value *generate(BasicBlock::iterator At, ParsedValues &PV); |
604 | | |
605 | | void setupSimplifier(); |
606 | | |
607 | | Simplifier Simp; |
608 | | Loop *CurLoop; |
609 | | const DataLayout &DL; |
610 | | const DominatorTree &DT; |
611 | | const TargetLibraryInfo &TLI; |
612 | | ScalarEvolution &SE; |
613 | | }; |
614 | | |
615 | | } // end anonymous namespace |
616 | | |
617 | 7 | Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) { |
618 | 7 | pred_iterator PI = pred_begin(BB), PE = pred_end(BB); |
619 | 7 | if (std::distance(PI, PE) != 2) |
620 | 0 | return nullptr; |
621 | 7 | BasicBlock *PB = (*PI == BB) ? 7 *std::next(PI)5 : *PI2 ; |
622 | 7 | |
623 | 13 | for (auto I = BB->begin(), E = BB->end(); I != E && 13 isa<PHINode>(I)13 ; ++I6 ) { |
624 | 11 | auto *PN = cast<PHINode>(I); |
625 | 11 | Value *InitV = PN->getIncomingValueForBlock(PB); |
626 | 11 | if (!isa<ConstantInt>(InitV) || 11 !cast<ConstantInt>(InitV)->isZero()6 ) |
627 | 5 | continue; |
628 | 6 | Value *IterV = PN->getIncomingValueForBlock(BB); |
629 | 6 | if (!isa<BinaryOperator>(IterV)) |
630 | 1 | continue; |
631 | 5 | auto *BO = dyn_cast<BinaryOperator>(IterV); |
632 | 5 | if (BO->getOpcode() != Instruction::Add) |
633 | 0 | continue; |
634 | 5 | Value *IncV = nullptr; |
635 | 5 | if (BO->getOperand(0) == PN) |
636 | 5 | IncV = BO->getOperand(1); |
637 | 0 | else if (0 BO->getOperand(1) == PN0 ) |
638 | 0 | IncV = BO->getOperand(0); |
639 | 5 | if (IncV == nullptr) |
640 | 0 | continue; |
641 | 5 | |
642 | 5 | if (auto *5 T5 = dyn_cast<ConstantInt>(IncV)) |
643 | 5 | if (5 T->getZExtValue() == 15 ) |
644 | 5 | return PN; |
645 | 11 | } |
646 | 2 | return nullptr; |
647 | 7 | } |
648 | | |
649 | 2 | static void replaceAllUsesOfWithIn(Value *I, Value *J, BasicBlock *BB) { |
650 | 5 | for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE5 ;) { |
651 | 3 | Use &TheUse = UI.getUse(); |
652 | 3 | ++UI; |
653 | 3 | if (auto *II = dyn_cast<Instruction>(TheUse.getUser())) |
654 | 3 | if (3 BB == II->getParent()3 ) |
655 | 3 | II->replaceUsesOfWith(I, J); |
656 | 3 | } |
657 | 2 | } |
658 | | |
659 | | bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI, |
660 | 21 | Value *CIV, ParsedValues &PV) { |
661 | 21 | // Match the following: |
662 | 21 | // select (X & (1 << i)) != 0 ? R ^ (Q << i) : R |
663 | 21 | // select (X & (1 << i)) == 0 ? R : R ^ (Q << i) |
664 | 21 | // The condition may also check for equality with the masked value, i.e |
665 | 21 | // select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R |
666 | 21 | // select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i); |
667 | 21 | |
668 | 21 | Value *CondV = SelI->getCondition(); |
669 | 21 | Value *TrueV = SelI->getTrueValue(); |
670 | 21 | Value *FalseV = SelI->getFalseValue(); |
671 | 21 | |
672 | 21 | using namespace PatternMatch; |
673 | 21 | |
674 | 21 | CmpInst::Predicate P; |
675 | 21 | Value *A = nullptr, *B = nullptr, *C = nullptr; |
676 | 21 | |
677 | 21 | if (!match(CondV, m_ICmp(P, m_And(m_Value(A), m_Value(B)), m_Value(C))) && |
678 | 16 | !match(CondV, m_ICmp(P, m_Value(C), m_And(m_Value(A), m_Value(B))))) |
679 | 16 | return false; |
680 | 5 | if (5 P != CmpInst::ICMP_EQ && 5 P != CmpInst::ICMP_NE0 ) |
681 | 0 | return false; |
682 | 5 | // Matched: select (A & B) == C ? ... : ... |
683 | 5 | // select (A & B) != C ? ... : ... |
684 | 5 | |
685 | 5 | Value *X = nullptr, *Sh1 = nullptr; |
686 | 5 | // Check (A & B) for (X & (1 << i)): |
687 | 5 | if (match(A, m_Shl(m_One(), m_Specific(CIV)))5 ) { |
688 | 2 | Sh1 = A; |
689 | 2 | X = B; |
690 | 5 | } else if (3 match(B, m_Shl(m_One(), m_Specific(CIV)))3 ) { |
691 | 1 | Sh1 = B; |
692 | 1 | X = A; |
693 | 3 | } else { |
694 | 2 | // TODO: Could also check for an induction variable containing single |
695 | 2 | // bit shifted left by 1 in each iteration. |
696 | 2 | return false; |
697 | 2 | } |
698 | 3 | |
699 | 3 | bool TrueIfZero; |
700 | 3 | |
701 | 3 | // Check C against the possible values for comparison: 0 and (1 << i): |
702 | 3 | if (match(C, m_Zero())) |
703 | 2 | TrueIfZero = (P == CmpInst::ICMP_EQ); |
704 | 1 | else if (1 C == Sh11 ) |
705 | 1 | TrueIfZero = (P == CmpInst::ICMP_NE); |
706 | 1 | else |
707 | 0 | return false; |
708 | 3 | |
709 | 3 | // So far, matched: |
710 | 3 | // select (X & (1 << i)) ? ... : ... |
711 | 3 | // including variations of the check against zero/non-zero value. |
712 | 3 | |
713 | 3 | Value *ShouldSameV = nullptr, *ShouldXoredV = nullptr; |
714 | 3 | if (TrueIfZero3 ) { |
715 | 2 | ShouldSameV = TrueV; |
716 | 2 | ShouldXoredV = FalseV; |
717 | 3 | } else { |
718 | 1 | ShouldSameV = FalseV; |
719 | 1 | ShouldXoredV = TrueV; |
720 | 1 | } |
721 | 3 | |
722 | 3 | Value *Q = nullptr, *R = nullptr, *Y = nullptr, *Z = nullptr; |
723 | 3 | Value *T = nullptr; |
724 | 3 | if (match(ShouldXoredV, m_Xor(m_Value(Y), m_Value(Z)))3 ) { |
725 | 3 | // Matched: select +++ ? ... : Y ^ Z |
726 | 3 | // select +++ ? Y ^ Z : ... |
727 | 3 | // where +++ denotes previously checked matches. |
728 | 3 | if (ShouldSameV == Y) |
729 | 1 | T = Z; |
730 | 2 | else if (2 ShouldSameV == Z2 ) |
731 | 2 | T = Y; |
732 | 2 | else |
733 | 0 | return false; |
734 | 3 | R = ShouldSameV; |
735 | 3 | // Matched: select +++ ? R : R ^ T |
736 | 3 | // select +++ ? R ^ T : R |
737 | 3 | // depending on TrueIfZero. |
738 | 3 | |
739 | 3 | } else if (0 match(ShouldSameV, m_Zero())0 ) { |
740 | 0 | // Matched: select +++ ? 0 : ... |
741 | 0 | // select +++ ? ... : 0 |
742 | 0 | if (!SelI->hasOneUse()) |
743 | 0 | return false; |
744 | 0 | T = ShouldXoredV; |
745 | 0 | // Matched: select +++ ? 0 : T |
746 | 0 | // select +++ ? T : 0 |
747 | 0 |
|
748 | 0 | Value *U = *SelI->user_begin(); |
749 | 0 | if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) && |
750 | 0 | !match(U, m_Xor(m_Value(R), m_Specific(SelI)))) |
751 | 0 | return false; |
752 | 0 | // Matched: xor (select +++ ? 0 : T), R |
753 | 0 | // xor (select +++ ? T : 0), R |
754 | 0 | } else |
755 | 0 | return false; |
756 | 3 | |
757 | 3 | // The xor input value T is isolated into its own match so that it could |
758 | 3 | // be checked against an induction variable containing a shifted bit |
759 | 3 | // (todo). |
760 | 3 | // For now, check against (Q << i). |
761 | 3 | if (3 !match(T, m_Shl(m_Value(Q), m_Specific(CIV))) && |
762 | 2 | !match(T, m_Shl(m_ZExt(m_Value(Q)), m_ZExt(m_Specific(CIV))))) |
763 | 0 | return false; |
764 | 3 | // Matched: select +++ ? R : R ^ (Q << i) |
765 | 3 | // select +++ ? R ^ (Q << i) : R |
766 | 3 | |
767 | 3 | PV.X = X; |
768 | 3 | PV.Q = Q; |
769 | 3 | PV.R = R; |
770 | 3 | PV.Left = true; |
771 | 3 | return true; |
772 | 3 | } |
773 | | |
774 | | bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI, |
775 | 18 | ParsedValues &PV) { |
776 | 18 | // Match the following: |
777 | 18 | // select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1) |
778 | 18 | // select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q |
779 | 18 | // The condition may also check for equality with the masked value, i.e |
780 | 18 | // select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1) |
781 | 18 | // select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q |
782 | 18 | |
783 | 18 | Value *CondV = SelI->getCondition(); |
784 | 18 | Value *TrueV = SelI->getTrueValue(); |
785 | 18 | Value *FalseV = SelI->getFalseValue(); |
786 | 18 | |
787 | 18 | using namespace PatternMatch; |
788 | 18 | |
789 | 18 | Value *C = nullptr; |
790 | 18 | CmpInst::Predicate P; |
791 | 18 | bool TrueIfZero; |
792 | 18 | |
793 | 18 | if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) || |
794 | 18 | match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))17 ) { |
795 | 1 | if (P != CmpInst::ICMP_EQ && 1 P != CmpInst::ICMP_NE1 ) |
796 | 1 | return false; |
797 | 0 | // Matched: select C == 0 ? ... : ... |
798 | 0 | // select C != 0 ? ... : ... |
799 | 0 | TrueIfZero = (P == CmpInst::ICMP_EQ); |
800 | 18 | } else if (17 match(CondV, m_ICmp(P, m_Value(C), m_One())) || |
801 | 17 | match(CondV, m_ICmp(P, m_One(), m_Value(C)))15 ) { |
802 | 2 | if (P != CmpInst::ICMP_EQ && 2 P != CmpInst::ICMP_NE0 ) |
803 | 0 | return false; |
804 | 2 | // Matched: select C == 1 ? ... : ... |
805 | 2 | // select C != 1 ? ... : ... |
806 | 2 | TrueIfZero = (P == CmpInst::ICMP_NE); |
807 | 2 | } else |
808 | 15 | return false; |
809 | 2 | |
810 | 2 | Value *X = nullptr; |
811 | 2 | if (!match(C, m_And(m_Value(X), m_One())) && |
812 | 0 | !match(C, m_And(m_One(), m_Value(X)))) |
813 | 0 | return false; |
814 | 2 | // Matched: select (X & 1) == +++ ? ... : ... |
815 | 2 | // select (X & 1) != +++ ? ... : ... |
816 | 2 | |
817 | 2 | Value *R = nullptr, *Q = nullptr; |
818 | 2 | if (TrueIfZero2 ) { |
819 | 0 | // The select's condition is true if the tested bit is 0. |
820 | 0 | // TrueV must be the shift, FalseV must be the xor. |
821 | 0 | if (!match(TrueV, m_LShr(m_Value(R), m_One()))) |
822 | 0 | return false; |
823 | 0 | // Matched: select +++ ? (R >> 1) : ... |
824 | 0 | if (0 !match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) && |
825 | 0 | !match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV)))) |
826 | 0 | return false; |
827 | 2 | // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q |
828 | 2 | // with commuting ^. |
829 | 2 | } else { |
830 | 2 | // The select's condition is true if the tested bit is 1. |
831 | 2 | // TrueV must be the xor, FalseV must be the shift. |
832 | 2 | if (!match(FalseV, m_LShr(m_Value(R), m_One()))) |
833 | 1 | return false; |
834 | 1 | // Matched: select +++ ? ... : (R >> 1) |
835 | 1 | if (1 !match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) && |
836 | 0 | !match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV)))) |
837 | 0 | return false; |
838 | 1 | // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1) |
839 | 1 | // with commuting ^. |
840 | 1 | } |
841 | 1 | |
842 | 1 | PV.X = X; |
843 | 1 | PV.Q = Q; |
844 | 1 | PV.R = R; |
845 | 1 | PV.Left = false; |
846 | 1 | return true; |
847 | 1 | } |
848 | | |
849 | | bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI, |
850 | | BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV, |
851 | 21 | bool PreScan) { |
852 | 21 | using namespace PatternMatch; |
853 | 21 | |
854 | 21 | // The basic pattern for R = P.Q is: |
855 | 21 | // for i = 0..31 |
856 | 21 | // R = phi (0, R') |
857 | 21 | // if (P & (1 << i)) ; test-bit(P, i) |
858 | 21 | // R' = R ^ (Q << i) |
859 | 21 | // |
860 | 21 | // Similarly, the basic pattern for R = (P/Q).Q - P |
861 | 21 | // for i = 0..31 |
862 | 21 | // R = phi(P, R') |
863 | 21 | // if (R & (1 << i)) |
864 | 21 | // R' = R ^ (Q << i) |
865 | 21 | |
866 | 21 | // There exist idioms, where instead of Q being shifted left, P is shifted |
867 | 21 | // right. This produces a result that is shifted right by 32 bits (the |
868 | 21 | // non-shifted result is 64-bit). |
869 | 21 | // |
870 | 21 | // For R = P.Q, this would be: |
871 | 21 | // for i = 0..31 |
872 | 21 | // R = phi (0, R') |
873 | 21 | // if ((P >> i) & 1) |
874 | 21 | // R' = (R >> 1) ^ Q ; R is cycled through the loop, so it must |
875 | 21 | // else ; be shifted by 1, not i. |
876 | 21 | // R' = R >> 1 |
877 | 21 | // |
878 | 21 | // And for the inverse: |
879 | 21 | // for i = 0..31 |
880 | 21 | // R = phi (P, R') |
881 | 21 | // if (R & 1) |
882 | 21 | // R' = (R >> 1) ^ Q |
883 | 21 | // else |
884 | 21 | // R' = R >> 1 |
885 | 21 | |
886 | 21 | // The left-shifting idioms share the same pattern: |
887 | 21 | // select (X & (1 << i)) ? R ^ (Q << i) : R |
888 | 21 | // Similarly for right-shifting idioms: |
889 | 21 | // select (X & 1) ? (R >> 1) ^ Q |
890 | 21 | |
891 | 21 | if (matchLeftShift(SelI, CIV, PV)21 ) { |
892 | 3 | // If this is a pre-scan, getting this far is sufficient. |
893 | 3 | if (PreScan) |
894 | 1 | return true; |
895 | 2 | |
896 | 2 | // Need to make sure that the SelI goes back into R. |
897 | 2 | auto *RPhi = dyn_cast<PHINode>(PV.R); |
898 | 2 | if (!RPhi) |
899 | 0 | return false; |
900 | 2 | if (2 SelI != RPhi->getIncomingValueForBlock(LoopB)2 ) |
901 | 0 | return false; |
902 | 2 | PV.Res = SelI; |
903 | 2 | |
904 | 2 | // If X is loop invariant, it must be the input polynomial, and the |
905 | 2 | // idiom is the basic polynomial multiply. |
906 | 2 | if (CurLoop->isLoopInvariant(PV.X)2 ) { |
907 | 1 | PV.P = PV.X; |
908 | 1 | PV.Inv = false; |
909 | 2 | } else { |
910 | 1 | // X is not loop invariant. If X == R, this is the inverse pmpy. |
911 | 1 | // Otherwise, check for an xor with an invariant value. If the |
912 | 1 | // variable argument to the xor is R, then this is still a valid |
913 | 1 | // inverse pmpy. |
914 | 1 | PV.Inv = true; |
915 | 1 | if (PV.X != PV.R1 ) { |
916 | 1 | Value *Var = nullptr, *Inv = nullptr, *X1 = nullptr, *X2 = nullptr; |
917 | 1 | if (!match(PV.X, m_Xor(m_Value(X1), m_Value(X2)))) |
918 | 0 | return false; |
919 | 1 | auto *I1 = dyn_cast<Instruction>(X1); |
920 | 1 | auto *I2 = dyn_cast<Instruction>(X2); |
921 | 1 | if (!I1 || 1 I1->getParent() != LoopB1 ) { |
922 | 0 | Var = X2; |
923 | 0 | Inv = X1; |
924 | 1 | } else if (1 !I2 || 1 I2->getParent() != LoopB1 ) { |
925 | 1 | Var = X1; |
926 | 1 | Inv = X2; |
927 | 1 | } else |
928 | 0 | return false; |
929 | 1 | if (1 Var != PV.R1 ) |
930 | 0 | return false; |
931 | 1 | PV.M = Inv; |
932 | 1 | } |
933 | 1 | // The input polynomial P still needs to be determined. It will be |
934 | 1 | // the entry value of R. |
935 | 1 | Value *EntryP = RPhi->getIncomingValueForBlock(PrehB); |
936 | 1 | PV.P = EntryP; |
937 | 1 | } |
938 | 2 | |
939 | 2 | return true; |
940 | 18 | } |
941 | 18 | |
942 | 18 | if (18 matchRightShift(SelI, PV)18 ) { |
943 | 1 | // If this is an inverse pattern, the Q polynomial must be known at |
944 | 1 | // compile time. |
945 | 1 | if (PV.Inv && 1 !isa<ConstantInt>(PV.Q)0 ) |
946 | 0 | return false; |
947 | 1 | if (1 PreScan1 ) |
948 | 1 | return true; |
949 | 0 | // There is no exact matching of right-shift pmpy. |
950 | 0 | return false; |
951 | 0 | } |
952 | 17 | |
953 | 17 | return false; |
954 | 17 | } |
955 | | |
956 | | bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val, |
957 | 14 | IntegerType *DestTy) { |
958 | 14 | IntegerType *T = dyn_cast<IntegerType>(Val->getType()); |
959 | 14 | if (!T || 14 T->getBitWidth() > DestTy->getBitWidth()14 ) |
960 | 0 | return false; |
961 | 14 | if (14 T->getBitWidth() == DestTy->getBitWidth()14 ) |
962 | 4 | return true; |
963 | 10 | // Non-instructions are promotable. The reason why an instruction may not |
964 | 10 | // be promotable is that it may produce a different result if its operands |
965 | 10 | // and the result are promoted, for example, it may produce more non-zero |
966 | 10 | // bits. While it would still be possible to represent the proper result |
967 | 10 | // in a wider type, it may require adding additional instructions (which |
968 | 10 | // we don't want to do). |
969 | 10 | Instruction *In = dyn_cast<Instruction>(Val); |
970 | 10 | if (!In) |
971 | 0 | return true; |
972 | 10 | // The bitwidth of the source type is smaller than the destination. |
973 | 10 | // Check if the individual operation can be promoted. |
974 | 10 | switch (In->getOpcode()) { |
975 | 7 | case Instruction::PHI: |
976 | 7 | case Instruction::ZExt: |
977 | 7 | case Instruction::And: |
978 | 7 | case Instruction::Or: |
979 | 7 | case Instruction::Xor: |
980 | 7 | case Instruction::LShr: // Shift right is ok. |
981 | 7 | case Instruction::Select: |
982 | 7 | return true; |
983 | 2 | case Instruction::ICmp: |
984 | 2 | if (CmpInst *CI = cast<CmpInst>(In)) |
985 | 2 | return CI->isEquality() || 2 CI->isUnsigned()1 ; |
986 | 0 | llvm_unreachable0 ("Cast failed unexpectedly"); |
987 | 1 | case Instruction::Add: |
988 | 1 | return In->hasNoSignedWrap() && In->hasNoUnsignedWrap(); |
989 | 0 | } |
990 | 0 | return false; |
991 | 0 | } |
992 | | |
993 | | void PolynomialMultiplyRecognize::promoteTo(Instruction *In, |
994 | 15 | IntegerType *DestTy, BasicBlock *LoopB) { |
995 | 15 | // Leave boolean values alone. |
996 | 15 | if (!In->getType()->isIntegerTy(1)) |
997 | 13 | In->mutateType(DestTy); |
998 | 15 | unsigned DestBW = DestTy->getBitWidth(); |
999 | 15 | |
1000 | 15 | // Handle PHIs. |
1001 | 15 | if (PHINode *P15 = dyn_cast<PHINode>(In)) { |
1002 | 3 | unsigned N = P->getNumIncomingValues(); |
1003 | 9 | for (unsigned i = 0; i != N9 ; ++i6 ) { |
1004 | 6 | BasicBlock *InB = P->getIncomingBlock(i); |
1005 | 6 | if (InB == LoopB) |
1006 | 3 | continue; |
1007 | 3 | Value *InV = P->getIncomingValue(i); |
1008 | 3 | IntegerType *Ty = cast<IntegerType>(InV->getType()); |
1009 | 3 | // Do not promote values in PHI nodes of type i1. |
1010 | 3 | if (Ty != P->getType()3 ) { |
1011 | 3 | // If the value type does not match the PHI type, the PHI type |
1012 | 3 | // must have been promoted. |
1013 | 3 | assert(Ty->getBitWidth() < DestBW); |
1014 | 3 | InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy); |
1015 | 3 | P->setIncomingValue(i, InV); |
1016 | 3 | } |
1017 | 6 | } |
1018 | 15 | } else if (ZExtInst *12 Z12 = dyn_cast<ZExtInst>(In)) { |
1019 | 2 | Value *Op = Z->getOperand(0); |
1020 | 2 | if (Op->getType() == Z->getType()) |
1021 | 2 | Z->replaceAllUsesWith(Op); |
1022 | 12 | Z->eraseFromParent(); |
1023 | 12 | return; |
1024 | 12 | } |
1025 | 13 | |
1026 | 13 | // Promote immediates. |
1027 | 41 | for (unsigned i = 0, n = In->getNumOperands(); 13 i != n41 ; ++i28 ) { |
1028 | 28 | if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i))) |
1029 | 8 | if (8 CI->getType()->getBitWidth() < DestBW8 ) |
1030 | 5 | In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue())); |
1031 | 28 | } |
1032 | 15 | } |
1033 | | |
1034 | | bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB, |
1035 | 1 | BasicBlock *ExitB) { |
1036 | 1 | assert(LoopB); |
1037 | 1 | // Skip loops where the exit block has more than one predecessor. The values |
1038 | 1 | // coming from the loop block will be promoted to another type, and so the |
1039 | 1 | // values coming into the exit block from other predecessors would also have |
1040 | 1 | // to be promoted. |
1041 | 1 | if (!ExitB || 1 (ExitB->getSinglePredecessor() != LoopB)1 ) |
1042 | 0 | return false; |
1043 | 1 | IntegerType *DestTy = getPmpyType(); |
1044 | 1 | // Check if the exit values have types that are no wider than the type |
1045 | 1 | // that we want to promote to. |
1046 | 1 | unsigned DestBW = DestTy->getBitWidth(); |
1047 | 2 | for (Instruction &In : *ExitB) { |
1048 | 2 | PHINode *P = dyn_cast<PHINode>(&In); |
1049 | 2 | if (!P) |
1050 | 1 | break; |
1051 | 1 | if (1 P->getNumIncomingValues() != 11 ) |
1052 | 0 | return false; |
1053 | 1 | assert(P->getIncomingBlock(0) == LoopB); |
1054 | 1 | IntegerType *T = dyn_cast<IntegerType>(P->getType()); |
1055 | 1 | if (!T || 1 T->getBitWidth() > DestBW1 ) |
1056 | 0 | return false; |
1057 | 1 | } |
1058 | 1 | |
1059 | 1 | // Check all instructions in the loop. |
1060 | 1 | for (Instruction &In : *LoopB) |
1061 | 15 | if (15 !In.isTerminator() && 15 !isPromotableTo(&In, DestTy)14 ) |
1062 | 0 | return false; |
1063 | 1 | |
1064 | 1 | // Perform the promotion. |
1065 | 1 | std::vector<Instruction*> LoopIns; |
1066 | 1 | std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns), |
1067 | 15 | [](Instruction &In) { return &In; }); |
1068 | 1 | for (Instruction *In : LoopIns) |
1069 | 15 | promoteTo(In, DestTy, LoopB); |
1070 | 1 | |
1071 | 1 | // Fix up the PHI nodes in the exit block. |
1072 | 1 | Instruction *EndI = ExitB->getFirstNonPHI(); |
1073 | 1 | BasicBlock::iterator End = EndI ? EndI->getIterator()1 : ExitB->end()0 ; |
1074 | 2 | for (auto I = ExitB->begin(); I != End2 ; ++I1 ) { |
1075 | 2 | PHINode *P = dyn_cast<PHINode>(I); |
1076 | 2 | if (!P) |
1077 | 1 | break; |
1078 | 1 | Type *Ty0 = P->getIncomingValue(0)->getType(); |
1079 | 1 | Type *PTy = P->getType(); |
1080 | 1 | if (PTy != Ty01 ) { |
1081 | 1 | assert(Ty0 == DestTy); |
1082 | 1 | // In order to create the trunc, P must have the promoted type. |
1083 | 1 | P->mutateType(Ty0); |
1084 | 1 | Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy); |
1085 | 1 | // In order for the RAUW to work, the types of P and T must match. |
1086 | 1 | P->mutateType(PTy); |
1087 | 1 | P->replaceAllUsesWith(T); |
1088 | 1 | // Final update of the P's type. |
1089 | 1 | P->mutateType(Ty0); |
1090 | 1 | cast<Instruction>(T)->setOperand(0, P); |
1091 | 1 | } |
1092 | 2 | } |
1093 | 1 | |
1094 | 1 | return true; |
1095 | 1 | } |
1096 | | |
1097 | | bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In, |
1098 | 6 | ValueSeq &Cycle) { |
1099 | 6 | // Out = ..., In, ... |
1100 | 6 | if (Out == In) |
1101 | 2 | return true; |
1102 | 4 | |
1103 | 4 | auto *BB = cast<Instruction>(Out)->getParent(); |
1104 | 4 | bool HadPhi = false; |
1105 | 4 | |
1106 | 4 | for (auto U : Out->users()) { |
1107 | 4 | auto *I = dyn_cast<Instruction>(&*U); |
1108 | 4 | if (I == nullptr || 4 I->getParent() != BB4 ) |
1109 | 0 | continue; |
1110 | 4 | // Make sure that there are no multi-iteration cycles, e.g. |
1111 | 4 | // p1 = phi(p2) |
1112 | 4 | // p2 = phi(p1) |
1113 | 4 | // The cycle p1->p2->p1 would span two loop iterations. |
1114 | 4 | // Check that there is only one phi in the cycle. |
1115 | 4 | bool IsPhi = isa<PHINode>(I); |
1116 | 4 | if (IsPhi && 4 HadPhi2 ) |
1117 | 0 | return false; |
1118 | 4 | HadPhi |= IsPhi; |
1119 | 4 | if (Cycle.count(I)) |
1120 | 0 | return false; |
1121 | 4 | Cycle.insert(I); |
1122 | 4 | if (findCycle(I, In, Cycle)) |
1123 | 4 | break; |
1124 | 0 | Cycle.remove(I); |
1125 | 0 | } |
1126 | 4 | return !Cycle.empty(); |
1127 | 6 | } |
1128 | | |
1129 | | void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI, |
1130 | 2 | ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) { |
1131 | 2 | // All the values in the cycle that are between the phi node and the |
1132 | 2 | // divider instruction will be classified as "early", all other values |
1133 | 2 | // will be "late". |
1134 | 2 | |
1135 | 2 | bool IsE = true; |
1136 | 2 | unsigned I, N = Cycle.size(); |
1137 | 4 | for (I = 0; I < N4 ; ++I2 ) { |
1138 | 4 | Value *V = Cycle[I]; |
1139 | 4 | if (DivI == V) |
1140 | 0 | IsE = false; |
1141 | 4 | else if (4 !isa<PHINode>(V)4 ) |
1142 | 2 | continue; |
1143 | 2 | // Stop if found either. |
1144 | 2 | break; |
1145 | 2 | } |
1146 | 2 | // "I" is the index of either DivI or the phi node, whichever was first. |
1147 | 2 | // "E" is "false" or "true" respectively. |
1148 | 2 | ValueSeq &First = !IsE ? Early0 : Late2 ; |
1149 | 4 | for (unsigned J = 0; J < I4 ; ++J2 ) |
1150 | 2 | First.insert(Cycle[J]); |
1151 | 2 | |
1152 | 2 | ValueSeq &Second = IsE ? Early2 : Late0 ; |
1153 | 2 | Second.insert(Cycle[I]); |
1154 | 2 | for (++I; I < N2 ; ++I0 ) { |
1155 | 2 | Value *V = Cycle[I]; |
1156 | 2 | if (DivI == V || 2 isa<PHINode>(V)0 ) |
1157 | 2 | break; |
1158 | 0 | Second.insert(V); |
1159 | 0 | } |
1160 | 2 | |
1161 | 4 | for (; I < N4 ; ++I2 ) |
1162 | 2 | First.insert(Cycle[I]); |
1163 | 2 | } |
1164 | | |
1165 | | bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI, |
1166 | 8 | ValueSeq &Early, ValueSeq &Late) { |
1167 | 8 | // Select is an exception, since the condition value does not have to be |
1168 | 8 | // classified in the same way as the true/false values. The true/false |
1169 | 8 | // values do have to be both early or both late. |
1170 | 8 | if (UseI->getOpcode() == Instruction::Select8 ) { |
1171 | 3 | Value *TV = UseI->getOperand(1), *FV = UseI->getOperand(2); |
1172 | 3 | if (Early.count(TV) || 3 Early.count(FV)3 ) { |
1173 | 0 | if (Late.count(TV) || 0 Late.count(FV)0 ) |
1174 | 0 | return false; |
1175 | 0 | Early.insert(UseI); |
1176 | 3 | } else if (3 Late.count(TV) || 3 Late.count(FV)0 ) { |
1177 | 3 | if (Early.count(TV) || 3 Early.count(FV)3 ) |
1178 | 0 | return false; |
1179 | 3 | Late.insert(UseI); |
1180 | 3 | } |
1181 | 3 | return true; |
1182 | 5 | } |
1183 | 5 | |
1184 | 5 | // Not sure what would be the example of this, but the code below relies |
1185 | 5 | // on having at least one operand. |
1186 | 5 | if (5 UseI->getNumOperands() == 05 ) |
1187 | 0 | return true; |
1188 | 5 | |
1189 | 5 | bool AE = true, AL = true; |
1190 | 10 | for (auto &I : UseI->operands()) { |
1191 | 10 | if (Early.count(&*I)) |
1192 | 6 | AL = false; |
1193 | 4 | else if (4 Late.count(&*I)4 ) |
1194 | 1 | AE = false; |
1195 | 10 | } |
1196 | 5 | // If the operands appear "all early" and "all late" at the same time, |
1197 | 5 | // then it means that none of them are actually classified as either. |
1198 | 5 | // This is harmless. |
1199 | 5 | if (AE && 5 AL4 ) |
1200 | 0 | return true; |
1201 | 5 | // Conversely, if they are neither "all early" nor "all late", then |
1202 | 5 | // we have a mixture of early and late operands that is not a known |
1203 | 5 | // exception. |
1204 | 5 | if (5 !AE && 5 !AL1 ) |
1205 | 0 | return false; |
1206 | 5 | |
1207 | 5 | // Check that we have covered the two special cases. |
1208 | 5 | assert(AE != AL); |
1209 | 5 | |
1210 | 5 | if (AE) |
1211 | 4 | Early.insert(UseI); |
1212 | 5 | else |
1213 | 1 | Late.insert(UseI); |
1214 | 8 | return true; |
1215 | 8 | } |
1216 | | |
1217 | 9 | bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) { |
1218 | 9 | switch (I->getOpcode()) { |
1219 | 9 | case Instruction::And: |
1220 | 9 | case Instruction::Or: |
1221 | 9 | case Instruction::Xor: |
1222 | 9 | case Instruction::LShr: |
1223 | 9 | case Instruction::Shl: |
1224 | 9 | case Instruction::Select: |
1225 | 9 | case Instruction::ICmp: |
1226 | 9 | case Instruction::PHI: |
1227 | 9 | break; |
1228 | 0 | default: |
1229 | 0 | return false; |
1230 | 9 | } |
1231 | 9 | return true; |
1232 | 9 | } |
1233 | | |
1234 | | bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V, |
1235 | 2 | unsigned IterCount) { |
1236 | 2 | auto *T = dyn_cast<IntegerType>(V->getType()); |
1237 | 2 | if (!T) |
1238 | 0 | return false; |
1239 | 2 | |
1240 | 2 | KnownBits Known(T->getBitWidth()); |
1241 | 2 | computeKnownBits(V, Known, DL); |
1242 | 2 | return Known.countMinLeadingZeros() >= IterCount; |
1243 | 2 | } |
1244 | | |
1245 | | bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V, |
1246 | 11 | unsigned IterCount) { |
1247 | 11 | // Assume that all inputs to the value have the high bits zero. |
1248 | 11 | // Check if the value itself preserves the zeros in the high bits. |
1249 | 11 | if (auto *C = dyn_cast<ConstantInt>(V)) |
1250 | 2 | return C->getValue().countLeadingZeros() >= IterCount; |
1251 | 9 | |
1252 | 9 | if (auto *9 I9 = dyn_cast<Instruction>(V)) { |
1253 | 9 | switch (I->getOpcode()) { |
1254 | 9 | case Instruction::And: |
1255 | 9 | case Instruction::Or: |
1256 | 9 | case Instruction::Xor: |
1257 | 9 | case Instruction::LShr: |
1258 | 9 | case Instruction::Select: |
1259 | 9 | case Instruction::ICmp: |
1260 | 9 | case Instruction::PHI: |
1261 | 9 | case Instruction::ZExt: |
1262 | 9 | return true; |
1263 | 0 | } |
1264 | 0 | } |
1265 | 0 |
|
1266 | 0 | return false; |
1267 | 0 | } |
1268 | | |
1269 | 11 | bool PolynomialMultiplyRecognize::isOperandShifted(Instruction *I, Value *Op) { |
1270 | 11 | unsigned Opc = I->getOpcode(); |
1271 | 11 | if (Opc == Instruction::Shl || 11 Opc == Instruction::LShr11 ) |
1272 | 0 | return Op != I->getOperand(1); |
1273 | 11 | return true; |
1274 | 11 | } |
1275 | | |
1276 | | bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB, |
1277 | 1 | BasicBlock *ExitB, unsigned IterCount) { |
1278 | 1 | Value *CIV = getCountIV(LoopB); |
1279 | 1 | if (CIV == nullptr) |
1280 | 0 | return false; |
1281 | 1 | auto *CIVTy = dyn_cast<IntegerType>(CIV->getType()); |
1282 | 1 | if (CIVTy == nullptr) |
1283 | 0 | return false; |
1284 | 1 | |
1285 | 1 | ValueSeq RShifts; |
1286 | 1 | ValueSeq Early, Late, Cycled; |
1287 | 1 | |
1288 | 1 | // Find all value cycles that contain logical right shifts by 1. |
1289 | 13 | for (Instruction &I : *LoopB) { |
1290 | 13 | using namespace PatternMatch; |
1291 | 13 | |
1292 | 13 | Value *V = nullptr; |
1293 | 13 | if (!match(&I, m_LShr(m_Value(V), m_One()))) |
1294 | 11 | continue; |
1295 | 2 | ValueSeq C; |
1296 | 2 | if (!findCycle(&I, V, C)) |
1297 | 0 | continue; |
1298 | 2 | |
1299 | 2 | // Found a cycle. |
1300 | 2 | C.insert(&I); |
1301 | 2 | classifyCycle(&I, C, Early, Late); |
1302 | 2 | Cycled.insert(C.begin(), C.end()); |
1303 | 2 | RShifts.insert(&I); |
1304 | 2 | } |
1305 | 1 | |
1306 | 1 | // Find the set of all values affected by the shift cycles, i.e. all |
1307 | 1 | // cycled values, and (recursively) all their users. |
1308 | 1 | ValueSeq Users(Cycled.begin(), Cycled.end()); |
1309 | 10 | for (unsigned i = 0; i < Users.size()10 ; ++i9 ) { |
1310 | 9 | Value *V = Users[i]; |
1311 | 9 | if (!isa<IntegerType>(V->getType())) |
1312 | 0 | return false; |
1313 | 9 | auto *R = cast<Instruction>(V); |
1314 | 9 | // If the instruction does not commute with shifts, the loop cannot |
1315 | 9 | // be unshifted. |
1316 | 9 | if (!commutesWithShift(R)) |
1317 | 0 | return false; |
1318 | 22 | for (auto I = R->user_begin(), E = R->user_end(); 9 I != E22 ; ++I13 ) { |
1319 | 13 | auto *T = cast<Instruction>(*I); |
1320 | 13 | // Skip users from outside of the loop. They will be handled later. |
1321 | 13 | // Also, skip the right-shifts and phi nodes, since they mix early |
1322 | 13 | // and late values. |
1323 | 13 | if (T->getParent() != LoopB || 13 RShifts.count(T)12 || isa<PHINode>(T)10 ) |
1324 | 5 | continue; |
1325 | 8 | |
1326 | 8 | Users.insert(T); |
1327 | 8 | if (!classifyInst(T, Early, Late)) |
1328 | 0 | return false; |
1329 | 13 | } |
1330 | 9 | } |
1331 | 1 | |
1332 | 1 | if (1 Users.empty()1 ) |
1333 | 0 | return false; |
1334 | 1 | |
1335 | 1 | // Verify that high bits remain zero. |
1336 | 1 | ValueSeq Internal(Users.begin(), Users.end()); |
1337 | 1 | ValueSeq Inputs; |
1338 | 12 | for (unsigned i = 0; i < Internal.size()12 ; ++i11 ) { |
1339 | 11 | auto *R = dyn_cast<Instruction>(Internal[i]); |
1340 | 11 | if (!R) |
1341 | 2 | continue; |
1342 | 9 | for (Value *Op : R->operands()) 9 { |
1343 | 19 | auto *T = dyn_cast<Instruction>(Op); |
1344 | 19 | if (T && 19 T->getParent() != LoopB14 ) |
1345 | 2 | Inputs.insert(Op); |
1346 | 19 | else |
1347 | 17 | Internal.insert(Op); |
1348 | 19 | } |
1349 | 11 | } |
1350 | 1 | for (Value *V : Inputs) |
1351 | 2 | if (2 !highBitsAreZero(V, IterCount)2 ) |
1352 | 0 | return false; |
1353 | 1 | for (Value *V : Internal) |
1354 | 11 | if (11 !keepsHighBitsZero(V, IterCount)11 ) |
1355 | 0 | return false; |
1356 | 1 | |
1357 | 1 | // Finally, the work can be done. Unshift each user. |
1358 | 1 | IRBuilder<> IRB(LoopB); |
1359 | 1 | std::map<Value*,Value*> ShiftMap; |
1360 | 1 | |
1361 | 1 | using CastMapType = std::map<std::pair<Value *, Type *>, Value *>; |
1362 | 1 | |
1363 | 1 | CastMapType CastMap; |
1364 | 1 | |
1365 | 1 | auto upcast = [] (CastMapType &CM, IRBuilder<> &IRB, Value *V, |
1366 | 0 | IntegerType *Ty) -> Value* { |
1367 | 0 | auto H = CM.find(std::make_pair(V, Ty)); |
1368 | 0 | if (H != CM.end()) |
1369 | 0 | return H->second; |
1370 | 0 | Value *CV = IRB.CreateIntCast(V, Ty, false); |
1371 | 0 | CM.insert(std::make_pair(std::make_pair(V, Ty), CV)); |
1372 | 0 | return CV; |
1373 | 0 | }; |
1374 | 1 | |
1375 | 14 | for (auto I = LoopB->begin(), E = LoopB->end(); I != E14 ; ++I13 ) { |
1376 | 13 | using namespace PatternMatch; |
1377 | 13 | |
1378 | 13 | if (isa<PHINode>(I) || 13 !Users.count(&*I)10 ) |
1379 | 6 | continue; |
1380 | 7 | |
1381 | 7 | // Match lshr x, 1. |
1382 | 7 | Value *V = nullptr; |
1383 | 7 | if (match(&*I, m_LShr(m_Value(V), m_One()))7 ) { |
1384 | 2 | replaceAllUsesOfWithIn(&*I, V, LoopB); |
1385 | 2 | continue; |
1386 | 2 | } |
1387 | 5 | // For each non-cycled operand, replace it with the corresponding |
1388 | 5 | // value shifted left. |
1389 | 5 | for (auto &J : I->operands()) 5 { |
1390 | 11 | Value *Op = J.get(); |
1391 | 11 | if (!isOperandShifted(&*I, Op)) |
1392 | 0 | continue; |
1393 | 11 | if (11 Users.count(Op)11 ) |
1394 | 8 | continue; |
1395 | 3 | // Skip shifting zeros. |
1396 | 3 | if (3 isa<ConstantInt>(Op) && 3 cast<ConstantInt>(Op)->isZero()3 ) |
1397 | 0 | continue; |
1398 | 3 | // Check if we have already generated a shift for this value. |
1399 | 3 | auto F = ShiftMap.find(Op); |
1400 | 3 | Value *W = (F != ShiftMap.end()) ? F->second1 : nullptr2 ; |
1401 | 3 | if (W == nullptr3 ) { |
1402 | 2 | IRB.SetInsertPoint(&*I); |
1403 | 2 | // First, the shift amount will be CIV or CIV+1, depending on |
1404 | 2 | // whether the value is early or late. Instead of creating CIV+1, |
1405 | 2 | // do a single shift of the value. |
1406 | 2 | Value *ShAmt = CIV, *ShVal = Op; |
1407 | 2 | auto *VTy = cast<IntegerType>(ShVal->getType()); |
1408 | 2 | auto *ATy = cast<IntegerType>(ShAmt->getType()); |
1409 | 2 | if (Late.count(&*I)) |
1410 | 1 | ShVal = IRB.CreateShl(Op, ConstantInt::get(VTy, 1)); |
1411 | 2 | // Second, the types of the shifted value and the shift amount |
1412 | 2 | // must match. |
1413 | 2 | if (VTy != ATy2 ) { |
1414 | 0 | if (VTy->getBitWidth() < ATy->getBitWidth()) |
1415 | 0 | ShVal = upcast(CastMap, IRB, ShVal, ATy); |
1416 | 0 | else |
1417 | 0 | ShAmt = upcast(CastMap, IRB, ShAmt, VTy); |
1418 | 0 | } |
1419 | 2 | // Ready to generate the shift and memoize it. |
1420 | 2 | W = IRB.CreateShl(ShVal, ShAmt); |
1421 | 2 | ShiftMap.insert(std::make_pair(Op, W)); |
1422 | 2 | } |
1423 | 11 | I->replaceUsesOfWith(Op, W); |
1424 | 11 | } |
1425 | 13 | } |
1426 | 1 | |
1427 | 1 | // Update the users outside of the loop to account for having left |
1428 | 1 | // shifts. They would normally be shifted right in the loop, so shift |
1429 | 1 | // them right after the loop exit. |
1430 | 1 | // Take advantage of the loop-closed SSA form, which has all the post- |
1431 | 1 | // loop values in phi nodes. |
1432 | 1 | IRB.SetInsertPoint(ExitB, ExitB->getFirstInsertionPt()); |
1433 | 2 | for (auto P = ExitB->begin(), Q = ExitB->end(); P != Q2 ; ++P1 ) { |
1434 | 2 | if (!isa<PHINode>(P)) |
1435 | 1 | break; |
1436 | 1 | auto *PN = cast<PHINode>(P); |
1437 | 1 | Value *U = PN->getIncomingValueForBlock(LoopB); |
1438 | 1 | if (!Users.count(U)) |
1439 | 0 | continue; |
1440 | 1 | Value *S = IRB.CreateLShr(PN, ConstantInt::get(PN->getType(), IterCount)); |
1441 | 1 | PN->replaceAllUsesWith(S); |
1442 | 1 | // The above RAUW will create |
1443 | 1 | // S = lshr S, IterCount |
1444 | 1 | // so we need to fix it back into |
1445 | 1 | // S = lshr PN, IterCount |
1446 | 1 | cast<User>(S)->replaceUsesOfWith(S, PN); |
1447 | 1 | } |
1448 | 1 | |
1449 | 1 | return true; |
1450 | 1 | } |
1451 | | |
1452 | 1 | void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) { |
1453 | 1 | for (auto &I : *LoopB) |
1454 | 15 | if (Value *15 SV15 = SimplifyInstruction(&I, {DL, &TLI, &DT})) |
1455 | 1 | I.replaceAllUsesWith(SV); |
1456 | 1 | |
1457 | 16 | for (auto I = LoopB->begin(), N = I; I != LoopB->end()16 ; I = N15 ) { |
1458 | 15 | N = std::next(I); |
1459 | 15 | RecursivelyDeleteTriviallyDeadInstructions(&*I, &TLI); |
1460 | 15 | } |
1461 | 1 | } |
1462 | | |
1463 | 1 | unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) { |
1464 | 1 | // Arrays of coefficients of Q and the inverse, C. |
1465 | 1 | // Q[i] = coefficient at x^i. |
1466 | 1 | std::array<char,32> Q, C; |
1467 | 1 | |
1468 | 33 | for (unsigned i = 0; i < 3233 ; ++i32 ) { |
1469 | 32 | Q[i] = QP & 1; |
1470 | 32 | QP >>= 1; |
1471 | 32 | } |
1472 | 1 | assert(Q[0] == 1); |
1473 | 1 | |
1474 | 1 | // Find C, such that |
1475 | 1 | // (Q[n]*x^n + ... + Q[1]*x + Q[0]) * (C[n]*x^n + ... + C[1]*x + C[0]) = 1 |
1476 | 1 | // |
1477 | 1 | // For it to have a solution, Q[0] must be 1. Since this is Z2[x], the |
1478 | 1 | // operations * and + are & and ^ respectively. |
1479 | 1 | // |
1480 | 1 | // Find C[i] recursively, by comparing i-th coefficient in the product |
1481 | 1 | // with 0 (or 1 for i=0). |
1482 | 1 | // |
1483 | 1 | // C[0] = 1, since C[0] = Q[0], and Q[0] = 1. |
1484 | 1 | C[0] = 1; |
1485 | 32 | for (unsigned i = 1; i < 3232 ; ++i31 ) { |
1486 | 31 | // Solve for C[i] in: |
1487 | 31 | // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0 |
1488 | 31 | // This is equivalent to |
1489 | 31 | // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0 |
1490 | 31 | // which is |
1491 | 31 | // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i] |
1492 | 31 | unsigned T = 0; |
1493 | 527 | for (unsigned j = 0; j < i527 ; ++j496 ) |
1494 | 496 | T = T ^ (C[j] & Q[i-j]); |
1495 | 31 | C[i] = T; |
1496 | 31 | } |
1497 | 1 | |
1498 | 1 | unsigned QV = 0; |
1499 | 33 | for (unsigned i = 0; i < 3233 ; ++i32 ) |
1500 | 32 | if (32 C[i]32 ) |
1501 | 32 | QV |= (1 << i); |
1502 | 1 | |
1503 | 1 | return QV; |
1504 | 1 | } |
1505 | | |
1506 | | Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At, |
1507 | 2 | ParsedValues &PV) { |
1508 | 2 | IRBuilder<> B(&*At); |
1509 | 2 | Module *M = At->getParent()->getParent()->getParent(); |
1510 | 2 | Value *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw); |
1511 | 2 | |
1512 | 2 | Value *P = PV.P, *Q = PV.Q, *P0 = P; |
1513 | 2 | unsigned IC = PV.IterCount; |
1514 | 2 | |
1515 | 2 | if (PV.M != nullptr) |
1516 | 1 | P0 = P = B.CreateXor(P, PV.M); |
1517 | 2 | |
1518 | 2 | // Create a bit mask to clear the high bits beyond IterCount. |
1519 | 2 | auto *BMI = ConstantInt::get(P->getType(), APInt::getLowBitsSet(32, IC)); |
1520 | 2 | |
1521 | 2 | if (PV.IterCount != 32) |
1522 | 1 | P = B.CreateAnd(P, BMI); |
1523 | 2 | |
1524 | 2 | if (PV.Inv2 ) { |
1525 | 1 | auto *QI = dyn_cast<ConstantInt>(PV.Q); |
1526 | 1 | assert(QI && QI->getBitWidth() <= 32); |
1527 | 1 | |
1528 | 1 | // Again, clearing bits beyond IterCount. |
1529 | 1 | unsigned M = (1 << PV.IterCount) - 1; |
1530 | 1 | unsigned Tmp = (QI->getZExtValue() | 1) & M; |
1531 | 1 | unsigned QV = getInverseMxN(Tmp) & M; |
1532 | 1 | auto *QVI = ConstantInt::get(QI->getType(), QV); |
1533 | 1 | P = B.CreateCall(PMF, {P, QVI}); |
1534 | 1 | P = B.CreateTrunc(P, QI->getType()); |
1535 | 1 | if (IC != 32) |
1536 | 1 | P = B.CreateAnd(P, BMI); |
1537 | 1 | } |
1538 | 2 | |
1539 | 2 | Value *R = B.CreateCall(PMF, {P, Q}); |
1540 | 2 | |
1541 | 2 | if (PV.M != nullptr) |
1542 | 1 | R = B.CreateXor(R, B.CreateIntCast(P0, R->getType(), false)); |
1543 | 2 | |
1544 | 2 | return R; |
1545 | 2 | } |
1546 | | |
1547 | 6 | void PolynomialMultiplyRecognize::setupSimplifier() { |
1548 | 6 | Simp.addRule( |
1549 | 6 | // Sink zext past bitwise operations. |
1550 | 10.8k | [](Instruction *I, LLVMContext &Ctx) -> Value* { |
1551 | 10.8k | if (I->getOpcode() != Instruction::ZExt) |
1552 | 10.8k | return nullptr; |
1553 | 13 | Instruction *T = dyn_cast<Instruction>(I->getOperand(0)); |
1554 | 13 | if (!T) |
1555 | 0 | return nullptr; |
1556 | 13 | switch (T->getOpcode()) { |
1557 | 4 | case Instruction::And: |
1558 | 4 | case Instruction::Or: |
1559 | 4 | case Instruction::Xor: |
1560 | 4 | break; |
1561 | 9 | default: |
1562 | 9 | return nullptr; |
1563 | 4 | } |
1564 | 4 | IRBuilder<> B(Ctx); |
1565 | 4 | return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(), |
1566 | 4 | B.CreateZExt(T->getOperand(0), I->getType()), |
1567 | 4 | B.CreateZExt(T->getOperand(1), I->getType())); |
1568 | 4 | }); |
1569 | 6 | Simp.addRule( |
1570 | 6 | // (xor (and x a) (and y a)) -> (and (xor x y) a) |
1571 | 10.8k | [](Instruction *I, LLVMContext &Ctx) -> Value* { |
1572 | 10.8k | if (I->getOpcode() != Instruction::Xor) |
1573 | 10.8k | return nullptr; |
1574 | 16 | Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0)); |
1575 | 16 | Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1)); |
1576 | 16 | if (!And0 || 16 !And116 ) |
1577 | 7 | return nullptr; |
1578 | 9 | if (9 And0->getOpcode() != Instruction::And || |
1579 | 2 | And1->getOpcode() != Instruction::And) |
1580 | 7 | return nullptr; |
1581 | 2 | if (2 And0->getOperand(1) != And1->getOperand(1)2 ) |
1582 | 0 | return nullptr; |
1583 | 2 | IRBuilder<> B(Ctx); |
1584 | 2 | return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)), |
1585 | 2 | And0->getOperand(1)); |
1586 | 2 | }); |
1587 | 6 | Simp.addRule( |
1588 | 6 | // (Op (select c x y) z) -> (select c (Op x z) (Op y z)) |
1589 | 6 | // (Op x (select c y z)) -> (select c (Op x y) (Op x z)) |
1590 | 10.8k | [](Instruction *I, LLVMContext &Ctx) -> Value* { |
1591 | 10.8k | BinaryOperator *BO = dyn_cast<BinaryOperator>(I); |
1592 | 10.8k | if (!BO) |
1593 | 5.82k | return nullptr; |
1594 | 5.06k | Instruction::BinaryOps Op = BO->getOpcode(); |
1595 | 5.06k | if (SelectInst *Sel5.06k = dyn_cast<SelectInst>(BO->getOperand(0))) { |
1596 | 377 | IRBuilder<> B(Ctx); |
1597 | 377 | Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue(); |
1598 | 377 | Value *Z = BO->getOperand(1); |
1599 | 377 | return B.CreateSelect(Sel->getCondition(), |
1600 | 377 | B.CreateBinOp(Op, X, Z), |
1601 | 377 | B.CreateBinOp(Op, Y, Z)); |
1602 | 377 | } |
1603 | 4.69k | if (SelectInst *4.69k Sel4.69k = dyn_cast<SelectInst>(BO->getOperand(1))) { |
1604 | 39 | IRBuilder<> B(Ctx); |
1605 | 39 | Value *X = BO->getOperand(0); |
1606 | 39 | Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue(); |
1607 | 39 | return B.CreateSelect(Sel->getCondition(), |
1608 | 39 | B.CreateBinOp(Op, X, Y), |
1609 | 39 | B.CreateBinOp(Op, X, Z)); |
1610 | 39 | } |
1611 | 4.65k | return nullptr; |
1612 | 4.65k | }); |
1613 | 6 | Simp.addRule( |
1614 | 6 | // (select c (select c x y) z) -> (select c x z) |
1615 | 6 | // (select c x (select c y z)) -> (select c x z) |
1616 | 10.4k | [](Instruction *I, LLVMContext &Ctx) -> Value* { |
1617 | 10.4k | SelectInst *Sel = dyn_cast<SelectInst>(I); |
1618 | 10.4k | if (!Sel) |
1619 | 5.19k | return nullptr; |
1620 | 5.27k | IRBuilder<> B(Ctx); |
1621 | 5.27k | Value *C = Sel->getCondition(); |
1622 | 5.27k | if (SelectInst *Sel05.27k = dyn_cast<SelectInst>(Sel->getTrueValue())) { |
1623 | 2.64k | if (Sel0->getCondition() == C) |
1624 | 13 | return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue()); |
1625 | 5.26k | } |
1626 | 5.26k | if (SelectInst *5.26k Sel15.26k = dyn_cast<SelectInst>(Sel->getFalseValue())) { |
1627 | 2.70k | if (Sel1->getCondition() == C) |
1628 | 13 | return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue()); |
1629 | 5.24k | } |
1630 | 5.24k | return nullptr; |
1631 | 5.24k | }); |
1632 | 6 | Simp.addRule( |
1633 | 6 | // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0) |
1634 | 10.4k | [](Instruction *I, LLVMContext &Ctx) -> Value* { |
1635 | 10.4k | if (I->getOpcode() != Instruction::Or) |
1636 | 10.4k | return nullptr; |
1637 | 1 | Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0)); |
1638 | 1 | if (!LShr || 1 LShr->getOpcode() != Instruction::LShr1 ) |
1639 | 0 | return nullptr; |
1640 | 1 | ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1)); |
1641 | 1 | if (!One || 1 One->getZExtValue() != 11 ) |
1642 | 0 | return nullptr; |
1643 | 1 | ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1)); |
1644 | 1 | if (!Msb || 1 Msb->getZExtValue() != Msb->getType()->getSignBit()1 ) |
1645 | 0 | return nullptr; |
1646 | 1 | return IRBuilder<>(Ctx).CreateXor(LShr, Msb); |
1647 | 1 | }); |
1648 | 6 | Simp.addRule( |
1649 | 6 | // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c)) |
1650 | 10.4k | [](Instruction *I, LLVMContext &Ctx) -> Value* { |
1651 | 10.4k | if (I->getOpcode() != Instruction::LShr) |
1652 | 10.4k | return nullptr; |
1653 | 5 | BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0)); |
1654 | 5 | if (!BitOp) |
1655 | 4 | return nullptr; |
1656 | 1 | switch (BitOp->getOpcode()) { |
1657 | 1 | case Instruction::And: |
1658 | 1 | case Instruction::Or: |
1659 | 1 | case Instruction::Xor: |
1660 | 1 | break; |
1661 | 0 | default: |
1662 | 0 | return nullptr; |
1663 | 1 | } |
1664 | 1 | IRBuilder<> B(Ctx); |
1665 | 1 | Value *S = I->getOperand(1); |
1666 | 1 | return B.CreateBinOp(BitOp->getOpcode(), |
1667 | 1 | B.CreateLShr(BitOp->getOperand(0), S), |
1668 | 1 | B.CreateLShr(BitOp->getOperand(1), S)); |
1669 | 1 | }); |
1670 | 6 | Simp.addRule( |
1671 | 6 | // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b)) |
1672 | 10.4k | [](Instruction *I, LLVMContext &Ctx) -> Value* { |
1673 | 4.65k | auto IsBitOp = [](unsigned Op) -> bool { |
1674 | 4.65k | switch (Op) { |
1675 | 23 | case Instruction::And: |
1676 | 23 | case Instruction::Or: |
1677 | 23 | case Instruction::Xor: |
1678 | 23 | return true; |
1679 | 4.63k | } |
1680 | 4.63k | return false; |
1681 | 4.63k | }; |
1682 | 10.4k | BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I); |
1683 | 10.4k | if (!BitOp1 || 10.4k !IsBitOp(BitOp1->getOpcode())4.64k ) |
1684 | 10.4k | return nullptr; |
1685 | 18 | BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0)); |
1686 | 18 | if (!BitOp2 || 18 !IsBitOp(BitOp2->getOpcode())9 ) |
1687 | 13 | return nullptr; |
1688 | 5 | ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1)); |
1689 | 5 | ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1)); |
1690 | 5 | if (!CA || 5 !CB1 ) |
1691 | 4 | return nullptr; |
1692 | 1 | IRBuilder<> B(Ctx); |
1693 | 1 | Value *X = BitOp2->getOperand(0); |
1694 | 1 | return B.CreateBinOp(BitOp2->getOpcode(), X, |
1695 | 1 | B.CreateBinOp(BitOp1->getOpcode(), CA, CB)); |
1696 | 1 | }); |
1697 | 6 | } |
1698 | | |
1699 | 7 | bool PolynomialMultiplyRecognize::recognize() { |
1700 | 7 | DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n" |
1701 | 7 | << *CurLoop << '\n'); |
1702 | 7 | // Restrictions: |
1703 | 7 | // - The loop must consist of a single block. |
1704 | 7 | // - The iteration count must be known at compile-time. |
1705 | 7 | // - The loop must have an induction variable starting from 0, and |
1706 | 7 | // incremented in each iteration of the loop. |
1707 | 7 | BasicBlock *LoopB = CurLoop->getHeader(); |
1708 | 7 | DEBUG(dbgs() << "Loop header:\n" << *LoopB); |
1709 | 7 | |
1710 | 7 | if (LoopB != CurLoop->getLoopLatch()) |
1711 | 1 | return false; |
1712 | 6 | BasicBlock *ExitB = CurLoop->getExitBlock(); |
1713 | 6 | if (ExitB == nullptr) |
1714 | 0 | return false; |
1715 | 6 | BasicBlock *EntryB = CurLoop->getLoopPreheader(); |
1716 | 6 | if (EntryB == nullptr) |
1717 | 0 | return false; |
1718 | 6 | |
1719 | 6 | unsigned IterCount = 0; |
1720 | 6 | const SCEV *CT = SE.getBackedgeTakenCount(CurLoop); |
1721 | 6 | if (isa<SCEVCouldNotCompute>(CT)) |
1722 | 0 | return false; |
1723 | 6 | if (auto *6 CV6 = dyn_cast<SCEVConstant>(CT)) |
1724 | 4 | IterCount = CV->getValue()->getZExtValue() + 1; |
1725 | 6 | |
1726 | 6 | Value *CIV = getCountIV(LoopB); |
1727 | 6 | ParsedValues PV; |
1728 | 6 | PV.IterCount = IterCount; |
1729 | 6 | DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n'); |
1730 | 6 | |
1731 | 6 | setupSimplifier(); |
1732 | 6 | |
1733 | 6 | // Perform a preliminary scan of select instructions to see if any of them |
1734 | 6 | // looks like a generator of the polynomial multiply steps. Assume that a |
1735 | 6 | // loop can only contain a single transformable operation, so stop the |
1736 | 6 | // traversal after the first reasonable candidate was found. |
1737 | 6 | // XXX: Currently this approach can modify the loop before being 100% sure |
1738 | 6 | // that the transformation can be carried out. |
1739 | 6 | bool FoundPreScan = false; |
1740 | 118 | for (Instruction &In : *LoopB) { |
1741 | 118 | SelectInst *SI = dyn_cast<SelectInst>(&In); |
1742 | 118 | if (!SI) |
1743 | 99 | continue; |
1744 | 19 | |
1745 | 19 | Simplifier::Context C(SI); |
1746 | 19 | Value *T = Simp.simplify(C); |
1747 | 19 | SelectInst *SelI = (T && isa<SelectInst>(T)18 ) ? cast<SelectInst>(T)18 : SI1 ; |
1748 | 19 | DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n'); |
1749 | 19 | if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)19 ) { |
1750 | 2 | FoundPreScan = true; |
1751 | 2 | if (SelI != SI2 ) { |
1752 | 2 | Value *NewSel = C.materialize(LoopB, SI->getIterator()); |
1753 | 2 | SI->replaceAllUsesWith(NewSel); |
1754 | 2 | RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI); |
1755 | 2 | } |
1756 | 2 | break; |
1757 | 2 | } |
1758 | 6 | } |
1759 | 6 | |
1760 | 6 | if (!FoundPreScan6 ) { |
1761 | 4 | DEBUG(dbgs() << "Have not found candidates for pmpy\n"); |
1762 | 4 | return false; |
1763 | 4 | } |
1764 | 2 | |
1765 | 2 | if (2 !PV.Left2 ) { |
1766 | 1 | // The right shift version actually only returns the higher bits of |
1767 | 1 | // the result (each iteration discards the LSB). If we want to convert it |
1768 | 1 | // to a left-shifting loop, the working data type must be at least as |
1769 | 1 | // wide as the target's pmpy instruction. |
1770 | 1 | if (!promoteTypes(LoopB, ExitB)) |
1771 | 0 | return false; |
1772 | 1 | if (1 !convertShiftsToLeft(LoopB, ExitB, IterCount)1 ) |
1773 | 0 | return false; |
1774 | 1 | cleanupLoopBody(LoopB); |
1775 | 1 | } |
1776 | 2 | |
1777 | 2 | // Scan the loop again, find the generating select instruction. |
1778 | 2 | bool FoundScan = false; |
1779 | 18 | for (Instruction &In : *LoopB) { |
1780 | 18 | SelectInst *SelI = dyn_cast<SelectInst>(&In); |
1781 | 18 | if (!SelI) |
1782 | 16 | continue; |
1783 | 2 | DEBUG2 (dbgs() << "scanSelect: " << *SelI << '\n'); |
1784 | 2 | FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false); |
1785 | 2 | if (FoundScan) |
1786 | 2 | break; |
1787 | 2 | } |
1788 | 2 | assert(FoundScan); |
1789 | 2 | |
1790 | 2 | DEBUG({ |
1791 | 2 | StringRef PP = (PV.M ? "(P+M)" : "P"); |
1792 | 2 | if (!PV.Inv) |
1793 | 2 | dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n"; |
1794 | 2 | else |
1795 | 2 | dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + " |
1796 | 2 | << PP << "\n"; |
1797 | 2 | dbgs() << " Res:" << *PV.Res << "\n P:" << *PV.P << "\n"; |
1798 | 2 | if (PV.M) |
1799 | 2 | dbgs() << " M:" << *PV.M << "\n"; |
1800 | 2 | dbgs() << " Q:" << *PV.Q << "\n"; |
1801 | 2 | dbgs() << " Iteration count:" << PV.IterCount << "\n"; |
1802 | 2 | }); |
1803 | 2 | |
1804 | 2 | BasicBlock::iterator At(EntryB->getTerminator()); |
1805 | 2 | Value *PM = generate(At, PV); |
1806 | 2 | if (PM == nullptr) |
1807 | 0 | return false; |
1808 | 2 | |
1809 | 2 | if (2 PM->getType() != PV.Res->getType()2 ) |
1810 | 1 | PM = IRBuilder<>(&*At).CreateIntCast(PM, PV.Res->getType(), false); |
1811 | 7 | |
1812 | 7 | PV.Res->replaceAllUsesWith(PM); |
1813 | 7 | PV.Res->eraseFromParent(); |
1814 | 7 | return true; |
1815 | 7 | } |
1816 | | |
1817 | 6 | unsigned HexagonLoopIdiomRecognize::getStoreSizeInBytes(StoreInst *SI) { |
1818 | 6 | uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); |
1819 | 6 | assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) && |
1820 | 6 | "Don't overflow unsigned."); |
1821 | 6 | return (unsigned)SizeInBits >> 3; |
1822 | 6 | } |
1823 | | |
1824 | 9 | int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) { |
1825 | 9 | if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1))) |
1826 | 9 | return SC->getAPInt().getSExtValue(); |
1827 | 0 | return 0; |
1828 | 0 | } |
1829 | | |
1830 | 3 | bool HexagonLoopIdiomRecognize::isLegalStore(Loop *CurLoop, StoreInst *SI) { |
1831 | 3 | // Allow volatile stores if HexagonVolatileMemcpy is enabled. |
1832 | 3 | if (!(SI->isVolatile() && 3 HexagonVolatileMemcpy0 ) && !SI->isSimple()3 ) |
1833 | 0 | return false; |
1834 | 3 | |
1835 | 3 | Value *StoredVal = SI->getValueOperand(); |
1836 | 3 | Value *StorePtr = SI->getPointerOperand(); |
1837 | 3 | |
1838 | 3 | // Reject stores that are so large that they overflow an unsigned. |
1839 | 3 | uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); |
1840 | 3 | if ((SizeInBits & 7) || 3 (SizeInBits >> 32) != 03 ) |
1841 | 0 | return false; |
1842 | 3 | |
1843 | 3 | // See if the pointer expression is an AddRec like {base,+,1} on the current |
1844 | 3 | // loop, which indicates a strided store. If we have something else, it's a |
1845 | 3 | // random store we can't handle. |
1846 | 3 | auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); |
1847 | 3 | if (!StoreEv || 3 StoreEv->getLoop() != CurLoop3 || !StoreEv->isAffine()3 ) |
1848 | 0 | return false; |
1849 | 3 | |
1850 | 3 | // Check to see if the stride matches the size of the store. If so, then we |
1851 | 3 | // know that every byte is touched in the loop. |
1852 | 3 | int Stride = getSCEVStride(StoreEv); |
1853 | 3 | if (Stride == 0) |
1854 | 0 | return false; |
1855 | 3 | unsigned StoreSize = getStoreSizeInBytes(SI); |
1856 | 3 | if (StoreSize != unsigned(std::abs(Stride))) |
1857 | 0 | return false; |
1858 | 3 | |
1859 | 3 | // The store must be feeding a non-volatile load. |
1860 | 3 | LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); |
1861 | 3 | if (!LI || 3 !LI->isSimple()3 ) |
1862 | 0 | return false; |
1863 | 3 | |
1864 | 3 | // See if the pointer expression is an AddRec like {base,+,1} on the current |
1865 | 3 | // loop, which indicates a strided load. If we have something else, it's a |
1866 | 3 | // random load we can't handle. |
1867 | 3 | Value *LoadPtr = LI->getPointerOperand(); |
1868 | 3 | auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr)); |
1869 | 3 | if (!LoadEv || 3 LoadEv->getLoop() != CurLoop3 || !LoadEv->isAffine()3 ) |
1870 | 0 | return false; |
1871 | 3 | |
1872 | 3 | // The store and load must share the same stride. |
1873 | 3 | if (3 StoreEv->getOperand(1) != LoadEv->getOperand(1)3 ) |
1874 | 0 | return false; |
1875 | 3 | |
1876 | 3 | // Success. This store can be converted into a memcpy. |
1877 | 3 | return true; |
1878 | 3 | } |
1879 | | |
1880 | | /// mayLoopAccessLocation - Return true if the specified loop might access the |
1881 | | /// specified pointer location, which is a loop-strided access. The 'Access' |
1882 | | /// argument specifies what the verboten forms of access are (read or write). |
1883 | | static bool |
1884 | | mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, |
1885 | | const SCEV *BECount, unsigned StoreSize, |
1886 | | AliasAnalysis &AA, |
1887 | 9 | SmallPtrSetImpl<Instruction *> &Ignored) { |
1888 | 9 | // Get the location that may be stored across the loop. Since the access |
1889 | 9 | // is strided positively through memory, we say that the modified location |
1890 | 9 | // starts at the pointer and has infinite size. |
1891 | 9 | uint64_t AccessSize = MemoryLocation::UnknownSize; |
1892 | 9 | |
1893 | 9 | // If the loop iterates a fixed number of times, we can refine the access |
1894 | 9 | // size to be exactly the size of the memset, which is (BECount+1)*StoreSize |
1895 | 9 | if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) |
1896 | 0 | AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize; |
1897 | 9 | |
1898 | 9 | // TODO: For this to be really effective, we have to dive into the pointer |
1899 | 9 | // operand in the store. Store to &A[i] of 100 will always return may alias |
1900 | 9 | // with store of &A[100], we need to StoreLoc to be "A" with size of 100, |
1901 | 9 | // which will then no-alias a store to &A[100]. |
1902 | 9 | MemoryLocation StoreLoc(Ptr, AccessSize); |
1903 | 9 | |
1904 | 9 | for (auto *B : L->blocks()) |
1905 | 11 | for (auto &I : *B) |
1906 | 64 | if (64 Ignored.count(&I) == 0 && 64 (AA.getModRefInfo(&I, StoreLoc) & Access)55 ) |
1907 | 3 | return true; |
1908 | 6 | |
1909 | 6 | return false; |
1910 | 6 | } |
1911 | | |
1912 | | void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB, |
1913 | 5 | SmallVectorImpl<StoreInst*> &Stores) { |
1914 | 5 | Stores.clear(); |
1915 | 5 | for (Instruction &I : *BB) |
1916 | 100 | if (StoreInst *100 SI100 = dyn_cast<StoreInst>(&I)) |
1917 | 3 | if (3 isLegalStore(CurLoop, SI)3 ) |
1918 | 3 | Stores.push_back(SI); |
1919 | 5 | } |
1920 | | |
1921 | | bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop, |
1922 | 3 | StoreInst *SI, const SCEV *BECount) { |
1923 | 3 | assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) && |
1924 | 3 | "Expected only non-volatile stores, or Hexagon-specific memcpy" |
1925 | 3 | "to volatile destination."); |
1926 | 3 | |
1927 | 3 | Value *StorePtr = SI->getPointerOperand(); |
1928 | 3 | auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); |
1929 | 3 | unsigned Stride = getSCEVStride(StoreEv); |
1930 | 3 | unsigned StoreSize = getStoreSizeInBytes(SI); |
1931 | 3 | if (Stride != StoreSize) |
1932 | 0 | return false; |
1933 | 3 | |
1934 | 3 | // See if the pointer expression is an AddRec like {base,+,1} on the current |
1935 | 3 | // loop, which indicates a strided load. If we have something else, it's a |
1936 | 3 | // random load we can't handle. |
1937 | 3 | LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); |
1938 | 3 | auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); |
1939 | 3 | |
1940 | 3 | // The trip count of the loop and the base pointer of the addrec SCEV is |
1941 | 3 | // guaranteed to be loop invariant, which means that it should dominate the |
1942 | 3 | // header. This allows us to insert code for it in the preheader. |
1943 | 3 | BasicBlock *Preheader = CurLoop->getLoopPreheader(); |
1944 | 3 | Instruction *ExpPt = Preheader->getTerminator(); |
1945 | 3 | IRBuilder<> Builder(ExpPt); |
1946 | 3 | SCEVExpander Expander(*SE, *DL, "hexagon-loop-idiom"); |
1947 | 3 | |
1948 | 3 | Type *IntPtrTy = Builder.getIntPtrTy(*DL, SI->getPointerAddressSpace()); |
1949 | 3 | |
1950 | 3 | // Okay, we have a strided store "p[i]" of a loaded value. We can turn |
1951 | 3 | // this into a memcpy/memmove in the loop preheader now if we want. However, |
1952 | 3 | // this would be unsafe to do if there is anything else in the loop that may |
1953 | 3 | // read or write the memory region we're storing to. For memcpy, this |
1954 | 3 | // includes the load that feeds the stores. Check for an alias by generating |
1955 | 3 | // the base address and checking everything. |
1956 | 3 | Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(), |
1957 | 3 | Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt); |
1958 | 3 | Value *LoadBasePtr = nullptr; |
1959 | 3 | |
1960 | 3 | bool Overlap = false; |
1961 | 3 | bool DestVolatile = SI->isVolatile(); |
1962 | 3 | Type *BECountTy = BECount->getType(); |
1963 | 3 | |
1964 | 3 | if (DestVolatile3 ) { |
1965 | 0 | // The trip count must fit in i32, since it is the type of the "num_words" |
1966 | 0 | // argument to hexagon_memcpy_forward_vp4cp4n2. |
1967 | 0 | if (StoreSize != 4 || 0 DL->getTypeSizeInBits(BECountTy) > 320 ) { |
1968 | 0 | CleanupAndExit: |
1969 | 0 | // If we generated new code for the base pointer, clean up. |
1970 | 0 | Expander.clear(); |
1971 | 0 | if (StoreBasePtr && 0 (LoadBasePtr != StoreBasePtr)0 ) { |
1972 | 0 | RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); |
1973 | 0 | StoreBasePtr = nullptr; |
1974 | 0 | } |
1975 | 0 | if (LoadBasePtr0 ) { |
1976 | 0 | RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); |
1977 | 0 | LoadBasePtr = nullptr; |
1978 | 0 | } |
1979 | 0 | return false; |
1980 | 3 | } |
1981 | 0 | } |
1982 | 3 | |
1983 | 3 | SmallPtrSet<Instruction*, 2> Ignore1; |
1984 | 3 | Ignore1.insert(SI); |
1985 | 3 | if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, |
1986 | 3 | StoreSize, *AA, Ignore1)) { |
1987 | 3 | // Check if the load is the offending instruction. |
1988 | 3 | Ignore1.insert(LI); |
1989 | 3 | if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, |
1990 | 3 | StoreSize, *AA, Ignore1)) { |
1991 | 0 | // Still bad. Nothing we can do. |
1992 | 0 | goto CleanupAndExit; |
1993 | 0 | } |
1994 | 3 | // It worked with the load ignored. |
1995 | 3 | Overlap = true; |
1996 | 3 | } |
1997 | 3 | |
1998 | 3 | if (3 !Overlap3 ) { |
1999 | 0 | if (DisableMemcpyIdiom || 0 !HasMemcpy0 ) |
2000 | 0 | goto CleanupAndExit; |
2001 | 3 | } else { |
2002 | 3 | // Don't generate memmove if this function will be inlined. This is |
2003 | 3 | // because the caller will undergo this transformation after inlining. |
2004 | 3 | Function *Func = CurLoop->getHeader()->getParent(); |
2005 | 3 | if (Func->hasFnAttribute(Attribute::AlwaysInline)) |
2006 | 0 | goto CleanupAndExit; |
2007 | 3 | |
2008 | 3 | // In case of a memmove, the call to memmove will be executed instead |
2009 | 3 | // of the loop, so we need to make sure that there is nothing else in |
2010 | 3 | // the loop than the load, store and instructions that these two depend |
2011 | 3 | // on. |
2012 | 3 | SmallVector<Instruction*,2> Insts; |
2013 | 3 | Insts.push_back(SI); |
2014 | 3 | Insts.push_back(LI); |
2015 | 3 | if (!coverLoop(CurLoop, Insts)) |
2016 | 0 | goto CleanupAndExit; |
2017 | 3 | |
2018 | 3 | if (3 DisableMemmoveIdiom || 3 !HasMemmove3 ) |
2019 | 0 | goto CleanupAndExit; |
2020 | 3 | bool IsNested = CurLoop->getParentLoop() != nullptr; |
2021 | 3 | if (IsNested && 3 OnlyNonNestedMemmove0 ) |
2022 | 0 | goto CleanupAndExit; |
2023 | 3 | } |
2024 | 3 | |
2025 | 3 | // For a memcpy, we have to make sure that the input array is not being |
2026 | 3 | // mutated by the loop. |
2027 | 3 | LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(), |
2028 | 3 | Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt); |
2029 | 3 | |
2030 | 3 | SmallPtrSet<Instruction*, 2> Ignore2; |
2031 | 3 | Ignore2.insert(SI); |
2032 | 3 | if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, |
2033 | 3 | *AA, Ignore2)) |
2034 | 0 | goto CleanupAndExit; |
2035 | 3 | |
2036 | 3 | // Check the stride. |
2037 | 3 | bool StridePos = getSCEVStride(LoadEv) >= 0; |
2038 | 3 | |
2039 | 3 | // Currently, the volatile memcpy only emulates traversing memory forward. |
2040 | 3 | if (!StridePos && 3 DestVolatile0 ) |
2041 | 0 | goto CleanupAndExit; |
2042 | 3 | |
2043 | 3 | bool RuntimeCheck = (Overlap || 3 DestVolatile0 ); |
2044 | 3 | |
2045 | 3 | BasicBlock *ExitB; |
2046 | 3 | if (RuntimeCheck3 ) { |
2047 | 3 | // The runtime check needs a single exit block. |
2048 | 3 | SmallVector<BasicBlock*, 8> ExitBlocks; |
2049 | 3 | CurLoop->getUniqueExitBlocks(ExitBlocks); |
2050 | 3 | if (ExitBlocks.size() != 1) |
2051 | 0 | goto CleanupAndExit; |
2052 | 3 | ExitB = ExitBlocks[0]; |
2053 | 3 | } |
2054 | 3 | |
2055 | 3 | // The # stored bytes is (BECount+1)*Size. Expand the trip count out to |
2056 | 3 | // pointer size if it isn't already. |
2057 | 3 | LLVMContext &Ctx = SI->getContext(); |
2058 | 3 | BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); |
2059 | 3 | unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment()); |
2060 | 3 | DebugLoc DLoc = SI->getDebugLoc(); |
2061 | 3 | |
2062 | 3 | const SCEV *NumBytesS = |
2063 | 3 | SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); |
2064 | 3 | if (StoreSize != 1) |
2065 | 3 | NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), |
2066 | 3 | SCEV::FlagNUW); |
2067 | 3 | Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt); |
2068 | 3 | if (Instruction *In = dyn_cast<Instruction>(NumBytes)) |
2069 | 3 | if (Value *3 Simp3 = SimplifyInstruction(In, {*DL, TLI, DT})) |
2070 | 0 | NumBytes = Simp; |
2071 | 3 | |
2072 | 3 | CallInst *NewCall; |
2073 | 3 | |
2074 | 3 | if (RuntimeCheck3 ) { |
2075 | 3 | unsigned Threshold = RuntimeMemSizeThreshold; |
2076 | 3 | if (ConstantInt *CI3 = dyn_cast<ConstantInt>(NumBytes)) { |
2077 | 0 | uint64_t C = CI->getZExtValue(); |
2078 | 0 | if (Threshold != 0 && 0 C < Threshold0 ) |
2079 | 0 | goto CleanupAndExit; |
2080 | 0 | if (0 C < CompileTimeMemSizeThreshold0 ) |
2081 | 0 | goto CleanupAndExit; |
2082 | 3 | } |
2083 | 3 | |
2084 | 3 | BasicBlock *Header = CurLoop->getHeader(); |
2085 | 3 | Function *Func = Header->getParent(); |
2086 | 3 | Loop *ParentL = LF->getLoopFor(Preheader); |
2087 | 3 | StringRef HeaderName = Header->getName(); |
2088 | 3 | |
2089 | 3 | // Create a new (empty) preheader, and update the PHI nodes in the |
2090 | 3 | // header to use the new preheader. |
2091 | 3 | BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph", |
2092 | 3 | Func, Header); |
2093 | 3 | if (ParentL) |
2094 | 0 | ParentL->addBasicBlockToLoop(NewPreheader, *LF); |
2095 | 3 | IRBuilder<>(NewPreheader).CreateBr(Header); |
2096 | 8 | for (auto &In : *Header) { |
2097 | 8 | PHINode *PN = dyn_cast<PHINode>(&In); |
2098 | 8 | if (!PN) |
2099 | 3 | break; |
2100 | 5 | int bx = PN->getBasicBlockIndex(Preheader); |
2101 | 5 | if (bx >= 0) |
2102 | 5 | PN->setIncomingBlock(bx, NewPreheader); |
2103 | 8 | } |
2104 | 3 | DT->addNewBlock(NewPreheader, Preheader); |
2105 | 3 | DT->changeImmediateDominator(Header, NewPreheader); |
2106 | 3 | |
2107 | 3 | // Check for safe conditions to execute memmove. |
2108 | 3 | // If stride is positive, copying things from higher to lower addresses |
2109 | 3 | // is equivalent to memmove. For negative stride, it's the other way |
2110 | 3 | // around. Copying forward in memory with positive stride may not be |
2111 | 3 | // same as memmove since we may be copying values that we just stored |
2112 | 3 | // in some previous iteration. |
2113 | 3 | Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy); |
2114 | 3 | Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy); |
2115 | 3 | Value *LowA = StridePos ? SA3 : LA0 ; |
2116 | 3 | Value *HighA = StridePos ? LA3 : SA0 ; |
2117 | 3 | Value *CmpA = Builder.CreateICmpULT(LowA, HighA); |
2118 | 3 | Value *Cond = CmpA; |
2119 | 3 | |
2120 | 3 | // Check for distance between pointers. Since the case LowA < HighA |
2121 | 3 | // is checked for above, assume LowA >= HighA. |
2122 | 3 | Value *Dist = Builder.CreateSub(LowA, HighA); |
2123 | 3 | Value *CmpD = Builder.CreateICmpSLE(NumBytes, Dist); |
2124 | 3 | Value *CmpEither = Builder.CreateOr(Cond, CmpD); |
2125 | 3 | Cond = CmpEither; |
2126 | 3 | |
2127 | 3 | if (Threshold != 03 ) { |
2128 | 0 | Type *Ty = NumBytes->getType(); |
2129 | 0 | Value *Thr = ConstantInt::get(Ty, Threshold); |
2130 | 0 | Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes); |
2131 | 0 | Value *CmpBoth = Builder.CreateAnd(Cond, CmpB); |
2132 | 0 | Cond = CmpBoth; |
2133 | 0 | } |
2134 | 3 | BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli", |
2135 | 3 | Func, NewPreheader); |
2136 | 3 | if (ParentL) |
2137 | 0 | ParentL->addBasicBlockToLoop(MemmoveB, *LF); |
2138 | 3 | Instruction *OldT = Preheader->getTerminator(); |
2139 | 3 | Builder.CreateCondBr(Cond, MemmoveB, NewPreheader); |
2140 | 3 | OldT->eraseFromParent(); |
2141 | 3 | Preheader->setName(Preheader->getName()+".old"); |
2142 | 3 | DT->addNewBlock(MemmoveB, Preheader); |
2143 | 3 | // Find the new immediate dominator of the exit block. |
2144 | 3 | BasicBlock *ExitD = Preheader; |
2145 | 6 | for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE6 ; ++PI3 ) { |
2146 | 3 | BasicBlock *PB = *PI; |
2147 | 3 | ExitD = DT->findNearestCommonDominator(ExitD, PB); |
2148 | 3 | if (!ExitD) |
2149 | 0 | break; |
2150 | 3 | } |
2151 | 3 | // If the prior immediate dominator of ExitB was dominated by the |
2152 | 3 | // old preheader, then the old preheader becomes the new immediate |
2153 | 3 | // dominator. Otherwise don't change anything (because the newly |
2154 | 3 | // added blocks are dominated by the old preheader). |
2155 | 3 | if (ExitD && 3 DT->dominates(Preheader, ExitD)3 ) { |
2156 | 3 | DomTreeNode *BN = DT->getNode(ExitB); |
2157 | 3 | DomTreeNode *DN = DT->getNode(ExitD); |
2158 | 3 | BN->setIDom(DN); |
2159 | 3 | } |
2160 | 3 | |
2161 | 3 | // Add a call to memmove to the conditional block. |
2162 | 3 | IRBuilder<> CondBuilder(MemmoveB); |
2163 | 3 | CondBuilder.CreateBr(ExitB); |
2164 | 3 | CondBuilder.SetInsertPoint(MemmoveB->getTerminator()); |
2165 | 3 | |
2166 | 3 | if (DestVolatile3 ) { |
2167 | 0 | Type *Int32Ty = Type::getInt32Ty(Ctx); |
2168 | 0 | Type *Int32PtrTy = Type::getInt32PtrTy(Ctx); |
2169 | 0 | Type *VoidTy = Type::getVoidTy(Ctx); |
2170 | 0 | Module *M = Func->getParent(); |
2171 | 0 | Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy, |
2172 | 0 | Int32PtrTy, Int32PtrTy, Int32Ty); |
2173 | 0 | Function *Fn = cast<Function>(CF); |
2174 | 0 | Fn->setLinkage(Function::ExternalLinkage); |
2175 | 0 |
|
2176 | 0 | const SCEV *OneS = SE->getConstant(Int32Ty, 1); |
2177 | 0 | const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty); |
2178 | 0 | const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW); |
2179 | 0 | Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty, |
2180 | 0 | MemmoveB->getTerminator()); |
2181 | 0 | if (Instruction *In = dyn_cast<Instruction>(NumWords)) |
2182 | 0 | if (Value *0 Simp0 = SimplifyInstruction(In, {*DL, TLI, DT})) |
2183 | 0 | NumWords = Simp; |
2184 | 0 |
|
2185 | 0 | Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy) |
2186 | 0 | ? StoreBasePtr |
2187 | 0 | : CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy); |
2188 | 0 | Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy) |
2189 | 0 | ? LoadBasePtr |
2190 | 0 | : CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy); |
2191 | 0 | NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords}); |
2192 | 3 | } else { |
2193 | 3 | NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr, |
2194 | 3 | NumBytes, Alignment); |
2195 | 3 | } |
2196 | 0 | } else { |
2197 | 0 | NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, |
2198 | 0 | NumBytes, Alignment); |
2199 | 0 | // Okay, the memcpy has been formed. Zap the original store and |
2200 | 0 | // anything that feeds into it. |
2201 | 0 | RecursivelyDeleteTriviallyDeadInstructions(SI, TLI); |
2202 | 0 | } |
2203 | 3 | |
2204 | 3 | NewCall->setDebugLoc(DLoc); |
2205 | 3 | |
2206 | 3 | DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ") |
2207 | 3 | << *NewCall << "\n" |
2208 | 3 | << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" |
2209 | 3 | << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); |
2210 | 3 | |
2211 | 3 | return true; |
2212 | 3 | } |
2213 | | |
2214 | | // \brief Check if the instructions in Insts, together with their dependencies |
2215 | | // cover the loop in the sense that the loop could be safely eliminated once |
2216 | | // the instructions in Insts are removed. |
2217 | | bool HexagonLoopIdiomRecognize::coverLoop(Loop *L, |
2218 | 3 | SmallVectorImpl<Instruction*> &Insts) const { |
2219 | 3 | SmallSet<BasicBlock*,8> LoopBlocks; |
2220 | 3 | for (auto *B : L->blocks()) |
2221 | 4 | LoopBlocks.insert(B); |
2222 | 3 | |
2223 | 3 | SetVector<Instruction*> Worklist(Insts.begin(), Insts.end()); |
2224 | 3 | |
2225 | 3 | // Collect all instructions from the loop that the instructions in Insts |
2226 | 3 | // depend on (plus their dependencies, etc.). These instructions will |
2227 | 3 | // constitute the expression trees that feed those in Insts, but the trees |
2228 | 3 | // will be limited only to instructions contained in the loop. |
2229 | 21 | for (unsigned i = 0; i < Worklist.size()21 ; ++i18 ) { |
2230 | 18 | Instruction *In = Worklist[i]; |
2231 | 51 | for (auto I = In->op_begin(), E = In->op_end(); I != E51 ; ++I33 ) { |
2232 | 33 | Instruction *OpI = dyn_cast<Instruction>(I); |
2233 | 33 | if (!OpI) |
2234 | 8 | continue; |
2235 | 25 | BasicBlock *PB = OpI->getParent(); |
2236 | 25 | if (!LoopBlocks.count(PB)) |
2237 | 4 | continue; |
2238 | 21 | Worklist.insert(OpI); |
2239 | 21 | } |
2240 | 18 | } |
2241 | 3 | |
2242 | 3 | // Scan all instructions in the loop, if any of them have a user outside |
2243 | 3 | // of the loop, or outside of the expressions collected above, then either |
2244 | 3 | // the loop has a side-effect visible outside of it, or there are |
2245 | 3 | // instructions in it that are not involved in the original set Insts. |
2246 | 4 | for (auto *B : L->blocks()) { |
2247 | 27 | for (auto &In : *B) { |
2248 | 27 | if (isa<BranchInst>(In) || 27 isa<DbgInfoIntrinsic>(In)23 ) |
2249 | 4 | continue; |
2250 | 23 | if (23 !Worklist.count(&In) && 23 In.mayHaveSideEffects()5 ) |
2251 | 0 | return false; |
2252 | 23 | for (const auto &K : In.users()) 23 { |
2253 | 29 | Instruction *UseI = dyn_cast<Instruction>(K); |
2254 | 29 | if (!UseI) |
2255 | 0 | continue; |
2256 | 29 | BasicBlock *UseB = UseI->getParent(); |
2257 | 29 | if (LF->getLoopFor(UseB) != L) |
2258 | 0 | return false; |
2259 | 3 | } |
2260 | 27 | } |
2261 | 4 | } |
2262 | 3 | |
2263 | 3 | return true; |
2264 | 3 | } |
2265 | | |
2266 | | /// runOnLoopBlock - Process the specified block, which lives in a counted loop |
2267 | | /// with the specified backedge count. This block is known to be in the current |
2268 | | /// loop and not in any subloops. |
2269 | | bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, |
2270 | 6 | const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks) { |
2271 | 6 | // We can only promote stores in this block if they are unconditionally |
2272 | 6 | // executed in the loop. For a block to be unconditionally executed, it has |
2273 | 6 | // to dominate all the exit blocks of the loop. Verify this now. |
2274 | 6 | auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool { |
2275 | 6 | return DT->dominates(BB, EB); |
2276 | 6 | }; |
2277 | 6 | if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB)) |
2278 | 1 | return false; |
2279 | 5 | |
2280 | 5 | bool MadeChange = false; |
2281 | 5 | // Look for store instructions, which may be optimized to memset/memcpy. |
2282 | 5 | SmallVector<StoreInst*,8> Stores; |
2283 | 5 | collectStores(CurLoop, BB, Stores); |
2284 | 5 | |
2285 | 5 | // Optimize the store into a memcpy, if it feeds an similarly strided load. |
2286 | 5 | for (auto &SI : Stores) |
2287 | 3 | MadeChange |= processCopyingStore(CurLoop, SI, BECount); |
2288 | 6 | |
2289 | 6 | return MadeChange; |
2290 | 6 | } |
2291 | | |
2292 | 7 | bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) { |
2293 | 7 | PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE); |
2294 | 7 | if (PMR.recognize()) |
2295 | 2 | return true; |
2296 | 5 | |
2297 | 5 | if (5 !HasMemcpy && 5 !HasMemmove0 ) |
2298 | 0 | return false; |
2299 | 5 | |
2300 | 5 | const SCEV *BECount = SE->getBackedgeTakenCount(L); |
2301 | 5 | assert(!isa<SCEVCouldNotCompute>(BECount) && |
2302 | 5 | "runOnCountableLoop() called on a loop without a predictable" |
2303 | 5 | "backedge-taken count"); |
2304 | 5 | |
2305 | 5 | SmallVector<BasicBlock *, 8> ExitBlocks; |
2306 | 5 | L->getUniqueExitBlocks(ExitBlocks); |
2307 | 5 | |
2308 | 5 | bool Changed = false; |
2309 | 5 | |
2310 | 5 | // Scan all the blocks in the loop that are not in subloops. |
2311 | 6 | for (auto *BB : L->getBlocks()) { |
2312 | 6 | // Ignore blocks in subloops. |
2313 | 6 | if (LF->getLoopFor(BB) != L) |
2314 | 0 | continue; |
2315 | 6 | Changed |= runOnLoopBlock(L, BB, BECount, ExitBlocks); |
2316 | 6 | } |
2317 | 7 | |
2318 | 7 | return Changed; |
2319 | 7 | } |
2320 | | |
2321 | 8 | bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { |
2322 | 8 | const Module &M = *L->getHeader()->getParent()->getParent(); |
2323 | 8 | if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon) |
2324 | 0 | return false; |
2325 | 8 | |
2326 | 8 | if (8 skipLoop(L)8 ) |
2327 | 0 | return false; |
2328 | 8 | |
2329 | 8 | // If the loop could not be converted to canonical form, it must have an |
2330 | 8 | // indirectbr in it, just give up. |
2331 | 8 | if (8 !L->getLoopPreheader()8 ) |
2332 | 0 | return false; |
2333 | 8 | |
2334 | 8 | // Disable loop idiom recognition if the function's name is a common idiom. |
2335 | 8 | StringRef Name = L->getHeader()->getParent()->getName(); |
2336 | 8 | if (Name == "memset" || 8 Name == "memcpy"8 || Name == "memmove"8 ) |
2337 | 0 | return false; |
2338 | 8 | |
2339 | 8 | AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
2340 | 8 | DL = &L->getHeader()->getModule()->getDataLayout(); |
2341 | 8 | DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
2342 | 8 | LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
2343 | 8 | TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); |
2344 | 8 | SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
2345 | 8 | |
2346 | 8 | HasMemcpy = TLI->has(LibFunc_memcpy); |
2347 | 8 | HasMemmove = TLI->has(LibFunc_memmove); |
2348 | 8 | |
2349 | 8 | if (SE->hasLoopInvariantBackedgeTakenCount(L)) |
2350 | 7 | return runOnCountableLoop(L); |
2351 | 1 | return false; |
2352 | 1 | } |
2353 | | |
2354 | 8 | Pass *llvm::createHexagonLoopIdiomPass() { |
2355 | 8 | return new HexagonLoopIdiomRecognize(); |
2356 | 8 | } |