Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- HexagonLoopIdiomRecognition.cpp ------------------------------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
10
#define DEBUG_TYPE "hexagon-lir"
11
12
#include "llvm/ADT/APInt.h"
13
#include "llvm/ADT/DenseMap.h"
14
#include "llvm/ADT/SetVector.h"
15
#include "llvm/ADT/SmallPtrSet.h"
16
#include "llvm/ADT/SmallSet.h"
17
#include "llvm/ADT/SmallVector.h"
18
#include "llvm/ADT/StringRef.h"
19
#include "llvm/ADT/Triple.h"
20
#include "llvm/Analysis/AliasAnalysis.h"
21
#include "llvm/Analysis/InstructionSimplify.h"
22
#include "llvm/Analysis/LoopInfo.h"
23
#include "llvm/Analysis/LoopPass.h"
24
#include "llvm/Analysis/MemoryLocation.h"
25
#include "llvm/Analysis/ScalarEvolution.h"
26
#include "llvm/Analysis/ScalarEvolutionExpander.h"
27
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
28
#include "llvm/Analysis/TargetLibraryInfo.h"
29
#include "llvm/Analysis/ValueTracking.h"
30
#include "llvm/IR/Attributes.h"
31
#include "llvm/IR/BasicBlock.h"
32
#include "llvm/IR/Constant.h"
33
#include "llvm/IR/Constants.h"
34
#include "llvm/IR/DataLayout.h"
35
#include "llvm/IR/DebugLoc.h"
36
#include "llvm/IR/DerivedTypes.h"
37
#include "llvm/IR/Dominators.h"
38
#include "llvm/IR/Function.h"
39
#include "llvm/IR/IRBuilder.h"
40
#include "llvm/IR/InstrTypes.h"
41
#include "llvm/IR/Instruction.h"
42
#include "llvm/IR/Instructions.h"
43
#include "llvm/IR/IntrinsicInst.h"
44
#include "llvm/IR/Intrinsics.h"
45
#include "llvm/IR/Module.h"
46
#include "llvm/IR/PatternMatch.h"
47
#include "llvm/IR/Type.h"
48
#include "llvm/IR/User.h"
49
#include "llvm/IR/Value.h"
50
#include "llvm/Pass.h"
51
#include "llvm/Support/Casting.h"
52
#include "llvm/Support/CommandLine.h"
53
#include "llvm/Support/Compiler.h"
54
#include "llvm/Support/Debug.h"
55
#include "llvm/Support/ErrorHandling.h"
56
#include "llvm/Support/KnownBits.h"
57
#include "llvm/Support/raw_ostream.h"
58
#include "llvm/Transforms/Scalar.h"
59
#include "llvm/Transforms/Utils/Local.h"
60
#include <algorithm>
61
#include <array>
62
#include <cassert>
63
#include <cstdint>
64
#include <cstdlib>
65
#include <deque>
66
#include <functional>
67
#include <iterator>
68
#include <map>
69
#include <set>
70
#include <utility>
71
#include <vector>
72
73
using namespace llvm;
74
75
static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
76
  cl::Hidden, cl::init(false),
77
  cl::desc("Disable generation of memcpy in loop idiom recognition"));
78
79
static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
80
  cl::Hidden, cl::init(false),
81
  cl::desc("Disable generation of memmove in loop idiom recognition"));
82
83
static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
84
  cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
85
  "check guarding the memmove."));
86
87
static cl::opt<unsigned> CompileTimeMemSizeThreshold(
88
  "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
89
  cl::desc("Threshold (in bytes) to perform the transformation, if the "
90
    "runtime loop count (mem transfer size) is known at compile-time."));
91
92
static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
93
  cl::Hidden, cl::init(true),
94
  cl::desc("Only enable generating memmove in non-nested loops"));
95
96
cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy",
97
  cl::Hidden, cl::init(false),
98
  cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
99
100
static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
101
  cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
102
103
static const char *HexagonVolatileMemcpyName
104
  = "hexagon_memcpy_forward_vp4cp4n2";
105
106
107
namespace llvm {
108
109
  void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
110
  Pass *createHexagonLoopIdiomPass();
111
112
} // end namespace llvm
113
114
namespace {
115
116
  class HexagonLoopIdiomRecognize : public LoopPass {
117
  public:
118
    static char ID;
119
120
14
    explicit HexagonLoopIdiomRecognize() : LoopPass(ID) {
121
14
      initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
122
14
    }
123
124
0
    StringRef getPassName() const override {
125
0
      return "Recognize Hexagon-specific loop idioms";
126
0
    }
127
128
14
   void getAnalysisUsage(AnalysisUsage &AU) const override {
129
14
      AU.addRequired<LoopInfoWrapperPass>();
130
14
      AU.addRequiredID(LoopSimplifyID);
131
14
      AU.addRequiredID(LCSSAID);
132
14
      AU.addRequired<AAResultsWrapperPass>();
133
14
      AU.addPreserved<AAResultsWrapperPass>();
134
14
      AU.addRequired<ScalarEvolutionWrapperPass>();
135
14
      AU.addRequired<DominatorTreeWrapperPass>();
136
14
      AU.addRequired<TargetLibraryInfoWrapperPass>();
137
14
      AU.addPreserved<TargetLibraryInfoWrapperPass>();
138
14
    }
139
140
    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
141
142
  private:
143
    unsigned getStoreSizeInBytes(StoreInst *SI);
144
    int getSCEVStride(const SCEVAddRecExpr *StoreEv);
145
    bool isLegalStore(Loop *CurLoop, StoreInst *SI);
146
    void collectStores(Loop *CurLoop, BasicBlock *BB,
147
        SmallVectorImpl<StoreInst*> &Stores);
148
    bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
149
    bool coverLoop(Loop *L, SmallVectorImpl<Instruction*> &Insts) const;
150
    bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
151
        SmallVectorImpl<BasicBlock*> &ExitBlocks);
152
    bool runOnCountableLoop(Loop *L);
153
154
    AliasAnalysis *AA;
155
    const DataLayout *DL;
156
    DominatorTree *DT;
157
    LoopInfo *LF;
158
    const TargetLibraryInfo *TLI;
159
    ScalarEvolution *SE;
160
    bool HasMemcpy, HasMemmove;
161
  };
162
163
  struct Simplifier {
164
    using Rule = std::function<Value * (Instruction *, LLVMContext &)>;
165
166
42
    void addRule(const Rule &R) { Rules.push_back(R); }
167
168
  private:
169
    struct WorkListType {
170
5.22k
      WorkListType() = default;
171
172
2.23M
      void push_back(Value* V) {
173
2.23M
        // Do not push back duplicates.
174
2.23M
        if (
!S.count(V)2.23M
)
{ Q.push_back(V); S.insert(V); }1.17M
175
2.23M
      }
176
177
1.11M
      Value *pop_front_val() {
178
1.11M
        Value *V = Q.front(); Q.pop_front(); S.erase(V);
179
1.11M
        return V;
180
1.11M
      }
181
182
1.11M
      bool empty() const { return Q.empty(); }
183
184
    private:
185
      std::deque<Value*> Q;
186
      std::set<Value*> S;
187
    };
188
189
    using ValueSetType = std::set<Value *>;
190
191
    std::vector<Rule> Rules;
192
193
  public:
194
    struct Context {
195
      using ValueMapType = DenseMap<Value *, Value *>;
196
197
      Value *Root;
198
      ValueSetType Used;    // The set of all cloned values used by Root.
199
      ValueSetType Clones;  // The set of all cloned values.
200
      LLVMContext &Ctx;
201
202
      Context(Instruction *Exp)
203
19
        : Ctx(Exp->getParent()->getParent()->getContext()) {
204
19
        initialize(Exp);
205
19
      }
206
207
19
      ~Context() { cleanup(); }
208
209
      void print(raw_ostream &OS, const Value *V) const;
210
      Value *materialize(BasicBlock *B, BasicBlock::iterator At);
211
212
    private:
213
      friend struct Simplifier;
214
215
      void initialize(Instruction *Exp);
216
      void cleanup();
217
218
      template <typename FuncT> void traverse(Value *V, FuncT F);
219
      void record(Value *V);
220
      void use(Value *V);
221
      void unuse(Value *V);
222
223
      bool equal(const Instruction *I, const Instruction *J) const;
224
      Value *find(Value *Tree, Value *Sub) const;
225
      Value *subst(Value *Tree, Value *OldV, Value *NewV);
226
      void replace(Value *OldV, Value *NewV);
227
      void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
228
    };
229
230
    Value *simplify(Context &C);
231
  };
232
233
  struct PE {
234
0
    PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
235
236
    const Simplifier::Context &C;
237
    const Value *V;
238
  };
239
240
  raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED;
241
0
  raw_ostream &operator<< (raw_ostream &OS, const PE &P) {
242
0
    P.C.print(OS, P.V ? 
P.V0
:
P.C.Root0
);
243
0
    return OS;
244
0
  }
245
246
} // end anonymous namespace
247
248
char HexagonLoopIdiomRecognize::ID = 0;
249
250
90.0k
INITIALIZE_PASS_BEGIN90.0k
(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
251
90.0k
    "Recognize Hexagon-specific loop idioms", false, false)
252
90.0k
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
253
90.0k
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
254
90.0k
INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
255
90.0k
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
256
90.0k
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
257
90.0k
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
258
90.0k
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
259
90.0k
INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
260
    "Recognize Hexagon-specific loop idioms", false, false)
261
262
template <typename FuncT>
263
1.78k
void Simplifier::Context::traverse(Value *V, FuncT F) {
264
1.78k
  WorkListType Q;
265
1.78k
  Q.push_back(V);
266
1.78k
267
207k
  while (
!Q.empty()207k
) {
268
205k
    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
269
205k
    if (
!U || 205k
U->getParent()176k
)
270
31.6k
      continue;
271
174k
    
if (174k
!F(U)174k
)
272
1.02k
      continue;
273
173k
    for (Value *Op : U->operands())
274
390k
      Q.push_back(Op);
275
205k
  }
276
1.78k
}
HexagonLoopIdiomRecognition.cpp:void (anonymous namespace)::Simplifier::Context::traverse<(anonymous namespace)::Simplifier::Context::record(llvm::Value*)::$_0>(llvm::Value*, (anonymous namespace)::Simplifier::Context::record(llvm::Value*)::$_0)
Line
Count
Source
263
470
void Simplifier::Context::traverse(Value *V, FuncT F) {
264
470
  WorkListType Q;
265
470
  Q.push_back(V);
266
470
267
16.7k
  while (
!Q.empty()16.7k
) {
268
16.2k
    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
269
16.2k
    if (
!U || 16.2k
U->getParent()11.3k
)
270
5.55k
      continue;
271
10.7k
    
if (10.7k
!F(U)10.7k
)
272
0
      continue;
273
10.7k
    for (Value *Op : U->operands())
274
21.0k
      Q.push_back(Op);
275
16.2k
  }
276
470
}
HexagonLoopIdiomRecognition.cpp:void (anonymous namespace)::Simplifier::Context::traverse<(anonymous namespace)::Simplifier::Context::use(llvm::Value*)::$_1>(llvm::Value*, (anonymous namespace)::Simplifier::Context::use(llvm::Value*)::$_1)
Line
Count
Source
263
470
void Simplifier::Context::traverse(Value *V, FuncT F) {
264
470
  WorkListType Q;
265
470
  Q.push_back(V);
266
470
267
188k
  while (
!Q.empty()188k
) {
268
187k
    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
269
187k
    if (
!U || 187k
U->getParent()164k
)
270
25.8k
      continue;
271
162k
    
if (162k
!F(U)162k
)
272
0
      continue;
273
162k
    for (Value *Op : U->operands())
274
368k
      Q.push_back(Op);
275
187k
  }
276
470
}
HexagonLoopIdiomRecognition.cpp:void (anonymous namespace)::Simplifier::Context::traverse<(anonymous namespace)::Simplifier::Context::unuse(llvm::Value*)::$_2>(llvm::Value*, (anonymous namespace)::Simplifier::Context::unuse(llvm::Value*)::$_2)
Line
Count
Source
263
845
void Simplifier::Context::traverse(Value *V, FuncT F) {
264
845
  WorkListType Q;
265
845
  Q.push_back(V);
266
845
267
2.54k
  while (
!Q.empty()2.54k
) {
268
1.69k
    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
269
1.69k
    if (
!U || 1.69k
U->getParent()1.46k
)
270
252
      continue;
271
1.44k
    
if (1.44k
!F(U)1.44k
)
272
1.02k
      continue;
273
419
    for (Value *Op : U->operands())
274
852
      Q.push_back(Op);
275
1.69k
  }
276
845
}
277
278
0
void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
279
0
  const auto *U = dyn_cast<const Instruction>(V);
280
0
  if (
!U0
) {
281
0
    OS << V << '(' << *V << ')';
282
0
    return;
283
0
  }
284
0
285
0
  
if (0
U->getParent()0
) {
286
0
    OS << U << '(';
287
0
    U->printAsOperand(OS, true);
288
0
    OS << ')';
289
0
    return;
290
0
  }
291
0
292
0
  unsigned N = U->getNumOperands();
293
0
  if (N != 0)
294
0
    OS << U << '(';
295
0
  OS << U->getOpcodeName();
296
0
  for (const Value *Op : U->operands()) {
297
0
    OS << ' ';
298
0
    print(OS, Op);
299
0
  }
300
0
  if (N != 0)
301
0
    OS << ')';
302
0
}
303
304
19
void Simplifier::Context::initialize(Instruction *Exp) {
305
19
  // Perform a deep clone of the expression, set Root to the root
306
19
  // of the clone, and build a map from the cloned values to the
307
19
  // original ones.
308
19
  ValueMapType M;
309
19
  BasicBlock *Block = Exp->getParent();
310
19
  WorkListType Q;
311
19
  Q.push_back(Exp);
312
19
313
307
  while (
!Q.empty()307
) {
314
288
    Value *V = Q.pop_front_val();
315
288
    if (M.find(V) != M.end())
316
5
      continue;
317
283
    
if (Instruction *283
U283
= dyn_cast<Instruction>(V)) {
318
201
      if (
isa<PHINode>(U) || 201
U->getParent() != Block182
)
319
32
        continue;
320
169
      for (Value *Op : U->operands())
321
339
        Q.push_back(Op);
322
201
      M.insert({U, U->clone()});
323
201
    }
324
288
  }
325
19
326
169
  for (std::pair<Value*,Value*> P : M) {
327
169
    Instruction *U = cast<Instruction>(P.second);
328
508
    for (unsigned i = 0, n = U->getNumOperands(); 
i != n508
;
++i339
) {
329
339
      auto F = M.find(U->getOperand(i));
330
339
      if (F != M.end())
331
186
        U->setOperand(i, F->second);
332
339
    }
333
169
  }
334
19
335
19
  auto R = M.find(Exp);
336
19
  assert(R != M.end());
337
19
  Root = R->second;
338
19
339
19
  record(Root);
340
19
  use(Root);
341
19
}
342
343
470
void Simplifier::Context::record(Value *V) {
344
10.7k
  auto Record = [this](Instruction *U) -> bool {
345
10.7k
    Clones.insert(U);
346
10.7k
    return true;
347
10.7k
  };
348
470
  traverse(V, Record);
349
470
}
350
351
470
void Simplifier::Context::use(Value *V) {
352
162k
  auto Use = [this](Instruction *U) -> bool {
353
162k
    Used.insert(U);
354
162k
    return true;
355
162k
  };
356
470
  traverse(V, Use);
357
470
}
358
359
845
void Simplifier::Context::unuse(Value *V) {
360
845
  if (
!isa<Instruction>(V) || 845
cast<Instruction>(V)->getParent() != nullptr845
)
361
0
    return;
362
845
363
845
  
auto Unuse = [this](Instruction *U) -> bool 845
{
364
1.44k
    if (!U->use_empty())
365
1.02k
      return false;
366
419
    Used.erase(U);
367
419
    return true;
368
419
  };
369
845
  traverse(V, Unuse);
370
845
}
371
372
545
Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) {
373
545
  if (Tree == OldV)
374
1
    return NewV;
375
544
  
if (544
OldV == NewV544
)
376
0
    return Tree;
377
544
378
544
  WorkListType Q;
379
544
  Q.push_back(Tree);
380
187k
  while (
!Q.empty()187k
) {
381
186k
    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
382
186k
    // If U is not an instruction, or it's not a clone, skip it.
383
186k
    if (
!U || 186k
U->getParent()162k
)
384
26.4k
      continue;
385
524k
    
for (unsigned i = 0, n = U->getNumOperands(); 160k
i != n524k
;
++i364k
) {
386
364k
      Value *Op = U->getOperand(i);
387
364k
      if (
Op == OldV364k
) {
388
845
        U->setOperand(i, NewV);
389
845
        unuse(OldV);
390
364k
      } else {
391
363k
        Q.push_back(Op);
392
363k
      }
393
364k
    }
394
186k
  }
395
545
  return Tree;
396
545
}
397
398
451
void Simplifier::Context::replace(Value *OldV, Value *NewV) {
399
451
  if (
Root == OldV451
) {
400
2
    Root = NewV;
401
2
    use(Root);
402
2
    return;
403
2
  }
404
449
405
449
  // NewV may be a complex tree that has just been created by one of the
406
449
  // transformation rules. We need to make sure that it is commoned with
407
449
  // the existing Root to the maximum extent possible.
408
449
  // Identify all subtrees of NewV (including NewV itself) that have
409
449
  // equivalent counterparts in Root, and replace those subtrees with
410
449
  // these counterparts.
411
449
  WorkListType Q;
412
449
  Q.push_back(NewV);
413
3.38k
  while (
!Q.empty()3.38k
) {
414
2.93k
    Value *V = Q.pop_front_val();
415
2.93k
    Instruction *U = dyn_cast<Instruction>(V);
416
2.93k
    if (
!U || 2.93k
U->getParent()2.41k
)
417
531
      continue;
418
2.40k
    
if (Value *2.40k
DupV2.40k
= find(Root, V)) {
419
1.21k
      if (DupV != V)
420
96
        NewV = subst(NewV, V, DupV);
421
2.40k
    } else {
422
1.18k
      for (Value *Op : U->operands())
423
2.81k
        Q.push_back(Op);
424
1.18k
    }
425
2.93k
  }
426
451
427
451
  // Now, simply replace OldV with NewV in Root.
428
451
  Root = subst(Root, OldV, NewV);
429
451
  use(Root);
430
451
}
431
432
19
void Simplifier::Context::cleanup() {
433
1.44k
  for (Value *V : Clones) {
434
1.44k
    Instruction *U = cast<Instruction>(V);
435
1.44k
    if (!U->getParent())
436
1.43k
      U->dropAllReferences();
437
1.44k
  }
438
19
439
1.44k
  for (Value *V : Clones) {
440
1.44k
    Instruction *U = cast<Instruction>(V);
441
1.44k
    if (!U->getParent())
442
1.43k
      U->deleteValue();
443
1.44k
  }
444
19
}
445
446
bool Simplifier::Context::equal(const Instruction *I,
447
1.26M
                                const Instruction *J) const {
448
1.26M
  if (I == J)
449
0
    return true;
450
1.26M
  
if (1.26M
!I->isSameOperationAs(J)1.26M
)
451
497k
    return false;
452
769k
  
if (769k
isa<PHINode>(I)769k
)
453
593
    return I->isIdenticalTo(J);
454
769k
455
843k
  
for (unsigned i = 0, n = I->getNumOperands(); 769k
i != n843k
;
++i74.8k
) {
456
843k
    Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i);
457
843k
    if (OpI == OpJ)
458
74.4k
      continue;
459
769k
    auto *InI = dyn_cast<const Instruction>(OpI);
460
769k
    auto *InJ = dyn_cast<const Instruction>(OpJ);
461
769k
    if (
InI && 769k
InJ712k
) {
462
649k
      if (!equal(InI, InJ))
463
649k
        return false;
464
119k
    } else 
if (119k
InI != InJ || 119k
!InI23.3k
)
465
119k
      return false;
466
843k
  }
467
475
  return true;
468
1.26M
}
469
470
2.40k
Value *Simplifier::Context::find(Value *Tree, Value *Sub) const {
471
2.40k
  Instruction *SubI = dyn_cast<Instruction>(Sub);
472
2.40k
  WorkListType Q;
473
2.40k
  Q.push_back(Tree);
474
2.40k
475
707k
  while (
!Q.empty()707k
) {
476
705k
    Value *V = Q.pop_front_val();
477
705k
    if (V == Sub)
478
1.12k
      return V;
479
704k
    Instruction *U = dyn_cast<Instruction>(V);
480
704k
    if (
!U || 704k
U->getParent()624k
)
481
86.9k
      continue;
482
617k
    
if (617k
SubI && 617k
equal(SubI, U)617k
)
483
96
      return U;
484
617k
    assert(!isa<PHINode>(U));
485
617k
    for (Value *Op : U->operands())
486
1.44M
      Q.push_back(Op);
487
705k
  }
488
1.18k
  return nullptr;
489
2.40k
}
490
491
void Simplifier::Context::link(Instruction *I, BasicBlock *B,
492
24
      BasicBlock::iterator At) {
493
24
  if (I->getParent())
494
9
    return;
495
15
496
15
  
for (Value *Op : I->operands()) 15
{
497
29
    if (Instruction *OpI = dyn_cast<Instruction>(Op))
498
22
      link(OpI, B, At);
499
29
  }
500
24
501
24
  B->getInstList().insert(At, I);
502
24
}
503
504
Value *Simplifier::Context::materialize(BasicBlock *B,
505
2
      BasicBlock::iterator At) {
506
2
  if (Instruction *RootI = dyn_cast<Instruction>(Root))
507
2
    link(RootI, B, At);
508
2
  return Root;
509
2
}
510
511
19
Value *Simplifier::simplify(Context &C) {
512
19
  WorkListType Q;
513
19
  Q.push_back(C.Root);
514
19
  unsigned Count = 0;
515
19
  const unsigned Limit = SimplifyLimit;
516
19
517
11.5k
  while (
!Q.empty()11.5k
) {
518
11.5k
    if (Count++ >= Limit)
519
1
      break;
520
11.5k
    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
521
11.5k
    if (
!U || 11.5k
U->getParent()11.0k
||
!C.Used.count(U)10.8k
)
522
664
      continue;
523
10.8k
    bool Changed = false;
524
74.4k
    for (Rule &R : Rules) {
525
74.4k
      Value *W = R(U, C.Ctx);
526
74.4k
      if (!W)
527
74.0k
        continue;
528
451
      Changed = true;
529
451
      C.record(W);
530
451
      C.replace(U, W);
531
451
      Q.push_back(C.Root);
532
451
      break;
533
451
    }
534
10.8k
    if (
!Changed10.8k
) {
535
10.4k
      for (Value *Op : U->operands())
536
25.8k
        Q.push_back(Op);
537
10.4k
    }
538
11.5k
  }
539
19
  return Count < Limit ? 
C.Root18
:
nullptr1
;
540
19
}
541
542
//===----------------------------------------------------------------------===//
543
//
544
//          Implementation of PolynomialMultiplyRecognize
545
//
546
//===----------------------------------------------------------------------===//
547
548
namespace {
549
550
  class PolynomialMultiplyRecognize {
551
  public:
552
    explicit PolynomialMultiplyRecognize(Loop *loop, const DataLayout &dl,
553
        const DominatorTree &dt, const TargetLibraryInfo &tli,
554
        ScalarEvolution &se)
555
7
      : CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {}
556
557
    bool recognize();
558
559
  private:
560
    using ValueSeq = SetVector<Value *>;
561
562
1
    IntegerType *getPmpyType() const {
563
1
      LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
564
1
      return IntegerType::get(Ctx, 32);
565
1
    }
566
567
    bool isPromotableTo(Value *V, IntegerType *Ty);
568
    void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB);
569
    bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB);
570
571
    Value *getCountIV(BasicBlock *BB);
572
    bool findCycle(Value *Out, Value *In, ValueSeq &Cycle);
573
    void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
574
          ValueSeq &Late);
575
    bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late);
576
    bool commutesWithShift(Instruction *I);
577
    bool highBitsAreZero(Value *V, unsigned IterCount);
578
    bool keepsHighBitsZero(Value *V, unsigned IterCount);
579
    bool isOperandShifted(Instruction *I, Value *Op);
580
    bool convertShiftsToLeft(BasicBlock *LoopB, BasicBlock *ExitB,
581
          unsigned IterCount);
582
    void cleanupLoopBody(BasicBlock *LoopB);
583
584
    struct ParsedValues {
585
6
      ParsedValues() = default;
586
587
      Value *M = nullptr;
588
      Value *P = nullptr;
589
      Value *Q = nullptr;
590
      Value *R = nullptr;
591
      Value *X = nullptr;
592
      Instruction *Res = nullptr;
593
      unsigned IterCount = 0;
594
      bool Left = false;
595
      bool Inv = false;
596
    };
597
598
    bool matchLeftShift(SelectInst *SelI, Value *CIV, ParsedValues &PV);
599
    bool matchRightShift(SelectInst *SelI, ParsedValues &PV);
600
    bool scanSelect(SelectInst *SI, BasicBlock *LoopB, BasicBlock *PrehB,
601
          Value *CIV, ParsedValues &PV, bool PreScan);
602
    unsigned getInverseMxN(unsigned QP);
603
    Value *generate(BasicBlock::iterator At, ParsedValues &PV);
604
605
    void setupSimplifier();
606
607
    Simplifier Simp;
608
    Loop *CurLoop;
609
    const DataLayout &DL;
610
    const DominatorTree &DT;
611
    const TargetLibraryInfo &TLI;
612
    ScalarEvolution &SE;
613
  };
614
615
} // end anonymous namespace
616
617
7
Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) {
618
7
  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
619
7
  if (std::distance(PI, PE) != 2)
620
0
    return nullptr;
621
7
  
BasicBlock *PB = (*PI == BB) ? 7
*std::next(PI)5
:
*PI2
;
622
7
623
13
  for (auto I = BB->begin(), E = BB->end(); 
I != E && 13
isa<PHINode>(I)13
;
++I6
) {
624
11
    auto *PN = cast<PHINode>(I);
625
11
    Value *InitV = PN->getIncomingValueForBlock(PB);
626
11
    if (
!isa<ConstantInt>(InitV) || 11
!cast<ConstantInt>(InitV)->isZero()6
)
627
5
      continue;
628
6
    Value *IterV = PN->getIncomingValueForBlock(BB);
629
6
    if (!isa<BinaryOperator>(IterV))
630
1
      continue;
631
5
    auto *BO = dyn_cast<BinaryOperator>(IterV);
632
5
    if (BO->getOpcode() != Instruction::Add)
633
0
      continue;
634
5
    Value *IncV = nullptr;
635
5
    if (BO->getOperand(0) == PN)
636
5
      IncV = BO->getOperand(1);
637
0
    else 
if (0
BO->getOperand(1) == PN0
)
638
0
      IncV = BO->getOperand(0);
639
5
    if (IncV == nullptr)
640
0
      continue;
641
5
642
5
    
if (auto *5
T5
= dyn_cast<ConstantInt>(IncV))
643
5
      
if (5
T->getZExtValue() == 15
)
644
5
        return PN;
645
11
  }
646
2
  return nullptr;
647
7
}
648
649
2
static void replaceAllUsesOfWithIn(Value *I, Value *J, BasicBlock *BB) {
650
5
  for (auto UI = I->user_begin(), UE = I->user_end(); 
UI != UE5
;) {
651
3
    Use &TheUse = UI.getUse();
652
3
    ++UI;
653
3
    if (auto *II = dyn_cast<Instruction>(TheUse.getUser()))
654
3
      
if (3
BB == II->getParent()3
)
655
3
        II->replaceUsesOfWith(I, J);
656
3
  }
657
2
}
658
659
bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
660
21
      Value *CIV, ParsedValues &PV) {
661
21
  // Match the following:
662
21
  //   select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
663
21
  //   select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
664
21
  // The condition may also check for equality with the masked value, i.e
665
21
  //   select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
666
21
  //   select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
667
21
668
21
  Value *CondV = SelI->getCondition();
669
21
  Value *TrueV = SelI->getTrueValue();
670
21
  Value *FalseV = SelI->getFalseValue();
671
21
672
21
  using namespace PatternMatch;
673
21
674
21
  CmpInst::Predicate P;
675
21
  Value *A = nullptr, *B = nullptr, *C = nullptr;
676
21
677
21
  if (!match(CondV, m_ICmp(P, m_And(m_Value(A), m_Value(B)), m_Value(C))) &&
678
16
      !match(CondV, m_ICmp(P, m_Value(C), m_And(m_Value(A), m_Value(B)))))
679
16
    return false;
680
5
  
if (5
P != CmpInst::ICMP_EQ && 5
P != CmpInst::ICMP_NE0
)
681
0
    return false;
682
5
  // Matched: select (A & B) == C ? ... : ...
683
5
  //          select (A & B) != C ? ... : ...
684
5
685
5
  Value *X = nullptr, *Sh1 = nullptr;
686
5
  // Check (A & B) for (X & (1 << i)):
687
5
  if (
match(A, m_Shl(m_One(), m_Specific(CIV)))5
) {
688
2
    Sh1 = A;
689
2
    X = B;
690
5
  } else 
if (3
match(B, m_Shl(m_One(), m_Specific(CIV)))3
) {
691
1
    Sh1 = B;
692
1
    X = A;
693
3
  } else {
694
2
    // TODO: Could also check for an induction variable containing single
695
2
    // bit shifted left by 1 in each iteration.
696
2
    return false;
697
2
  }
698
3
699
3
  bool TrueIfZero;
700
3
701
3
  // Check C against the possible values for comparison: 0 and (1 << i):
702
3
  if (match(C, m_Zero()))
703
2
    TrueIfZero = (P == CmpInst::ICMP_EQ);
704
1
  else 
if (1
C == Sh11
)
705
1
    TrueIfZero = (P == CmpInst::ICMP_NE);
706
1
  else
707
0
    return false;
708
3
709
3
  // So far, matched:
710
3
  //   select (X & (1 << i)) ? ... : ...
711
3
  // including variations of the check against zero/non-zero value.
712
3
713
3
  Value *ShouldSameV = nullptr, *ShouldXoredV = nullptr;
714
3
  if (
TrueIfZero3
) {
715
2
    ShouldSameV = TrueV;
716
2
    ShouldXoredV = FalseV;
717
3
  } else {
718
1
    ShouldSameV = FalseV;
719
1
    ShouldXoredV = TrueV;
720
1
  }
721
3
722
3
  Value *Q = nullptr, *R = nullptr, *Y = nullptr, *Z = nullptr;
723
3
  Value *T = nullptr;
724
3
  if (
match(ShouldXoredV, m_Xor(m_Value(Y), m_Value(Z)))3
) {
725
3
    // Matched: select +++ ? ... : Y ^ Z
726
3
    //          select +++ ? Y ^ Z : ...
727
3
    // where +++ denotes previously checked matches.
728
3
    if (ShouldSameV == Y)
729
1
      T = Z;
730
2
    else 
if (2
ShouldSameV == Z2
)
731
2
      T = Y;
732
2
    else
733
0
      return false;
734
3
    R = ShouldSameV;
735
3
    // Matched: select +++ ? R : R ^ T
736
3
    //          select +++ ? R ^ T : R
737
3
    // depending on TrueIfZero.
738
3
739
3
  } else 
if (0
match(ShouldSameV, m_Zero())0
) {
740
0
    // Matched: select +++ ? 0 : ...
741
0
    //          select +++ ? ... : 0
742
0
    if (!SelI->hasOneUse())
743
0
      return false;
744
0
    T = ShouldXoredV;
745
0
    // Matched: select +++ ? 0 : T
746
0
    //          select +++ ? T : 0
747
0
748
0
    Value *U = *SelI->user_begin();
749
0
    if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) &&
750
0
        !match(U, m_Xor(m_Value(R), m_Specific(SelI))))
751
0
      return false;
752
0
    // Matched: xor (select +++ ? 0 : T), R
753
0
    //          xor (select +++ ? T : 0), R
754
0
  } else
755
0
    return false;
756
3
757
3
  // The xor input value T is isolated into its own match so that it could
758
3
  // be checked against an induction variable containing a shifted bit
759
3
  // (todo).
760
3
  // For now, check against (Q << i).
761
3
  
if (3
!match(T, m_Shl(m_Value(Q), m_Specific(CIV))) &&
762
2
      !match(T, m_Shl(m_ZExt(m_Value(Q)), m_ZExt(m_Specific(CIV)))))
763
0
    return false;
764
3
  // Matched: select +++ ? R : R ^ (Q << i)
765
3
  //          select +++ ? R ^ (Q << i) : R
766
3
767
3
  PV.X = X;
768
3
  PV.Q = Q;
769
3
  PV.R = R;
770
3
  PV.Left = true;
771
3
  return true;
772
3
}
773
774
bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
775
18
      ParsedValues &PV) {
776
18
  // Match the following:
777
18
  //   select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
778
18
  //   select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
779
18
  // The condition may also check for equality with the masked value, i.e
780
18
  //   select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
781
18
  //   select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
782
18
783
18
  Value *CondV = SelI->getCondition();
784
18
  Value *TrueV = SelI->getTrueValue();
785
18
  Value *FalseV = SelI->getFalseValue();
786
18
787
18
  using namespace PatternMatch;
788
18
789
18
  Value *C = nullptr;
790
18
  CmpInst::Predicate P;
791
18
  bool TrueIfZero;
792
18
793
18
  if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) ||
794
18
      
match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))17
) {
795
1
    if (
P != CmpInst::ICMP_EQ && 1
P != CmpInst::ICMP_NE1
)
796
1
      return false;
797
0
    // Matched: select C == 0 ? ... : ...
798
0
    //          select C != 0 ? ... : ...
799
0
    TrueIfZero = (P == CmpInst::ICMP_EQ);
800
18
  } else 
if (17
match(CondV, m_ICmp(P, m_Value(C), m_One())) ||
801
17
             
match(CondV, m_ICmp(P, m_One(), m_Value(C)))15
) {
802
2
    if (
P != CmpInst::ICMP_EQ && 2
P != CmpInst::ICMP_NE0
)
803
0
      return false;
804
2
    // Matched: select C == 1 ? ... : ...
805
2
    //          select C != 1 ? ... : ...
806
2
    TrueIfZero = (P == CmpInst::ICMP_NE);
807
2
  } else
808
15
    return false;
809
2
810
2
  Value *X = nullptr;
811
2
  if (!match(C, m_And(m_Value(X), m_One())) &&
812
0
      !match(C, m_And(m_One(), m_Value(X))))
813
0
    return false;
814
2
  // Matched: select (X & 1) == +++ ? ... : ...
815
2
  //          select (X & 1) != +++ ? ... : ...
816
2
817
2
  Value *R = nullptr, *Q = nullptr;
818
2
  if (
TrueIfZero2
) {
819
0
    // The select's condition is true if the tested bit is 0.
820
0
    // TrueV must be the shift, FalseV must be the xor.
821
0
    if (!match(TrueV, m_LShr(m_Value(R), m_One())))
822
0
      return false;
823
0
    // Matched: select +++ ? (R >> 1) : ...
824
0
    
if (0
!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) &&
825
0
        !match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV))))
826
0
      return false;
827
2
    // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
828
2
    // with commuting ^.
829
2
  } else {
830
2
    // The select's condition is true if the tested bit is 1.
831
2
    // TrueV must be the xor, FalseV must be the shift.
832
2
    if (!match(FalseV, m_LShr(m_Value(R), m_One())))
833
1
      return false;
834
1
    // Matched: select +++ ? ... : (R >> 1)
835
1
    
if (1
!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) &&
836
0
        !match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV))))
837
0
      return false;
838
1
    // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
839
1
    // with commuting ^.
840
1
  }
841
1
842
1
  PV.X = X;
843
1
  PV.Q = Q;
844
1
  PV.R = R;
845
1
  PV.Left = false;
846
1
  return true;
847
1
}
848
849
bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
850
      BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV,
851
21
      bool PreScan) {
852
21
  using namespace PatternMatch;
853
21
854
21
  // The basic pattern for R = P.Q is:
855
21
  // for i = 0..31
856
21
  //   R = phi (0, R')
857
21
  //   if (P & (1 << i))        ; test-bit(P, i)
858
21
  //     R' = R ^ (Q << i)
859
21
  //
860
21
  // Similarly, the basic pattern for R = (P/Q).Q - P
861
21
  // for i = 0..31
862
21
  //   R = phi(P, R')
863
21
  //   if (R & (1 << i))
864
21
  //     R' = R ^ (Q << i)
865
21
866
21
  // There exist idioms, where instead of Q being shifted left, P is shifted
867
21
  // right. This produces a result that is shifted right by 32 bits (the
868
21
  // non-shifted result is 64-bit).
869
21
  //
870
21
  // For R = P.Q, this would be:
871
21
  // for i = 0..31
872
21
  //   R = phi (0, R')
873
21
  //   if ((P >> i) & 1)
874
21
  //     R' = (R >> 1) ^ Q      ; R is cycled through the loop, so it must
875
21
  //   else                     ; be shifted by 1, not i.
876
21
  //     R' = R >> 1
877
21
  //
878
21
  // And for the inverse:
879
21
  // for i = 0..31
880
21
  //   R = phi (P, R')
881
21
  //   if (R & 1)
882
21
  //     R' = (R >> 1) ^ Q
883
21
  //   else
884
21
  //     R' = R >> 1
885
21
886
21
  // The left-shifting idioms share the same pattern:
887
21
  //   select (X & (1 << i)) ? R ^ (Q << i) : R
888
21
  // Similarly for right-shifting idioms:
889
21
  //   select (X & 1) ? (R >> 1) ^ Q
890
21
891
21
  if (
matchLeftShift(SelI, CIV, PV)21
) {
892
3
    // If this is a pre-scan, getting this far is sufficient.
893
3
    if (PreScan)
894
1
      return true;
895
2
896
2
    // Need to make sure that the SelI goes back into R.
897
2
    auto *RPhi = dyn_cast<PHINode>(PV.R);
898
2
    if (!RPhi)
899
0
      return false;
900
2
    
if (2
SelI != RPhi->getIncomingValueForBlock(LoopB)2
)
901
0
      return false;
902
2
    PV.Res = SelI;
903
2
904
2
    // If X is loop invariant, it must be the input polynomial, and the
905
2
    // idiom is the basic polynomial multiply.
906
2
    if (
CurLoop->isLoopInvariant(PV.X)2
) {
907
1
      PV.P = PV.X;
908
1
      PV.Inv = false;
909
2
    } else {
910
1
      // X is not loop invariant. If X == R, this is the inverse pmpy.
911
1
      // Otherwise, check for an xor with an invariant value. If the
912
1
      // variable argument to the xor is R, then this is still a valid
913
1
      // inverse pmpy.
914
1
      PV.Inv = true;
915
1
      if (
PV.X != PV.R1
) {
916
1
        Value *Var = nullptr, *Inv = nullptr, *X1 = nullptr, *X2 = nullptr;
917
1
        if (!match(PV.X, m_Xor(m_Value(X1), m_Value(X2))))
918
0
          return false;
919
1
        auto *I1 = dyn_cast<Instruction>(X1);
920
1
        auto *I2 = dyn_cast<Instruction>(X2);
921
1
        if (
!I1 || 1
I1->getParent() != LoopB1
) {
922
0
          Var = X2;
923
0
          Inv = X1;
924
1
        } else 
if (1
!I2 || 1
I2->getParent() != LoopB1
) {
925
1
          Var = X1;
926
1
          Inv = X2;
927
1
        } else
928
0
          return false;
929
1
        
if (1
Var != PV.R1
)
930
0
          return false;
931
1
        PV.M = Inv;
932
1
      }
933
1
      // The input polynomial P still needs to be determined. It will be
934
1
      // the entry value of R.
935
1
      Value *EntryP = RPhi->getIncomingValueForBlock(PrehB);
936
1
      PV.P = EntryP;
937
1
    }
938
2
939
2
    return true;
940
18
  }
941
18
942
18
  
if (18
matchRightShift(SelI, PV)18
) {
943
1
    // If this is an inverse pattern, the Q polynomial must be known at
944
1
    // compile time.
945
1
    if (
PV.Inv && 1
!isa<ConstantInt>(PV.Q)0
)
946
0
      return false;
947
1
    
if (1
PreScan1
)
948
1
      return true;
949
0
    // There is no exact matching of right-shift pmpy.
950
0
    return false;
951
0
  }
952
17
953
17
  return false;
954
17
}
955
956
bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
957
14
      IntegerType *DestTy) {
958
14
  IntegerType *T = dyn_cast<IntegerType>(Val->getType());
959
14
  if (
!T || 14
T->getBitWidth() > DestTy->getBitWidth()14
)
960
0
    return false;
961
14
  
if (14
T->getBitWidth() == DestTy->getBitWidth()14
)
962
4
    return true;
963
10
  // Non-instructions are promotable. The reason why an instruction may not
964
10
  // be promotable is that it may produce a different result if its operands
965
10
  // and the result are promoted, for example, it may produce more non-zero
966
10
  // bits. While it would still be possible to represent the proper result
967
10
  // in a wider type, it may require adding additional instructions (which
968
10
  // we don't want to do).
969
10
  Instruction *In = dyn_cast<Instruction>(Val);
970
10
  if (!In)
971
0
    return true;
972
10
  // The bitwidth of the source type is smaller than the destination.
973
10
  // Check if the individual operation can be promoted.
974
10
  switch (In->getOpcode()) {
975
7
    case Instruction::PHI:
976
7
    case Instruction::ZExt:
977
7
    case Instruction::And:
978
7
    case Instruction::Or:
979
7
    case Instruction::Xor:
980
7
    case Instruction::LShr: // Shift right is ok.
981
7
    case Instruction::Select:
982
7
      return true;
983
2
    case Instruction::ICmp:
984
2
      if (CmpInst *CI = cast<CmpInst>(In))
985
2
        
return CI->isEquality() || 2
CI->isUnsigned()1
;
986
0
      
llvm_unreachable0
("Cast failed unexpectedly");
987
1
    case Instruction::Add:
988
1
      return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
989
0
  }
990
0
  return false;
991
0
}
992
993
void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
994
15
      IntegerType *DestTy, BasicBlock *LoopB) {
995
15
  // Leave boolean values alone.
996
15
  if (!In->getType()->isIntegerTy(1))
997
13
    In->mutateType(DestTy);
998
15
  unsigned DestBW = DestTy->getBitWidth();
999
15
1000
15
  // Handle PHIs.
1001
15
  if (PHINode *
P15
= dyn_cast<PHINode>(In)) {
1002
3
    unsigned N = P->getNumIncomingValues();
1003
9
    for (unsigned i = 0; 
i != N9
;
++i6
) {
1004
6
      BasicBlock *InB = P->getIncomingBlock(i);
1005
6
      if (InB == LoopB)
1006
3
        continue;
1007
3
      Value *InV = P->getIncomingValue(i);
1008
3
      IntegerType *Ty = cast<IntegerType>(InV->getType());
1009
3
      // Do not promote values in PHI nodes of type i1.
1010
3
      if (
Ty != P->getType()3
) {
1011
3
        // If the value type does not match the PHI type, the PHI type
1012
3
        // must have been promoted.
1013
3
        assert(Ty->getBitWidth() < DestBW);
1014
3
        InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
1015
3
        P->setIncomingValue(i, InV);
1016
3
      }
1017
6
    }
1018
15
  } else 
if (ZExtInst *12
Z12
= dyn_cast<ZExtInst>(In)) {
1019
2
    Value *Op = Z->getOperand(0);
1020
2
    if (Op->getType() == Z->getType())
1021
2
      Z->replaceAllUsesWith(Op);
1022
12
    Z->eraseFromParent();
1023
12
    return;
1024
12
  }
1025
13
1026
13
  // Promote immediates.
1027
41
  
for (unsigned i = 0, n = In->getNumOperands(); 13
i != n41
;
++i28
) {
1028
28
    if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
1029
8
      
if (8
CI->getType()->getBitWidth() < DestBW8
)
1030
5
        In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
1031
28
  }
1032
15
}
1033
1034
bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
1035
1
      BasicBlock *ExitB) {
1036
1
  assert(LoopB);
1037
1
  // Skip loops where the exit block has more than one predecessor. The values
1038
1
  // coming from the loop block will be promoted to another type, and so the
1039
1
  // values coming into the exit block from other predecessors would also have
1040
1
  // to be promoted.
1041
1
  if (
!ExitB || 1
(ExitB->getSinglePredecessor() != LoopB)1
)
1042
0
    return false;
1043
1
  IntegerType *DestTy = getPmpyType();
1044
1
  // Check if the exit values have types that are no wider than the type
1045
1
  // that we want to promote to.
1046
1
  unsigned DestBW = DestTy->getBitWidth();
1047
2
  for (Instruction &In : *ExitB) {
1048
2
    PHINode *P = dyn_cast<PHINode>(&In);
1049
2
    if (!P)
1050
1
      break;
1051
1
    
if (1
P->getNumIncomingValues() != 11
)
1052
0
      return false;
1053
1
    assert(P->getIncomingBlock(0) == LoopB);
1054
1
    IntegerType *T = dyn_cast<IntegerType>(P->getType());
1055
1
    if (
!T || 1
T->getBitWidth() > DestBW1
)
1056
0
      return false;
1057
1
  }
1058
1
1059
1
  // Check all instructions in the loop.
1060
1
  for (Instruction &In : *LoopB)
1061
15
    
if (15
!In.isTerminator() && 15
!isPromotableTo(&In, DestTy)14
)
1062
0
      return false;
1063
1
1064
1
  // Perform the promotion.
1065
1
  std::vector<Instruction*> LoopIns;
1066
1
  std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
1067
15
                 [](Instruction &In) { return &In; });
1068
1
  for (Instruction *In : LoopIns)
1069
15
    promoteTo(In, DestTy, LoopB);
1070
1
1071
1
  // Fix up the PHI nodes in the exit block.
1072
1
  Instruction *EndI = ExitB->getFirstNonPHI();
1073
1
  BasicBlock::iterator End = EndI ? 
EndI->getIterator()1
:
ExitB->end()0
;
1074
2
  for (auto I = ExitB->begin(); 
I != End2
;
++I1
) {
1075
2
    PHINode *P = dyn_cast<PHINode>(I);
1076
2
    if (!P)
1077
1
      break;
1078
1
    Type *Ty0 = P->getIncomingValue(0)->getType();
1079
1
    Type *PTy = P->getType();
1080
1
    if (
PTy != Ty01
) {
1081
1
      assert(Ty0 == DestTy);
1082
1
      // In order to create the trunc, P must have the promoted type.
1083
1
      P->mutateType(Ty0);
1084
1
      Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
1085
1
      // In order for the RAUW to work, the types of P and T must match.
1086
1
      P->mutateType(PTy);
1087
1
      P->replaceAllUsesWith(T);
1088
1
      // Final update of the P's type.
1089
1
      P->mutateType(Ty0);
1090
1
      cast<Instruction>(T)->setOperand(0, P);
1091
1
    }
1092
2
  }
1093
1
1094
1
  return true;
1095
1
}
1096
1097
bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In,
1098
6
      ValueSeq &Cycle) {
1099
6
  // Out = ..., In, ...
1100
6
  if (Out == In)
1101
2
    return true;
1102
4
1103
4
  auto *BB = cast<Instruction>(Out)->getParent();
1104
4
  bool HadPhi = false;
1105
4
1106
4
  for (auto U : Out->users()) {
1107
4
    auto *I = dyn_cast<Instruction>(&*U);
1108
4
    if (
I == nullptr || 4
I->getParent() != BB4
)
1109
0
      continue;
1110
4
    // Make sure that there are no multi-iteration cycles, e.g.
1111
4
    //   p1 = phi(p2)
1112
4
    //   p2 = phi(p1)
1113
4
    // The cycle p1->p2->p1 would span two loop iterations.
1114
4
    // Check that there is only one phi in the cycle.
1115
4
    bool IsPhi = isa<PHINode>(I);
1116
4
    if (
IsPhi && 4
HadPhi2
)
1117
0
      return false;
1118
4
    HadPhi |= IsPhi;
1119
4
    if (Cycle.count(I))
1120
0
      return false;
1121
4
    Cycle.insert(I);
1122
4
    if (findCycle(I, In, Cycle))
1123
4
      break;
1124
0
    Cycle.remove(I);
1125
0
  }
1126
4
  return !Cycle.empty();
1127
6
}
1128
1129
void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI,
1130
2
      ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) {
1131
2
  // All the values in the cycle that are between the phi node and the
1132
2
  // divider instruction will be classified as "early", all other values
1133
2
  // will be "late".
1134
2
1135
2
  bool IsE = true;
1136
2
  unsigned I, N = Cycle.size();
1137
4
  for (I = 0; 
I < N4
;
++I2
) {
1138
4
    Value *V = Cycle[I];
1139
4
    if (DivI == V)
1140
0
      IsE = false;
1141
4
    else 
if (4
!isa<PHINode>(V)4
)
1142
2
      continue;
1143
2
    // Stop if found either.
1144
2
    break;
1145
2
  }
1146
2
  // "I" is the index of either DivI or the phi node, whichever was first.
1147
2
  // "E" is "false" or "true" respectively.
1148
2
  ValueSeq &First = !IsE ? 
Early0
:
Late2
;
1149
4
  for (unsigned J = 0; 
J < I4
;
++J2
)
1150
2
    First.insert(Cycle[J]);
1151
2
1152
2
  ValueSeq &Second = IsE ? 
Early2
:
Late0
;
1153
2
  Second.insert(Cycle[I]);
1154
2
  for (++I; 
I < N2
;
++I0
) {
1155
2
    Value *V = Cycle[I];
1156
2
    if (
DivI == V || 2
isa<PHINode>(V)0
)
1157
2
      break;
1158
0
    Second.insert(V);
1159
0
  }
1160
2
1161
4
  for (; 
I < N4
;
++I2
)
1162
2
    First.insert(Cycle[I]);
1163
2
}
1164
1165
bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI,
1166
8
      ValueSeq &Early, ValueSeq &Late) {
1167
8
  // Select is an exception, since the condition value does not have to be
1168
8
  // classified in the same way as the true/false values. The true/false
1169
8
  // values do have to be both early or both late.
1170
8
  if (
UseI->getOpcode() == Instruction::Select8
) {
1171
3
    Value *TV = UseI->getOperand(1), *FV = UseI->getOperand(2);
1172
3
    if (
Early.count(TV) || 3
Early.count(FV)3
) {
1173
0
      if (
Late.count(TV) || 0
Late.count(FV)0
)
1174
0
        return false;
1175
0
      Early.insert(UseI);
1176
3
    } else 
if (3
Late.count(TV) || 3
Late.count(FV)0
) {
1177
3
      if (
Early.count(TV) || 3
Early.count(FV)3
)
1178
0
        return false;
1179
3
      Late.insert(UseI);
1180
3
    }
1181
3
    return true;
1182
5
  }
1183
5
1184
5
  // Not sure what would be the example of this, but the code below relies
1185
5
  // on having at least one operand.
1186
5
  
if (5
UseI->getNumOperands() == 05
)
1187
0
    return true;
1188
5
1189
5
  bool AE = true, AL = true;
1190
10
  for (auto &I : UseI->operands()) {
1191
10
    if (Early.count(&*I))
1192
6
      AL = false;
1193
4
    else 
if (4
Late.count(&*I)4
)
1194
1
      AE = false;
1195
10
  }
1196
5
  // If the operands appear "all early" and "all late" at the same time,
1197
5
  // then it means that none of them are actually classified as either.
1198
5
  // This is harmless.
1199
5
  if (
AE && 5
AL4
)
1200
0
    return true;
1201
5
  // Conversely, if they are neither "all early" nor "all late", then
1202
5
  // we have a mixture of early and late operands that is not a known
1203
5
  // exception.
1204
5
  
if (5
!AE && 5
!AL1
)
1205
0
    return false;
1206
5
1207
5
  // Check that we have covered the two special cases.
1208
5
  assert(AE != AL);
1209
5
1210
5
  if (AE)
1211
4
    Early.insert(UseI);
1212
5
  else
1213
1
    Late.insert(UseI);
1214
8
  return true;
1215
8
}
1216
1217
9
bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) {
1218
9
  switch (I->getOpcode()) {
1219
9
    case Instruction::And:
1220
9
    case Instruction::Or:
1221
9
    case Instruction::Xor:
1222
9
    case Instruction::LShr:
1223
9
    case Instruction::Shl:
1224
9
    case Instruction::Select:
1225
9
    case Instruction::ICmp:
1226
9
    case Instruction::PHI:
1227
9
      break;
1228
0
    default:
1229
0
      return false;
1230
9
  }
1231
9
  return true;
1232
9
}
1233
1234
bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
1235
2
      unsigned IterCount) {
1236
2
  auto *T = dyn_cast<IntegerType>(V->getType());
1237
2
  if (!T)
1238
0
    return false;
1239
2
1240
2
  KnownBits Known(T->getBitWidth());
1241
2
  computeKnownBits(V, Known, DL);
1242
2
  return Known.countMinLeadingZeros() >= IterCount;
1243
2
}
1244
1245
bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
1246
11
      unsigned IterCount) {
1247
11
  // Assume that all inputs to the value have the high bits zero.
1248
11
  // Check if the value itself preserves the zeros in the high bits.
1249
11
  if (auto *C = dyn_cast<ConstantInt>(V))
1250
2
    return C->getValue().countLeadingZeros() >= IterCount;
1251
9
1252
9
  
if (auto *9
I9
= dyn_cast<Instruction>(V)) {
1253
9
    switch (I->getOpcode()) {
1254
9
      case Instruction::And:
1255
9
      case Instruction::Or:
1256
9
      case Instruction::Xor:
1257
9
      case Instruction::LShr:
1258
9
      case Instruction::Select:
1259
9
      case Instruction::ICmp:
1260
9
      case Instruction::PHI:
1261
9
      case Instruction::ZExt:
1262
9
        return true;
1263
0
    }
1264
0
  }
1265
0
1266
0
  return false;
1267
0
}
1268
1269
11
bool PolynomialMultiplyRecognize::isOperandShifted(Instruction *I, Value *Op) {
1270
11
  unsigned Opc = I->getOpcode();
1271
11
  if (
Opc == Instruction::Shl || 11
Opc == Instruction::LShr11
)
1272
0
    return Op != I->getOperand(1);
1273
11
  return true;
1274
11
}
1275
1276
bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
1277
1
      BasicBlock *ExitB, unsigned IterCount) {
1278
1
  Value *CIV = getCountIV(LoopB);
1279
1
  if (CIV == nullptr)
1280
0
    return false;
1281
1
  auto *CIVTy = dyn_cast<IntegerType>(CIV->getType());
1282
1
  if (CIVTy == nullptr)
1283
0
    return false;
1284
1
1285
1
  ValueSeq RShifts;
1286
1
  ValueSeq Early, Late, Cycled;
1287
1
1288
1
  // Find all value cycles that contain logical right shifts by 1.
1289
13
  for (Instruction &I : *LoopB) {
1290
13
    using namespace PatternMatch;
1291
13
1292
13
    Value *V = nullptr;
1293
13
    if (!match(&I, m_LShr(m_Value(V), m_One())))
1294
11
      continue;
1295
2
    ValueSeq C;
1296
2
    if (!findCycle(&I, V, C))
1297
0
      continue;
1298
2
1299
2
    // Found a cycle.
1300
2
    C.insert(&I);
1301
2
    classifyCycle(&I, C, Early, Late);
1302
2
    Cycled.insert(C.begin(), C.end());
1303
2
    RShifts.insert(&I);
1304
2
  }
1305
1
1306
1
  // Find the set of all values affected by the shift cycles, i.e. all
1307
1
  // cycled values, and (recursively) all their users.
1308
1
  ValueSeq Users(Cycled.begin(), Cycled.end());
1309
10
  for (unsigned i = 0; 
i < Users.size()10
;
++i9
) {
1310
9
    Value *V = Users[i];
1311
9
    if (!isa<IntegerType>(V->getType()))
1312
0
      return false;
1313
9
    auto *R = cast<Instruction>(V);
1314
9
    // If the instruction does not commute with shifts, the loop cannot
1315
9
    // be unshifted.
1316
9
    if (!commutesWithShift(R))
1317
0
      return false;
1318
22
    
for (auto I = R->user_begin(), E = R->user_end(); 9
I != E22
;
++I13
) {
1319
13
      auto *T = cast<Instruction>(*I);
1320
13
      // Skip users from outside of the loop. They will be handled later.
1321
13
      // Also, skip the right-shifts and phi nodes, since they mix early
1322
13
      // and late values.
1323
13
      if (
T->getParent() != LoopB || 13
RShifts.count(T)12
||
isa<PHINode>(T)10
)
1324
5
        continue;
1325
8
1326
8
      Users.insert(T);
1327
8
      if (!classifyInst(T, Early, Late))
1328
0
        return false;
1329
13
    }
1330
9
  }
1331
1
1332
1
  
if (1
Users.empty()1
)
1333
0
    return false;
1334
1
1335
1
  // Verify that high bits remain zero.
1336
1
  ValueSeq Internal(Users.begin(), Users.end());
1337
1
  ValueSeq Inputs;
1338
12
  for (unsigned i = 0; 
i < Internal.size()12
;
++i11
) {
1339
11
    auto *R = dyn_cast<Instruction>(Internal[i]);
1340
11
    if (!R)
1341
2
      continue;
1342
9
    
for (Value *Op : R->operands()) 9
{
1343
19
      auto *T = dyn_cast<Instruction>(Op);
1344
19
      if (
T && 19
T->getParent() != LoopB14
)
1345
2
        Inputs.insert(Op);
1346
19
      else
1347
17
        Internal.insert(Op);
1348
19
    }
1349
11
  }
1350
1
  for (Value *V : Inputs)
1351
2
    
if (2
!highBitsAreZero(V, IterCount)2
)
1352
0
      return false;
1353
1
  for (Value *V : Internal)
1354
11
    
if (11
!keepsHighBitsZero(V, IterCount)11
)
1355
0
      return false;
1356
1
1357
1
  // Finally, the work can be done. Unshift each user.
1358
1
  IRBuilder<> IRB(LoopB);
1359
1
  std::map<Value*,Value*> ShiftMap;
1360
1
1361
1
  using CastMapType = std::map<std::pair<Value *, Type *>, Value *>;
1362
1
1363
1
  CastMapType CastMap;
1364
1
1365
1
  auto upcast = [] (CastMapType &CM, IRBuilder<> &IRB, Value *V,
1366
0
        IntegerType *Ty) -> Value* {
1367
0
    auto H = CM.find(std::make_pair(V, Ty));
1368
0
    if (H != CM.end())
1369
0
      return H->second;
1370
0
    Value *CV = IRB.CreateIntCast(V, Ty, false);
1371
0
    CM.insert(std::make_pair(std::make_pair(V, Ty), CV));
1372
0
    return CV;
1373
0
  };
1374
1
1375
14
  for (auto I = LoopB->begin(), E = LoopB->end(); 
I != E14
;
++I13
) {
1376
13
    using namespace PatternMatch;
1377
13
1378
13
    if (
isa<PHINode>(I) || 13
!Users.count(&*I)10
)
1379
6
      continue;
1380
7
1381
7
    // Match lshr x, 1.
1382
7
    Value *V = nullptr;
1383
7
    if (
match(&*I, m_LShr(m_Value(V), m_One()))7
) {
1384
2
      replaceAllUsesOfWithIn(&*I, V, LoopB);
1385
2
      continue;
1386
2
    }
1387
5
    // For each non-cycled operand, replace it with the corresponding
1388
5
    // value shifted left.
1389
5
    
for (auto &J : I->operands()) 5
{
1390
11
      Value *Op = J.get();
1391
11
      if (!isOperandShifted(&*I, Op))
1392
0
        continue;
1393
11
      
if (11
Users.count(Op)11
)
1394
8
        continue;
1395
3
      // Skip shifting zeros.
1396
3
      
if (3
isa<ConstantInt>(Op) && 3
cast<ConstantInt>(Op)->isZero()3
)
1397
0
        continue;
1398
3
      // Check if we have already generated a shift for this value.
1399
3
      auto F = ShiftMap.find(Op);
1400
3
      Value *W = (F != ShiftMap.end()) ? 
F->second1
:
nullptr2
;
1401
3
      if (
W == nullptr3
) {
1402
2
        IRB.SetInsertPoint(&*I);
1403
2
        // First, the shift amount will be CIV or CIV+1, depending on
1404
2
        // whether the value is early or late. Instead of creating CIV+1,
1405
2
        // do a single shift of the value.
1406
2
        Value *ShAmt = CIV, *ShVal = Op;
1407
2
        auto *VTy = cast<IntegerType>(ShVal->getType());
1408
2
        auto *ATy = cast<IntegerType>(ShAmt->getType());
1409
2
        if (Late.count(&*I))
1410
1
          ShVal = IRB.CreateShl(Op, ConstantInt::get(VTy, 1));
1411
2
        // Second, the types of the shifted value and the shift amount
1412
2
        // must match.
1413
2
        if (
VTy != ATy2
) {
1414
0
          if (VTy->getBitWidth() < ATy->getBitWidth())
1415
0
            ShVal = upcast(CastMap, IRB, ShVal, ATy);
1416
0
          else
1417
0
            ShAmt = upcast(CastMap, IRB, ShAmt, VTy);
1418
0
        }
1419
2
        // Ready to generate the shift and memoize it.
1420
2
        W = IRB.CreateShl(ShVal, ShAmt);
1421
2
        ShiftMap.insert(std::make_pair(Op, W));
1422
2
      }
1423
11
      I->replaceUsesOfWith(Op, W);
1424
11
    }
1425
13
  }
1426
1
1427
1
  // Update the users outside of the loop to account for having left
1428
1
  // shifts. They would normally be shifted right in the loop, so shift
1429
1
  // them right after the loop exit.
1430
1
  // Take advantage of the loop-closed SSA form, which has all the post-
1431
1
  // loop values in phi nodes.
1432
1
  IRB.SetInsertPoint(ExitB, ExitB->getFirstInsertionPt());
1433
2
  for (auto P = ExitB->begin(), Q = ExitB->end(); 
P != Q2
;
++P1
) {
1434
2
    if (!isa<PHINode>(P))
1435
1
      break;
1436
1
    auto *PN = cast<PHINode>(P);
1437
1
    Value *U = PN->getIncomingValueForBlock(LoopB);
1438
1
    if (!Users.count(U))
1439
0
      continue;
1440
1
    Value *S = IRB.CreateLShr(PN, ConstantInt::get(PN->getType(), IterCount));
1441
1
    PN->replaceAllUsesWith(S);
1442
1
    // The above RAUW will create
1443
1
    //   S = lshr S, IterCount
1444
1
    // so we need to fix it back into
1445
1
    //   S = lshr PN, IterCount
1446
1
    cast<User>(S)->replaceUsesOfWith(S, PN);
1447
1
  }
1448
1
1449
1
  return true;
1450
1
}
1451
1452
1
void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
1453
1
  for (auto &I : *LoopB)
1454
15
    
if (Value *15
SV15
= SimplifyInstruction(&I, {DL, &TLI, &DT}))
1455
1
      I.replaceAllUsesWith(SV);
1456
1
1457
16
  for (auto I = LoopB->begin(), N = I; 
I != LoopB->end()16
;
I = N15
) {
1458
15
    N = std::next(I);
1459
15
    RecursivelyDeleteTriviallyDeadInstructions(&*I, &TLI);
1460
15
  }
1461
1
}
1462
1463
1
unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
1464
1
  // Arrays of coefficients of Q and the inverse, C.
1465
1
  // Q[i] = coefficient at x^i.
1466
1
  std::array<char,32> Q, C;
1467
1
1468
33
  for (unsigned i = 0; 
i < 3233
;
++i32
) {
1469
32
    Q[i] = QP & 1;
1470
32
    QP >>= 1;
1471
32
  }
1472
1
  assert(Q[0] == 1);
1473
1
1474
1
  // Find C, such that
1475
1
  // (Q[n]*x^n + ... + Q[1]*x + Q[0]) * (C[n]*x^n + ... + C[1]*x + C[0]) = 1
1476
1
  //
1477
1
  // For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
1478
1
  // operations * and + are & and ^ respectively.
1479
1
  //
1480
1
  // Find C[i] recursively, by comparing i-th coefficient in the product
1481
1
  // with 0 (or 1 for i=0).
1482
1
  //
1483
1
  // C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
1484
1
  C[0] = 1;
1485
32
  for (unsigned i = 1; 
i < 3232
;
++i31
) {
1486
31
    // Solve for C[i] in:
1487
31
    //   C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
1488
31
    // This is equivalent to
1489
31
    //   C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
1490
31
    // which is
1491
31
    //   C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
1492
31
    unsigned T = 0;
1493
527
    for (unsigned j = 0; 
j < i527
;
++j496
)
1494
496
      T = T ^ (C[j] & Q[i-j]);
1495
31
    C[i] = T;
1496
31
  }
1497
1
1498
1
  unsigned QV = 0;
1499
33
  for (unsigned i = 0; 
i < 3233
;
++i32
)
1500
32
    
if (32
C[i]32
)
1501
32
      QV |= (1 << i);
1502
1
1503
1
  return QV;
1504
1
}
1505
1506
Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
1507
2
      ParsedValues &PV) {
1508
2
  IRBuilder<> B(&*At);
1509
2
  Module *M = At->getParent()->getParent()->getParent();
1510
2
  Value *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
1511
2
1512
2
  Value *P = PV.P, *Q = PV.Q, *P0 = P;
1513
2
  unsigned IC = PV.IterCount;
1514
2
1515
2
  if (PV.M != nullptr)
1516
1
    P0 = P = B.CreateXor(P, PV.M);
1517
2
1518
2
  // Create a bit mask to clear the high bits beyond IterCount.
1519
2
  auto *BMI = ConstantInt::get(P->getType(), APInt::getLowBitsSet(32, IC));
1520
2
1521
2
  if (PV.IterCount != 32)
1522
1
    P = B.CreateAnd(P, BMI);
1523
2
1524
2
  if (
PV.Inv2
) {
1525
1
    auto *QI = dyn_cast<ConstantInt>(PV.Q);
1526
1
    assert(QI && QI->getBitWidth() <= 32);
1527
1
1528
1
    // Again, clearing bits beyond IterCount.
1529
1
    unsigned M = (1 << PV.IterCount) - 1;
1530
1
    unsigned Tmp = (QI->getZExtValue() | 1) & M;
1531
1
    unsigned QV = getInverseMxN(Tmp) & M;
1532
1
    auto *QVI = ConstantInt::get(QI->getType(), QV);
1533
1
    P = B.CreateCall(PMF, {P, QVI});
1534
1
    P = B.CreateTrunc(P, QI->getType());
1535
1
    if (IC != 32)
1536
1
      P = B.CreateAnd(P, BMI);
1537
1
  }
1538
2
1539
2
  Value *R = B.CreateCall(PMF, {P, Q});
1540
2
1541
2
  if (PV.M != nullptr)
1542
1
    R = B.CreateXor(R, B.CreateIntCast(P0, R->getType(), false));
1543
2
1544
2
  return R;
1545
2
}
1546
1547
6
void PolynomialMultiplyRecognize::setupSimplifier() {
1548
6
  Simp.addRule(
1549
6
    // Sink zext past bitwise operations.
1550
10.8k
    [](Instruction *I, LLVMContext &Ctx) -> Value* {
1551
10.8k
      if (I->getOpcode() != Instruction::ZExt)
1552
10.8k
        return nullptr;
1553
13
      Instruction *T = dyn_cast<Instruction>(I->getOperand(0));
1554
13
      if (!T)
1555
0
        return nullptr;
1556
13
      switch (T->getOpcode()) {
1557
4
        case Instruction::And:
1558
4
        case Instruction::Or:
1559
4
        case Instruction::Xor:
1560
4
          break;
1561
9
        default:
1562
9
          return nullptr;
1563
4
      }
1564
4
      IRBuilder<> B(Ctx);
1565
4
      return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
1566
4
                           B.CreateZExt(T->getOperand(0), I->getType()),
1567
4
                           B.CreateZExt(T->getOperand(1), I->getType()));
1568
4
    });
1569
6
  Simp.addRule(
1570
6
    // (xor (and x a) (and y a)) -> (and (xor x y) a)
1571
10.8k
    [](Instruction *I, LLVMContext &Ctx) -> Value* {
1572
10.8k
      if (I->getOpcode() != Instruction::Xor)
1573
10.8k
        return nullptr;
1574
16
      Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
1575
16
      Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
1576
16
      if (
!And0 || 16
!And116
)
1577
7
        return nullptr;
1578
9
      
if (9
And0->getOpcode() != Instruction::And ||
1579
2
          And1->getOpcode() != Instruction::And)
1580
7
        return nullptr;
1581
2
      
if (2
And0->getOperand(1) != And1->getOperand(1)2
)
1582
0
        return nullptr;
1583
2
      IRBuilder<> B(Ctx);
1584
2
      return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
1585
2
                         And0->getOperand(1));
1586
2
    });
1587
6
  Simp.addRule(
1588
6
    // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
1589
6
    // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
1590
10.8k
    [](Instruction *I, LLVMContext &Ctx) -> Value* {
1591
10.8k
      BinaryOperator *BO = dyn_cast<BinaryOperator>(I);
1592
10.8k
      if (!BO)
1593
5.82k
        return nullptr;
1594
5.06k
      Instruction::BinaryOps Op = BO->getOpcode();
1595
5.06k
      if (SelectInst *
Sel5.06k
= dyn_cast<SelectInst>(BO->getOperand(0))) {
1596
377
        IRBuilder<> B(Ctx);
1597
377
        Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue();
1598
377
        Value *Z = BO->getOperand(1);
1599
377
        return B.CreateSelect(Sel->getCondition(),
1600
377
                              B.CreateBinOp(Op, X, Z),
1601
377
                              B.CreateBinOp(Op, Y, Z));
1602
377
      }
1603
4.69k
      
if (SelectInst *4.69k
Sel4.69k
= dyn_cast<SelectInst>(BO->getOperand(1))) {
1604
39
        IRBuilder<> B(Ctx);
1605
39
        Value *X = BO->getOperand(0);
1606
39
        Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue();
1607
39
        return B.CreateSelect(Sel->getCondition(),
1608
39
                              B.CreateBinOp(Op, X, Y),
1609
39
                              B.CreateBinOp(Op, X, Z));
1610
39
      }
1611
4.65k
      return nullptr;
1612
4.65k
    });
1613
6
  Simp.addRule(
1614
6
    // (select c (select c x y) z) -> (select c x z)
1615
6
    // (select c x (select c y z)) -> (select c x z)
1616
10.4k
    [](Instruction *I, LLVMContext &Ctx) -> Value* {
1617
10.4k
      SelectInst *Sel = dyn_cast<SelectInst>(I);
1618
10.4k
      if (!Sel)
1619
5.19k
        return nullptr;
1620
5.27k
      IRBuilder<> B(Ctx);
1621
5.27k
      Value *C = Sel->getCondition();
1622
5.27k
      if (SelectInst *
Sel05.27k
= dyn_cast<SelectInst>(Sel->getTrueValue())) {
1623
2.64k
        if (Sel0->getCondition() == C)
1624
13
          return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
1625
5.26k
      }
1626
5.26k
      
if (SelectInst *5.26k
Sel15.26k
= dyn_cast<SelectInst>(Sel->getFalseValue())) {
1627
2.70k
        if (Sel1->getCondition() == C)
1628
13
          return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
1629
5.24k
      }
1630
5.24k
      return nullptr;
1631
5.24k
    });
1632
6
  Simp.addRule(
1633
6
    // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
1634
10.4k
    [](Instruction *I, LLVMContext &Ctx) -> Value* {
1635
10.4k
      if (I->getOpcode() != Instruction::Or)
1636
10.4k
        return nullptr;
1637
1
      Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0));
1638
1
      if (
!LShr || 1
LShr->getOpcode() != Instruction::LShr1
)
1639
0
        return nullptr;
1640
1
      ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1));
1641
1
      if (
!One || 1
One->getZExtValue() != 11
)
1642
0
        return nullptr;
1643
1
      ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
1644
1
      if (
!Msb || 1
Msb->getZExtValue() != Msb->getType()->getSignBit()1
)
1645
0
        return nullptr;
1646
1
      return IRBuilder<>(Ctx).CreateXor(LShr, Msb);
1647
1
    });
1648
6
  Simp.addRule(
1649
6
    // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
1650
10.4k
    [](Instruction *I, LLVMContext &Ctx) -> Value* {
1651
10.4k
      if (I->getOpcode() != Instruction::LShr)
1652
10.4k
        return nullptr;
1653
5
      BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0));
1654
5
      if (!BitOp)
1655
4
        return nullptr;
1656
1
      switch (BitOp->getOpcode()) {
1657
1
        case Instruction::And:
1658
1
        case Instruction::Or:
1659
1
        case Instruction::Xor:
1660
1
          break;
1661
0
        default:
1662
0
          return nullptr;
1663
1
      }
1664
1
      IRBuilder<> B(Ctx);
1665
1
      Value *S = I->getOperand(1);
1666
1
      return B.CreateBinOp(BitOp->getOpcode(),
1667
1
                B.CreateLShr(BitOp->getOperand(0), S),
1668
1
                B.CreateLShr(BitOp->getOperand(1), S));
1669
1
    });
1670
6
  Simp.addRule(
1671
6
    // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
1672
10.4k
    [](Instruction *I, LLVMContext &Ctx) -> Value* {
1673
4.65k
      auto IsBitOp = [](unsigned Op) -> bool {
1674
4.65k
        switch (Op) {
1675
23
          case Instruction::And:
1676
23
          case Instruction::Or:
1677
23
          case Instruction::Xor:
1678
23
            return true;
1679
4.63k
        }
1680
4.63k
        return false;
1681
4.63k
      };
1682
10.4k
      BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I);
1683
10.4k
      if (
!BitOp1 || 10.4k
!IsBitOp(BitOp1->getOpcode())4.64k
)
1684
10.4k
        return nullptr;
1685
18
      BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
1686
18
      if (
!BitOp2 || 18
!IsBitOp(BitOp2->getOpcode())9
)
1687
13
        return nullptr;
1688
5
      ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
1689
5
      ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
1690
5
      if (
!CA || 5
!CB1
)
1691
4
        return nullptr;
1692
1
      IRBuilder<> B(Ctx);
1693
1
      Value *X = BitOp2->getOperand(0);
1694
1
      return B.CreateBinOp(BitOp2->getOpcode(), X,
1695
1
                B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
1696
1
    });
1697
6
}
1698
1699
7
bool PolynomialMultiplyRecognize::recognize() {
1700
7
  DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
1701
7
               << *CurLoop << '\n');
1702
7
  // Restrictions:
1703
7
  // - The loop must consist of a single block.
1704
7
  // - The iteration count must be known at compile-time.
1705
7
  // - The loop must have an induction variable starting from 0, and
1706
7
  //   incremented in each iteration of the loop.
1707
7
  BasicBlock *LoopB = CurLoop->getHeader();
1708
7
  DEBUG(dbgs() << "Loop header:\n" << *LoopB);
1709
7
1710
7
  if (LoopB != CurLoop->getLoopLatch())
1711
1
    return false;
1712
6
  BasicBlock *ExitB = CurLoop->getExitBlock();
1713
6
  if (ExitB == nullptr)
1714
0
    return false;
1715
6
  BasicBlock *EntryB = CurLoop->getLoopPreheader();
1716
6
  if (EntryB == nullptr)
1717
0
    return false;
1718
6
1719
6
  unsigned IterCount = 0;
1720
6
  const SCEV *CT = SE.getBackedgeTakenCount(CurLoop);
1721
6
  if (isa<SCEVCouldNotCompute>(CT))
1722
0
    return false;
1723
6
  
if (auto *6
CV6
= dyn_cast<SCEVConstant>(CT))
1724
4
    IterCount = CV->getValue()->getZExtValue() + 1;
1725
6
1726
6
  Value *CIV = getCountIV(LoopB);
1727
6
  ParsedValues PV;
1728
6
  PV.IterCount = IterCount;
1729
6
  DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n');
1730
6
1731
6
  setupSimplifier();
1732
6
1733
6
  // Perform a preliminary scan of select instructions to see if any of them
1734
6
  // looks like a generator of the polynomial multiply steps. Assume that a
1735
6
  // loop can only contain a single transformable operation, so stop the
1736
6
  // traversal after the first reasonable candidate was found.
1737
6
  // XXX: Currently this approach can modify the loop before being 100% sure
1738
6
  // that the transformation can be carried out.
1739
6
  bool FoundPreScan = false;
1740
118
  for (Instruction &In : *LoopB) {
1741
118
    SelectInst *SI = dyn_cast<SelectInst>(&In);
1742
118
    if (!SI)
1743
99
      continue;
1744
19
1745
19
    Simplifier::Context C(SI);
1746
19
    Value *T = Simp.simplify(C);
1747
19
    SelectInst *SelI = (T && 
isa<SelectInst>(T)18
) ?
cast<SelectInst>(T)18
:
SI1
;
1748
19
    DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
1749
19
    if (
scanSelect(SelI, LoopB, EntryB, CIV, PV, true)19
) {
1750
2
      FoundPreScan = true;
1751
2
      if (
SelI != SI2
) {
1752
2
        Value *NewSel = C.materialize(LoopB, SI->getIterator());
1753
2
        SI->replaceAllUsesWith(NewSel);
1754
2
        RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI);
1755
2
      }
1756
2
      break;
1757
2
    }
1758
6
  }
1759
6
1760
6
  if (
!FoundPreScan6
) {
1761
4
    DEBUG(dbgs() << "Have not found candidates for pmpy\n");
1762
4
    return false;
1763
4
  }
1764
2
1765
2
  
if (2
!PV.Left2
) {
1766
1
    // The right shift version actually only returns the higher bits of
1767
1
    // the result (each iteration discards the LSB). If we want to convert it
1768
1
    // to a left-shifting loop, the working data type must be at least as
1769
1
    // wide as the target's pmpy instruction.
1770
1
    if (!promoteTypes(LoopB, ExitB))
1771
0
      return false;
1772
1
    
if (1
!convertShiftsToLeft(LoopB, ExitB, IterCount)1
)
1773
0
      return false;
1774
1
    cleanupLoopBody(LoopB);
1775
1
  }
1776
2
1777
2
  // Scan the loop again, find the generating select instruction.
1778
2
  bool FoundScan = false;
1779
18
  for (Instruction &In : *LoopB) {
1780
18
    SelectInst *SelI = dyn_cast<SelectInst>(&In);
1781
18
    if (!SelI)
1782
16
      continue;
1783
2
    
DEBUG2
(dbgs() << "scanSelect: " << *SelI << '\n');
1784
2
    FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
1785
2
    if (FoundScan)
1786
2
      break;
1787
2
  }
1788
2
  assert(FoundScan);
1789
2
1790
2
  DEBUG({
1791
2
    StringRef PP = (PV.M ? "(P+M)" : "P");
1792
2
    if (!PV.Inv)
1793
2
      dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
1794
2
    else
1795
2
      dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + "
1796
2
             << PP << "\n";
1797
2
    dbgs() << "  Res:" << *PV.Res << "\n  P:" << *PV.P << "\n";
1798
2
    if (PV.M)
1799
2
      dbgs() << "  M:" << *PV.M << "\n";
1800
2
    dbgs() << "  Q:" << *PV.Q << "\n";
1801
2
    dbgs() << "  Iteration count:" << PV.IterCount << "\n";
1802
2
  });
1803
2
1804
2
  BasicBlock::iterator At(EntryB->getTerminator());
1805
2
  Value *PM = generate(At, PV);
1806
2
  if (PM == nullptr)
1807
0
    return false;
1808
2
1809
2
  
if (2
PM->getType() != PV.Res->getType()2
)
1810
1
    PM = IRBuilder<>(&*At).CreateIntCast(PM, PV.Res->getType(), false);
1811
7
1812
7
  PV.Res->replaceAllUsesWith(PM);
1813
7
  PV.Res->eraseFromParent();
1814
7
  return true;
1815
7
}
1816
1817
6
unsigned HexagonLoopIdiomRecognize::getStoreSizeInBytes(StoreInst *SI) {
1818
6
  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
1819
6
  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
1820
6
         "Don't overflow unsigned.");
1821
6
  return (unsigned)SizeInBits >> 3;
1822
6
}
1823
1824
9
int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
1825
9
  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
1826
9
    return SC->getAPInt().getSExtValue();
1827
0
  return 0;
1828
0
}
1829
1830
3
bool HexagonLoopIdiomRecognize::isLegalStore(Loop *CurLoop, StoreInst *SI) {
1831
3
  // Allow volatile stores if HexagonVolatileMemcpy is enabled.
1832
3
  if (
!(SI->isVolatile() && 3
HexagonVolatileMemcpy0
) &&
!SI->isSimple()3
)
1833
0
    return false;
1834
3
1835
3
  Value *StoredVal = SI->getValueOperand();
1836
3
  Value *StorePtr = SI->getPointerOperand();
1837
3
1838
3
  // Reject stores that are so large that they overflow an unsigned.
1839
3
  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
1840
3
  if (
(SizeInBits & 7) || 3
(SizeInBits >> 32) != 03
)
1841
0
    return false;
1842
3
1843
3
  // See if the pointer expression is an AddRec like {base,+,1} on the current
1844
3
  // loop, which indicates a strided store.  If we have something else, it's a
1845
3
  // random store we can't handle.
1846
3
  auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1847
3
  if (
!StoreEv || 3
StoreEv->getLoop() != CurLoop3
||
!StoreEv->isAffine()3
)
1848
0
    return false;
1849
3
1850
3
  // Check to see if the stride matches the size of the store.  If so, then we
1851
3
  // know that every byte is touched in the loop.
1852
3
  int Stride = getSCEVStride(StoreEv);
1853
3
  if (Stride == 0)
1854
0
    return false;
1855
3
  unsigned StoreSize = getStoreSizeInBytes(SI);
1856
3
  if (StoreSize != unsigned(std::abs(Stride)))
1857
0
    return false;
1858
3
1859
3
  // The store must be feeding a non-volatile load.
1860
3
  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
1861
3
  if (
!LI || 3
!LI->isSimple()3
)
1862
0
    return false;
1863
3
1864
3
  // See if the pointer expression is an AddRec like {base,+,1} on the current
1865
3
  // loop, which indicates a strided load.  If we have something else, it's a
1866
3
  // random load we can't handle.
1867
3
  Value *LoadPtr = LI->getPointerOperand();
1868
3
  auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1869
3
  if (
!LoadEv || 3
LoadEv->getLoop() != CurLoop3
||
!LoadEv->isAffine()3
)
1870
0
    return false;
1871
3
1872
3
  // The store and load must share the same stride.
1873
3
  
if (3
StoreEv->getOperand(1) != LoadEv->getOperand(1)3
)
1874
0
    return false;
1875
3
1876
3
  // Success.  This store can be converted into a memcpy.
1877
3
  return true;
1878
3
}
1879
1880
/// mayLoopAccessLocation - Return true if the specified loop might access the
1881
/// specified pointer location, which is a loop-strided access.  The 'Access'
1882
/// argument specifies what the verboten forms of access are (read or write).
1883
static bool
1884
mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
1885
                      const SCEV *BECount, unsigned StoreSize,
1886
                      AliasAnalysis &AA,
1887
9
                      SmallPtrSetImpl<Instruction *> &Ignored) {
1888
9
  // Get the location that may be stored across the loop.  Since the access
1889
9
  // is strided positively through memory, we say that the modified location
1890
9
  // starts at the pointer and has infinite size.
1891
9
  uint64_t AccessSize = MemoryLocation::UnknownSize;
1892
9
1893
9
  // If the loop iterates a fixed number of times, we can refine the access
1894
9
  // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
1895
9
  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
1896
0
    AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
1897
9
1898
9
  // TODO: For this to be really effective, we have to dive into the pointer
1899
9
  // operand in the store.  Store to &A[i] of 100 will always return may alias
1900
9
  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1901
9
  // which will then no-alias a store to &A[100].
1902
9
  MemoryLocation StoreLoc(Ptr, AccessSize);
1903
9
1904
9
  for (auto *B : L->blocks())
1905
11
    for (auto &I : *B)
1906
64
      
if (64
Ignored.count(&I) == 0 && 64
(AA.getModRefInfo(&I, StoreLoc) & Access)55
)
1907
3
        return true;
1908
6
1909
6
  return false;
1910
6
}
1911
1912
void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB,
1913
5
      SmallVectorImpl<StoreInst*> &Stores) {
1914
5
  Stores.clear();
1915
5
  for (Instruction &I : *BB)
1916
100
    
if (StoreInst *100
SI100
= dyn_cast<StoreInst>(&I))
1917
3
      
if (3
isLegalStore(CurLoop, SI)3
)
1918
3
        Stores.push_back(SI);
1919
5
}
1920
1921
bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
1922
3
      StoreInst *SI, const SCEV *BECount) {
1923
3
  assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) &&
1924
3
         "Expected only non-volatile stores, or Hexagon-specific memcpy"
1925
3
         "to volatile destination.");
1926
3
1927
3
  Value *StorePtr = SI->getPointerOperand();
1928
3
  auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1929
3
  unsigned Stride = getSCEVStride(StoreEv);
1930
3
  unsigned StoreSize = getStoreSizeInBytes(SI);
1931
3
  if (Stride != StoreSize)
1932
0
    return false;
1933
3
1934
3
  // See if the pointer expression is an AddRec like {base,+,1} on the current
1935
3
  // loop, which indicates a strided load.  If we have something else, it's a
1936
3
  // random load we can't handle.
1937
3
  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
1938
3
  auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1939
3
1940
3
  // The trip count of the loop and the base pointer of the addrec SCEV is
1941
3
  // guaranteed to be loop invariant, which means that it should dominate the
1942
3
  // header.  This allows us to insert code for it in the preheader.
1943
3
  BasicBlock *Preheader = CurLoop->getLoopPreheader();
1944
3
  Instruction *ExpPt = Preheader->getTerminator();
1945
3
  IRBuilder<> Builder(ExpPt);
1946
3
  SCEVExpander Expander(*SE, *DL, "hexagon-loop-idiom");
1947
3
1948
3
  Type *IntPtrTy = Builder.getIntPtrTy(*DL, SI->getPointerAddressSpace());
1949
3
1950
3
  // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
1951
3
  // this into a memcpy/memmove in the loop preheader now if we want.  However,
1952
3
  // this would be unsafe to do if there is anything else in the loop that may
1953
3
  // read or write the memory region we're storing to.  For memcpy, this
1954
3
  // includes the load that feeds the stores.  Check for an alias by generating
1955
3
  // the base address and checking everything.
1956
3
  Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(),
1957
3
      Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt);
1958
3
  Value *LoadBasePtr = nullptr;
1959
3
1960
3
  bool Overlap = false;
1961
3
  bool DestVolatile = SI->isVolatile();
1962
3
  Type *BECountTy = BECount->getType();
1963
3
1964
3
  if (
DestVolatile3
) {
1965
0
    // The trip count must fit in i32, since it is the type of the "num_words"
1966
0
    // argument to hexagon_memcpy_forward_vp4cp4n2.
1967
0
    if (
StoreSize != 4 || 0
DL->getTypeSizeInBits(BECountTy) > 320
) {
1968
0
CleanupAndExit:
1969
0
      // If we generated new code for the base pointer, clean up.
1970
0
      Expander.clear();
1971
0
      if (
StoreBasePtr && 0
(LoadBasePtr != StoreBasePtr)0
) {
1972
0
        RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1973
0
        StoreBasePtr = nullptr;
1974
0
      }
1975
0
      if (
LoadBasePtr0
) {
1976
0
        RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
1977
0
        LoadBasePtr = nullptr;
1978
0
      }
1979
0
      return false;
1980
3
    }
1981
0
  }
1982
3
1983
3
  SmallPtrSet<Instruction*, 2> Ignore1;
1984
3
  Ignore1.insert(SI);
1985
3
  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
1986
3
                            StoreSize, *AA, Ignore1)) {
1987
3
    // Check if the load is the offending instruction.
1988
3
    Ignore1.insert(LI);
1989
3
    if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
1990
3
                              StoreSize, *AA, Ignore1)) {
1991
0
      // Still bad. Nothing we can do.
1992
0
      goto CleanupAndExit;
1993
0
    }
1994
3
    // It worked with the load ignored.
1995
3
    Overlap = true;
1996
3
  }
1997
3
1998
3
  
if (3
!Overlap3
) {
1999
0
    if (
DisableMemcpyIdiom || 0
!HasMemcpy0
)
2000
0
      goto CleanupAndExit;
2001
3
  } else {
2002
3
    // Don't generate memmove if this function will be inlined. This is
2003
3
    // because the caller will undergo this transformation after inlining.
2004
3
    Function *Func = CurLoop->getHeader()->getParent();
2005
3
    if (Func->hasFnAttribute(Attribute::AlwaysInline))
2006
0
      goto CleanupAndExit;
2007
3
2008
3
    // In case of a memmove, the call to memmove will be executed instead
2009
3
    // of the loop, so we need to make sure that there is nothing else in
2010
3
    // the loop than the load, store and instructions that these two depend
2011
3
    // on.
2012
3
    SmallVector<Instruction*,2> Insts;
2013
3
    Insts.push_back(SI);
2014
3
    Insts.push_back(LI);
2015
3
    if (!coverLoop(CurLoop, Insts))
2016
0
      goto CleanupAndExit;
2017
3
2018
3
    
if (3
DisableMemmoveIdiom || 3
!HasMemmove3
)
2019
0
      goto CleanupAndExit;
2020
3
    bool IsNested = CurLoop->getParentLoop() != nullptr;
2021
3
    if (
IsNested && 3
OnlyNonNestedMemmove0
)
2022
0
      goto CleanupAndExit;
2023
3
  }
2024
3
2025
3
  // For a memcpy, we have to make sure that the input array is not being
2026
3
  // mutated by the loop.
2027
3
  LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(),
2028
3
      Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt);
2029
3
2030
3
  SmallPtrSet<Instruction*, 2> Ignore2;
2031
3
  Ignore2.insert(SI);
2032
3
  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
2033
3
                            *AA, Ignore2))
2034
0
    goto CleanupAndExit;
2035
3
2036
3
  // Check the stride.
2037
3
  bool StridePos = getSCEVStride(LoadEv) >= 0;
2038
3
2039
3
  // Currently, the volatile memcpy only emulates traversing memory forward.
2040
3
  if (
!StridePos && 3
DestVolatile0
)
2041
0
    goto CleanupAndExit;
2042
3
2043
3
  
bool RuntimeCheck = (Overlap || 3
DestVolatile0
);
2044
3
2045
3
  BasicBlock *ExitB;
2046
3
  if (
RuntimeCheck3
) {
2047
3
    // The runtime check needs a single exit block.
2048
3
    SmallVector<BasicBlock*, 8> ExitBlocks;
2049
3
    CurLoop->getUniqueExitBlocks(ExitBlocks);
2050
3
    if (ExitBlocks.size() != 1)
2051
0
      goto CleanupAndExit;
2052
3
    ExitB = ExitBlocks[0];
2053
3
  }
2054
3
2055
3
  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
2056
3
  // pointer size if it isn't already.
2057
3
  LLVMContext &Ctx = SI->getContext();
2058
3
  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
2059
3
  unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment());
2060
3
  DebugLoc DLoc = SI->getDebugLoc();
2061
3
2062
3
  const SCEV *NumBytesS =
2063
3
      SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
2064
3
  if (StoreSize != 1)
2065
3
    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
2066
3
                               SCEV::FlagNUW);
2067
3
  Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
2068
3
  if (Instruction *In = dyn_cast<Instruction>(NumBytes))
2069
3
    
if (Value *3
Simp3
= SimplifyInstruction(In, {*DL, TLI, DT}))
2070
0
      NumBytes = Simp;
2071
3
2072
3
  CallInst *NewCall;
2073
3
2074
3
  if (
RuntimeCheck3
) {
2075
3
    unsigned Threshold = RuntimeMemSizeThreshold;
2076
3
    if (ConstantInt *
CI3
= dyn_cast<ConstantInt>(NumBytes)) {
2077
0
      uint64_t C = CI->getZExtValue();
2078
0
      if (
Threshold != 0 && 0
C < Threshold0
)
2079
0
        goto CleanupAndExit;
2080
0
      
if (0
C < CompileTimeMemSizeThreshold0
)
2081
0
        goto CleanupAndExit;
2082
3
    }
2083
3
2084
3
    BasicBlock *Header = CurLoop->getHeader();
2085
3
    Function *Func = Header->getParent();
2086
3
    Loop *ParentL = LF->getLoopFor(Preheader);
2087
3
    StringRef HeaderName = Header->getName();
2088
3
2089
3
    // Create a new (empty) preheader, and update the PHI nodes in the
2090
3
    // header to use the new preheader.
2091
3
    BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
2092
3
                                                  Func, Header);
2093
3
    if (ParentL)
2094
0
      ParentL->addBasicBlockToLoop(NewPreheader, *LF);
2095
3
    IRBuilder<>(NewPreheader).CreateBr(Header);
2096
8
    for (auto &In : *Header) {
2097
8
      PHINode *PN = dyn_cast<PHINode>(&In);
2098
8
      if (!PN)
2099
3
        break;
2100
5
      int bx = PN->getBasicBlockIndex(Preheader);
2101
5
      if (bx >= 0)
2102
5
        PN->setIncomingBlock(bx, NewPreheader);
2103
8
    }
2104
3
    DT->addNewBlock(NewPreheader, Preheader);
2105
3
    DT->changeImmediateDominator(Header, NewPreheader);
2106
3
2107
3
    // Check for safe conditions to execute memmove.
2108
3
    // If stride is positive, copying things from higher to lower addresses
2109
3
    // is equivalent to memmove.  For negative stride, it's the other way
2110
3
    // around.  Copying forward in memory with positive stride may not be
2111
3
    // same as memmove since we may be copying values that we just stored
2112
3
    // in some previous iteration.
2113
3
    Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
2114
3
    Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
2115
3
    Value *LowA = StridePos ? 
SA3
:
LA0
;
2116
3
    Value *HighA = StridePos ? 
LA3
:
SA0
;
2117
3
    Value *CmpA = Builder.CreateICmpULT(LowA, HighA);
2118
3
    Value *Cond = CmpA;
2119
3
2120
3
    // Check for distance between pointers. Since the case LowA < HighA
2121
3
    // is checked for above, assume LowA >= HighA.
2122
3
    Value *Dist = Builder.CreateSub(LowA, HighA);
2123
3
    Value *CmpD = Builder.CreateICmpSLE(NumBytes, Dist);
2124
3
    Value *CmpEither = Builder.CreateOr(Cond, CmpD);
2125
3
    Cond = CmpEither;
2126
3
2127
3
    if (
Threshold != 03
) {
2128
0
      Type *Ty = NumBytes->getType();
2129
0
      Value *Thr = ConstantInt::get(Ty, Threshold);
2130
0
      Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
2131
0
      Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
2132
0
      Cond = CmpBoth;
2133
0
    }
2134
3
    BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli",
2135
3
                                              Func, NewPreheader);
2136
3
    if (ParentL)
2137
0
      ParentL->addBasicBlockToLoop(MemmoveB, *LF);
2138
3
    Instruction *OldT = Preheader->getTerminator();
2139
3
    Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
2140
3
    OldT->eraseFromParent();
2141
3
    Preheader->setName(Preheader->getName()+".old");
2142
3
    DT->addNewBlock(MemmoveB, Preheader);
2143
3
    // Find the new immediate dominator of the exit block.
2144
3
    BasicBlock *ExitD = Preheader;
2145
6
    for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); 
PI != PE6
;
++PI3
) {
2146
3
      BasicBlock *PB = *PI;
2147
3
      ExitD = DT->findNearestCommonDominator(ExitD, PB);
2148
3
      if (!ExitD)
2149
0
        break;
2150
3
    }
2151
3
    // If the prior immediate dominator of ExitB was dominated by the
2152
3
    // old preheader, then the old preheader becomes the new immediate
2153
3
    // dominator.  Otherwise don't change anything (because the newly
2154
3
    // added blocks are dominated by the old preheader).
2155
3
    if (
ExitD && 3
DT->dominates(Preheader, ExitD)3
) {
2156
3
      DomTreeNode *BN = DT->getNode(ExitB);
2157
3
      DomTreeNode *DN = DT->getNode(ExitD);
2158
3
      BN->setIDom(DN);
2159
3
    }
2160
3
2161
3
    // Add a call to memmove to the conditional block.
2162
3
    IRBuilder<> CondBuilder(MemmoveB);
2163
3
    CondBuilder.CreateBr(ExitB);
2164
3
    CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
2165
3
2166
3
    if (
DestVolatile3
) {
2167
0
      Type *Int32Ty = Type::getInt32Ty(Ctx);
2168
0
      Type *Int32PtrTy = Type::getInt32PtrTy(Ctx);
2169
0
      Type *VoidTy = Type::getVoidTy(Ctx);
2170
0
      Module *M = Func->getParent();
2171
0
      Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy,
2172
0
                                            Int32PtrTy, Int32PtrTy, Int32Ty);
2173
0
      Function *Fn = cast<Function>(CF);
2174
0
      Fn->setLinkage(Function::ExternalLinkage);
2175
0
2176
0
      const SCEV *OneS = SE->getConstant(Int32Ty, 1);
2177
0
      const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
2178
0
      const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW);
2179
0
      Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
2180
0
                                               MemmoveB->getTerminator());
2181
0
      if (Instruction *In = dyn_cast<Instruction>(NumWords))
2182
0
        
if (Value *0
Simp0
= SimplifyInstruction(In, {*DL, TLI, DT}))
2183
0
          NumWords = Simp;
2184
0
2185
0
      Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy)
2186
0
                      ? StoreBasePtr
2187
0
                      : CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy);
2188
0
      Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy)
2189
0
                      ? LoadBasePtr
2190
0
                      : CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
2191
0
      NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
2192
3
    } else {
2193
3
      NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr,
2194
3
                                          NumBytes, Alignment);
2195
3
    }
2196
0
  } else {
2197
0
    NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr,
2198
0
                                   NumBytes, Alignment);
2199
0
    // Okay, the memcpy has been formed.  Zap the original store and
2200
0
    // anything that feeds into it.
2201
0
    RecursivelyDeleteTriviallyDeadInstructions(SI, TLI);
2202
0
  }
2203
3
2204
3
  NewCall->setDebugLoc(DLoc);
2205
3
2206
3
  DEBUG(dbgs() << "  Formed " << (Overlap ? "memmove: " : "memcpy: ")
2207
3
               << *NewCall << "\n"
2208
3
               << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
2209
3
               << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
2210
3
2211
3
  return true;
2212
3
}
2213
2214
// \brief Check if the instructions in Insts, together with their dependencies
2215
// cover the loop in the sense that the loop could be safely eliminated once
2216
// the instructions in Insts are removed.
2217
bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
2218
3
      SmallVectorImpl<Instruction*> &Insts) const {
2219
3
  SmallSet<BasicBlock*,8> LoopBlocks;
2220
3
  for (auto *B : L->blocks())
2221
4
    LoopBlocks.insert(B);
2222
3
2223
3
  SetVector<Instruction*> Worklist(Insts.begin(), Insts.end());
2224
3
2225
3
  // Collect all instructions from the loop that the instructions in Insts
2226
3
  // depend on (plus their dependencies, etc.).  These instructions will
2227
3
  // constitute the expression trees that feed those in Insts, but the trees
2228
3
  // will be limited only to instructions contained in the loop.
2229
21
  for (unsigned i = 0; 
i < Worklist.size()21
;
++i18
) {
2230
18
    Instruction *In = Worklist[i];
2231
51
    for (auto I = In->op_begin(), E = In->op_end(); 
I != E51
;
++I33
) {
2232
33
      Instruction *OpI = dyn_cast<Instruction>(I);
2233
33
      if (!OpI)
2234
8
        continue;
2235
25
      BasicBlock *PB = OpI->getParent();
2236
25
      if (!LoopBlocks.count(PB))
2237
4
        continue;
2238
21
      Worklist.insert(OpI);
2239
21
    }
2240
18
  }
2241
3
2242
3
  // Scan all instructions in the loop, if any of them have a user outside
2243
3
  // of the loop, or outside of the expressions collected above, then either
2244
3
  // the loop has a side-effect visible outside of it, or there are
2245
3
  // instructions in it that are not involved in the original set Insts.
2246
4
  for (auto *B : L->blocks()) {
2247
27
    for (auto &In : *B) {
2248
27
      if (
isa<BranchInst>(In) || 27
isa<DbgInfoIntrinsic>(In)23
)
2249
4
        continue;
2250
23
      
if (23
!Worklist.count(&In) && 23
In.mayHaveSideEffects()5
)
2251
0
        return false;
2252
23
      
for (const auto &K : In.users()) 23
{
2253
29
        Instruction *UseI = dyn_cast<Instruction>(K);
2254
29
        if (!UseI)
2255
0
          continue;
2256
29
        BasicBlock *UseB = UseI->getParent();
2257
29
        if (LF->getLoopFor(UseB) != L)
2258
0
          return false;
2259
3
      }
2260
27
    }
2261
4
  }
2262
3
2263
3
  return true;
2264
3
}
2265
2266
/// runOnLoopBlock - Process the specified block, which lives in a counted loop
2267
/// with the specified backedge count.  This block is known to be in the current
2268
/// loop and not in any subloops.
2269
bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB,
2270
6
      const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks) {
2271
6
  // We can only promote stores in this block if they are unconditionally
2272
6
  // executed in the loop.  For a block to be unconditionally executed, it has
2273
6
  // to dominate all the exit blocks of the loop.  Verify this now.
2274
6
  auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
2275
6
    return DT->dominates(BB, EB);
2276
6
  };
2277
6
  if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB))
2278
1
    return false;
2279
5
2280
5
  bool MadeChange = false;
2281
5
  // Look for store instructions, which may be optimized to memset/memcpy.
2282
5
  SmallVector<StoreInst*,8> Stores;
2283
5
  collectStores(CurLoop, BB, Stores);
2284
5
2285
5
  // Optimize the store into a memcpy, if it feeds an similarly strided load.
2286
5
  for (auto &SI : Stores)
2287
3
    MadeChange |= processCopyingStore(CurLoop, SI, BECount);
2288
6
2289
6
  return MadeChange;
2290
6
}
2291
2292
7
bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
2293
7
  PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE);
2294
7
  if (PMR.recognize())
2295
2
    return true;
2296
5
2297
5
  
if (5
!HasMemcpy && 5
!HasMemmove0
)
2298
0
    return false;
2299
5
2300
5
  const SCEV *BECount = SE->getBackedgeTakenCount(L);
2301
5
  assert(!isa<SCEVCouldNotCompute>(BECount) &&
2302
5
         "runOnCountableLoop() called on a loop without a predictable"
2303
5
         "backedge-taken count");
2304
5
2305
5
  SmallVector<BasicBlock *, 8> ExitBlocks;
2306
5
  L->getUniqueExitBlocks(ExitBlocks);
2307
5
2308
5
  bool Changed = false;
2309
5
2310
5
  // Scan all the blocks in the loop that are not in subloops.
2311
6
  for (auto *BB : L->getBlocks()) {
2312
6
    // Ignore blocks in subloops.
2313
6
    if (LF->getLoopFor(BB) != L)
2314
0
      continue;
2315
6
    Changed |= runOnLoopBlock(L, BB, BECount, ExitBlocks);
2316
6
  }
2317
7
2318
7
  return Changed;
2319
7
}
2320
2321
8
bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
2322
8
  const Module &M = *L->getHeader()->getParent()->getParent();
2323
8
  if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
2324
0
    return false;
2325
8
2326
8
  
if (8
skipLoop(L)8
)
2327
0
    return false;
2328
8
2329
8
  // If the loop could not be converted to canonical form, it must have an
2330
8
  // indirectbr in it, just give up.
2331
8
  
if (8
!L->getLoopPreheader()8
)
2332
0
    return false;
2333
8
2334
8
  // Disable loop idiom recognition if the function's name is a common idiom.
2335
8
  StringRef Name = L->getHeader()->getParent()->getName();
2336
8
  if (
Name == "memset" || 8
Name == "memcpy"8
||
Name == "memmove"8
)
2337
0
    return false;
2338
8
2339
8
  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2340
8
  DL = &L->getHeader()->getModule()->getDataLayout();
2341
8
  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2342
8
  LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2343
8
  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
2344
8
  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2345
8
2346
8
  HasMemcpy = TLI->has(LibFunc_memcpy);
2347
8
  HasMemmove = TLI->has(LibFunc_memmove);
2348
8
2349
8
  if (SE->hasLoopInvariantBackedgeTakenCount(L))
2350
7
    return runOnCountableLoop(L);
2351
1
  return false;
2352
1
}
2353
2354
8
Pass *llvm::createHexagonLoopIdiomPass() {
2355
8
  return new HexagonLoopIdiomRecognize();
2356
8
}