Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Scalar/MergeICmps.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This pass turns chains of integer comparisons into memcmp (the memcmp is
10
// later typically inlined as a chain of efficient hardware comparisons). This
11
// typically benefits c++ member or nonmember operator==().
12
//
13
// The basic idea is to replace a longer chain of integer comparisons loaded
14
// from contiguous memory locations into a shorter chain of larger integer
15
// comparisons. Benefits are double:
16
//  - There are less jumps, and therefore less opportunities for mispredictions
17
//    and I-cache misses.
18
//  - Code size is smaller, both because jumps are removed and because the
19
//    encoding of a 2*n byte compare is smaller than that of two n-byte
20
//    compares.
21
//
22
// Example:
23
//
24
//  struct S {
25
//    int a;
26
//    char b;
27
//    char c;
28
//    uint16_t d;
29
//    bool operator==(const S& o) const {
30
//      return a == o.a && b == o.b && c == o.c && d == o.d;
31
//    }
32
//  };
33
//
34
//  Is optimized as :
35
//
36
//    bool S::operator==(const S& o) const {
37
//      return memcmp(this, &o, 8) == 0;
38
//    }
39
//
40
//  Which will later be expanded (ExpandMemCmp) as a single 8-bytes icmp.
41
//
42
//===----------------------------------------------------------------------===//
43
44
#include "llvm/Transforms/Scalar/MergeICmps.h"
45
#include "llvm/Analysis/DomTreeUpdater.h"
46
#include "llvm/Analysis/GlobalsModRef.h"
47
#include "llvm/Analysis/Loads.h"
48
#include "llvm/Analysis/TargetLibraryInfo.h"
49
#include "llvm/Analysis/TargetTransformInfo.h"
50
#include "llvm/IR/Dominators.h"
51
#include "llvm/IR/Function.h"
52
#include "llvm/IR/IRBuilder.h"
53
#include "llvm/Pass.h"
54
#include "llvm/Transforms/Scalar.h"
55
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
56
#include "llvm/Transforms/Utils/BuildLibCalls.h"
57
#include <algorithm>
58
#include <numeric>
59
#include <utility>
60
#include <vector>
61
62
using namespace llvm;
63
64
namespace {
65
66
#define DEBUG_TYPE "mergeicmps"
67
68
// Returns true if the instruction is a simple load or a simple store
69
3
static bool isSimpleLoadOrStore(const Instruction *I) {
70
3
  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
71
0
    return LI->isSimple();
72
3
  if (const StoreInst *SI = dyn_cast<StoreInst>(I))
73
1
    return SI->isSimple();
74
2
  return false;
75
2
}
76
77
// A BCE atom "Binary Compare Expression Atom" represents an integer load
78
// that is a constant offset from a base value, e.g. `a` or `o.c` in the example
79
// at the top.
80
struct BCEAtom {
81
45
  BCEAtom() = default;
82
  BCEAtom(GetElementPtrInst *GEP, LoadInst *LoadI, int BaseId, APInt Offset)
83
82
      : GEP(GEP), LoadI(LoadI), BaseId(BaseId), Offset(Offset) {}
84
85
  BCEAtom(const BCEAtom &) = delete;
86
  BCEAtom &operator=(const BCEAtom &) = delete;
87
88
400
  BCEAtom(BCEAtom &&that) = default;
89
20
  BCEAtom &operator=(BCEAtom &&that) {
90
20
    if (this == &that)
91
0
      return *this;
92
20
    GEP = that.GEP;
93
20
    LoadI = that.LoadI;
94
20
    BaseId = that.BaseId;
95
20
    Offset = std::move(that.Offset);
96
20
    return *this;
97
20
  }
98
99
  // We want to order BCEAtoms by (Base, Offset). However we cannot use
100
  // the pointer values for Base because these are non-deterministic.
101
  // To make sure that the sort order is stable, we first assign to each atom
102
  // base value an index based on its order of appearance in the chain of
103
  // comparisons. We call this index `BaseOrdering`. For example, for:
104
  //    b[3] == c[2] && a[1] == d[1] && b[4] == c[3]
105
  //    |  block 1 |    |  block 2 |    |  block 3 |
106
  // b gets assigned index 0 and a index 1, because b appears as LHS in block 1,
107
  // which is before block 2.
108
  // We then sort by (BaseOrdering[LHS.Base()], LHS.Offset), which is stable.
109
88
  bool operator<(const BCEAtom &O) const {
110
88
    return BaseId != O.BaseId ? 
BaseId < O.BaseId41
:
Offset.slt(O.Offset)47
;
111
88
  }
112
113
  GetElementPtrInst *GEP = nullptr;
114
  LoadInst *LoadI = nullptr;
115
  unsigned BaseId = 0;
116
  APInt Offset;
117
};
118
119
// A class that assigns increasing ids to values in the order in which they are
120
// seen. See comment in `BCEAtom::operator<()``.
121
class BaseIdentifier {
122
public:
123
  // Returns the id for value `Base`, after assigning one if `Base` has not been
124
  // seen before.
125
82
  int getBaseId(const Value *Base) {
126
82
    assert(Base && "invalid base");
127
82
    const auto Insertion = BaseToIndex.try_emplace(Base, Order);
128
82
    if (Insertion.second)
129
32
      ++Order;
130
82
    return Insertion.first->second;
131
82
  }
132
133
private:
134
  unsigned Order = 1;
135
  DenseMap<const Value*, int> BaseToIndex;
136
};
137
138
// If this value is a load from a constant offset w.r.t. a base address, and
139
// there are no other users of the load or address, returns the base address and
140
// the offset.
141
91
BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
142
91
  auto *const LoadI = dyn_cast<LoadInst>(Val);
143
91
  if (!LoadI)
144
5
    return {};
145
86
  LLVM_DEBUG(dbgs() << "load\n");
146
86
  if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
147
0
    LLVM_DEBUG(dbgs() << "used outside of block\n");
148
0
    return {};
149
0
  }
150
86
  // Do not optimize atomic loads to non-atomic memcmp
151
86
  if (!LoadI->isSimple()) {
152
2
    LLVM_DEBUG(dbgs() << "volatile or atomic\n");
153
2
    return {};
154
2
  }
155
84
  Value *const Addr = LoadI->getOperand(0);
156
84
  auto *const GEP = dyn_cast<GetElementPtrInst>(Addr);
157
84
  if (!GEP)
158
0
    return {};
159
84
  LLVM_DEBUG(dbgs() << "GEP\n");
160
84
  if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) {
161
2
    LLVM_DEBUG(dbgs() << "used outside of block\n");
162
2
    return {};
163
2
  }
164
82
  const auto &DL = GEP->getModule()->getDataLayout();
165
82
  if (!isDereferenceablePointer(GEP, LoadI->getType(), DL)) {
166
0
    LLVM_DEBUG(dbgs() << "not dereferenceable\n");
167
0
    // We need to make sure that we can do comparison in any order, so we
168
0
    // require memory to be unconditionnally dereferencable.
169
0
    return {};
170
0
  }
171
82
  APInt Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
172
82
  if (!GEP->accumulateConstantOffset(DL, Offset))
173
0
    return {};
174
82
  return BCEAtom(GEP, LoadI, BaseId.getBaseId(GEP->getPointerOperand()),
175
82
                 Offset);
176
82
}
177
178
// A basic block with a comparison between two BCE atoms, e.g. `a == o.a` in the
179
// example at the top.
180
// The block might do extra work besides the atom comparison, in which case
181
// doesOtherWork() returns true. Under some conditions, the block can be
182
// split into the atom comparison part and the "other work" part
183
// (see canSplit()).
184
// Note: the terminology is misleading: the comparison is symmetric, so there
185
// is no real {l/r}hs. What we want though is to have the same base on the
186
// left (resp. right), so that we can detect consecutive loads. To ensure this
187
// we put the smallest atom on the left.
188
class BCECmpBlock {
189
 public:
190
18
  BCECmpBlock() {}
191
192
  BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits)
193
41
      : Lhs_(std::move(L)), Rhs_(std::move(R)), SizeBits_(SizeBits) {
194
41
    if (Rhs_ < Lhs_) 
std::swap(Rhs_, Lhs_)0
;
195
41
  }
196
197
100
  bool IsValid() const { return Lhs_.BaseId != 0 && 
Rhs_.BaseId != 082
; }
198
199
  // Assert the block is consistent: If valid, it should also have
200
  // non-null members besides Lhs_ and Rhs_.
201
41
  void AssertConsistent() const {
202
41
    if (IsValid()) {
203
41
      assert(BB);
204
41
      assert(CmpI);
205
41
      assert(BranchI);
206
41
    }
207
41
  }
208
209
185
  const BCEAtom &Lhs() const { return Lhs_; }
210
177
  const BCEAtom &Rhs() const { return Rhs_; }
211
84
  int SizeBits() const { return SizeBits_; }
212
213
  // Returns true if the block does other works besides comparison.
214
  bool doesOtherWork() const;
215
216
  // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp
217
  // instructions in the block.
218
  bool canSplit(AliasAnalysis &AA) const;
219
220
  // Return true if this all the relevant instructions in the BCE-cmp-block can
221
  // be sunk below this instruction. By doing this, we know we can separate the
222
  // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the
223
  // block.
224
  bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &,
225
                         AliasAnalysis &AA) const;
226
227
  // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block
228
  // instructions. Split the old block and move all non-BCE-cmp-insts into the
229
  // new parent block.
230
  void split(BasicBlock *NewParent, AliasAnalysis &AA) const;
231
232
  // The basic block where this comparison happens.
233
  BasicBlock *BB = nullptr;
234
  // The ICMP for this comparison.
235
  ICmpInst *CmpI = nullptr;
236
  // The terminating branch.
237
  BranchInst *BranchI = nullptr;
238
  // The block requires splitting.
239
  bool RequireSplit = false;
240
241
private:
242
  BCEAtom Lhs_;
243
  BCEAtom Rhs_;
244
  int SizeBits_ = 0;
245
};
246
247
bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
248
                                    DenseSet<Instruction *> &BlockInsts,
249
10
                                    AliasAnalysis &AA) const {
250
10
  // If this instruction has side effects and its in middle of the BCE cmp block
251
10
  // instructions, then bail for now.
252
10
  if (Inst->mayHaveSideEffects()) {
253
3
    // Bail if this is not a simple load or store
254
3
    if (!isSimpleLoadOrStore(Inst))
255
2
      return false;
256
1
    // Disallow stores that might alias the BCE operands
257
1
    MemoryLocation LLoc = MemoryLocation::get(Lhs_.LoadI);
258
1
    MemoryLocation RLoc = MemoryLocation::get(Rhs_.LoadI);
259
1
    if (isModSet(AA.getModRefInfo(Inst, LLoc)) ||
260
1
        isModSet(AA.getModRefInfo(Inst, RLoc)))
261
0
      return false;
262
8
  }
263
8
  // Make sure this instruction does not use any of the BCE cmp block
264
8
  // instructions as operand.
265
39
  
for (auto BI : BlockInsts)8
{
266
39
    if (is_contained(Inst->operands(), BI))
267
2
      return false;
268
39
  }
269
8
  
return true6
;
270
8
}
271
272
2
void BCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis &AA) const {
273
2
  DenseSet<Instruction *> BlockInsts(
274
2
      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
275
2
  llvm::SmallVector<Instruction *, 4> OtherInsts;
276
15
  for (Instruction &Inst : *BB) {
277
15
    if (BlockInsts.count(&Inst))
278
12
      continue;
279
3
      assert(canSinkBCECmpInst(&Inst, BlockInsts, AA) &&
280
3
             "Split unsplittable block");
281
3
    // This is a non-BCE-cmp-block instruction. And it can be separated
282
3
    // from the BCE-cmp-block instruction.
283
3
    OtherInsts.push_back(&Inst);
284
3
  }
285
2
286
2
  // Do the actual spliting.
287
3
  for (Instruction *Inst : reverse(OtherInsts)) {
288
3
    Inst->moveBefore(&*NewParent->begin());
289
3
  }
290
2
}
291
292
8
bool BCECmpBlock::canSplit(AliasAnalysis &AA) const {
293
8
  DenseSet<Instruction *> BlockInsts(
294
8
      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
295
50
  for (Instruction &Inst : *BB) {
296
50
    if (!BlockInsts.count(&Inst)) {
297
10
      if (!canSinkBCECmpInst(&Inst, BlockInsts, AA))
298
4
        return false;
299
10
    }
300
50
  }
301
8
  
return true4
;
302
8
}
303
304
41
bool BCECmpBlock::doesOtherWork() const {
305
41
  AssertConsistent();
306
41
  // All the instructions we care about in the BCE cmp block.
307
41
  DenseSet<Instruction *> BlockInsts(
308
41
      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
309
41
  // TODO(courbet): Can we allow some other things ? This is very conservative.
310
41
  // We might be able to get away with anything does not have any side
311
41
  // effects outside of the basic block.
312
41
  // Note: The GEPs and/or loads are not necessarily in the same block.
313
230
  for (const Instruction &Inst : *BB) {
314
230
    if (!BlockInsts.count(&Inst))
315
8
      return true;
316
230
  }
317
41
  
return false33
;
318
41
}
319
320
// Visit the given comparison. If this is a comparison between two valid
321
// BCE atoms, returns the comparison.
322
BCECmpBlock visitICmp(const ICmpInst *const CmpI,
323
                      const ICmpInst::Predicate ExpectedPredicate,
324
53
                      BaseIdentifier &BaseId) {
325
53
  // The comparison can only be used once:
326
53
  //  - For intermediate blocks, as a branch condition.
327
53
  //  - For the final block, as an incoming value for the Phi.
328
53
  // If there are any other uses of the comparison, we cannot merge it with
329
53
  // other comparisons as we would create an orphan use of the value.
330
53
  if (!CmpI->hasOneUse()) {
331
2
    LLVM_DEBUG(dbgs() << "cmp has several uses\n");
332
2
    return {};
333
2
  }
334
51
  if (CmpI->getPredicate() != ExpectedPredicate)
335
1
    return {};
336
50
  LLVM_DEBUG(dbgs() << "cmp "
337
50
                    << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
338
50
                    << "\n");
339
50
  auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0), BaseId);
340
50
  if (!Lhs.BaseId)
341
9
    return {};
342
41
  auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1), BaseId);
343
41
  if (!Rhs.BaseId)
344
0
    return {};
345
41
  const auto &DL = CmpI->getModule()->getDataLayout();
346
41
  return BCECmpBlock(std::move(Lhs), std::move(Rhs),
347
41
                     DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()));
348
41
}
349
350
// Visit the given comparison block. If this is a comparison between two valid
351
// BCE atoms, returns the comparison.
352
BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
353
                          const BasicBlock *const PhiBlock,
354
59
                          BaseIdentifier &BaseId) {
355
59
  if (Block->empty()) 
return {}0
;
356
59
  auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
357
59
  if (!BranchI) 
return {}0
;
358
59
  LLVM_DEBUG(dbgs() << "branch\n");
359
59
  if (BranchI->isUnconditional()) {
360
16
    // In this case, we expect an incoming value which is the result of the
361
16
    // comparison. This is the last link in the chain of comparisons (note
362
16
    // that this does not mean that this is the last incoming value, blocks
363
16
    // can be reordered).
364
16
    auto *const CmpI = dyn_cast<ICmpInst>(Val);
365
16
    if (!CmpI) 
return {}0
;
366
16
    LLVM_DEBUG(dbgs() << "icmp\n");
367
16
    auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ, BaseId);
368
16
    Result.CmpI = CmpI;
369
16
    Result.BranchI = BranchI;
370
16
    return Result;
371
43
  } else {
372
43
    // In this case, we expect a constant incoming value (the comparison is
373
43
    // chained).
374
43
    const auto *const Const = dyn_cast<ConstantInt>(Val);
375
43
    LLVM_DEBUG(dbgs() << "const\n");
376
43
    if (!Const->isZero()) 
return {}6
;
377
37
    LLVM_DEBUG(dbgs() << "false\n");
378
37
    auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
379
37
    if (!CmpI) 
return {}0
;
380
37
    LLVM_DEBUG(dbgs() << "icmp\n");
381
37
    assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
382
37
    BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
383
37
    auto Result = visitICmp(
384
37
        CmpI, FalseBlock == PhiBlock ? 
ICmpInst::ICMP_EQ36
:
ICmpInst::ICMP_NE1
,
385
37
        BaseId);
386
37
    Result.CmpI = CmpI;
387
37
    Result.BranchI = BranchI;
388
37
    return Result;
389
37
  }
390
0
  return {};
391
0
}
392
393
static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
394
37
                                BCECmpBlock &&Comparison) {
395
37
  LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName()
396
37
                    << "': Found cmp of " << Comparison.SizeBits()
397
37
                    << " bits between " << Comparison.Lhs().BaseId << " + "
398
37
                    << Comparison.Lhs().Offset << " and "
399
37
                    << Comparison.Rhs().BaseId << " + "
400
37
                    << Comparison.Rhs().Offset << "\n");
401
37
  LLVM_DEBUG(dbgs() << "\n");
402
37
  Comparisons.push_back(std::move(Comparison));
403
37
}
404
405
// A chain of comparisons.
406
class BCECmpChain {
407
 public:
408
   BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
409
               AliasAnalysis &AA);
410
411
31
   int size() const { return Comparisons_.size(); }
412
413
#ifdef MERGEICMPS_DOT_ON
414
  void dump() const;
415
#endif  // MERGEICMPS_DOT_ON
416
417
  bool simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
418
                DomTreeUpdater &DTU);
419
420
private:
421
  static bool IsContiguous(const BCECmpBlock &First,
422
31
                           const BCECmpBlock &Second) {
423
31
    return First.Lhs().BaseId == Second.Lhs().BaseId &&
424
31
           First.Rhs().BaseId == Second.Rhs().BaseId &&
425
31
           First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
426
31
           
First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset27
;
427
31
  }
428
429
  PHINode &Phi_;
430
  std::vector<BCECmpBlock> Comparisons_;
431
  // The original entry block (before sorting);
432
  BasicBlock *EntryBlock_;
433
};
434
435
BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
436
                         AliasAnalysis &AA)
437
31
    : Phi_(Phi) {
438
31
  assert(!Blocks.empty() && "a chain should have at least one block");
439
31
  // Now look inside blocks to check for BCE comparisons.
440
31
  std::vector<BCECmpBlock> Comparisons;
441
31
  BaseIdentifier BaseId;
442
72
  for (size_t BlockIdx = 0; BlockIdx < Blocks.size(); 
++BlockIdx41
) {
443
59
    BasicBlock *const Block = Blocks[BlockIdx];
444
59
    assert(Block && "invalid block");
445
59
    BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
446
59
                                           Block, Phi.getParent(), BaseId);
447
59
    Comparison.BB = Block;
448
59
    if (!Comparison.IsValid()) {
449
18
      LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n");
450
18
      return;
451
18
    }
452
41
    if (Comparison.doesOtherWork()) {
453
8
      LLVM_DEBUG(dbgs() << "block '" << Comparison.BB->getName()
454
8
                        << "' does extra work besides compare\n");
455
8
      if (Comparisons.empty()) {
456
8
        // This is the initial block in the chain, in case this block does other
457
8
        // work, we can try to split the block and move the irrelevant
458
8
        // instructions to the predecessor.
459
8
        //
460
8
        // If this is not the initial block in the chain, splitting it wont
461
8
        // work.
462
8
        //
463
8
        // As once split, there will still be instructions before the BCE cmp
464
8
        // instructions that do other work in program order, i.e. within the
465
8
        // chain before sorting. Unless we can abort the chain at this point
466
8
        // and start anew.
467
8
        //
468
8
        // NOTE: we only handle blocks a with single predecessor for now.
469
8
        if (Comparison.canSplit(AA)) {
470
4
          LLVM_DEBUG(dbgs()
471
4
                     << "Split initial block '" << Comparison.BB->getName()
472
4
                     << "' that does extra work besides compare\n");
473
4
          Comparison.RequireSplit = true;
474
4
          enqueueBlock(Comparisons, std::move(Comparison));
475
4
        } else {
476
4
          LLVM_DEBUG(dbgs()
477
4
                     << "ignoring initial block '" << Comparison.BB->getName()
478
4
                     << "' that does extra work besides compare\n");
479
4
        }
480
8
        continue;
481
8
      }
482
0
      // TODO(courbet): Right now we abort the whole chain. We could be
483
0
      // merging only the blocks that don't do other work and resume the
484
0
      // chain from there. For example:
485
0
      //  if (a[0] == b[0]) {  // bb1
486
0
      //    if (a[1] == b[1]) {  // bb2
487
0
      //      some_value = 3; //bb3
488
0
      //      if (a[2] == b[2]) { //bb3
489
0
      //        do a ton of stuff  //bb4
490
0
      //      }
491
0
      //    }
492
0
      //  }
493
0
      //
494
0
      // This is:
495
0
      //
496
0
      // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+
497
0
      //  \            \           \               \
498
0
      //   ne           ne          ne              \
499
0
      //    \            \           \               v
500
0
      //     +------------+-----------+----------> bb_phi
501
0
      //
502
0
      // We can only merge the first two comparisons, because bb3* does
503
0
      // "other work" (setting some_value to 3).
504
0
      // We could still merge bb1 and bb2 though.
505
0
      return;
506
0
    }
507
33
    enqueueBlock(Comparisons, std::move(Comparison));
508
33
  }
509
31
510
31
  // It is possible we have no suitable comparison to merge.
511
31
  
if (13
Comparisons.empty()13
) {
512
1
    LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n");
513
1
    return;
514
1
  }
515
12
  EntryBlock_ = Comparisons[0].BB;
516
12
  Comparisons_ = std::move(Comparisons);
517
#ifdef MERGEICMPS_DOT_ON
518
  errs() << "BEFORE REORDERING:\n\n";
519
  dump();
520
#endif  // MERGEICMPS_DOT_ON
521
  // Reorder blocks by LHS. We can do that without changing the
522
12
  // semantics because we are only accessing dereferencable memory.
523
12
  llvm::sort(Comparisons_,
524
23
             [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) {
525
23
               return std::tie(LhsBlock.Lhs(), LhsBlock.Rhs()) <
526
23
                      std::tie(RhsBlock.Lhs(), RhsBlock.Rhs());
527
23
             });
528
#ifdef MERGEICMPS_DOT_ON
529
  errs() << "AFTER REORDERING:\n\n";
530
  dump();
531
#endif  // MERGEICMPS_DOT_ON
532
}
533
534
#ifdef MERGEICMPS_DOT_ON
535
void BCECmpChain::dump() const {
536
  errs() << "digraph dag {\n";
537
  errs() << " graph [bgcolor=transparent];\n";
538
  errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
539
  errs() << " edge [color=black];\n";
540
  for (size_t I = 0; I < Comparisons_.size(); ++I) {
541
    const auto &Comparison = Comparisons_[I];
542
    errs() << " \"" << I << "\" [label=\"%"
543
           << Comparison.Lhs().Base()->getName() << " + "
544
           << Comparison.Lhs().Offset << " == %"
545
           << Comparison.Rhs().Base()->getName() << " + "
546
           << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
547
           << " bytes)\"];\n";
548
    const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
549
    if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
550
    errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
551
  }
552
  errs() << " \"Phi\" [label=\"Phi\"];\n";
553
  errs() << "}\n\n";
554
}
555
#endif  // MERGEICMPS_DOT_ON
556
557
namespace {
558
559
// A class to compute the name of a set of merged basic blocks.
560
// This is optimized for the common case of no block names.
561
class MergedBlockName {
562
  // Storage for the uncommon case of several named blocks.
563
  SmallString<16> Scratch;
564
565
public:
566
  explicit MergedBlockName(ArrayRef<BCECmpBlock> Comparisons)
567
13
      : Name(makeName(Comparisons)) {}
568
  const StringRef Name;
569
570
private:
571
13
  StringRef makeName(ArrayRef<BCECmpBlock> Comparisons) {
572
13
    assert(!Comparisons.empty() && "no basic block");
573
13
    // Fast path: only one block, or no names at all.
574
13
    if (Comparisons.size() == 1)
575
2
      return Comparisons[0].BB->getName();
576
11
    const int size = std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
577
26
                                     [](int i, const BCECmpBlock &Cmp) {
578
26
                                       return i + Cmp.BB->getName().size();
579
26
                                     });
580
11
    if (size == 0)
581
0
      return StringRef("", 0);
582
11
583
11
    // Slow path: at least two blocks, at least one block with a name.
584
11
    Scratch.clear();
585
11
    // We'll have `size` bytes for name and `Comparisons.size() - 1` bytes for
586
11
    // separators.
587
11
    Scratch.reserve(size + Comparisons.size() - 1);
588
41
    const auto append = [this](StringRef str) {
589
41
      Scratch.append(str.begin(), str.end());
590
41
    };
591
11
    append(Comparisons[0].BB->getName());
592
26
    for (int I = 1, E = Comparisons.size(); I < E; 
++I15
) {
593
15
      const BasicBlock *const BB = Comparisons[I].BB;
594
15
      if (!BB->getName().empty()) {
595
15
        append("+");
596
15
        append(BB->getName());
597
15
      }
598
15
    }
599
11
    return StringRef(Scratch);
600
11
  }
601
};
602
} // namespace
603
604
// Merges the given contiguous comparison blocks into one memcmp block.
605
static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
606
                                    BasicBlock *const InsertBefore,
607
                                    BasicBlock *const NextCmpBlock,
608
                                    PHINode &Phi, const TargetLibraryInfo &TLI,
609
13
                                    AliasAnalysis &AA, DomTreeUpdater &DTU) {
610
13
  assert(!Comparisons.empty() && "merging zero comparisons");
611
13
  LLVMContext &Context = NextCmpBlock->getContext();
612
13
  const BCECmpBlock &FirstCmp = Comparisons[0];
613
13
614
13
  // Create a new cmp block before next cmp block.
615
13
  BasicBlock *const BB =
616
13
      BasicBlock::Create(Context, MergedBlockName(Comparisons).Name,
617
13
                         NextCmpBlock->getParent(), InsertBefore);
618
13
  IRBuilder<> Builder(BB);
619
13
  // Add the GEPs from the first BCECmpBlock.
620
13
  Value *const Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone());
621
13
  Value *const Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone());
622
13
623
13
  Value *IsEqual = nullptr;
624
13
  LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons -> "
625
13
                    << BB->getName() << "\n");
626
13
  if (Comparisons.size() == 1) {
627
2
    LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n");
628
2
    Value *const LhsLoad =
629
2
        Builder.CreateLoad(FirstCmp.Lhs().LoadI->getType(), Lhs);
630
2
    Value *const RhsLoad =
631
2
        Builder.CreateLoad(FirstCmp.Rhs().LoadI->getType(), Rhs);
632
2
    // There are no blocks to merge, just do the comparison.
633
2
    IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
634
11
  } else {
635
11
    // If there is one block that requires splitting, we do it now, i.e.
636
11
    // just before we know we will collapse the chain. The instructions
637
11
    // can be executed before any of the instructions in the chain.
638
11
    const auto ToSplit =
639
11
        std::find_if(Comparisons.begin(), Comparisons.end(),
640
20
                     [](const BCECmpBlock &B) { return B.RequireSplit; });
641
11
    if (ToSplit != Comparisons.end()) {
642
2
      LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n");
643
2
      ToSplit->split(BB, AA);
644
2
    }
645
11
646
11
    const unsigned TotalSizeBits = std::accumulate(
647
11
        Comparisons.begin(), Comparisons.end(), 0u,
648
26
        [](int Size, const BCECmpBlock &C) { return Size + C.SizeBits(); });
649
11
650
11
    // Create memcmp() == 0.
651
11
    const auto &DL = Phi.getModule()->getDataLayout();
652
11
    Value *const MemCmpCall = emitMemCmp(
653
11
        Lhs, Rhs,
654
11
        ConstantInt::get(DL.getIntPtrType(Context), TotalSizeBits / 8), Builder,
655
11
        DL, &TLI);
656
11
    IsEqual = Builder.CreateICmpEQ(
657
11
        MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
658
11
  }
659
13
660
13
  BasicBlock *const PhiBB = Phi.getParent();
661
13
  // Add a branch to the next basic block in the chain.
662
13
  if (NextCmpBlock == PhiBB) {
663
11
    // Continue to phi, passing it the comparison result.
664
11
    Builder.CreateBr(PhiBB);
665
11
    Phi.addIncoming(IsEqual, BB);
666
11
    DTU.applyUpdates({{DominatorTree::Insert, BB, PhiBB}});
667
11
  } else {
668
2
    // Continue to next block if equal, exit to phi else.
669
2
    Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB);
670
2
    Phi.addIncoming(ConstantInt::getFalse(Context), BB);
671
2
    DTU.applyUpdates({{DominatorTree::Insert, BB, NextCmpBlock},
672
2
                      {DominatorTree::Insert, BB, PhiBB}});
673
2
  }
674
13
  return BB;
675
13
}
676
677
bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
678
12
                           DomTreeUpdater &DTU) {
679
12
  assert(Comparisons_.size() >= 2 && "simplifying trivial BCECmpChain");
680
12
  // First pass to check if there is at least one merge. If not, we don't do
681
12
  // anything and we keep analysis passes intact.
682
12
  const auto AtLeastOneMerged = [this]() {
683
15
    for (size_t I = 1; I < Comparisons_.size(); 
++I3
) {
684
14
      if (IsContiguous(Comparisons_[I - 1], Comparisons_[I]))
685
11
        return true;
686
14
    }
687
12
    
return false1
;
688
12
  };
689
12
  if (!AtLeastOneMerged())
690
1
    return false;
691
11
692
11
  LLVM_DEBUG(dbgs() << "Simplifying comparison chain starting at block "
693
11
                    << EntryBlock_->getName() << "\n");
694
11
695
11
  // Effectively merge blocks. We go in the reverse direction from the phi block
696
11
  // so that the next block is always available to branch to.
697
11
  const auto mergeRange = [this, &TLI, &AA, &DTU](int I, int Num,
698
11
                                                  BasicBlock *InsertBefore,
699
13
                                                  BasicBlock *Next) {
700
13
    return mergeComparisons(makeArrayRef(Comparisons_).slice(I, Num),
701
13
                            InsertBefore, Next, Phi_, TLI, AA, DTU);
702
13
  };
703
11
  int NumMerged = 1;
704
11
  BasicBlock *NextCmpBlock = Phi_.getParent();
705
28
  for (int I = static_cast<int>(Comparisons_.size()) - 2; I >= 0; 
--I17
) {
706
17
    if (IsContiguous(Comparisons_[I], Comparisons_[I + 1])) {
707
15
      LLVM_DEBUG(dbgs() << "Merging block " << Comparisons_[I].BB->getName()
708
15
                        << " into " << Comparisons_[I + 1].BB->getName()
709
15
                        << "\n");
710
15
      ++NumMerged;
711
15
    } else {
712
2
      NextCmpBlock = mergeRange(I + 1, NumMerged, NextCmpBlock, NextCmpBlock);
713
2
      NumMerged = 1;
714
2
    }
715
17
  }
716
11
  // Insert the entry block for the new chain before the old entry block.
717
11
  // If the old entry block was the function entry, this ensures that the new
718
11
  // entry can become the function entry.
719
11
  NextCmpBlock = mergeRange(0, NumMerged, EntryBlock_, NextCmpBlock);
720
11
721
11
  // Replace the original cmp chain with the new cmp chain by pointing all
722
11
  // predecessors of EntryBlock_ to NextCmpBlock instead. This makes all cmp
723
11
  // blocks in the old chain unreachable.
724
14
  while (!pred_empty(EntryBlock_)) {
725
3
    BasicBlock* const Pred = *pred_begin(EntryBlock_);
726
3
    LLVM_DEBUG(dbgs() << "Updating jump into old chain from " << Pred->getName()
727
3
                      << "\n");
728
3
    Pred->getTerminator()->replaceUsesOfWith(EntryBlock_, NextCmpBlock);
729
3
    DTU.applyUpdates({{DominatorTree::Delete, Pred, EntryBlock_},
730
3
                      {DominatorTree::Insert, Pred, NextCmpBlock}});
731
3
  }
732
11
733
11
  // If the old cmp chain was the function entry, we need to update the function
734
11
  // entry.
735
11
  const bool ChainEntryIsFnEntry =
736
11
      (EntryBlock_ == &EntryBlock_->getParent()->getEntryBlock());
737
11
  if (ChainEntryIsFnEntry && 
DTU.hasDomTree()0
) {
738
0
    LLVM_DEBUG(dbgs() << "Changing function entry from "
739
0
                      << EntryBlock_->getName() << " to "
740
0
                      << NextCmpBlock->getName() << "\n");
741
0
    DTU.getDomTree().setNewRoot(NextCmpBlock);
742
0
    DTU.applyUpdates({{DominatorTree::Delete, NextCmpBlock, EntryBlock_}});
743
0
  }
744
11
  EntryBlock_ = nullptr;
745
11
746
11
  // Delete merged blocks. This also removes incoming values in phi.
747
11
  SmallVector<BasicBlock *, 16> DeadBlocks;
748
28
  for (auto &Cmp : Comparisons_) {
749
28
    LLVM_DEBUG(dbgs() << "Deleting merged block " << Cmp.BB->getName() << "\n");
750
28
    DeadBlocks.push_back(Cmp.BB);
751
28
  }
752
11
  DeleteDeadBlocks(DeadBlocks, &DTU);
753
11
754
11
  Comparisons_.clear();
755
11
  return true;
756
11
}
757
758
std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
759
                                           BasicBlock *const LastBlock,
760
32
                                           int NumBlocks) {
761
32
  // Walk up from the last block to find other blocks.
762
32
  std::vector<BasicBlock *> Blocks(NumBlocks);
763
32
  assert(LastBlock && "invalid last block");
764
32
  BasicBlock *CurBlock = LastBlock;
765
81
  for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; 
--BlockIndex49
) {
766
50
    if (CurBlock->hasAddressTaken()) {
767
0
      // Somebody is jumping to the block through an address, all bets are
768
0
      // off.
769
0
      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
770
0
                        << " has its address taken\n");
771
0
      return {};
772
0
    }
773
50
    Blocks[BlockIndex] = CurBlock;
774
50
    auto *SinglePredecessor = CurBlock->getSinglePredecessor();
775
50
    if (!SinglePredecessor) {
776
0
      // The block has two or more predecessors.
777
0
      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
778
0
                        << " has two or more predecessors\n");
779
0
      return {};
780
0
    }
781
50
    if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
782
1
      // The block does not link back to the phi.
783
1
      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
784
1
                        << " does not link back to the phi\n");
785
1
      return {};
786
1
    }
787
49
    CurBlock = SinglePredecessor;
788
49
  }
789
32
  Blocks[0] = CurBlock;
790
31
  return Blocks;
791
32
}
792
793
bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA,
794
4.16k
                DomTreeUpdater &DTU) {
795
4.16k
  LLVM_DEBUG(dbgs() << "processPhi()\n");
796
4.16k
  if (Phi.getNumIncomingValues() <= 1) {
797
187
    LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
798
187
    return false;
799
187
  }
800
3.97k
  // We are looking for something that has the following structure:
801
3.97k
  //   bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
802
3.97k
  //     \            \           \               \
803
3.97k
  //      ne           ne          ne              \
804
3.97k
  //       \            \           \               v
805
3.97k
  //        +------------+-----------+----------> bb_phi
806
3.97k
  //
807
3.97k
  //  - The last basic block (bb4 here) must branch unconditionally to bb_phi.
808
3.97k
  //    It's the only block that contributes a non-constant value to the Phi.
809
3.97k
  //  - All other blocks (b1, b2, b3) must have exactly two successors, one of
810
3.97k
  //    them being the phi block.
811
3.97k
  //  - All intermediate blocks (bb2, bb3) must have only one predecessor.
812
3.97k
  //  - Blocks cannot do other work besides the comparison, see doesOtherWork()
813
3.97k
814
3.97k
  // The blocks are not necessarily ordered in the phi, so we start from the
815
3.97k
  // last block and reconstruct the order.
816
3.97k
  BasicBlock *LastBlock = nullptr;
817
5.28k
  for (unsigned I = 0; I < Phi.getNumIncomingValues(); 
++I1.30k
) {
818
5.10k
    if (isa<ConstantInt>(Phi.getIncomingValue(I))) 
continue1.27k
;
819
3.83k
    if (LastBlock) {
820
5
      // There are several non-constant values.
821
5
      LLVM_DEBUG(dbgs() << "skip: several non-constant values\n");
822
5
      return false;
823
5
    }
824
3.83k
    if (!isa<ICmpInst>(Phi.getIncomingValue(I)) ||
825
3.83k
        cast<ICmpInst>(Phi.getIncomingValue(I))->getParent() !=
826
3.79k
            Phi.getIncomingBlock(I)) {
827
3.79k
      // Non-constant incoming value is not from a cmp instruction or not
828
3.79k
      // produced by the last block. We could end up processing the value
829
3.79k
      // producing block more than once.
830
3.79k
      //
831
3.79k
      // This is an uncommon case, so we bail.
832
3.79k
      LLVM_DEBUG(
833
3.79k
          dbgs()
834
3.79k
          << "skip: non-constant value not from cmp or not from last block.\n");
835
3.79k
      return false;
836
3.79k
    }
837
37
    LastBlock = Phi.getIncomingBlock(I);
838
37
  }
839
3.97k
  
if (181
!LastBlock181
) {
840
149
    // There is no non-constant block.
841
149
    LLVM_DEBUG(dbgs() << "skip: no non-constant block\n");
842
149
    return false;
843
149
  }
844
32
  if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
845
0
    LLVM_DEBUG(dbgs() << "skip: last block non-phi successor\n");
846
0
    return false;
847
0
  }
848
32
849
32
  const auto Blocks =
850
32
      getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
851
32
  if (Blocks.empty()) 
return false1
;
852
31
  BCECmpChain CmpChain(Blocks, Phi, AA);
853
31
854
31
  if (CmpChain.size() < 2) {
855
19
    LLVM_DEBUG(dbgs() << "skip: only one compare block\n");
856
19
    return false;
857
19
  }
858
12
859
12
  return CmpChain.simplify(TLI, AA, DTU);
860
12
}
861
862
static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
863
                    const TargetTransformInfo &TTI, AliasAnalysis &AA,
864
489k
                    DominatorTree *DT) {
865
489k
  LLVM_DEBUG(dbgs() << "MergeICmpsLegacyPass: " << F.getName() << "\n");
866
489k
867
489k
  // We only try merging comparisons if the target wants to expand memcmp later.
868
489k
  // The rationale is to avoid turning small chains into memcmp calls.
869
489k
  if (!TTI.enableMemCmpExpansion(F.hasOptSize(), true))
870
344k
    return false;
871
145k
872
145k
  // If we don't have memcmp avaiable we can't emit calls to it.
873
145k
  if (!TLI.has(LibFunc_memcmp))
874
21.4k
    return false;
875
124k
876
124k
  DomTreeUpdater DTU(DT, /*PostDominatorTree*/ nullptr,
877
124k
                     DomTreeUpdater::UpdateStrategy::Eager);
878
124k
879
124k
  bool MadeChange = false;
880
124k
881
147k
  for (auto BBIt = ++F.begin(); BBIt != F.end(); 
++BBIt23.7k
) {
882
23.7k
    // A Phi operation is always first in a basic block.
883
23.7k
    if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin()))
884
4.16k
      MadeChange |= processPhi(*Phi, TLI, AA, DTU);
885
23.7k
  }
886
124k
887
124k
  return MadeChange;
888
124k
}
889
890
class MergeICmpsLegacyPass : public FunctionPass {
891
public:
892
  static char ID;
893
894
34.5k
  MergeICmpsLegacyPass() : FunctionPass(ID) {
895
34.5k
    initializeMergeICmpsLegacyPassPass(*PassRegistry::getPassRegistry());
896
34.5k
  }
897
898
490k
  bool runOnFunction(Function &F) override {
899
490k
    if (skipFunction(F)) 
return false272
;
900
489k
    const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
901
489k
    const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
902
489k
    // MergeICmps does not need the DominatorTree, but we update it if it's
903
489k
    // already available.
904
489k
    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
905
489k
    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
906
489k
    return runImpl(F, TLI, TTI, AA, DTWP ? 
&DTWP->getDomTree()489k
:
nullptr4
);
907
489k
  }
908
909
 private:
910
34.3k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
911
34.3k
    AU.addRequired<TargetLibraryInfoWrapperPass>();
912
34.3k
    AU.addRequired<TargetTransformInfoWrapperPass>();
913
34.3k
    AU.addRequired<AAResultsWrapperPass>();
914
34.3k
    AU.addPreserved<GlobalsAAWrapperPass>();
915
34.3k
    AU.addPreserved<DominatorTreeWrapperPass>();
916
34.3k
  }
917
};
918
919
} // namespace
920
921
char MergeICmpsLegacyPass::ID = 0;
922
48.6k
INITIALIZE_PASS_BEGIN(MergeICmpsLegacyPass, "mergeicmps",
923
48.6k
                      "Merge contiguous icmps into a memcmp", false, false)
924
48.6k
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
925
48.6k
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
926
48.6k
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
927
48.6k
INITIALIZE_PASS_END(MergeICmpsLegacyPass, "mergeicmps",
928
                    "Merge contiguous icmps into a memcmp", false, false)
929
930
34.5k
Pass *llvm::createMergeICmpsLegacyPass() { return new MergeICmpsLegacyPass(); }
931
932
PreservedAnalyses MergeICmpsPass::run(Function &F,
933
2
                                      FunctionAnalysisManager &AM) {
934
2
  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
935
2
  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
936
2
  auto &AA = AM.getResult<AAManager>(F);
937
2
  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
938
2
  const bool MadeChanges = runImpl(F, TLI, TTI, AA, DT);
939
2
  if (!MadeChanges)
940
0
    return PreservedAnalyses::all();
941
2
  PreservedAnalyses PA;
942
2
  PA.preserve<GlobalsAA>();
943
2
  PA.preserve<DominatorTreeAnalysis>();
944
2
  return PA;
945
2
}