Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// Loop unrolling may create many similar GEPs for array accesses.
10
// e.g., a 2-level loop
11
//
12
// float a[32][32]; // global variable
13
//
14
// for (int i = 0; i < 2; ++i) {
15
//   for (int j = 0; j < 2; ++j) {
16
//     ...
17
//     ... = a[x + i][y + j];
18
//     ...
19
//   }
20
// }
21
//
22
// will probably be unrolled to:
23
//
24
// gep %a, 0, %x, %y; load
25
// gep %a, 0, %x, %y + 1; load
26
// gep %a, 0, %x + 1, %y; load
27
// gep %a, 0, %x + 1, %y + 1; load
28
//
29
// LLVM's GVN does not use partial redundancy elimination yet, and is thus
30
// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
31
// significant slowdown in targets with limited addressing modes. For instance,
32
// because the PTX target does not support the reg+reg addressing mode, the
33
// NVPTX backend emits PTX code that literally computes the pointer address of
34
// each GEP, wasting tons of registers. It emits the following PTX for the
35
// first load and similar PTX for other loads.
36
//
37
// mov.u32         %r1, %x;
38
// mov.u32         %r2, %y;
39
// mul.wide.u32    %rl2, %r1, 128;
40
// mov.u64         %rl3, a;
41
// add.s64         %rl4, %rl3, %rl2;
42
// mul.wide.u32    %rl5, %r2, 4;
43
// add.s64         %rl6, %rl4, %rl5;
44
// ld.global.f32   %f1, [%rl6];
45
//
46
// To reduce the register pressure, the optimization implemented in this file
47
// merges the common part of a group of GEPs, so we can compute each pointer
48
// address by adding a simple offset to the common part, saving many registers.
49
//
50
// It works by splitting each GEP into a variadic base and a constant offset.
51
// The variadic base can be computed once and reused by multiple GEPs, and the
52
// constant offsets can be nicely folded into the reg+immediate addressing mode
53
// (supported by most targets) without using any extra register.
54
//
55
// For instance, we transform the four GEPs and four loads in the above example
56
// into:
57
//
58
// base = gep a, 0, x, y
59
// load base
60
// laod base + 1  * sizeof(float)
61
// load base + 32 * sizeof(float)
62
// load base + 33 * sizeof(float)
63
//
64
// Given the transformed IR, a backend that supports the reg+immediate
65
// addressing mode can easily fold the pointer arithmetics into the loads. For
66
// example, the NVPTX backend can easily fold the pointer arithmetics into the
67
// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
68
//
69
// mov.u32         %r1, %tid.x;
70
// mov.u32         %r2, %tid.y;
71
// mul.wide.u32    %rl2, %r1, 128;
72
// mov.u64         %rl3, a;
73
// add.s64         %rl4, %rl3, %rl2;
74
// mul.wide.u32    %rl5, %r2, 4;
75
// add.s64         %rl6, %rl4, %rl5;
76
// ld.global.f32   %f1, [%rl6]; // so far the same as unoptimized PTX
77
// ld.global.f32   %f2, [%rl6+4]; // much better
78
// ld.global.f32   %f3, [%rl6+128]; // much better
79
// ld.global.f32   %f4, [%rl6+132]; // much better
80
//
81
// Another improvement enabled by the LowerGEP flag is to lower a GEP with
82
// multiple indices to either multiple GEPs with a single index or arithmetic
83
// operations (depending on whether the target uses alias analysis in codegen).
84
// Such transformation can have following benefits:
85
// (1) It can always extract constants in the indices of structure type.
86
// (2) After such Lowering, there are more optimization opportunities such as
87
//     CSE, LICM and CGP.
88
//
89
// E.g. The following GEPs have multiple indices:
90
//  BB1:
91
//    %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
92
//    load %p
93
//    ...
94
//  BB2:
95
//    %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
96
//    load %p2
97
//    ...
98
//
99
// We can not do CSE to the common part related to index "i64 %i". Lowering
100
// GEPs can achieve such goals.
101
// If the target does not use alias analysis in codegen, this pass will
102
// lower a GEP with multiple indices into arithmetic operations:
103
//  BB1:
104
//    %1 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
105
//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
106
//    %3 = add i64 %1, %2                          ; CSE opportunity
107
//    %4 = mul i64 %j1, length_of_struct
108
//    %5 = add i64 %3, %4
109
//    %6 = add i64 %3, struct_field_3              ; Constant offset
110
//    %p = inttoptr i64 %6 to i32*
111
//    load %p
112
//    ...
113
//  BB2:
114
//    %7 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
115
//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
116
//    %9 = add i64 %7, %8                          ; CSE opportunity
117
//    %10 = mul i64 %j2, length_of_struct
118
//    %11 = add i64 %9, %10
119
//    %12 = add i64 %11, struct_field_2            ; Constant offset
120
//    %p = inttoptr i64 %12 to i32*
121
//    load %p2
122
//    ...
123
//
124
// If the target uses alias analysis in codegen, this pass will lower a GEP
125
// with multiple indices into multiple GEPs with a single index:
126
//  BB1:
127
//    %1 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
128
//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
129
//    %3 = getelementptr i8* %1, i64 %2            ; CSE opportunity
130
//    %4 = mul i64 %j1, length_of_struct
131
//    %5 = getelementptr i8* %3, i64 %4
132
//    %6 = getelementptr i8* %5, struct_field_3    ; Constant offset
133
//    %p = bitcast i8* %6 to i32*
134
//    load %p
135
//    ...
136
//  BB2:
137
//    %7 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
138
//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
139
//    %9 = getelementptr i8* %7, i64 %8            ; CSE opportunity
140
//    %10 = mul i64 %j2, length_of_struct
141
//    %11 = getelementptr i8* %9, i64 %10
142
//    %12 = getelementptr i8* %11, struct_field_2  ; Constant offset
143
//    %p2 = bitcast i8* %12 to i32*
144
//    load %p2
145
//    ...
146
//
147
// Lowering GEPs can also benefit other passes such as LICM and CGP.
148
// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
149
// indices if one of the index is variant. If we lower such GEP into invariant
150
// parts and variant parts, LICM can hoist/sink those invariant parts.
151
// CGP (CodeGen Prepare) tries to sink address calculations that match the
152
// target's addressing modes. A GEP with multiple indices may not match and will
153
// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
154
// them. So we end up with a better addressing mode.
155
//
156
//===----------------------------------------------------------------------===//
157
158
#include "llvm/ADT/APInt.h"
159
#include "llvm/ADT/DenseMap.h"
160
#include "llvm/ADT/DepthFirstIterator.h"
161
#include "llvm/ADT/SmallVector.h"
162
#include "llvm/Analysis/LoopInfo.h"
163
#include "llvm/Analysis/MemoryBuiltins.h"
164
#include "llvm/Analysis/ScalarEvolution.h"
165
#include "llvm/Analysis/TargetLibraryInfo.h"
166
#include "llvm/Analysis/TargetTransformInfo.h"
167
#include "llvm/Transforms/Utils/Local.h"
168
#include "llvm/Analysis/ValueTracking.h"
169
#include "llvm/IR/BasicBlock.h"
170
#include "llvm/IR/Constant.h"
171
#include "llvm/IR/Constants.h"
172
#include "llvm/IR/DataLayout.h"
173
#include "llvm/IR/DerivedTypes.h"
174
#include "llvm/IR/Dominators.h"
175
#include "llvm/IR/Function.h"
176
#include "llvm/IR/GetElementPtrTypeIterator.h"
177
#include "llvm/IR/IRBuilder.h"
178
#include "llvm/IR/Instruction.h"
179
#include "llvm/IR/Instructions.h"
180
#include "llvm/IR/Module.h"
181
#include "llvm/IR/PatternMatch.h"
182
#include "llvm/IR/Type.h"
183
#include "llvm/IR/User.h"
184
#include "llvm/IR/Value.h"
185
#include "llvm/Pass.h"
186
#include "llvm/Support/Casting.h"
187
#include "llvm/Support/CommandLine.h"
188
#include "llvm/Support/ErrorHandling.h"
189
#include "llvm/Support/raw_ostream.h"
190
#include "llvm/Target/TargetMachine.h"
191
#include "llvm/Transforms/Scalar.h"
192
#include <cassert>
193
#include <cstdint>
194
#include <string>
195
196
using namespace llvm;
197
using namespace llvm::PatternMatch;
198
199
static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
200
    "disable-separate-const-offset-from-gep", cl::init(false),
201
    cl::desc("Do not separate the constant offset from a GEP instruction"),
202
    cl::Hidden);
203
204
// Setting this flag may emit false positives when the input module already
205
// contains dead instructions. Therefore, we set it only in unit tests that are
206
// free of dead code.
207
static cl::opt<bool>
208
    VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
209
                     cl::desc("Verify this pass produces no dead code"),
210
                     cl::Hidden);
211
212
namespace {
213
214
/// A helper class for separating a constant offset from a GEP index.
215
///
216
/// In real programs, a GEP index may be more complicated than a simple addition
217
/// of something and a constant integer which can be trivially splitted. For
218
/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
219
/// constant offset, so that we can separate the index to (a << 3) + b and 5.
220
///
221
/// Therefore, this class looks into the expression that computes a given GEP
222
/// index, and tries to find a constant integer that can be hoisted to the
223
/// outermost level of the expression as an addition. Not every constant in an
224
/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
225
/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
226
/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
227
class ConstantOffsetExtractor {
228
public:
229
  /// Extracts a constant offset from the given GEP index. It returns the
230
  /// new index representing the remainder (equal to the original index minus
231
  /// the constant offset), or nullptr if we cannot extract a constant offset.
232
  /// \p Idx The given GEP index
233
  /// \p GEP The given GEP
234
  /// \p UserChainTail Outputs the tail of UserChain so that we can
235
  ///                  garbage-collect unused instructions in UserChain.
236
  static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
237
                        User *&UserChainTail, const DominatorTree *DT);
238
239
  /// Looks for a constant offset from the given GEP index without extracting
240
  /// it. It returns the numeric value of the extracted constant offset (0 if
241
  /// failed). The meaning of the arguments are the same as Extract.
242
  static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
243
                      const DominatorTree *DT);
244
245
private:
246
  ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
247
12.7k
      : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
248
12.7k
  }
249
250
  /// Searches the expression that computes V for a non-zero constant C s.t.
251
  /// V can be reassociated into the form V' + C. If the searching is
252
  /// successful, returns C and update UserChain as a def-use chain from C to V;
253
  /// otherwise, UserChain is empty.
254
  ///
255
  /// \p V            The given expression
256
  /// \p SignExtended Whether V will be sign-extended in the computation of the
257
  ///                 GEP index
258
  /// \p ZeroExtended Whether V will be zero-extended in the computation of the
259
  ///                 GEP index
260
  /// \p NonNegative  Whether V is guaranteed to be non-negative. For example,
261
  ///                 an index of an inbounds GEP is guaranteed to be
262
  ///                 non-negative. Levaraging this, we can better split
263
  ///                 inbounds GEPs.
264
  APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
265
266
  /// A helper function to look into both operands of a binary operator.
267
  APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
268
                            bool ZeroExtended);
269
270
  /// After finding the constant offset C from the GEP index I, we build a new
271
  /// index I' s.t. I' + C = I. This function builds and returns the new
272
  /// index I' according to UserChain produced by function "find".
273
  ///
274
  /// The building conceptually takes two steps:
275
  /// 1) iteratively distribute s/zext towards the leaves of the expression tree
276
  /// that computes I
277
  /// 2) reassociate the expression tree to the form I' + C.
278
  ///
279
  /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
280
  /// sext to a, b and 5 so that we have
281
  ///   sext(a) + (sext(b) + 5).
282
  /// Then, we reassociate it to
283
  ///   (sext(a) + sext(b)) + 5.
284
  /// Given this form, we know I' is sext(a) + sext(b).
285
  Value *rebuildWithoutConstOffset();
286
287
  /// After the first step of rebuilding the GEP index without the constant
288
  /// offset, distribute s/zext to the operands of all operators in UserChain.
289
  /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
290
  /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
291
  ///
292
  /// The function also updates UserChain to point to new subexpressions after
293
  /// distributing s/zext. e.g., the old UserChain of the above example is
294
  /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
295
  /// and the new UserChain is
296
  /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
297
  ///   zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
298
  ///
299
  /// \p ChainIndex The index to UserChain. ChainIndex is initially
300
  ///               UserChain.size() - 1, and is decremented during
301
  ///               the recursion.
302
  Value *distributeExtsAndCloneChain(unsigned ChainIndex);
303
304
  /// Reassociates the GEP index to the form I' + C and returns I'.
305
  Value *removeConstOffset(unsigned ChainIndex);
306
307
  /// A helper function to apply ExtInsts, a list of s/zext, to value V.
308
  /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
309
  /// returns "sext i32 (zext i16 V to i32) to i64".
310
  Value *applyExts(Value *V);
311
312
  /// A helper function that returns whether we can trace into the operands
313
  /// of binary operator BO for a constant offset.
314
  ///
315
  /// \p SignExtended Whether BO is surrounded by sext
316
  /// \p ZeroExtended Whether BO is surrounded by zext
317
  /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
318
  ///                array index.
319
  bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
320
                    bool NonNegative);
321
322
  /// The path from the constant offset to the old GEP index. e.g., if the GEP
323
  /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
324
  /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
325
  /// UserChain[2] will be the entire expression "a * b + (c + 5)".
326
  ///
327
  /// This path helps to rebuild the new GEP index.
328
  SmallVector<User *, 8> UserChain;
329
330
  /// A data structure used in rebuildWithoutConstOffset. Contains all
331
  /// sext/zext instructions along UserChain.
332
  SmallVector<CastInst *, 16> ExtInsts;
333
334
  /// Insertion position of cloned instructions.
335
  Instruction *IP;
336
337
  const DataLayout &DL;
338
  const DominatorTree *DT;
339
};
340
341
/// A pass that tries to split every GEP in the function into a variadic
342
/// base and a constant offset. It is a FunctionPass because searching for the
343
/// constant offset may inspect other basic blocks.
344
class SeparateConstOffsetFromGEP : public FunctionPass {
345
public:
346
  static char ID;
347
348
  SeparateConstOffsetFromGEP(bool LowerGEP = false)
349
4.54k
      : FunctionPass(ID), LowerGEP(LowerGEP) {
350
4.54k
    initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
351
4.54k
  }
352
353
4.50k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
354
4.50k
    AU.addRequired<DominatorTreeWrapperPass>();
355
4.50k
    AU.addRequired<ScalarEvolutionWrapperPass>();
356
4.50k
    AU.addRequired<TargetTransformInfoWrapperPass>();
357
4.50k
    AU.addRequired<LoopInfoWrapperPass>();
358
4.50k
    AU.setPreservesCFG();
359
4.50k
    AU.addRequired<TargetLibraryInfoWrapperPass>();
360
4.50k
  }
361
362
4.50k
  bool doInitialization(Module &M) override {
363
4.50k
    DL = &M.getDataLayout();
364
4.50k
    return false;
365
4.50k
  }
366
367
  bool runOnFunction(Function &F) override;
368
369
private:
370
  /// Tries to split the given GEP into a variadic base and a constant offset,
371
  /// and returns true if the splitting succeeds.
372
  bool splitGEP(GetElementPtrInst *GEP);
373
374
  /// Lower a GEP with multiple indices into multiple GEPs with a single index.
375
  /// Function splitGEP already split the original GEP into a variadic part and
376
  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
377
  /// variadic part into a set of GEPs with a single index and applies
378
  /// AccumulativeByteOffset to it.
379
  /// \p Variadic                  The variadic part of the original GEP.
380
  /// \p AccumulativeByteOffset    The constant offset.
381
  void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
382
                              int64_t AccumulativeByteOffset);
383
384
  /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
385
  /// Function splitGEP already split the original GEP into a variadic part and
386
  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
387
  /// variadic part into a set of arithmetic operations and applies
388
  /// AccumulativeByteOffset to it.
389
  /// \p Variadic                  The variadic part of the original GEP.
390
  /// \p AccumulativeByteOffset    The constant offset.
391
  void lowerToArithmetics(GetElementPtrInst *Variadic,
392
                          int64_t AccumulativeByteOffset);
393
394
  /// Finds the constant offset within each index and accumulates them. If
395
  /// LowerGEP is true, it finds in indices of both sequential and structure
396
  /// types, otherwise it only finds in sequential indices. The output
397
  /// NeedsExtraction indicates whether we successfully find a non-zero constant
398
  /// offset.
399
  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
400
401
  /// Canonicalize array indices to pointer-size integers. This helps to
402
  /// simplify the logic of splitting a GEP. For example, if a + b is a
403
  /// pointer-size integer, we have
404
  ///   gep base, a + b = gep (gep base, a), b
405
  /// However, this equality may not hold if the size of a + b is smaller than
406
  /// the pointer size, because LLVM conceptually sign-extends GEP indices to
407
  /// pointer size before computing the address
408
  /// (http://llvm.org/docs/LangRef.html#id181).
409
  ///
410
  /// This canonicalization is very likely already done in clang and
411
  /// instcombine. Therefore, the program will probably remain the same.
412
  ///
413
  /// Returns true if the module changes.
414
  ///
415
  /// Verified in @i32_add in split-gep.ll
416
  bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
417
418
  /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
419
  /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
420
  /// the constant offset. After extraction, it becomes desirable to reunion the
421
  /// distributed sexts. For example,
422
  ///
423
  ///                              &a[sext(i +nsw (j +nsw 5)]
424
  ///   => distribute              &a[sext(i) +nsw (sext(j) +nsw 5)]
425
  ///   => constant extraction     &a[sext(i) + sext(j)] + 5
426
  ///   => reunion                 &a[sext(i +nsw j)] + 5
427
  bool reuniteExts(Function &F);
428
429
  /// A helper that reunites sexts in an instruction.
430
  bool reuniteExts(Instruction *I);
431
432
  /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
433
  Instruction *findClosestMatchingDominator(const SCEV *Key,
434
                                            Instruction *Dominatee);
435
  /// Verify F is free of dead code.
436
  void verifyNoDeadCode(Function &F);
437
438
  bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
439
440
  // Swap the index operand of two GEP.
441
  void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
442
443
  // Check if it is safe to swap operand of two GEP.
444
  bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
445
                            Loop *CurLoop);
446
447
  const DataLayout *DL = nullptr;
448
  DominatorTree *DT = nullptr;
449
  ScalarEvolution *SE;
450
451
  LoopInfo *LI;
452
  TargetLibraryInfo *TLI;
453
454
  /// Whether to lower a GEP with multiple indices into arithmetic operations or
455
  /// multiple GEPs with a single index.
456
  bool LowerGEP;
457
458
  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs;
459
};
460
461
} // end anonymous namespace
462
463
char SeparateConstOffsetFromGEP::ID = 0;
464
465
36.0k
INITIALIZE_PASS_BEGIN(
466
36.0k
    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
467
36.0k
    "Split GEPs to a variadic base and a constant offset for better CSE", false,
468
36.0k
    false)
469
36.0k
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
470
36.0k
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
471
36.0k
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
472
36.0k
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
473
36.0k
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
474
36.0k
INITIALIZE_PASS_END(
475
    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
476
    "Split GEPs to a variadic base and a constant offset for better CSE", false,
477
    false)
478
479
4.54k
FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
480
4.54k
  return new SeparateConstOffsetFromGEP(LowerGEP);
481
4.54k
}
482
483
bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
484
                                            bool ZeroExtended,
485
                                            BinaryOperator *BO,
486
3.95k
                                            bool NonNegative) {
487
3.95k
  // We only consider ADD, SUB and OR, because a non-zero constant found in
488
3.95k
  // expressions composed of these operations can be easily hoisted as a
489
3.95k
  // constant offset by reassociation.
490
3.95k
  if (BO->getOpcode() != Instruction::Add &&
491
3.95k
      
BO->getOpcode() != Instruction::Sub844
&&
492
3.95k
      
BO->getOpcode() != Instruction::Or746
) {
493
462
    return false;
494
462
  }
495
3.49k
496
3.49k
  Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
497
3.49k
  // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
498
3.49k
  // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
499
3.49k
  // FIXME: this does not appear to be covered by any tests
500
3.49k
  //        (with x86/aarch64 backends at least)
501
3.49k
  if (BO->getOpcode() == Instruction::Or &&
502
3.49k
      
!haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT)284
)
503
5
    return false;
504
3.49k
505
3.49k
  // In addition, tracing into BO requires that its surrounding s/zext (if
506
3.49k
  // any) is distributable to both operands.
507
3.49k
  //
508
3.49k
  // Suppose BO = A op B.
509
3.49k
  //  SignExtended | ZeroExtended | Distributable?
510
3.49k
  // --------------+--------------+----------------------------------
511
3.49k
  //       0       |      0       | true because no s/zext exists
512
3.49k
  //       0       |      1       | zext(BO) == zext(A) op zext(B)
513
3.49k
  //       1       |      0       | sext(BO) == sext(A) op sext(B)
514
3.49k
  //       1       |      1       | zext(sext(BO)) ==
515
3.49k
  //               |              |     zext(sext(A)) op zext(sext(B))
516
3.49k
  if (BO->getOpcode() == Instruction::Add && 
!ZeroExtended3.11k
&&
NonNegative3.06k
) {
517
1.76k
    // If a + b >= 0 and (a >= 0 or b >= 0), then
518
1.76k
    //   sext(a + b) = sext(a) + sext(b)
519
1.76k
    // even if the addition is not marked nsw.
520
1.76k
    //
521
1.76k
    // Leveraging this invarient, we can trace into an sext'ed inbound GEP
522
1.76k
    // index if the constant offset is non-negative.
523
1.76k
    //
524
1.76k
    // Verified in @sext_add in split-gep.ll.
525
1.76k
    if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
526
0
      if (!ConstLHS->isNegative())
527
0
        return true;
528
1.76k
    }
529
1.76k
    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
530
1.62k
      if (!ConstRHS->isNegative())
531
1.40k
        return true;
532
2.08k
    }
533
1.76k
  }
534
2.08k
535
2.08k
  // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
536
2.08k
  // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
537
2.08k
  if (BO->getOpcode() == Instruction::Add ||
538
2.08k
      
BO->getOpcode() == Instruction::Sub377
) {
539
1.80k
    if (SignExtended && 
!BO->hasNoSignedWrap()306
)
540
20
      return false;
541
1.78k
    if (ZeroExtended && 
!BO->hasNoUnsignedWrap()52
)
542
17
      return false;
543
2.05k
  }
544
2.05k
545
2.05k
  return true;
546
2.05k
}
547
548
APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
549
                                                   bool SignExtended,
550
3.45k
                                                   bool ZeroExtended) {
551
3.45k
  // BO being non-negative does not shed light on whether its operands are
552
3.45k
  // non-negative. Clear the NonNegative flag here.
553
3.45k
  APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
554
3.45k
                              /* NonNegative */ false);
555
3.45k
  // If we found a constant offset in the left operand, stop and return that.
556
3.45k
  // This shortcut might cause us to miss opportunities of combining the
557
3.45k
  // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
558
3.45k
  // However, such cases are probably already handled by -instcombine,
559
3.45k
  // given this pass runs after the standard optimizations.
560
3.45k
  if (ConstantOffset != 0) 
return ConstantOffset564
;
561
2.89k
  ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
562
2.89k
                        /* NonNegative */ false);
563
2.89k
  // If U is a sub operator, negate the constant offset found in the right
564
2.89k
  // operand.
565
2.89k
  if (BO->getOpcode() == Instruction::Sub)
566
73
    ConstantOffset = -ConstantOffset;
567
2.89k
  return ConstantOffset;
568
2.89k
}
569
570
APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
571
26.9k
                                    bool ZeroExtended, bool NonNegative) {
572
26.9k
  // TODO(jingyue): We could trace into integer/pointer casts, such as
573
26.9k
  // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
574
26.9k
  // integers because it gives good enough results for our benchmarks.
575
26.9k
  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
576
26.9k
577
26.9k
  // We cannot do much with Values that are not a User, such as an Argument.
578
26.9k
  User *U = dyn_cast<User>(V);
579
26.9k
  if (U == nullptr) 
return APInt(BitWidth, 0)2.09k
;
580
24.8k
581
24.8k
  APInt ConstantOffset(BitWidth, 0);
582
24.8k
  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
583
4.37k
    // Hooray, we found it!
584
4.37k
    ConstantOffset = CI->getValue();
585
20.4k
  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
586
3.95k
    // Trace into subexpressions for more hoisting opportunities.
587
3.95k
    if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
588
3.45k
      ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
589
16.5k
  } else if (isa<TruncInst>(V)) {
590
231
    ConstantOffset =
591
231
        find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
592
231
            .trunc(BitWidth);
593
16.2k
  } else if (isa<SExtInst>(V)) {
594
7.29k
    ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
595
7.29k
                          ZeroExtended, NonNegative).sext(BitWidth);
596
8.97k
  } else if (isa<ZExtInst>(V)) {
597
303
    // As an optimization, we can clear the SignExtended flag because
598
303
    // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
599
303
    //
600
303
    // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
601
303
    ConstantOffset =
602
303
        find(U->getOperand(0), /* SignExtended */ false,
603
303
             /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
604
303
  }
605
24.8k
606
24.8k
  // If we found a non-zero constant offset, add it to the path for
607
24.8k
  // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
608
24.8k
  // help this optimization.
609
24.8k
  if (ConstantOffset != 0)
610
6.61k
    UserChain.push_back(U);
611
24.8k
  return ConstantOffset;
612
24.8k
}
613
614
2.79k
Value *ConstantOffsetExtractor::applyExts(Value *V) {
615
2.79k
  Value *Current = V;
616
2.79k
  // ExtInsts is built in the use-def order. Therefore, we apply them to V
617
2.79k
  // in the reversed order.
618
3.53k
  for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; 
++I739
) {
619
739
    if (Constant *C = dyn_cast<Constant>(Current)) {
620
388
      // If Current is a constant, apply s/zext using ConstantExpr::getCast.
621
388
      // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
622
388
      Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
623
388
    } else {
624
351
      Instruction *Ext = (*I)->clone();
625
351
      Ext->setOperand(0, Current);
626
351
      Ext->insertBefore(IP);
627
351
      Current = Ext;
628
351
    }
629
739
  }
630
2.79k
  return Current;
631
2.79k
}
632
633
1.28k
Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
634
1.28k
  distributeExtsAndCloneChain(UserChain.size() - 1);
635
1.28k
  // Remove all nullptrs (used to be s/zext) from UserChain.
636
1.28k
  unsigned NewSize = 0;
637
3.17k
  for (User *I : UserChain) {
638
3.17k
    if (I != nullptr) {
639
2.79k
      UserChain[NewSize] = I;
640
2.79k
      NewSize++;
641
2.79k
    }
642
3.17k
  }
643
1.28k
  UserChain.resize(NewSize);
644
1.28k
  return removeConstOffset(UserChain.size() - 1);
645
1.28k
}
646
647
Value *
648
3.17k
ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
649
3.17k
  User *U = UserChain[ChainIndex];
650
3.17k
  if (ChainIndex == 0) {
651
1.28k
    assert(isa<ConstantInt>(U));
652
1.28k
    // If U is a ConstantInt, applyExts will return a ConstantInt as well.
653
1.28k
    return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
654
1.28k
  }
655
1.88k
656
1.88k
  if (CastInst *Cast = dyn_cast<CastInst>(U)) {
657
376
    assert(
658
376
        (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
659
376
        "Only following instructions can be traced: sext, zext & trunc");
660
376
    ExtInsts.push_back(Cast);
661
376
    UserChain[ChainIndex] = nullptr;
662
376
    return distributeExtsAndCloneChain(ChainIndex - 1);
663
376
  }
664
1.51k
665
1.51k
  // Function find only trace into BinaryOperator and CastInst.
666
1.51k
  BinaryOperator *BO = cast<BinaryOperator>(U);
667
1.51k
  // OpNo = which operand of BO is UserChain[ChainIndex - 1]
668
1.51k
  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 
0282
:
11.23k
);
669
1.51k
  Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
670
1.51k
  Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
671
1.51k
672
1.51k
  BinaryOperator *NewBO = nullptr;
673
1.51k
  if (OpNo == 0) {
674
282
    NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
675
282
                                   BO->getName(), IP);
676
1.23k
  } else {
677
1.23k
    NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
678
1.23k
                                   BO->getName(), IP);
679
1.23k
  }
680
1.51k
  return UserChain[ChainIndex] = NewBO;
681
1.51k
}
682
683
2.79k
Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
684
2.79k
  if (ChainIndex == 0) {
685
1.28k
    assert(isa<ConstantInt>(UserChain[ChainIndex]));
686
1.28k
    return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
687
1.28k
  }
688
1.51k
689
1.51k
  BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
690
1.51k
  assert(BO->getNumUses() <= 1 &&
691
1.51k
         "distributeExtsAndCloneChain clones each BinaryOperator in "
692
1.51k
         "UserChain, so no one should be used more than "
693
1.51k
         "once");
694
1.51k
695
1.51k
  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 
0282
:
11.23k
);
696
1.51k
  assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
697
1.51k
  Value *NextInChain = removeConstOffset(ChainIndex - 1);
698
1.51k
  Value *TheOther = BO->getOperand(1 - OpNo);
699
1.51k
700
1.51k
  // If NextInChain is 0 and not the LHS of a sub, we can simplify the
701
1.51k
  // sub-expression to be just TheOther.
702
1.51k
  if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
703
1.23k
    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && 
OpNo == 09
))
704
1.22k
      return TheOther;
705
285
  }
706
285
707
285
  BinaryOperator::BinaryOps NewOp = BO->getOpcode();
708
285
  if (BO->getOpcode() == Instruction::Or) {
709
1
    // Rebuild "or" as "add", because "or" may be invalid for the new
710
1
    // expression.
711
1
    //
712
1
    // For instance, given
713
1
    //   a | (b + 5) where a and b + 5 have no common bits,
714
1
    // we can extract 5 as the constant offset.
715
1
    //
716
1
    // However, reusing the "or" in the new index would give us
717
1
    //   (a | b) + 5
718
1
    // which does not equal a | (b + 5).
719
1
    //
720
1
    // Replacing the "or" with "add" is fine, because
721
1
    //   a | (b + 5) = a + (b + 5) = (a + b) + 5
722
1
    NewOp = Instruction::Add;
723
1
  }
724
285
725
285
  BinaryOperator *NewBO;
726
285
  if (OpNo == 0) {
727
275
    NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
728
275
  } else {
729
10
    NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
730
10
  }
731
285
  NewBO->takeName(BO);
732
285
  return NewBO;
733
285
}
734
735
Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
736
                                        User *&UserChainTail,
737
1.85k
                                        const DominatorTree *DT) {
738
1.85k
  ConstantOffsetExtractor Extractor(GEP, DT);
739
1.85k
  // Find a non-zero constant offset first.
740
1.85k
  APInt ConstantOffset =
741
1.85k
      Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
742
1.85k
                     GEP->isInBounds());
743
1.85k
  if (ConstantOffset == 0) {
744
564
    UserChainTail = nullptr;
745
564
    return nullptr;
746
564
  }
747
1.28k
  // Separates the constant offset from the GEP index.
748
1.28k
  Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
749
1.28k
  UserChainTail = Extractor.UserChain.back();
750
1.28k
  return IdxWithoutConstOffset;
751
1.28k
}
752
753
int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
754
10.8k
                                      const DominatorTree *DT) {
755
10.8k
  // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
756
10.8k
  return ConstantOffsetExtractor(GEP, DT)
757
10.8k
      .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
758
10.8k
            GEP->isInBounds())
759
10.8k
      .getSExtValue();
760
10.8k
}
761
762
bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
763
9.53k
    GetElementPtrInst *GEP) {
764
9.53k
  bool Changed = false;
765
9.53k
  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
766
9.53k
  gep_type_iterator GTI = gep_type_begin(*GEP);
767
9.53k
  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
768
20.5k
       I != E; 
++I, ++GTI11.0k
) {
769
11.0k
    // Skip struct member indices which must be i32.
770
11.0k
    if (GTI.isSequential()) {
771
10.8k
      if ((*I)->getType() != IntPtrTy) {
772
4.57k
        *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
773
4.57k
        Changed = true;
774
4.57k
      }
775
10.8k
    }
776
11.0k
  }
777
9.53k
  return Changed;
778
9.53k
}
779
780
int64_t
781
SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
782
9.53k
                                                 bool &NeedsExtraction) {
783
9.53k
  NeedsExtraction = false;
784
9.53k
  int64_t AccumulativeByteOffset = 0;
785
9.53k
  gep_type_iterator GTI = gep_type_begin(*GEP);
786
20.5k
  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; 
++I, ++GTI11.0k
) {
787
11.0k
    if (GTI.isSequential()) {
788
10.8k
      // Tries to extract a constant offset from this GEP index.
789
10.8k
      int64_t ConstantOffset =
790
10.8k
          ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
791
10.8k
      if (ConstantOffset != 0) {
792
1.39k
        NeedsExtraction = true;
793
1.39k
        // A GEP may have multiple indices.  We accumulate the extracted
794
1.39k
        // constant offset to a byte offset, and later offset the remainder of
795
1.39k
        // the original GEP with this byte offset.
796
1.39k
        AccumulativeByteOffset +=
797
1.39k
            ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
798
1.39k
      }
799
10.8k
    } else 
if (142
LowerGEP142
) {
800
103
      StructType *StTy = GTI.getStructType();
801
103
      uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
802
103
      // Skip field 0 as the offset is always 0.
803
103
      if (Field != 0) {
804
97
        NeedsExtraction = true;
805
97
        AccumulativeByteOffset +=
806
97
            DL->getStructLayout(StTy)->getElementOffset(Field);
807
97
      }
808
103
    }
809
11.0k
  }
810
9.53k
  return AccumulativeByteOffset;
811
9.53k
}
812
813
void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
814
653
    GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
815
653
  IRBuilder<> Builder(Variadic);
816
653
  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
817
653
818
653
  Type *I8PtrTy =
819
653
      Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
820
653
  Value *ResultPtr = Variadic->getOperand(0);
821
653
  Loop *L = LI->getLoopFor(Variadic->getParent());
822
653
  // Check if the base is not loop invariant or used more than once.
823
653
  bool isSwapCandidate =
824
653
      L && 
L->isLoopInvariant(ResultPtr)361
&&
825
653
      
!hasMoreThanOneUseInLoop(ResultPtr, L)355
;
826
653
  Value *FirstResult = nullptr;
827
653
828
653
  if (ResultPtr->getType() != I8PtrTy)
829
642
    ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
830
653
831
653
  gep_type_iterator GTI = gep_type_begin(*Variadic);
832
653
  // Create an ugly GEP for each sequential index. We don't create GEPs for
833
653
  // structure indices, as they are accumulated in the constant offset index.
834
1.58k
  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; 
++I, ++GTI930
) {
835
930
    if (GTI.isSequential()) {
836
858
      Value *Idx = Variadic->getOperand(I);
837
858
      // Skip zero indices.
838
858
      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
839
172
        if (CI->isZero())
840
168
          continue;
841
690
842
690
      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
843
690
                                DL->getTypeAllocSize(GTI.getIndexedType()));
844
690
      // Scale the index by element size.
845
690
      if (ElementSize != 1) {
846
679
        if (ElementSize.isPowerOf2()) {
847
662
          Idx = Builder.CreateShl(
848
662
              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
849
662
        } else {
850
17
          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
851
17
        }
852
679
      }
853
690
      // Create an ugly GEP with a single index for each index.
854
690
      ResultPtr =
855
690
          Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
856
690
      if (FirstResult == nullptr)
857
645
        FirstResult = ResultPtr;
858
690
    }
859
930
  }
860
653
861
653
  // Create a GEP with the constant offset index.
862
653
  if (AccumulativeByteOffset != 0) {
863
653
    Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
864
653
    ResultPtr =
865
653
        Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
866
653
  } else
867
0
    isSwapCandidate = false;
868
653
869
653
  // If we created a GEP with constant index, and the base is loop invariant,
870
653
  // then we swap the first one with it, so LICM can move constant GEP out
871
653
  // later.
872
653
  GetElementPtrInst *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
873
653
  GetElementPtrInst *SecondGEP = dyn_cast_or_null<GetElementPtrInst>(ResultPtr);
874
653
  if (isSwapCandidate && 
isLegalToSwapOperand(FirstGEP, SecondGEP, L)1
)
875
1
    swapGEPOperand(FirstGEP, SecondGEP);
876
653
877
653
  if (ResultPtr->getType() != Variadic->getType())
878
641
    ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
879
653
880
653
  Variadic->replaceAllUsesWith(ResultPtr);
881
653
  Variadic->eraseFromParent();
882
653
}
883
884
void
885
SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
886
21
                                               int64_t AccumulativeByteOffset) {
887
21
  IRBuilder<> Builder(Variadic);
888
21
  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
889
21
890
21
  Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
891
21
  gep_type_iterator GTI = gep_type_begin(*Variadic);
892
21
  // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
893
21
  // don't create arithmetics for structure indices, as they are accumulated
894
21
  // in the constant offset index.
895
90
  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; 
++I, ++GTI69
) {
896
69
    if (GTI.isSequential()) {
897
42
      Value *Idx = Variadic->getOperand(I);
898
42
      // Skip zero indices.
899
42
      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
900
24
        if (CI->isZero())
901
21
          continue;
902
21
903
21
      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
904
21
                                DL->getTypeAllocSize(GTI.getIndexedType()));
905
21
      // Scale the index by element size.
906
21
      if (ElementSize != 1) {
907
21
        if (ElementSize.isPowerOf2()) {
908
12
          Idx = Builder.CreateShl(
909
12
              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
910
12
        } else {
911
9
          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
912
9
        }
913
21
      }
914
21
      // Create an ADD for each index.
915
21
      ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
916
21
    }
917
69
  }
918
21
919
21
  // Create an ADD for the constant offset index.
920
21
  if (AccumulativeByteOffset != 0) {
921
21
    ResultPtr = Builder.CreateAdd(
922
21
        ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
923
21
  }
924
21
925
21
  ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
926
21
  Variadic->replaceAllUsesWith(ResultPtr);
927
21
  Variadic->eraseFromParent();
928
21
}
929
930
19.2k
bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
931
19.2k
  // Skip vector GEPs.
932
19.2k
  if (GEP->getType()->isVectorTy())
933
16
    return false;
934
19.2k
935
19.2k
  // The backend can already nicely handle the case where all indices are
936
19.2k
  // constant.
937
19.2k
  if (GEP->hasAllConstantIndices())
938
9.67k
    return false;
939
9.53k
940
9.53k
  bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
941
9.53k
942
9.53k
  bool NeedsExtraction;
943
9.53k
  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
944
9.53k
945
9.53k
  if (!NeedsExtraction)
946
8.13k
    return Changed;
947
1.40k
948
1.40k
  TargetTransformInfo &TTI =
949
1.40k
      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*GEP->getFunction());
950
1.40k
951
1.40k
  // If LowerGEP is disabled, before really splitting the GEP, check whether the
952
1.40k
  // backend supports the addressing mode we are about to produce. If no, this
953
1.40k
  // splitting probably won't be beneficial.
954
1.40k
  // If LowerGEP is enabled, even the extracted constant offset can not match
955
1.40k
  // the addressing mode, we can still do optimizations to other lowered parts
956
1.40k
  // of variable indices. Therefore, we don't check for addressing modes in that
957
1.40k
  // case.
958
1.40k
  if (!LowerGEP) {
959
728
    unsigned AddrSpace = GEP->getPointerAddressSpace();
960
728
    if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
961
728
                                   /*BaseGV=*/nullptr, AccumulativeByteOffset,
962
728
                                   /*HasBaseReg=*/true, /*Scale=*/0,
963
728
                                   AddrSpace)) {
964
110
      return Changed;
965
110
    }
966
1.29k
  }
967
1.29k
968
1.29k
  // Remove the constant offset in each sequential index. The resultant GEP
969
1.29k
  // computes the variadic base.
970
1.29k
  // Notice that we don't remove struct field indices here. If LowerGEP is
971
1.29k
  // disabled, a structure index is not accumulated and we still use the old
972
1.29k
  // one. If LowerGEP is enabled, a structure index is accumulated in the
973
1.29k
  // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
974
1.29k
  // handle the constant offset and won't need a new structure index.
975
1.29k
  gep_type_iterator GTI = gep_type_begin(*GEP);
976
3.25k
  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; 
++I, ++GTI1.95k
) {
977
1.95k
    if (GTI.isSequential()) {
978
1.85k
      // Splits this GEP index into a variadic part and a constant offset, and
979
1.85k
      // uses the variadic part as the new index.
980
1.85k
      Value *OldIdx = GEP->getOperand(I);
981
1.85k
      User *UserChainTail;
982
1.85k
      Value *NewIdx =
983
1.85k
          ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
984
1.85k
      if (NewIdx != nullptr) {
985
1.28k
        // Switches to the index with the constant offset removed.
986
1.28k
        GEP->setOperand(I, NewIdx);
987
1.28k
        // After switching to the new index, we can garbage-collect UserChain
988
1.28k
        // and the old index if they are not used.
989
1.28k
        RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
990
1.28k
        RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
991
1.28k
      }
992
1.85k
    }
993
1.95k
  }
994
1.29k
995
1.29k
  // Clear the inbounds attribute because the new index may be off-bound.
996
1.29k
  // e.g.,
997
1.29k
  //
998
1.29k
  //   b     = add i64 a, 5
999
1.29k
  //   addr  = gep inbounds float, float* p, i64 b
1000
1.29k
  //
1001
1.29k
  // is transformed to:
1002
1.29k
  //
1003
1.29k
  //   addr2 = gep float, float* p, i64 a ; inbounds removed
1004
1.29k
  //   addr  = gep inbounds float, float* addr2, i64 5
1005
1.29k
  //
1006
1.29k
  // If a is -4, although the old index b is in bounds, the new index a is
1007
1.29k
  // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
1008
1.29k
  // inbounds keyword is not present, the offsets are added to the base
1009
1.29k
  // address with silently-wrapping two's complement arithmetic".
1010
1.29k
  // Therefore, the final code will be a semantically equivalent.
1011
1.29k
  //
1012
1.29k
  // TODO(jingyue): do some range analysis to keep as many inbounds as
1013
1.29k
  // possible. GEPs with inbounds are more friendly to alias analysis.
1014
1.29k
  bool GEPWasInBounds = GEP->isInBounds();
1015
1.29k
  GEP->setIsInBounds(false);
1016
1.29k
1017
1.29k
  // Lowers a GEP to either GEPs with a single index or arithmetic operations.
1018
1.29k
  if (LowerGEP) {
1019
674
    // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
1020
674
    // arithmetic operations if the target uses alias analysis in codegen.
1021
674
    if (TTI.useAA())
1022
653
      lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
1023
21
    else
1024
21
      lowerToArithmetics(GEP, AccumulativeByteOffset);
1025
674
    return true;
1026
674
  }
1027
618
1028
618
  // No need to create another GEP if the accumulative byte offset is 0.
1029
618
  if (AccumulativeByteOffset == 0)
1030
0
    return true;
1031
618
1032
618
  // Offsets the base with the accumulative byte offset.
1033
618
  //
1034
618
  //   %gep                        ; the base
1035
618
  //   ... %gep ...
1036
618
  //
1037
618
  // => add the offset
1038
618
  //
1039
618
  //   %gep2                       ; clone of %gep
1040
618
  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
1041
618
  //   %gep                        ; will be removed
1042
618
  //   ... %gep ...
1043
618
  //
1044
618
  // => replace all uses of %gep with %new.gep and remove %gep
1045
618
  //
1046
618
  //   %gep2                       ; clone of %gep
1047
618
  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
1048
618
  //   ... %new.gep ...
1049
618
  //
1050
618
  // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
1051
618
  // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
1052
618
  // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
1053
618
  // type of %gep.
1054
618
  //
1055
618
  //   %gep2                       ; clone of %gep
1056
618
  //   %0       = bitcast %gep2 to i8*
1057
618
  //   %uglygep = gep %0, <offset>
1058
618
  //   %new.gep = bitcast %uglygep to <type of %gep>
1059
618
  //   ... %new.gep ...
1060
618
  Instruction *NewGEP = GEP->clone();
1061
618
  NewGEP->insertBefore(GEP);
1062
618
1063
618
  // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
1064
618
  // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
1065
618
  // used with unsigned integers later.
1066
618
  int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
1067
618
      DL->getTypeAllocSize(GEP->getResultElementType()));
1068
618
  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
1069
618
  if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
1070
617
    // Very likely. As long as %gep is naturally aligned, the byte offset we
1071
617
    // extracted should be a multiple of sizeof(*%gep).
1072
617
    int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
1073
617
    NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
1074
617
                                       ConstantInt::get(IntPtrTy, Index, true),
1075
617
                                       GEP->getName(), GEP);
1076
617
    NewGEP->copyMetadata(*GEP);
1077
617
    // Inherit the inbounds attribute of the original GEP.
1078
617
    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
1079
617
  } else {
1080
1
    // Unlikely but possible. For example,
1081
1
    // #pragma pack(1)
1082
1
    // struct S {
1083
1
    //   int a[3];
1084
1
    //   int64 b[8];
1085
1
    // };
1086
1
    // #pragma pack()
1087
1
    //
1088
1
    // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
1089
1
    // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
1090
1
    // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
1091
1
    // sizeof(int64).
1092
1
    //
1093
1
    // Emit an uglygep in this case.
1094
1
    Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
1095
1
                                       GEP->getPointerAddressSpace());
1096
1
    NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
1097
1
    NewGEP = GetElementPtrInst::Create(
1098
1
        Type::getInt8Ty(GEP->getContext()), NewGEP,
1099
1
        ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
1100
1
        GEP);
1101
1
    NewGEP->copyMetadata(*GEP);
1102
1
    // Inherit the inbounds attribute of the original GEP.
1103
1
    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
1104
1
    if (GEP->getType() != I8PtrTy)
1105
1
      NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
1106
1
  }
1107
618
1108
618
  GEP->replaceAllUsesWith(NewGEP);
1109
618
  GEP->eraseFromParent();
1110
618
1111
618
  return true;
1112
618
}
1113
1114
38.4k
bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
1115
38.4k
  if (skipFunction(F))
1116
9
    return false;
1117
38.4k
1118
38.4k
  if (DisableSeparateConstOffsetFromGEP)
1119
0
    return false;
1120
38.4k
1121
38.4k
  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1122
38.4k
  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1123
38.4k
  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1124
38.4k
  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
1125
38.4k
  bool Changed = false;
1126
45.1k
  for (BasicBlock &B : F) {
1127
282k
    for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
1128
237k
      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
1129
19.2k
        Changed |= splitGEP(GEP);
1130
45.1k
    // No need to split GEP ConstantExprs because all its indices are constant
1131
45.1k
    // already.
1132
45.1k
  }
1133
38.4k
1134
38.4k
  Changed |= reuniteExts(F);
1135
38.4k
1136
38.4k
  if (VerifyNoDeadCode)
1137
26
    verifyNoDeadCode(F);
1138
38.4k
1139
38.4k
  return Changed;
1140
38.4k
}
1141
1142
Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
1143
5
    const SCEV *Key, Instruction *Dominatee) {
1144
5
  auto Pos = DominatingExprs.find(Key);
1145
5
  if (Pos == DominatingExprs.end())
1146
3
    return nullptr;
1147
2
1148
2
  auto &Candidates = Pos->second;
1149
2
  // Because we process the basic blocks in pre-order of the dominator tree, a
1150
2
  // candidate that doesn't dominate the current instruction won't dominate any
1151
2
  // future instruction either. Therefore, we pop it out of the stack. This
1152
2
  // optimization makes the algorithm O(n).
1153
2
  while (!Candidates.empty()) {
1154
2
    Instruction *Candidate = Candidates.back();
1155
2
    if (DT->dominates(Candidate, Dominatee))
1156
2
      return Candidate;
1157
0
    Candidates.pop_back();
1158
0
  }
1159
2
  
return nullptr0
;
1160
2
}
1161
1162
244k
bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
1163
244k
  if (!SE->isSCEVable(I->getType()))
1164
141k
    return false;
1165
102k
1166
102k
  //   Dom: LHS+RHS
1167
102k
  //   I: sext(LHS)+sext(RHS)
1168
102k
  // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
1169
102k
  // TODO: handle zext
1170
102k
  Value *LHS = nullptr, *RHS = nullptr;
1171
102k
  if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) ||
1172
102k
      
match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))102k
) {
1173
7
    if (LHS->getType() == RHS->getType()) {
1174
5
      const SCEV *Key =
1175
5
          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
1176
5
      if (auto *Dom = findClosestMatchingDominator(Key, I)) {
1177
2
        Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
1178
2
        NewSExt->takeName(I);
1179
2
        I->replaceAllUsesWith(NewSExt);
1180
2
        RecursivelyDeleteTriviallyDeadInstructions(I);
1181
2
        return true;
1182
2
      }
1183
102k
    }
1184
7
  }
1185
102k
1186
102k
  // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
1187
102k
  if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) ||
1188
102k
      
match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))100k
) {
1189
1.94k
    if (programUndefinedIfFullPoison(I)) {
1190
50
      const SCEV *Key =
1191
50
          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
1192
50
      DominatingExprs[Key].push_back(I);
1193
50
    }
1194
1.94k
  }
1195
102k
  return false;
1196
102k
}
1197
1198
38.4k
bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
1199
38.4k
  bool Changed = false;
1200
38.4k
  DominatingExprs.clear();
1201
45.1k
  for (const auto Node : depth_first(DT)) {
1202
45.1k
    BasicBlock *BB = Node->getBlock();
1203
289k
    for (auto I = BB->begin(); I != BB->end(); ) {
1204
244k
      Instruction *Cur = &*I++;
1205
244k
      Changed |= reuniteExts(Cur);
1206
244k
    }
1207
45.1k
  }
1208
38.4k
  return Changed;
1209
38.4k
}
1210
1211
26
void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
1212
26
  for (BasicBlock &B : F) {
1213
302
    for (Instruction &I : B) {
1214
302
      if (isInstructionTriviallyDead(&I)) {
1215
0
        std::string ErrMessage;
1216
0
        raw_string_ostream RSO(ErrMessage);
1217
0
        RSO << "Dead instruction detected!\n" << I << "\n";
1218
0
        llvm_unreachable(RSO.str().c_str());
1219
0
      }
1220
302
    }
1221
26
  }
1222
26
}
1223
1224
bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
1225
1
    GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
1226
1
  if (!FirstGEP || !FirstGEP->hasOneUse())
1227
0
    return false;
1228
1
1229
1
  if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
1230
0
    return false;
1231
1
1232
1
  if (FirstGEP == SecondGEP)
1233
0
    return false;
1234
1
1235
1
  unsigned FirstNum = FirstGEP->getNumOperands();
1236
1
  unsigned SecondNum = SecondGEP->getNumOperands();
1237
1
  // Give up if the number of operands are not 2.
1238
1
  if (FirstNum != SecondNum || FirstNum != 2)
1239
0
    return false;
1240
1
1241
1
  Value *FirstBase = FirstGEP->getOperand(0);
1242
1
  Value *SecondBase = SecondGEP->getOperand(0);
1243
1
  Value *FirstOffset = FirstGEP->getOperand(1);
1244
1
  // Give up if the index of the first GEP is loop invariant.
1245
1
  if (CurLoop->isLoopInvariant(FirstOffset))
1246
0
    return false;
1247
1
1248
1
  // Give up if base doesn't have same type.
1249
1
  if (FirstBase->getType() != SecondBase->getType())
1250
0
    return false;
1251
1
1252
1
  Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
1253
1
1254
1
  // Check if the second operand of first GEP has constant coefficient.
1255
1
  // For an example, for the following code,  we won't gain anything by
1256
1
  // hoisting the second GEP out because the second GEP can be folded away.
1257
1
  //   %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
1258
1
  //   %67 = shl i64 %scevgep.sum.ur159, 2
1259
1
  //   %uglygep160 = getelementptr i8* %65, i64 %67
1260
1
  //   %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
1261
1
1262
1
  // Skip constant shift instruction which may be generated by Splitting GEPs.
1263
1
  if (FirstOffsetDef && FirstOffsetDef->isShift() &&
1264
1
      isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
1265
1
    FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
1266
1
1267
1
  // Give up if FirstOffsetDef is an Add or Sub with constant.
1268
1
  // Because it may not profitable at all due to constant folding.
1269
1
  if (FirstOffsetDef)
1270
1
    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
1271
0
      unsigned opc = BO->getOpcode();
1272
0
      if ((opc == Instruction::Add || opc == Instruction::Sub) &&
1273
0
          (isa<ConstantInt>(BO->getOperand(0)) ||
1274
0
           isa<ConstantInt>(BO->getOperand(1))))
1275
0
        return false;
1276
1
    }
1277
1
  return true;
1278
1
}
1279
1280
355
bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
1281
355
  int UsesInLoop = 0;
1282
977
  for (User *U : V->users()) {
1283
977
    if (Instruction *User = dyn_cast<Instruction>(U))
1284
791
      if (L->contains(User))
1285
709
        if (++UsesInLoop > 1)
1286
354
          return true;
1287
977
  }
1288
355
  
return false1
;
1289
355
}
1290
1291
void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
1292
1
                                                GetElementPtrInst *Second) {
1293
1
  Value *Offset1 = First->getOperand(1);
1294
1
  Value *Offset2 = Second->getOperand(1);
1295
1
  First->setOperand(1, Offset2);
1296
1
  Second->setOperand(1, Offset1);
1297
1
1298
1
  // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
1299
1
  const DataLayout &DAL = First->getModule()->getDataLayout();
1300
1
  APInt Offset(DAL.getIndexSizeInBits(
1301
1
                   cast<PointerType>(First->getType())->getAddressSpace()),
1302
1
               0);
1303
1
  Value *NewBase =
1304
1
      First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
1305
1
  uint64_t ObjectSize;
1306
1
  if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
1307
1
     
Offset.ugt(ObjectSize)0
) {
1308
1
    First->setIsInBounds(false);
1309
1
    Second->setIsInBounds(false);
1310
1
  } else
1311
0
    First->setIsInBounds(true);
1312
1
}