/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- InstCombineSimplifyDemanded.cpp ------------------------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This file contains logic for simplifying instructions based on information |
11 | | // about how they are used. |
12 | | // |
13 | | //===----------------------------------------------------------------------===// |
14 | | |
15 | | #include "InstCombineInternal.h" |
16 | | #include "llvm/Analysis/ValueTracking.h" |
17 | | #include "llvm/IR/IntrinsicInst.h" |
18 | | #include "llvm/IR/PatternMatch.h" |
19 | | #include "llvm/Support/KnownBits.h" |
20 | | |
21 | | using namespace llvm; |
22 | | using namespace llvm::PatternMatch; |
23 | | |
24 | | #define DEBUG_TYPE "instcombine" |
25 | | |
26 | | /// Check to see if the specified operand of the specified instruction is a |
27 | | /// constant integer. If so, check to see if there are any bits set in the |
28 | | /// constant that are not demanded. If so, shrink the constant and return true. |
29 | | static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, |
30 | 15.8M | const APInt &Demanded) { |
31 | 15.8M | assert(I && "No instruction?"); |
32 | 15.8M | assert(OpNo < I->getNumOperands() && "Operand index too large"); |
33 | 15.8M | |
34 | 15.8M | // The operand must be a constant integer or splat integer. |
35 | 15.8M | Value *Op = I->getOperand(OpNo); |
36 | 15.8M | const APInt *C; |
37 | 15.8M | if (!match(Op, m_APInt(C))) |
38 | 9.48M | return false; |
39 | 6.39M | |
40 | 6.39M | // If there are no bits set that aren't demanded, nothing to do. |
41 | 6.39M | if (6.39M C->isSubsetOf(Demanded)6.39M ) |
42 | 6.38M | return false; |
43 | 7.60k | |
44 | 7.60k | // This instruction is producing bits that are not demanded. Shrink the RHS. |
45 | 7.60k | I->setOperand(OpNo, ConstantInt::get(Op->getType(), *C & Demanded)); |
46 | 7.60k | |
47 | 7.60k | return true; |
48 | 7.60k | } |
49 | | |
50 | | |
51 | | |
52 | | /// Inst is an integer instruction that SimplifyDemandedBits knows about. See if |
53 | | /// the instruction has any properties that allow us to simplify its operands. |
54 | 8.99M | bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) { |
55 | 8.99M | unsigned BitWidth = Inst.getType()->getScalarSizeInBits(); |
56 | 8.99M | KnownBits Known(BitWidth); |
57 | 8.99M | APInt DemandedMask(APInt::getAllOnesValue(BitWidth)); |
58 | 8.99M | |
59 | 8.99M | Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known, |
60 | 8.99M | 0, &Inst); |
61 | 8.99M | if (!V8.99M ) return false8.97M ; |
62 | 28.2k | if (28.2k V == &Inst28.2k ) return true21.3k ; |
63 | 6.89k | replaceInstUsesWith(Inst, V); |
64 | 6.89k | return true; |
65 | 6.89k | } |
66 | | |
67 | | /// This form of SimplifyDemandedBits simplifies the specified instruction |
68 | | /// operand if possible, updating it in place. It returns true if it made any |
69 | | /// change and false otherwise. |
70 | | bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo, |
71 | | const APInt &DemandedMask, |
72 | | KnownBits &Known, |
73 | 75.6M | unsigned Depth) { |
74 | 75.6M | Use &U = I->getOperandUse(OpNo); |
75 | 75.6M | Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known, |
76 | 75.6M | Depth, I); |
77 | 75.6M | if (!NewVal75.6M ) return false75.6M ; |
78 | 30.3k | U = NewVal; |
79 | 30.3k | return true; |
80 | 30.3k | } |
81 | | |
82 | | |
83 | | /// This function attempts to replace V with a simpler value based on the |
84 | | /// demanded bits. When this function is called, it is known that only the bits |
85 | | /// set in DemandedMask of the result of V are ever used downstream. |
86 | | /// Consequently, depending on the mask and V, it may be possible to replace V |
87 | | /// with a constant or one of its operands. In such cases, this function does |
88 | | /// the replacement and returns true. In all other cases, it returns false after |
89 | | /// analyzing the expression and setting KnownOne and known to be one in the |
90 | | /// expression. Known.Zero contains all the bits that are known to be zero in |
91 | | /// the expression. These are provided to potentially allow the caller (which |
92 | | /// might recursively be SimplifyDemandedBits itself) to simplify the |
93 | | /// expression. |
94 | | /// Known.One and Known.Zero always follow the invariant that: |
95 | | /// Known.One & Known.Zero == 0. |
96 | | /// That is, a bit can't be both 1 and 0. Note that the bits in Known.One and |
97 | | /// Known.Zero may only be accurate for those bits set in DemandedMask. Note |
98 | | /// also that the bitwidth of V, DemandedMask, Known.Zero and Known.One must all |
99 | | /// be the same. |
100 | | /// |
101 | | /// This returns null if it did not change anything and it permits no |
102 | | /// simplification. This returns V itself if it did some simplification of V's |
103 | | /// operands based on the information about what bits are demanded. This returns |
104 | | /// some other non-null value if it found out that V is equal to another value |
105 | | /// in the context where the specified bits are demanded, but not for all users. |
106 | | Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, |
107 | | KnownBits &Known, unsigned Depth, |
108 | 84.6M | Instruction *CxtI) { |
109 | 84.6M | assert(V != nullptr && "Null pointer of Value???"); |
110 | 84.6M | assert(Depth <= 6 && "Limit Search Depth"); |
111 | 84.6M | uint32_t BitWidth = DemandedMask.getBitWidth(); |
112 | 84.6M | Type *VTy = V->getType(); |
113 | 84.6M | assert( |
114 | 84.6M | (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) && |
115 | 84.6M | Known.getBitWidth() == BitWidth && |
116 | 84.6M | "Value *V, DemandedMask and Known must have same BitWidth"); |
117 | 84.6M | |
118 | 84.6M | if (isa<Constant>(V)84.6M ) { |
119 | 22.4M | computeKnownBits(V, Known, Depth, CxtI); |
120 | 22.4M | return nullptr; |
121 | 22.4M | } |
122 | 62.2M | |
123 | 62.2M | Known.resetAll(); |
124 | 62.2M | if (DemandedMask.isNullValue()) // Not demanding any bits from V. |
125 | 299 | return UndefValue::get(VTy); |
126 | 62.2M | |
127 | 62.2M | if (62.2M Depth == 662.2M ) // Limit search depth. |
128 | 84.6k | return nullptr; |
129 | 62.1M | |
130 | 62.1M | Instruction *I = dyn_cast<Instruction>(V); |
131 | 62.1M | if (!I62.1M ) { |
132 | 4.55M | computeKnownBits(V, Known, Depth, CxtI); |
133 | 4.55M | return nullptr; // Only analyze instructions. |
134 | 4.55M | } |
135 | 57.6M | |
136 | 57.6M | // If there are multiple uses of this value and we aren't at the root, then |
137 | 57.6M | // we can't do any simplifications of the operands, because DemandedMask |
138 | 57.6M | // only reflects the bits demanded by *one* of the users. |
139 | 57.6M | if (57.6M Depth != 0 && 57.6M !I->hasOneUse()24.6M ) |
140 | 12.4M | return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI); |
141 | 45.1M | |
142 | 45.1M | KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth); |
143 | 45.1M | |
144 | 45.1M | // If this is the root being simplified, allow it to have multiple uses, |
145 | 45.1M | // just set the DemandedMask to all bits so that we can try to simplify the |
146 | 45.1M | // operands. This allows visitTruncInst (for example) to simplify the |
147 | 45.1M | // operand of a trunc without duplicating all the logic below. |
148 | 45.1M | if (Depth == 0 && 45.1M !V->hasOneUse()32.9M ) |
149 | 16.8M | DemandedMask.setAllBits(); |
150 | 45.1M | |
151 | 45.1M | switch (I->getOpcode()) { |
152 | 20.0M | default: |
153 | 20.0M | computeKnownBits(I, Known, Depth, CxtI); |
154 | 20.0M | break; |
155 | 3.92M | case Instruction::And: { |
156 | 3.92M | // If either the LHS or the RHS are Zero, the result is zero. |
157 | 3.92M | if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || |
158 | 3.92M | SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown, |
159 | 3.92M | Depth + 1)) |
160 | 5.99k | return I; |
161 | 3.92M | assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); |
162 | 3.91M | assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); |
163 | 3.91M | |
164 | 3.91M | // Output known-0 are known to be clear if zero in either the LHS | RHS. |
165 | 3.91M | APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero; |
166 | 3.91M | // Output known-1 bits are only known if set in both the LHS & RHS. |
167 | 3.91M | APInt IKnownOne = RHSKnown.One & LHSKnown.One; |
168 | 3.91M | |
169 | 3.91M | // If the client is only demanding bits that we know, return the known |
170 | 3.91M | // constant. |
171 | 3.91M | if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) |
172 | 42 | return Constant::getIntegerValue(VTy, IKnownOne); |
173 | 3.91M | |
174 | 3.91M | // If all of the demanded bits are known 1 on one side, return the other. |
175 | 3.91M | // These bits cannot contribute to the result of the 'and'. |
176 | 3.91M | if (3.91M DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One)3.91M ) |
177 | 9.79k | return I->getOperand(0); |
178 | 3.90M | if (3.90M DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One)3.90M ) |
179 | 815 | return I->getOperand(1); |
180 | 3.90M | |
181 | 3.90M | // If the RHS is a constant, see if we can simplify it. |
182 | 3.90M | if (3.90M ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero)3.90M ) |
183 | 5.02k | return I; |
184 | 3.89M | |
185 | 3.89M | Known.Zero = std::move(IKnownZero); |
186 | 3.89M | Known.One = std::move(IKnownOne); |
187 | 3.89M | break; |
188 | 3.89M | } |
189 | 1.48M | case Instruction::Or: { |
190 | 1.48M | // If either the LHS or the RHS are One, the result is One. |
191 | 1.48M | if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || |
192 | 1.48M | SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown, |
193 | 1.48M | Depth + 1)) |
194 | 3.15k | return I; |
195 | 1.48M | assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); |
196 | 1.48M | assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); |
197 | 1.48M | |
198 | 1.48M | // Output known-0 bits are only known if clear in both the LHS & RHS. |
199 | 1.48M | APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero; |
200 | 1.48M | // Output known-1 are known. to be set if s.et in either the LHS | RHS. |
201 | 1.48M | APInt IKnownOne = RHSKnown.One | LHSKnown.One; |
202 | 1.48M | |
203 | 1.48M | // If the client is only demanding bits that we know, return the known |
204 | 1.48M | // constant. |
205 | 1.48M | if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) |
206 | 9 | return Constant::getIntegerValue(VTy, IKnownOne); |
207 | 1.48M | |
208 | 1.48M | // If all of the demanded bits are known zero on one side, return the other. |
209 | 1.48M | // These bits cannot contribute to the result of the 'or'. |
210 | 1.48M | if (1.48M DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero)1.48M ) |
211 | 254 | return I->getOperand(0); |
212 | 1.48M | if (1.48M DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero)1.48M ) |
213 | 75 | return I->getOperand(1); |
214 | 1.48M | |
215 | 1.48M | // If the RHS is a constant, see if we can simplify it. |
216 | 1.48M | if (1.48M ShrinkDemandedConstant(I, 1, DemandedMask)1.48M ) |
217 | 7 | return I; |
218 | 1.48M | |
219 | 1.48M | Known.Zero = std::move(IKnownZero); |
220 | 1.48M | Known.One = std::move(IKnownOne); |
221 | 1.48M | break; |
222 | 1.48M | } |
223 | 533k | case Instruction::Xor: { |
224 | 533k | if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || |
225 | 533k | SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1)) |
226 | 331 | return I; |
227 | 533k | assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); |
228 | 533k | assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); |
229 | 533k | |
230 | 533k | // Output known-0 bits are known if clear or set in both the LHS & RHS. |
231 | 533k | APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) | |
232 | 533k | (RHSKnown.One & LHSKnown.One); |
233 | 533k | // Output known-1 are known to be set if set in only one of the LHS, RHS. |
234 | 533k | APInt IKnownOne = (RHSKnown.Zero & LHSKnown.One) | |
235 | 533k | (RHSKnown.One & LHSKnown.Zero); |
236 | 533k | |
237 | 533k | // If the client is only demanding bits that we know, return the known |
238 | 533k | // constant. |
239 | 533k | if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) |
240 | 6 | return Constant::getIntegerValue(VTy, IKnownOne); |
241 | 533k | |
242 | 533k | // If all of the demanded bits are known zero on one side, return the other. |
243 | 533k | // These bits cannot contribute to the result of the 'xor'. |
244 | 533k | if (533k DemandedMask.isSubsetOf(RHSKnown.Zero)533k ) |
245 | 4 | return I->getOperand(0); |
246 | 533k | if (533k DemandedMask.isSubsetOf(LHSKnown.Zero)533k ) |
247 | 0 | return I->getOperand(1); |
248 | 533k | |
249 | 533k | // If all of the demanded bits are known to be zero on one side or the |
250 | 533k | // other, turn this into an *inclusive* or. |
251 | 533k | // e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0 |
252 | 533k | if (533k DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)533k ) { |
253 | 50 | Instruction *Or = |
254 | 50 | BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1), |
255 | 50 | I->getName()); |
256 | 50 | return InsertNewInstWith(Or, *I); |
257 | 50 | } |
258 | 533k | |
259 | 533k | // If all of the demanded bits on one side are known, and all of the set |
260 | 533k | // bits on that side are also known to be set on the other side, turn this |
261 | 533k | // into an AND, as we know the bits will be cleared. |
262 | 533k | // e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2 |
263 | 533k | if (533k DemandedMask.isSubsetOf(RHSKnown.Zero|RHSKnown.One) && |
264 | 533k | RHSKnown.One.isSubsetOf(LHSKnown.One)301k ) { |
265 | 8 | Constant *AndC = Constant::getIntegerValue(VTy, |
266 | 8 | ~RHSKnown.One & DemandedMask); |
267 | 8 | Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC); |
268 | 8 | return InsertNewInstWith(And, *I); |
269 | 8 | } |
270 | 533k | |
271 | 533k | // If the RHS is a constant, see if we can simplify it. |
272 | 533k | // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1. |
273 | 533k | if (533k ShrinkDemandedConstant(I, 1, DemandedMask)533k ) |
274 | 549 | return I; |
275 | 532k | |
276 | 532k | // If our LHS is an 'and' and if it has one use, and if any of the bits we |
277 | 532k | // are flipping are known to be set, then the xor is just resetting those |
278 | 532k | // bits to zero. We can just knock out bits from the 'and' and the 'xor', |
279 | 532k | // simplifying both of them. |
280 | 532k | if (Instruction *532k LHSInst532k = dyn_cast<Instruction>(I->getOperand(0))) |
281 | 523k | if (523k LHSInst->getOpcode() == Instruction::And && 523k LHSInst->hasOneUse()42.9k && |
282 | 38.9k | isa<ConstantInt>(I->getOperand(1)) && |
283 | 27.0k | isa<ConstantInt>(LHSInst->getOperand(1)) && |
284 | 523k | (LHSKnown.One & RHSKnown.One & DemandedMask) != 015.3k ) { |
285 | 0 | ConstantInt *AndRHS = cast<ConstantInt>(LHSInst->getOperand(1)); |
286 | 0 | ConstantInt *XorRHS = cast<ConstantInt>(I->getOperand(1)); |
287 | 0 | APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask); |
288 | 0 |
|
289 | 0 | Constant *AndC = |
290 | 0 | ConstantInt::get(I->getType(), NewMask & AndRHS->getValue()); |
291 | 0 | Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC); |
292 | 0 | InsertNewInstWith(NewAnd, *I); |
293 | 0 |
|
294 | 0 | Constant *XorC = |
295 | 0 | ConstantInt::get(I->getType(), NewMask & XorRHS->getValue()); |
296 | 0 | Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC); |
297 | 0 | return InsertNewInstWith(NewXor, *I); |
298 | 0 | } |
299 | 532k | |
300 | 532k | // Output known-0 bits are known if clear or set in both the LHS & RHS. |
301 | 532k | Known.Zero = std::move(IKnownZero); |
302 | 532k | // Output known-1 are known to be set if set in only one of the LHS, RHS. |
303 | 532k | Known.One = std::move(IKnownOne); |
304 | 532k | break; |
305 | 532k | } |
306 | 934k | case Instruction::Select: |
307 | 934k | // If this is a select as part of a min/max pattern, don't simplify any |
308 | 934k | // further in case we break the structure. |
309 | 934k | Value *LHS, *RHS; |
310 | 934k | if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN) |
311 | 538k | return nullptr; |
312 | 395k | |
313 | 395k | if (395k SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) || |
314 | 395k | SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1)) |
315 | 383 | return I; |
316 | 395k | assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); |
317 | 394k | assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); |
318 | 394k | |
319 | 394k | // If the operands are constants, see if we can simplify them. |
320 | 394k | if (ShrinkDemandedConstant(I, 1, DemandedMask) || |
321 | 394k | ShrinkDemandedConstant(I, 2, DemandedMask)) |
322 | 314 | return I; |
323 | 394k | |
324 | 394k | // Only known if known in both the LHS and RHS. |
325 | 394k | Known.One = RHSKnown.One & LHSKnown.One; |
326 | 394k | Known.Zero = RHSKnown.Zero & LHSKnown.Zero; |
327 | 394k | break; |
328 | 4.47M | case Instruction::ZExt: |
329 | 4.47M | case Instruction::Trunc: { |
330 | 4.47M | unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); |
331 | 4.47M | |
332 | 4.47M | APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth); |
333 | 4.47M | KnownBits InputKnown(SrcBitWidth); |
334 | 4.47M | if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1)) |
335 | 9.08k | return I; |
336 | 4.46M | Known = Known.zextOrTrunc(BitWidth); |
337 | 4.46M | // Any top bits are known to be zero. |
338 | 4.46M | if (BitWidth > SrcBitWidth) |
339 | 1.21M | Known.Zero.setBitsFrom(SrcBitWidth); |
340 | 4.46M | assert(!Known.hasConflict() && "Bits known to be one AND zero?"); |
341 | 4.46M | break; |
342 | 4.46M | } |
343 | 46.9k | case Instruction::BitCast: |
344 | 46.9k | if (!I->getOperand(0)->getType()->isIntOrIntVectorTy()) |
345 | 46.7k | return nullptr; // vector->int or fp->int? |
346 | 181 | |
347 | 181 | if (VectorType *181 DstVTy181 = dyn_cast<VectorType>(I->getType())) { |
348 | 109 | if (VectorType *SrcVTy = |
349 | 86 | dyn_cast<VectorType>(I->getOperand(0)->getType())) { |
350 | 86 | if (DstVTy->getNumElements() != SrcVTy->getNumElements()) |
351 | 86 | // Don't touch a bitcast between vectors of different element counts. |
352 | 86 | return nullptr; |
353 | 109 | } else |
354 | 109 | // Don't touch a scalar-to-vector bitcast. |
355 | 23 | return nullptr; |
356 | 72 | } else if (72 I->getOperand(0)->getType()->isVectorTy()72 ) |
357 | 72 | // Don't touch a vector-to-scalar bitcast. |
358 | 72 | return nullptr; |
359 | 0 |
|
360 | 0 | if (0 SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)0 ) |
361 | 0 | return I; |
362 | 0 | assert(!Known.hasConflict() && "Bits known to be one AND zero?"); |
363 | 0 | break; |
364 | 902k | case Instruction::SExt: { |
365 | 902k | // Compute the bits in the result that are not present in the input. |
366 | 902k | unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); |
367 | 902k | |
368 | 902k | APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth); |
369 | 902k | |
370 | 902k | // If any of the sign extended bits are demanded, we know that the sign |
371 | 902k | // bit is demanded. |
372 | 902k | if (DemandedMask.getActiveBits() > SrcBitWidth) |
373 | 899k | InputDemandedBits.setBit(SrcBitWidth-1); |
374 | 902k | |
375 | 902k | KnownBits InputKnown(SrcBitWidth); |
376 | 902k | if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1)) |
377 | 20 | return I; |
378 | 902k | |
379 | 902k | // If the input sign bit is known zero, or if the NewBits are not demanded |
380 | 902k | // convert this into a zero extension. |
381 | 902k | if (902k InputKnown.isNonNegative() || |
382 | 902k | DemandedMask.getActiveBits() <= SrcBitWidth902k ) { |
383 | 2.31k | // Convert to ZExt cast. |
384 | 2.31k | CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName()); |
385 | 2.31k | return InsertNewInstWith(NewCast, *I); |
386 | 2.31k | } |
387 | 899k | |
388 | 899k | // If the sign bit of the input is known set or clear, then we know the |
389 | 899k | // top bits of the result. |
390 | 899k | Known = InputKnown.sext(BitWidth); |
391 | 899k | assert(!Known.hasConflict() && "Bits known to be one AND zero?"); |
392 | 899k | break; |
393 | 899k | } |
394 | 4.58M | case Instruction::Add: |
395 | 4.58M | case Instruction::Sub: { |
396 | 4.58M | /// If the high-bits of an ADD/SUB are not demanded, then we do not care |
397 | 4.58M | /// about the high bits of the operands. |
398 | 4.58M | unsigned NLZ = DemandedMask.countLeadingZeros(); |
399 | 4.58M | // Right fill the mask of bits for this ADD/SUB to demand the most |
400 | 4.58M | // significant bit and all those below it. |
401 | 4.58M | APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ)); |
402 | 4.58M | if (ShrinkDemandedConstant(I, 0, DemandedFromOps) || |
403 | 4.58M | SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) || |
404 | 4.58M | ShrinkDemandedConstant(I, 1, DemandedFromOps) || |
405 | 4.58M | SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)4.58M ) { |
406 | 4.19k | if (NLZ > 04.19k ) { |
407 | 4.01k | // Disable the nsw and nuw flags here: We can no longer guarantee that |
408 | 4.01k | // we won't wrap after simplification. Removing the nsw/nuw flags is |
409 | 4.01k | // legal here because the top bit is not demanded. |
410 | 4.01k | BinaryOperator &BinOP = *cast<BinaryOperator>(I); |
411 | 4.01k | BinOP.setHasNoSignedWrap(false); |
412 | 4.01k | BinOP.setHasNoUnsignedWrap(false); |
413 | 4.01k | } |
414 | 4.19k | return I; |
415 | 4.19k | } |
416 | 4.58M | |
417 | 4.58M | // If we are known to be adding/subtracting zeros to every bit below |
418 | 4.58M | // the highest demanded bit, we just return the other side. |
419 | 4.58M | if (4.58M DemandedFromOps.isSubsetOf(RHSKnown.Zero)4.58M ) |
420 | 395 | return I->getOperand(0); |
421 | 4.58M | // We can't do this with the LHS for subtraction, unless we are only |
422 | 4.58M | // demanding the LSB. |
423 | 4.58M | if (4.58M (I->getOpcode() == Instruction::Add || |
424 | 920k | DemandedFromOps.isOneValue()) && |
425 | 3.66M | DemandedFromOps.isSubsetOf(LHSKnown.Zero)) |
426 | 257 | return I->getOperand(1); |
427 | 4.58M | |
428 | 4.58M | // Otherwise just compute the known bits of the result. |
429 | 4.58M | bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap(); |
430 | 4.58M | Known = KnownBits::computeForAddSub(I->getOpcode() == Instruction::Add, |
431 | 4.58M | NSW, LHSKnown, RHSKnown); |
432 | 4.58M | break; |
433 | 4.58M | } |
434 | 2.64M | case Instruction::Shl: { |
435 | 2.64M | const APInt *SA; |
436 | 2.64M | if (match(I->getOperand(1), m_APInt(SA))2.64M ) { |
437 | 2.25M | const APInt *ShrAmt; |
438 | 2.25M | if (match(I->getOperand(0), m_Shr(m_Value(), m_APInt(ShrAmt)))2.25M ) { |
439 | 35.7k | Instruction *Shr = cast<Instruction>(I->getOperand(0)); |
440 | 35.7k | if (Value *R = simplifyShrShlDemandedBits( |
441 | 35.7k | Shr, *ShrAmt, I, *SA, DemandedMask, Known)) |
442 | 414 | return R; |
443 | 2.25M | } |
444 | 2.25M | |
445 | 2.25M | uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); |
446 | 2.25M | APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt)); |
447 | 2.25M | |
448 | 2.25M | // If the shift is NUW/NSW, then it does demand the high bits. |
449 | 2.25M | ShlOperator *IOp = cast<ShlOperator>(I); |
450 | 2.25M | if (IOp->hasNoSignedWrap()) |
451 | 994k | DemandedMaskIn.setHighBits(ShiftAmt+1); |
452 | 1.25M | else if (1.25M IOp->hasNoUnsignedWrap()1.25M ) |
453 | 184k | DemandedMaskIn.setHighBits(ShiftAmt); |
454 | 2.25M | |
455 | 2.25M | if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) |
456 | 1.68k | return I; |
457 | 2.25M | assert(!Known.hasConflict() && "Bits known to be one AND zero?"); |
458 | 2.25M | Known.Zero <<= ShiftAmt; |
459 | 2.25M | Known.One <<= ShiftAmt; |
460 | 2.25M | // low bits known zero. |
461 | 2.25M | if (ShiftAmt) |
462 | 2.25M | Known.Zero.setLowBits(ShiftAmt); |
463 | 2.25M | } |
464 | 2.64M | break; |
465 | 2.64M | } |
466 | 2.26M | case Instruction::LShr: { |
467 | 2.26M | const APInt *SA; |
468 | 2.26M | if (match(I->getOperand(1), m_APInt(SA))2.26M ) { |
469 | 2.13M | uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); |
470 | 2.13M | |
471 | 2.13M | // Unsigned shift right. |
472 | 2.13M | APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt)); |
473 | 2.13M | |
474 | 2.13M | // If the shift is exact, then it does demand the low bits (and knows that |
475 | 2.13M | // they are zero). |
476 | 2.13M | if (cast<LShrOperator>(I)->isExact()) |
477 | 23.7k | DemandedMaskIn.setLowBits(ShiftAmt); |
478 | 2.13M | |
479 | 2.13M | if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) |
480 | 2.47k | return I; |
481 | 2.13M | assert(!Known.hasConflict() && "Bits known to be one AND zero?"); |
482 | 2.12M | Known.Zero.lshrInPlace(ShiftAmt); |
483 | 2.12M | Known.One.lshrInPlace(ShiftAmt); |
484 | 2.12M | if (ShiftAmt) |
485 | 2.12M | Known.Zero.setHighBits(ShiftAmt); // high bits known zero. |
486 | 2.13M | } |
487 | 2.26M | break; |
488 | 2.26M | } |
489 | 483k | case Instruction::AShr: { |
490 | 483k | // If this is an arithmetic shift right and only the low-bit is set, we can |
491 | 483k | // always convert this into a logical shr, even if the shift amount is |
492 | 483k | // variable. The low bit of the shift cannot be an input sign bit unless |
493 | 483k | // the shift amount is >= the size of the datatype, which is undefined. |
494 | 483k | if (DemandedMask.isOneValue()483k ) { |
495 | 211 | // Perform the logical shift right. |
496 | 211 | Instruction *NewVal = BinaryOperator::CreateLShr( |
497 | 211 | I->getOperand(0), I->getOperand(1), I->getName()); |
498 | 211 | return InsertNewInstWith(NewVal, *I); |
499 | 211 | } |
500 | 482k | |
501 | 482k | // If the sign bit is the only bit demanded by this ashr, then there is no |
502 | 482k | // need to do it, the shift doesn't change the high bit. |
503 | 482k | if (482k DemandedMask.isSignMask()482k ) |
504 | 47 | return I->getOperand(0); |
505 | 482k | |
506 | 482k | const APInt *SA; |
507 | 482k | if (match(I->getOperand(1), m_APInt(SA))482k ) { |
508 | 427k | uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1); |
509 | 427k | |
510 | 427k | // Signed shift right. |
511 | 427k | APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt)); |
512 | 427k | // If any of the high bits are demanded, we should set the sign bit as |
513 | 427k | // demanded. |
514 | 427k | if (DemandedMask.countLeadingZeros() <= ShiftAmt) |
515 | 422k | DemandedMaskIn.setSignBit(); |
516 | 427k | |
517 | 427k | // If the shift is exact, then it does demand the low bits (and knows that |
518 | 427k | // they are zero). |
519 | 427k | if (cast<AShrOperator>(I)->isExact()) |
520 | 223k | DemandedMaskIn.setLowBits(ShiftAmt); |
521 | 427k | |
522 | 427k | if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) |
523 | 1.09k | return I; |
524 | 426k | |
525 | 426k | unsigned SignBits = ComputeNumSignBits(I->getOperand(0), Depth + 1, CxtI); |
526 | 426k | |
527 | 426k | assert(!Known.hasConflict() && "Bits known to be one AND zero?"); |
528 | 426k | // Compute the new bits that are at the top now plus sign bits. |
529 | 426k | APInt HighBits(APInt::getHighBitsSet( |
530 | 426k | BitWidth, std::min(SignBits + ShiftAmt - 1, BitWidth))); |
531 | 426k | Known.Zero.lshrInPlace(ShiftAmt); |
532 | 426k | Known.One.lshrInPlace(ShiftAmt); |
533 | 426k | |
534 | 426k | // If the input sign bit is known to be zero, or if none of the top bits |
535 | 426k | // are demanded, turn this into an unsigned shift right. |
536 | 426k | assert(BitWidth > ShiftAmt && "Shift amount not saturated?"); |
537 | 426k | if (Known.Zero[BitWidth-ShiftAmt-1] || |
538 | 426k | !DemandedMask.intersects(HighBits)423k ) { |
539 | 5.66k | BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0), |
540 | 5.66k | I->getOperand(1)); |
541 | 5.66k | LShr->setIsExact(cast<BinaryOperator>(I)->isExact()); |
542 | 5.66k | return InsertNewInstWith(LShr, *I); |
543 | 420k | } else if (420k Known.One[BitWidth-ShiftAmt-1]420k ) { // New bits are known one. |
544 | 10 | Known.One |= HighBits; |
545 | 10 | } |
546 | 427k | } |
547 | 476k | break; |
548 | 482k | } |
549 | 60.5k | case Instruction::SRem: |
550 | 60.5k | if (ConstantInt *Rem60.5k = dyn_cast<ConstantInt>(I->getOperand(1))) { |
551 | 17.0k | // X % -1 demands all the bits because we don't want to introduce |
552 | 17.0k | // INT_MIN % -1 (== undef) by accident. |
553 | 17.0k | if (Rem->isMinusOne()) |
554 | 0 | break; |
555 | 17.0k | APInt RA = Rem->getValue().abs(); |
556 | 17.0k | if (RA.isPowerOf2()17.0k ) { |
557 | 6.26k | if (DemandedMask.ult(RA)) // srem won't affect demanded bits |
558 | 1 | return I->getOperand(0); |
559 | 6.26k | |
560 | 6.26k | APInt LowBits = RA - 1; |
561 | 6.26k | APInt Mask2 = LowBits | APInt::getSignMask(BitWidth); |
562 | 6.26k | if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1)) |
563 | 2 | return I; |
564 | 6.26k | |
565 | 6.26k | // The low bits of LHS are unchanged by the srem. |
566 | 6.26k | Known.Zero = LHSKnown.Zero & LowBits; |
567 | 6.26k | Known.One = LHSKnown.One & LowBits; |
568 | 6.26k | |
569 | 6.26k | // If LHS is non-negative or has all low bits zero, then the upper bits |
570 | 6.26k | // are all zero. |
571 | 6.26k | if (LHSKnown.isNonNegative() || 6.26k LowBits.isSubsetOf(LHSKnown.Zero)6.24k ) |
572 | 18 | Known.Zero |= ~LowBits; |
573 | 6.26k | |
574 | 6.26k | // If LHS is negative and not all low bits are zero, then the upper bits |
575 | 6.26k | // are all one. |
576 | 6.26k | if (LHSKnown.isNegative() && 6.26k LowBits.intersects(LHSKnown.One)0 ) |
577 | 0 | Known.One |= ~LowBits; |
578 | 6.26k | |
579 | 6.26k | assert(!Known.hasConflict() && "Bits known to be one AND zero?"); |
580 | 6.26k | break; |
581 | 6.26k | } |
582 | 17.0k | } |
583 | 54.3k | |
584 | 54.3k | // The sign bit is the LHS's sign bit, except when the result of the |
585 | 54.3k | // remainder is zero. |
586 | 54.3k | if (54.3k DemandedMask.isSignBitSet()54.3k ) { |
587 | 51.5k | computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI); |
588 | 51.5k | // If it's known zero, our sign bit is also zero. |
589 | 51.5k | if (LHSKnown.isNonNegative()) |
590 | 76 | Known.makeNonNegative(); |
591 | 51.5k | } |
592 | 54.3k | break; |
593 | 28.7k | case Instruction::URem: { |
594 | 28.7k | KnownBits Known2(BitWidth); |
595 | 28.7k | APInt AllOnes = APInt::getAllOnesValue(BitWidth); |
596 | 28.7k | if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) || |
597 | 28.7k | SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1)) |
598 | 0 | return I; |
599 | 28.7k | |
600 | 28.7k | unsigned Leaders = Known2.countMinLeadingZeros(); |
601 | 28.7k | Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask; |
602 | 28.7k | break; |
603 | 28.7k | } |
604 | 2.65M | case Instruction::Call: |
605 | 2.65M | if (IntrinsicInst *II2.65M = dyn_cast<IntrinsicInst>(I)) { |
606 | 53.3k | switch (II->getIntrinsicID()) { |
607 | 51.9k | default: break; |
608 | 1.44k | case Intrinsic::bswap: { |
609 | 1.44k | // If the only bits demanded come from one byte of the bswap result, |
610 | 1.44k | // just shift the input byte into position to eliminate the bswap. |
611 | 1.44k | unsigned NLZ = DemandedMask.countLeadingZeros(); |
612 | 1.44k | unsigned NTZ = DemandedMask.countTrailingZeros(); |
613 | 1.44k | |
614 | 1.44k | // Round NTZ down to the next byte. If we have 11 trailing zeros, then |
615 | 1.44k | // we need all the bits down to bit 8. Likewise, round NLZ. If we |
616 | 1.44k | // have 14 leading zeros, round to 8. |
617 | 1.44k | NLZ &= ~7; |
618 | 1.44k | NTZ &= ~7; |
619 | 1.44k | // If we need exactly one byte, we can do this transformation. |
620 | 1.44k | if (BitWidth-NLZ-NTZ == 81.44k ) { |
621 | 10 | unsigned ResultBit = NTZ; |
622 | 10 | unsigned InputBit = BitWidth-NTZ-8; |
623 | 10 | |
624 | 10 | // Replace this with either a left or right shift to get the byte into |
625 | 10 | // the right place. |
626 | 10 | Instruction *NewVal; |
627 | 10 | if (InputBit > ResultBit) |
628 | 7 | NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0), |
629 | 7 | ConstantInt::get(I->getType(), InputBit-ResultBit)); |
630 | 10 | else |
631 | 3 | NewVal = BinaryOperator::CreateShl(II->getArgOperand(0), |
632 | 3 | ConstantInt::get(I->getType(), ResultBit-InputBit)); |
633 | 10 | NewVal->takeName(I); |
634 | 10 | return InsertNewInstWith(NewVal, *I); |
635 | 10 | } |
636 | 1.43k | |
637 | 1.43k | // TODO: Could compute known zero/one bits based on the input. |
638 | 1.43k | break; |
639 | 1.43k | } |
640 | 14 | case Intrinsic::x86_mmx_pmovmskb: |
641 | 14 | case Intrinsic::x86_sse_movmsk_ps: |
642 | 14 | case Intrinsic::x86_sse2_movmsk_pd: |
643 | 14 | case Intrinsic::x86_sse2_pmovmskb_128: |
644 | 14 | case Intrinsic::x86_avx_movmsk_ps_256: |
645 | 14 | case Intrinsic::x86_avx_movmsk_pd_256: |
646 | 14 | case Intrinsic::x86_avx2_pmovmskb: { |
647 | 14 | // MOVMSK copies the vector elements' sign bits to the low bits |
648 | 14 | // and zeros the high bits. |
649 | 14 | unsigned ArgWidth; |
650 | 14 | if (II->getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb14 ) { |
651 | 2 | ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. |
652 | 14 | } else { |
653 | 12 | auto Arg = II->getArgOperand(0); |
654 | 12 | auto ArgType = cast<VectorType>(Arg->getType()); |
655 | 12 | ArgWidth = ArgType->getNumElements(); |
656 | 12 | } |
657 | 14 | |
658 | 14 | // If we don't need any of low bits then return zero, |
659 | 14 | // we know that DemandedMask is non-zero already. |
660 | 14 | APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); |
661 | 14 | if (DemandedElts.isNullValue()) |
662 | 6 | return ConstantInt::getNullValue(VTy); |
663 | 8 | |
664 | 8 | // We know that the upper bits are set to zero. |
665 | 8 | Known.Zero.setBitsFrom(ArgWidth); |
666 | 8 | return nullptr; |
667 | 8 | } |
668 | 1 | case Intrinsic::x86_sse42_crc32_64_64: |
669 | 1 | Known.Zero.setBitsFrom(32); |
670 | 1 | return nullptr; |
671 | 2.65M | } |
672 | 2.65M | } |
673 | 2.65M | computeKnownBits(V, Known, Depth, CxtI); |
674 | 2.65M | break; |
675 | 44.4M | } |
676 | 44.4M | |
677 | 44.4M | // If the client is only demanding bits that we know, return the known |
678 | 44.4M | // constant. |
679 | 44.4M | if (44.4M DemandedMask.isSubsetOf(Known.Zero|Known.One)44.4M ) |
680 | 317 | return Constant::getIntegerValue(VTy, Known.One); |
681 | 44.4M | return nullptr; |
682 | 44.4M | } |
683 | | |
684 | | /// Helper routine of SimplifyDemandedUseBits. It computes Known |
685 | | /// bits. It also tries to handle simplifications that can be done based on |
686 | | /// DemandedMask, but without modifying the Instruction. |
687 | | Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I, |
688 | | const APInt &DemandedMask, |
689 | | KnownBits &Known, |
690 | | unsigned Depth, |
691 | 12.4M | Instruction *CxtI) { |
692 | 12.4M | unsigned BitWidth = DemandedMask.getBitWidth(); |
693 | 12.4M | Type *ITy = I->getType(); |
694 | 12.4M | |
695 | 12.4M | KnownBits LHSKnown(BitWidth); |
696 | 12.4M | KnownBits RHSKnown(BitWidth); |
697 | 12.4M | |
698 | 12.4M | // Despite the fact that we can't simplify this instruction in all User's |
699 | 12.4M | // context, we can at least compute the known bits, and we can |
700 | 12.4M | // do simplifications that apply to *just* the one user if we know that |
701 | 12.4M | // this instruction has a simpler value in that context. |
702 | 12.4M | switch (I->getOpcode()) { |
703 | 376k | case Instruction::And: { |
704 | 376k | // If either the LHS or the RHS are Zero, the result is zero. |
705 | 376k | computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); |
706 | 376k | computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, |
707 | 376k | CxtI); |
708 | 376k | |
709 | 376k | // Output known-0 are known to be clear if zero in either the LHS | RHS. |
710 | 376k | APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero; |
711 | 376k | // Output known-1 bits are only known if set in both the LHS & RHS. |
712 | 376k | APInt IKnownOne = RHSKnown.One & LHSKnown.One; |
713 | 376k | |
714 | 376k | // If the client is only demanding bits that we know, return the known |
715 | 376k | // constant. |
716 | 376k | if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) |
717 | 5 | return Constant::getIntegerValue(ITy, IKnownOne); |
718 | 376k | |
719 | 376k | // If all of the demanded bits are known 1 on one side, return the other. |
720 | 376k | // These bits cannot contribute to the result of the 'and' in this |
721 | 376k | // context. |
722 | 376k | if (376k DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One)376k ) |
723 | 1.00k | return I->getOperand(0); |
724 | 375k | if (375k DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One)375k ) |
725 | 0 | return I->getOperand(1); |
726 | 375k | |
727 | 375k | Known.Zero = std::move(IKnownZero); |
728 | 375k | Known.One = std::move(IKnownOne); |
729 | 375k | break; |
730 | 375k | } |
731 | 302k | case Instruction::Or: { |
732 | 302k | // We can simplify (X|Y) -> X or Y in the user's context if we know that |
733 | 302k | // only bits from X or Y are demanded. |
734 | 302k | |
735 | 302k | // If either the LHS or the RHS are One, the result is One. |
736 | 302k | computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); |
737 | 302k | computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, |
738 | 302k | CxtI); |
739 | 302k | |
740 | 302k | // Output known-0 bits are only known if clear in both the LHS & RHS. |
741 | 302k | APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero; |
742 | 302k | // Output known-1 are known to be set if set in either the LHS | RHS. |
743 | 302k | APInt IKnownOne = RHSKnown.One | LHSKnown.One; |
744 | 302k | |
745 | 302k | // If the client is only demanding bits that we know, return the known |
746 | 302k | // constant. |
747 | 302k | if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) |
748 | 5 | return Constant::getIntegerValue(ITy, IKnownOne); |
749 | 302k | |
750 | 302k | // If all of the demanded bits are known zero on one side, return the |
751 | 302k | // other. These bits cannot contribute to the result of the 'or' in this |
752 | 302k | // context. |
753 | 302k | if (302k DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero)302k ) |
754 | 1.26k | return I->getOperand(0); |
755 | 301k | if (301k DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero)301k ) |
756 | 677 | return I->getOperand(1); |
757 | 300k | |
758 | 300k | Known.Zero = std::move(IKnownZero); |
759 | 300k | Known.One = std::move(IKnownOne); |
760 | 300k | break; |
761 | 300k | } |
762 | 132k | case Instruction::Xor: { |
763 | 132k | // We can simplify (X^Y) -> X or Y in the user's context if we know that |
764 | 132k | // only bits from X or Y are demanded. |
765 | 132k | |
766 | 132k | computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); |
767 | 132k | computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, |
768 | 132k | CxtI); |
769 | 132k | |
770 | 132k | // Output known-0 bits are known if clear or set in both the LHS & RHS. |
771 | 132k | APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) | |
772 | 132k | (RHSKnown.One & LHSKnown.One); |
773 | 132k | // Output known-1 are known to be set if set in only one of the LHS, RHS. |
774 | 132k | APInt IKnownOne = (RHSKnown.Zero & LHSKnown.One) | |
775 | 132k | (RHSKnown.One & LHSKnown.Zero); |
776 | 132k | |
777 | 132k | // If the client is only demanding bits that we know, return the known |
778 | 132k | // constant. |
779 | 132k | if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) |
780 | 1 | return Constant::getIntegerValue(ITy, IKnownOne); |
781 | 132k | |
782 | 132k | // If all of the demanded bits are known zero on one side, return the |
783 | 132k | // other. |
784 | 132k | if (132k DemandedMask.isSubsetOf(RHSKnown.Zero)132k ) |
785 | 47 | return I->getOperand(0); |
786 | 132k | if (132k DemandedMask.isSubsetOf(LHSKnown.Zero)132k ) |
787 | 97 | return I->getOperand(1); |
788 | 132k | |
789 | 132k | // Output known-0 bits are known if clear or set in both the LHS & RHS. |
790 | 132k | Known.Zero = std::move(IKnownZero); |
791 | 132k | // Output known-1 are known to be set if set in only one of the LHS, RHS. |
792 | 132k | Known.One = std::move(IKnownOne); |
793 | 132k | break; |
794 | 132k | } |
795 | 11.6M | default: |
796 | 11.6M | // Compute the Known bits to simplify things downstream. |
797 | 11.6M | computeKnownBits(I, Known, Depth, CxtI); |
798 | 11.6M | |
799 | 11.6M | // If this user is only demanding bits that we know, return the known |
800 | 11.6M | // constant. |
801 | 11.6M | if (DemandedMask.isSubsetOf(Known.Zero|Known.One)) |
802 | 126 | return Constant::getIntegerValue(ITy, Known.One); |
803 | 11.6M | |
804 | 11.6M | break; |
805 | 12.4M | } |
806 | 12.4M | |
807 | 12.4M | return nullptr; |
808 | 12.4M | } |
809 | | |
810 | | |
811 | | /// Helper routine of SimplifyDemandedUseBits. It tries to simplify |
812 | | /// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into |
813 | | /// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign |
814 | | /// of "C2-C1". |
815 | | /// |
816 | | /// Suppose E1 and E2 are generally different in bits S={bm, bm+1, |
817 | | /// ..., bn}, without considering the specific value X is holding. |
818 | | /// This transformation is legal iff one of following conditions is hold: |
819 | | /// 1) All the bit in S are 0, in this case E1 == E2. |
820 | | /// 2) We don't care those bits in S, per the input DemandedMask. |
821 | | /// 3) Combination of 1) and 2). Some bits in S are 0, and we don't care the |
822 | | /// rest bits. |
823 | | /// |
824 | | /// Currently we only test condition 2). |
825 | | /// |
826 | | /// As with SimplifyDemandedUseBits, it returns NULL if the simplification was |
827 | | /// not successful. |
828 | | Value * |
829 | | InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1, |
830 | | Instruction *Shl, const APInt &ShlOp1, |
831 | | const APInt &DemandedMask, |
832 | 35.7k | KnownBits &Known) { |
833 | 35.7k | if (!ShlOp1 || 35.7k !ShrOp135.7k ) |
834 | 0 | return nullptr; // No-op. |
835 | 35.7k | |
836 | 35.7k | Value *VarX = Shr->getOperand(0); |
837 | 35.7k | Type *Ty = VarX->getType(); |
838 | 35.7k | unsigned BitWidth = Ty->getScalarSizeInBits(); |
839 | 35.7k | if (ShlOp1.uge(BitWidth) || 35.7k ShrOp1.uge(BitWidth)35.7k ) |
840 | 0 | return nullptr; // Undef. |
841 | 35.7k | |
842 | 35.7k | unsigned ShlAmt = ShlOp1.getZExtValue(); |
843 | 35.7k | unsigned ShrAmt = ShrOp1.getZExtValue(); |
844 | 35.7k | |
845 | 35.7k | Known.One.clearAllBits(); |
846 | 35.7k | Known.Zero.setLowBits(ShlAmt - 1); |
847 | 35.7k | Known.Zero &= DemandedMask; |
848 | 35.7k | |
849 | 35.7k | APInt BitMask1(APInt::getAllOnesValue(BitWidth)); |
850 | 35.7k | APInt BitMask2(APInt::getAllOnesValue(BitWidth)); |
851 | 35.7k | |
852 | 35.7k | bool isLshr = (Shr->getOpcode() == Instruction::LShr); |
853 | 32.5k | BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) : |
854 | 3.13k | (BitMask1.ashr(ShrAmt) << ShlAmt); |
855 | 35.7k | |
856 | 35.7k | if (ShrAmt <= ShlAmt35.7k ) { |
857 | 24.7k | BitMask2 <<= (ShlAmt - ShrAmt); |
858 | 35.7k | } else { |
859 | 8.60k | BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt): |
860 | 2.35k | BitMask2.ashr(ShrAmt - ShlAmt); |
861 | 10.9k | } |
862 | 35.7k | |
863 | 35.7k | // Check if condition-2 (see the comment to this function) is satified. |
864 | 35.7k | if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)35.7k ) { |
865 | 883 | if (ShrAmt == ShlAmt) |
866 | 114 | return VarX; |
867 | 769 | |
868 | 769 | if (769 !Shr->hasOneUse()769 ) |
869 | 469 | return nullptr; |
870 | 300 | |
871 | 300 | BinaryOperator *New; |
872 | 300 | if (ShrAmt < ShlAmt300 ) { |
873 | 232 | Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt); |
874 | 232 | New = BinaryOperator::CreateShl(VarX, Amt); |
875 | 232 | BinaryOperator *Orig = cast<BinaryOperator>(Shl); |
876 | 232 | New->setHasNoSignedWrap(Orig->hasNoSignedWrap()); |
877 | 232 | New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap()); |
878 | 300 | } else { |
879 | 68 | Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt); |
880 | 66 | New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) : |
881 | 2 | BinaryOperator::CreateAShr(VarX, Amt); |
882 | 68 | if (cast<BinaryOperator>(Shr)->isExact()) |
883 | 2 | New->setIsExact(true); |
884 | 68 | } |
885 | 883 | |
886 | 883 | return InsertNewInstWith(New, *Shl); |
887 | 883 | } |
888 | 34.8k | |
889 | 34.8k | return nullptr; |
890 | 34.8k | } |
891 | | |
892 | | /// The specified value produces a vector with any number of elements. |
893 | | /// DemandedElts contains the set of elements that are actually used by the |
894 | | /// caller. This method analyzes which elements of the operand are undef and |
895 | | /// returns that information in UndefElts. |
896 | | /// |
897 | | /// If the information about demanded elements can be used to simplify the |
898 | | /// operation, the operation is simplified, then the resultant value is |
899 | | /// returned. This returns null if no change was made. |
900 | | Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, |
901 | | APInt &UndefElts, |
902 | 1.41M | unsigned Depth) { |
903 | 1.41M | unsigned VWidth = V->getType()->getVectorNumElements(); |
904 | 1.41M | APInt EltMask(APInt::getAllOnesValue(VWidth)); |
905 | 1.41M | assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!"); |
906 | 1.41M | |
907 | 1.41M | if (isa<UndefValue>(V)1.41M ) { |
908 | 405k | // If the entire vector is undefined, just return this info. |
909 | 405k | UndefElts = EltMask; |
910 | 405k | return nullptr; |
911 | 405k | } |
912 | 1.00M | |
913 | 1.00M | if (1.00M DemandedElts.isNullValue()1.00M ) { // If nothing is demanded, provide undef. |
914 | 3.88k | UndefElts = EltMask; |
915 | 3.88k | return UndefValue::get(V->getType()); |
916 | 3.88k | } |
917 | 1.00M | |
918 | 1.00M | UndefElts = 0; |
919 | 1.00M | |
920 | 1.00M | // Handle ConstantAggregateZero, ConstantVector, ConstantDataSequential. |
921 | 1.00M | if (Constant *C1.00M = dyn_cast<Constant>(V)) { |
922 | 14.4k | // Check if this is identity. If so, return 0 since we are not simplifying |
923 | 14.4k | // anything. |
924 | 14.4k | if (DemandedElts.isAllOnesValue()) |
925 | 8.76k | return nullptr; |
926 | 5.68k | |
927 | 5.68k | Type *EltTy = cast<VectorType>(V->getType())->getElementType(); |
928 | 5.68k | Constant *Undef = UndefValue::get(EltTy); |
929 | 5.68k | |
930 | 5.68k | SmallVector<Constant*, 16> Elts; |
931 | 29.9k | for (unsigned i = 0; i != VWidth29.9k ; ++i24.2k ) { |
932 | 24.2k | if (!DemandedElts[i]24.2k ) { // If not demanded, set to undef. |
933 | 10.0k | Elts.push_back(Undef); |
934 | 10.0k | UndefElts.setBit(i); |
935 | 10.0k | continue; |
936 | 10.0k | } |
937 | 14.2k | |
938 | 14.2k | Constant *Elt = C->getAggregateElement(i); |
939 | 14.2k | if (!Elt14.2k ) return nullptr0 ; |
940 | 14.2k | |
941 | 14.2k | if (14.2k isa<UndefValue>(Elt)14.2k ) { // Already undef. |
942 | 814 | Elts.push_back(Undef); |
943 | 814 | UndefElts.setBit(i); |
944 | 14.2k | } else { // Otherwise, defined. |
945 | 13.4k | Elts.push_back(Elt); |
946 | 13.4k | } |
947 | 24.2k | } |
948 | 5.68k | |
949 | 5.68k | // If we changed the constant, return it. |
950 | 5.68k | Constant *NewCV = ConstantVector::get(Elts); |
951 | 5.68k | return NewCV != C ? NewCV662 : nullptr5.02k ; |
952 | 991k | } |
953 | 991k | |
954 | 991k | // Limit search depth. |
955 | 991k | if (991k Depth == 10991k ) |
956 | 332 | return nullptr; |
957 | 991k | |
958 | 991k | // If multiple users are using the root value, proceed with |
959 | 991k | // simplification conservatively assuming that all elements |
960 | 991k | // are needed. |
961 | 991k | if (991k !V->hasOneUse()991k ) { |
962 | 272k | // Quit if we find multiple users of a non-root value though. |
963 | 272k | // They'll be handled when it's their turn to be visited by |
964 | 272k | // the main instcombine process. |
965 | 272k | if (Depth != 0) |
966 | 272k | // TODO: Just compute the UndefElts information recursively. |
967 | 249k | return nullptr; |
968 | 22.8k | |
969 | 22.8k | // Conservatively assume that all elements are needed. |
970 | 22.8k | DemandedElts = EltMask; |
971 | 22.8k | } |
972 | 991k | |
973 | 741k | Instruction *I = dyn_cast<Instruction>(V); |
974 | 741k | if (!I741k ) return nullptr9.82k ; // Only analyze instructions. |
975 | 732k | |
976 | 732k | bool MadeChange = false; |
977 | 732k | APInt UndefElts2(VWidth, 0); |
978 | 732k | APInt UndefElts3(VWidth, 0); |
979 | 732k | Value *TmpV; |
980 | 732k | switch (I->getOpcode()) { |
981 | 83.5k | default: break; |
982 | 732k | |
983 | 277k | case Instruction::InsertElement: { |
984 | 277k | // If this is a variable index, we don't know which element it overwrites. |
985 | 277k | // demand exactly the same input as we produce. |
986 | 277k | ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2)); |
987 | 277k | if (!Idx277k ) { |
988 | 444 | // Note that we can't propagate undef elt info, because we don't know |
989 | 444 | // which elt is getting updated. |
990 | 444 | TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, |
991 | 444 | UndefElts2, Depth + 1); |
992 | 444 | if (TmpV444 ) { I->setOperand(0, TmpV); MadeChange = true; }0 |
993 | 444 | break; |
994 | 444 | } |
995 | 277k | |
996 | 277k | // The element inserted overwrites whatever was there, so the input demanded |
997 | 277k | // set is simpler than the output set. |
998 | 277k | unsigned IdxNo = Idx->getZExtValue(); |
999 | 277k | APInt PreInsertDemandedElts = DemandedElts; |
1000 | 277k | if (IdxNo < VWidth) |
1001 | 277k | PreInsertDemandedElts.clearBit(IdxNo); |
1002 | 277k | TmpV = SimplifyDemandedVectorElts(I->getOperand(0), PreInsertDemandedElts, |
1003 | 277k | UndefElts, Depth + 1); |
1004 | 277k | if (TmpV277k ) { I->setOperand(0, TmpV); MadeChange = true; }8.85k |
1005 | 277k | |
1006 | 277k | // If this is inserting an element that isn't demanded, remove this |
1007 | 277k | // insertelement. |
1008 | 277k | if (IdxNo >= VWidth || 277k !DemandedElts[IdxNo]277k ) { |
1009 | 142 | Worklist.Add(I); |
1010 | 142 | return I->getOperand(0); |
1011 | 142 | } |
1012 | 276k | |
1013 | 276k | // The inserted element is defined. |
1014 | 276k | UndefElts.clearBit(IdxNo); |
1015 | 276k | break; |
1016 | 276k | } |
1017 | 328k | case Instruction::ShuffleVector: { |
1018 | 328k | ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I); |
1019 | 328k | unsigned LHSVWidth = |
1020 | 328k | Shuffle->getOperand(0)->getType()->getVectorNumElements(); |
1021 | 328k | APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0); |
1022 | 2.02M | for (unsigned i = 0; i < VWidth2.02M ; i++1.69M ) { |
1023 | 1.69M | if (DemandedElts[i]1.69M ) { |
1024 | 1.62M | unsigned MaskVal = Shuffle->getMaskValue(i); |
1025 | 1.62M | if (MaskVal != -1u1.62M ) { |
1026 | 1.55M | assert(MaskVal < LHSVWidth * 2 && |
1027 | 1.55M | "shufflevector mask index out of range!"); |
1028 | 1.55M | if (MaskVal < LHSVWidth) |
1029 | 1.22M | LeftDemanded.setBit(MaskVal); |
1030 | 1.55M | else |
1031 | 325k | RightDemanded.setBit(MaskVal - LHSVWidth); |
1032 | 1.55M | } |
1033 | 1.62M | } |
1034 | 1.69M | } |
1035 | 328k | |
1036 | 328k | APInt LHSUndefElts(LHSVWidth, 0); |
1037 | 328k | TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded, |
1038 | 328k | LHSUndefElts, Depth + 1); |
1039 | 328k | if (TmpV328k ) { I->setOperand(0, TmpV); MadeChange = true; }1.36k |
1040 | 328k | |
1041 | 328k | APInt RHSUndefElts(LHSVWidth, 0); |
1042 | 328k | TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded, |
1043 | 328k | RHSUndefElts, Depth + 1); |
1044 | 328k | if (TmpV328k ) { I->setOperand(1, TmpV); MadeChange = true; }1.32k |
1045 | 328k | |
1046 | 328k | bool NewUndefElts = false; |
1047 | 328k | unsigned LHSIdx = -1u, LHSValIdx = -1u; |
1048 | 328k | unsigned RHSIdx = -1u, RHSValIdx = -1u; |
1049 | 328k | bool LHSUniform = true; |
1050 | 328k | bool RHSUniform = true; |
1051 | 2.02M | for (unsigned i = 0; i < VWidth2.02M ; i++1.69M ) { |
1052 | 1.69M | unsigned MaskVal = Shuffle->getMaskValue(i); |
1053 | 1.69M | if (MaskVal == -1u1.69M ) { |
1054 | 133k | UndefElts.setBit(i); |
1055 | 1.69M | } else if (1.56M !DemandedElts[i]1.56M ) { |
1056 | 11.2k | NewUndefElts = true; |
1057 | 11.2k | UndefElts.setBit(i); |
1058 | 1.56M | } else if (1.55M MaskVal < LHSVWidth1.55M ) { |
1059 | 1.22M | if (LHSUndefElts[MaskVal]1.22M ) { |
1060 | 253 | NewUndefElts = true; |
1061 | 253 | UndefElts.setBit(i); |
1062 | 1.22M | } else { |
1063 | 1.22M | LHSIdx = LHSIdx == -1u ? i326k : LHSVWidth899k ; |
1064 | 1.22M | LHSValIdx = LHSValIdx == -1u ? MaskVal326k : LHSVWidth899k ; |
1065 | 813k | LHSUniform = LHSUniform && (MaskVal == i); |
1066 | 1.22M | } |
1067 | 1.55M | } else { |
1068 | 325k | if (RHSUndefElts[MaskVal - LHSVWidth]325k ) { |
1069 | 7 | NewUndefElts = true; |
1070 | 7 | UndefElts.setBit(i); |
1071 | 325k | } else { |
1072 | 325k | RHSIdx = RHSIdx == -1u ? i55.0k : LHSVWidth270k ; |
1073 | 325k | RHSValIdx = RHSValIdx == -1u ? MaskVal - LHSVWidth55.0k : LHSVWidth270k ; |
1074 | 58.7k | RHSUniform = RHSUniform && (MaskVal - LHSVWidth == i); |
1075 | 325k | } |
1076 | 1.56M | } |
1077 | 1.69M | } |
1078 | 328k | |
1079 | 328k | // Try to transform shuffle with constant vector and single element from |
1080 | 328k | // this constant vector to single insertelement instruction. |
1081 | 328k | // shufflevector V, C, <v1, v2, .., ci, .., vm> -> |
1082 | 328k | // insertelement V, C[ci], ci-n |
1083 | 328k | if (LHSVWidth == Shuffle->getType()->getNumElements()328k ) { |
1084 | 47.5k | Value *Op = nullptr; |
1085 | 47.5k | Constant *Value = nullptr; |
1086 | 47.5k | unsigned Idx = -1u; |
1087 | 47.5k | |
1088 | 47.5k | // Find constant vector with the single element in shuffle (LHS or RHS). |
1089 | 47.5k | if (LHSIdx < LHSVWidth && 47.5k RHSUniform4.75k ) { |
1090 | 3.94k | if (auto *CV3.94k = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) { |
1091 | 1 | Op = Shuffle->getOperand(1); |
1092 | 1 | Value = CV->getOperand(LHSValIdx); |
1093 | 1 | Idx = LHSIdx; |
1094 | 1 | } |
1095 | 3.94k | } |
1096 | 47.5k | if (RHSIdx < LHSVWidth && 47.5k LHSUniform1.56k ) { |
1097 | 961 | if (auto *CV961 = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) { |
1098 | 14 | Op = Shuffle->getOperand(0); |
1099 | 14 | Value = CV->getOperand(RHSValIdx); |
1100 | 14 | Idx = RHSIdx; |
1101 | 14 | } |
1102 | 961 | } |
1103 | 47.5k | // Found constant vector with single element - convert to insertelement. |
1104 | 47.5k | if (Op && 47.5k Value15 ) { |
1105 | 15 | Instruction *New = InsertElementInst::Create( |
1106 | 15 | Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx), |
1107 | 15 | Shuffle->getName()); |
1108 | 15 | InsertNewInstWith(New, *Shuffle); |
1109 | 15 | return New; |
1110 | 15 | } |
1111 | 328k | } |
1112 | 328k | if (328k NewUndefElts328k ) { |
1113 | 1.14k | // Add additional discovered undefs. |
1114 | 1.14k | SmallVector<Constant*, 16> Elts; |
1115 | 18.1k | for (unsigned i = 0; i < VWidth18.1k ; ++i17.0k ) { |
1116 | 17.0k | if (UndefElts[i]) |
1117 | 11.6k | Elts.push_back(UndefValue::get(Type::getInt32Ty(I->getContext()))); |
1118 | 17.0k | else |
1119 | 5.41k | Elts.push_back(ConstantInt::get(Type::getInt32Ty(I->getContext()), |
1120 | 5.41k | Shuffle->getMaskValue(i))); |
1121 | 17.0k | } |
1122 | 1.14k | I->setOperand(2, ConstantVector::get(Elts)); |
1123 | 1.14k | MadeChange = true; |
1124 | 1.14k | } |
1125 | 328k | break; |
1126 | 328k | } |
1127 | 7.37k | case Instruction::Select: { |
1128 | 7.37k | APInt LeftDemanded(DemandedElts), RightDemanded(DemandedElts); |
1129 | 7.37k | if (ConstantVector* CV7.37k = dyn_cast<ConstantVector>(I->getOperand(0))) { |
1130 | 10 | for (unsigned i = 0; i < VWidth10 ; i++8 ) { |
1131 | 8 | Constant *CElt = CV->getAggregateElement(i); |
1132 | 8 | // Method isNullValue always returns false when called on a |
1133 | 8 | // ConstantExpr. If CElt is a ConstantExpr then skip it in order to |
1134 | 8 | // to avoid propagating incorrect information. |
1135 | 8 | if (isa<ConstantExpr>(CElt)) |
1136 | 1 | continue; |
1137 | 7 | if (7 CElt->isNullValue()7 ) |
1138 | 2 | LeftDemanded.clearBit(i); |
1139 | 7 | else |
1140 | 5 | RightDemanded.clearBit(i); |
1141 | 8 | } |
1142 | 2 | } |
1143 | 7.37k | |
1144 | 7.37k | TmpV = SimplifyDemandedVectorElts(I->getOperand(1), LeftDemanded, UndefElts, |
1145 | 7.37k | Depth + 1); |
1146 | 7.37k | if (TmpV7.37k ) { I->setOperand(1, TmpV); MadeChange = true; }1 |
1147 | 7.37k | |
1148 | 7.37k | TmpV = SimplifyDemandedVectorElts(I->getOperand(2), RightDemanded, |
1149 | 7.37k | UndefElts2, Depth + 1); |
1150 | 7.37k | if (TmpV7.37k ) { I->setOperand(2, TmpV); MadeChange = true; }0 |
1151 | 7.37k | |
1152 | 7.37k | // Output elements are undefined if both are undefined. |
1153 | 7.37k | UndefElts &= UndefElts2; |
1154 | 7.37k | break; |
1155 | 328k | } |
1156 | 1.90k | case Instruction::BitCast: { |
1157 | 1.90k | // Vector->vector casts only. |
1158 | 1.90k | VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType()); |
1159 | 1.90k | if (!VTy1.90k ) break745 ; |
1160 | 1.15k | unsigned InVWidth = VTy->getNumElements(); |
1161 | 1.15k | APInt InputDemandedElts(InVWidth, 0); |
1162 | 1.15k | UndefElts2 = APInt(InVWidth, 0); |
1163 | 1.15k | unsigned Ratio; |
1164 | 1.15k | |
1165 | 1.15k | if (VWidth == InVWidth1.15k ) { |
1166 | 70 | // If we are converting from <4 x i32> -> <4 x f32>, we demand the same |
1167 | 70 | // elements as are demanded of us. |
1168 | 70 | Ratio = 1; |
1169 | 70 | InputDemandedElts = DemandedElts; |
1170 | 1.15k | } else if (1.08k (VWidth % InVWidth) == 01.08k ) { |
1171 | 610 | // If the number of elements in the output is a multiple of the number of |
1172 | 610 | // elements in the input then an input element is live if any of the |
1173 | 610 | // corresponding output elements are live. |
1174 | 610 | Ratio = VWidth / InVWidth; |
1175 | 6.38k | for (unsigned OutIdx = 0; OutIdx != VWidth6.38k ; ++OutIdx5.77k ) |
1176 | 5.77k | if (5.77k DemandedElts[OutIdx]5.77k ) |
1177 | 3.62k | InputDemandedElts.setBit(OutIdx / Ratio); |
1178 | 1.08k | } else if (478 (InVWidth % VWidth) == 0478 ) { |
1179 | 478 | // If the number of elements in the input is a multiple of the number of |
1180 | 478 | // elements in the output then an input element is live if the |
1181 | 478 | // corresponding output element is live. |
1182 | 478 | Ratio = InVWidth / VWidth; |
1183 | 6.65k | for (unsigned InIdx = 0; InIdx != InVWidth6.65k ; ++InIdx6.17k ) |
1184 | 6.17k | if (6.17k DemandedElts[InIdx / Ratio]6.17k ) |
1185 | 3.04k | InputDemandedElts.setBit(InIdx); |
1186 | 0 | } else { |
1187 | 0 | // Unsupported so far. |
1188 | 0 | break; |
1189 | 0 | } |
1190 | 1.15k | |
1191 | 1.15k | // div/rem demand all inputs, because they don't want divide by zero. |
1192 | 1.15k | TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts, |
1193 | 1.15k | UndefElts2, Depth + 1); |
1194 | 1.15k | if (TmpV1.15k ) { |
1195 | 11 | I->setOperand(0, TmpV); |
1196 | 11 | MadeChange = true; |
1197 | 11 | } |
1198 | 1.15k | |
1199 | 1.15k | if (VWidth == InVWidth1.15k ) { |
1200 | 70 | UndefElts = UndefElts2; |
1201 | 1.15k | } else if (1.08k (VWidth % InVWidth) == 01.08k ) { |
1202 | 610 | // If the number of elements in the output is a multiple of the number of |
1203 | 610 | // elements in the input then an output element is undef if the |
1204 | 610 | // corresponding input element is undef. |
1205 | 6.38k | for (unsigned OutIdx = 0; OutIdx != VWidth6.38k ; ++OutIdx5.77k ) |
1206 | 5.77k | if (5.77k UndefElts2[OutIdx / Ratio]5.77k ) |
1207 | 28 | UndefElts.setBit(OutIdx); |
1208 | 1.08k | } else if (478 (InVWidth % VWidth) == 0478 ) { |
1209 | 478 | // If the number of elements in the input is a multiple of the number of |
1210 | 478 | // elements in the output then an output element is undef if all of the |
1211 | 478 | // corresponding input elements are undef. |
1212 | 1.54k | for (unsigned OutIdx = 0; OutIdx != VWidth1.54k ; ++OutIdx1.06k ) { |
1213 | 1.06k | APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio); |
1214 | 1.06k | if (SubUndef.countPopulation() == Ratio) |
1215 | 29 | UndefElts.setBit(OutIdx); |
1216 | 1.06k | } |
1217 | 0 | } else { |
1218 | 0 | llvm_unreachable("Unimp"); |
1219 | 1.08k | } |
1220 | 1.15k | break; |
1221 | 1.15k | } |
1222 | 3.43k | case Instruction::And: |
1223 | 3.43k | case Instruction::Or: |
1224 | 3.43k | case Instruction::Xor: |
1225 | 3.43k | case Instruction::Add: |
1226 | 3.43k | case Instruction::Sub: |
1227 | 3.43k | case Instruction::Mul: |
1228 | 3.43k | // div/rem demand all inputs, because they don't want divide by zero. |
1229 | 3.43k | TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, UndefElts, |
1230 | 3.43k | Depth + 1); |
1231 | 3.43k | if (TmpV3.43k ) { I->setOperand(0, TmpV); MadeChange = true; }5 |
1232 | 3.43k | TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts, |
1233 | 3.43k | UndefElts2, Depth + 1); |
1234 | 3.43k | if (TmpV3.43k ) { I->setOperand(1, TmpV); MadeChange = true; }8 |
1235 | 3.43k | |
1236 | 3.43k | // Output elements are undefined if both are undefined. Consider things |
1237 | 3.43k | // like undef&0. The result is known zero, not undef. |
1238 | 3.43k | UndefElts &= UndefElts2; |
1239 | 3.43k | break; |
1240 | 130 | case Instruction::FPTrunc: |
1241 | 130 | case Instruction::FPExt: |
1242 | 130 | TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, UndefElts, |
1243 | 130 | Depth + 1); |
1244 | 130 | if (TmpV130 ) { I->setOperand(0, TmpV); MadeChange = true; }2 |
1245 | 130 | break; |
1246 | 130 | |
1247 | 29.9k | case Instruction::Call: { |
1248 | 29.9k | IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); |
1249 | 29.9k | if (!II29.9k ) break12.9k ; |
1250 | 16.9k | switch (II->getIntrinsicID()) { |
1251 | 16.0k | default: break; |
1252 | 16.9k | |
1253 | 23 | case Intrinsic::x86_xop_vfrcz_ss: |
1254 | 23 | case Intrinsic::x86_xop_vfrcz_sd: |
1255 | 23 | // The instructions for these intrinsics are speced to zero upper bits not |
1256 | 23 | // pass them through like other scalar intrinsics. So we shouldn't just |
1257 | 23 | // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. |
1258 | 23 | // Instead we should return a zero vector. |
1259 | 23 | if (!DemandedElts[0]23 ) { |
1260 | 2 | Worklist.Add(II); |
1261 | 2 | return ConstantAggregateZero::get(II->getType()); |
1262 | 2 | } |
1263 | 21 | |
1264 | 21 | // Only the lower element is used. |
1265 | 21 | DemandedElts = 1; |
1266 | 21 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, |
1267 | 21 | UndefElts, Depth + 1); |
1268 | 21 | if (TmpV21 ) { II->setArgOperand(0, TmpV); MadeChange = true; }6 |
1269 | 21 | |
1270 | 21 | // Only the lower element is undefined. The high elements are zero. |
1271 | 21 | UndefElts = UndefElts[0]; |
1272 | 21 | break; |
1273 | 21 | |
1274 | 21 | // Unary scalar-as-vector operations that work column-wise. |
1275 | 16 | case Intrinsic::x86_sse_rcp_ss: |
1276 | 16 | case Intrinsic::x86_sse_rsqrt_ss: |
1277 | 16 | case Intrinsic::x86_sse_sqrt_ss: |
1278 | 16 | case Intrinsic::x86_sse2_sqrt_sd: |
1279 | 16 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, |
1280 | 16 | UndefElts, Depth + 1); |
1281 | 16 | if (TmpV16 ) { II->setArgOperand(0, TmpV); MadeChange = true; }8 |
1282 | 16 | |
1283 | 16 | // If lowest element of a scalar op isn't used then use Arg0. |
1284 | 16 | if (!DemandedElts[0]16 ) { |
1285 | 4 | Worklist.Add(II); |
1286 | 4 | return II->getArgOperand(0); |
1287 | 4 | } |
1288 | 12 | // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions |
1289 | 12 | // checks). |
1290 | 12 | break; |
1291 | 12 | |
1292 | 12 | // Binary scalar-as-vector operations that work column-wise. The high |
1293 | 12 | // elements come from operand 0. The low element is a function of both |
1294 | 12 | // operands. |
1295 | 311 | case Intrinsic::x86_sse_min_ss: |
1296 | 311 | case Intrinsic::x86_sse_max_ss: |
1297 | 311 | case Intrinsic::x86_sse_cmp_ss: |
1298 | 311 | case Intrinsic::x86_sse2_min_sd: |
1299 | 311 | case Intrinsic::x86_sse2_max_sd: |
1300 | 311 | case Intrinsic::x86_sse2_cmp_sd: { |
1301 | 311 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, |
1302 | 311 | UndefElts, Depth + 1); |
1303 | 311 | if (TmpV311 ) { II->setArgOperand(0, TmpV); MadeChange = true; }14 |
1304 | 311 | |
1305 | 311 | // If lowest element of a scalar op isn't used then use Arg0. |
1306 | 311 | if (!DemandedElts[0]311 ) { |
1307 | 6 | Worklist.Add(II); |
1308 | 6 | return II->getArgOperand(0); |
1309 | 6 | } |
1310 | 305 | |
1311 | 305 | // Only lower element is used for operand 1. |
1312 | 305 | DemandedElts = 1; |
1313 | 305 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts, |
1314 | 305 | UndefElts2, Depth + 1); |
1315 | 305 | if (TmpV305 ) { II->setArgOperand(1, TmpV); MadeChange = true; }17 |
1316 | 305 | |
1317 | 305 | // Lower element is undefined if both lower elements are undefined. |
1318 | 305 | // Consider things like undef&0. The result is known zero, not undef. |
1319 | 305 | if (!UndefElts2[0]) |
1320 | 305 | UndefElts.clearBit(0); |
1321 | 305 | |
1322 | 305 | break; |
1323 | 305 | } |
1324 | 305 | |
1325 | 305 | // Binary scalar-as-vector operations that work column-wise. The high |
1326 | 305 | // elements come from operand 0 and the low element comes from operand 1. |
1327 | 25 | case Intrinsic::x86_sse41_round_ss: |
1328 | 25 | case Intrinsic::x86_sse41_round_sd: { |
1329 | 25 | // Don't use the low element of operand 0. |
1330 | 25 | APInt DemandedElts2 = DemandedElts; |
1331 | 25 | DemandedElts2.clearBit(0); |
1332 | 25 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts2, |
1333 | 25 | UndefElts, Depth + 1); |
1334 | 25 | if (TmpV25 ) { II->setArgOperand(0, TmpV); MadeChange = true; }9 |
1335 | 25 | |
1336 | 25 | // If lowest element of a scalar op isn't used then use Arg0. |
1337 | 25 | if (!DemandedElts[0]25 ) { |
1338 | 2 | Worklist.Add(II); |
1339 | 2 | return II->getArgOperand(0); |
1340 | 2 | } |
1341 | 23 | |
1342 | 23 | // Only lower element is used for operand 1. |
1343 | 23 | DemandedElts = 1; |
1344 | 23 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts, |
1345 | 23 | UndefElts2, Depth + 1); |
1346 | 23 | if (TmpV23 ) { II->setArgOperand(1, TmpV); MadeChange = true; }6 |
1347 | 23 | |
1348 | 23 | // Take the high undef elements from operand 0 and take the lower element |
1349 | 23 | // from operand 1. |
1350 | 23 | UndefElts.clearBit(0); |
1351 | 23 | UndefElts |= UndefElts2[0]; |
1352 | 23 | break; |
1353 | 23 | } |
1354 | 23 | |
1355 | 23 | // Three input scalar-as-vector operations that work column-wise. The high |
1356 | 23 | // elements come from operand 0 and the low element is a function of all |
1357 | 23 | // three inputs. |
1358 | 258 | case Intrinsic::x86_avx512_mask_add_ss_round: |
1359 | 258 | case Intrinsic::x86_avx512_mask_div_ss_round: |
1360 | 258 | case Intrinsic::x86_avx512_mask_mul_ss_round: |
1361 | 258 | case Intrinsic::x86_avx512_mask_sub_ss_round: |
1362 | 258 | case Intrinsic::x86_avx512_mask_max_ss_round: |
1363 | 258 | case Intrinsic::x86_avx512_mask_min_ss_round: |
1364 | 258 | case Intrinsic::x86_avx512_mask_add_sd_round: |
1365 | 258 | case Intrinsic::x86_avx512_mask_div_sd_round: |
1366 | 258 | case Intrinsic::x86_avx512_mask_mul_sd_round: |
1367 | 258 | case Intrinsic::x86_avx512_mask_sub_sd_round: |
1368 | 258 | case Intrinsic::x86_avx512_mask_max_sd_round: |
1369 | 258 | case Intrinsic::x86_avx512_mask_min_sd_round: |
1370 | 258 | case Intrinsic::x86_fma_vfmadd_ss: |
1371 | 258 | case Intrinsic::x86_fma_vfmsub_ss: |
1372 | 258 | case Intrinsic::x86_fma_vfnmadd_ss: |
1373 | 258 | case Intrinsic::x86_fma_vfnmsub_ss: |
1374 | 258 | case Intrinsic::x86_fma_vfmadd_sd: |
1375 | 258 | case Intrinsic::x86_fma_vfmsub_sd: |
1376 | 258 | case Intrinsic::x86_fma_vfnmadd_sd: |
1377 | 258 | case Intrinsic::x86_fma_vfnmsub_sd: |
1378 | 258 | case Intrinsic::x86_avx512_mask_vfmadd_ss: |
1379 | 258 | case Intrinsic::x86_avx512_mask_vfmadd_sd: |
1380 | 258 | case Intrinsic::x86_avx512_maskz_vfmadd_ss: |
1381 | 258 | case Intrinsic::x86_avx512_maskz_vfmadd_sd: |
1382 | 258 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, |
1383 | 258 | UndefElts, Depth + 1); |
1384 | 258 | if (TmpV258 ) { II->setArgOperand(0, TmpV); MadeChange = true; }36 |
1385 | 258 | |
1386 | 258 | // If lowest element of a scalar op isn't used then use Arg0. |
1387 | 258 | if (!DemandedElts[0]258 ) { |
1388 | 24 | Worklist.Add(II); |
1389 | 24 | return II->getArgOperand(0); |
1390 | 24 | } |
1391 | 234 | |
1392 | 234 | // Only lower element is used for operand 1 and 2. |
1393 | 234 | DemandedElts = 1; |
1394 | 234 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts, |
1395 | 234 | UndefElts2, Depth + 1); |
1396 | 234 | if (TmpV234 ) { II->setArgOperand(1, TmpV); MadeChange = true; }36 |
1397 | 234 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts, |
1398 | 234 | UndefElts3, Depth + 1); |
1399 | 234 | if (TmpV234 ) { II->setArgOperand(2, TmpV); MadeChange = true; }24 |
1400 | 234 | |
1401 | 234 | // Lower element is undefined if all three lower elements are undefined. |
1402 | 234 | // Consider things like undef&0. The result is known zero, not undef. |
1403 | 234 | if (!UndefElts2[0] || 234 !UndefElts3[0]0 ) |
1404 | 234 | UndefElts.clearBit(0); |
1405 | 234 | |
1406 | 234 | break; |
1407 | 234 | |
1408 | 69 | case Intrinsic::x86_avx512_mask3_vfmadd_ss: |
1409 | 69 | case Intrinsic::x86_avx512_mask3_vfmadd_sd: |
1410 | 69 | case Intrinsic::x86_avx512_mask3_vfmsub_ss: |
1411 | 69 | case Intrinsic::x86_avx512_mask3_vfmsub_sd: |
1412 | 69 | case Intrinsic::x86_avx512_mask3_vfnmsub_ss: |
1413 | 69 | case Intrinsic::x86_avx512_mask3_vfnmsub_sd: |
1414 | 69 | // These intrinsics get the passthru bits from operand 2. |
1415 | 69 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts, |
1416 | 69 | UndefElts, Depth + 1); |
1417 | 69 | if (TmpV69 ) { II->setArgOperand(2, TmpV); MadeChange = true; }12 |
1418 | 69 | |
1419 | 69 | // If lowest element of a scalar op isn't used then use Arg2. |
1420 | 69 | if (!DemandedElts[0]69 ) { |
1421 | 6 | Worklist.Add(II); |
1422 | 6 | return II->getArgOperand(2); |
1423 | 6 | } |
1424 | 63 | |
1425 | 63 | // Only lower element is used for operand 0 and 1. |
1426 | 63 | DemandedElts = 1; |
1427 | 63 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, |
1428 | 63 | UndefElts2, Depth + 1); |
1429 | 63 | if (TmpV63 ) { II->setArgOperand(0, TmpV); MadeChange = true; }6 |
1430 | 63 | TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts, |
1431 | 63 | UndefElts3, Depth + 1); |
1432 | 63 | if (TmpV63 ) { II->setArgOperand(1, TmpV); MadeChange = true; }6 |
1433 | 63 | |
1434 | 63 | // Lower element is undefined if all three lower elements are undefined. |
1435 | 63 | // Consider things like undef&0. The result is known zero, not undef. |
1436 | 63 | if (!UndefElts2[0] || 63 !UndefElts3[0]0 ) |
1437 | 63 | UndefElts.clearBit(0); |
1438 | 63 | |
1439 | 63 | break; |
1440 | 63 | |
1441 | 36 | case Intrinsic::x86_sse2_pmulu_dq: |
1442 | 36 | case Intrinsic::x86_sse41_pmuldq: |
1443 | 36 | case Intrinsic::x86_avx2_pmul_dq: |
1444 | 36 | case Intrinsic::x86_avx2_pmulu_dq: |
1445 | 36 | case Intrinsic::x86_avx512_pmul_dq_512: |
1446 | 36 | case Intrinsic::x86_avx512_pmulu_dq_512: { |
1447 | 36 | Value *Op0 = II->getArgOperand(0); |
1448 | 36 | Value *Op1 = II->getArgOperand(1); |
1449 | 36 | unsigned InnerVWidth = Op0->getType()->getVectorNumElements(); |
1450 | 36 | assert((VWidth * 2) == InnerVWidth && "Unexpected input size"); |
1451 | 36 | |
1452 | 36 | APInt InnerDemandedElts(InnerVWidth, 0); |
1453 | 204 | for (unsigned i = 0; i != VWidth204 ; ++i168 ) |
1454 | 168 | if (168 DemandedElts[i]168 ) |
1455 | 140 | InnerDemandedElts.setBit(i * 2); |
1456 | 36 | |
1457 | 36 | UndefElts2 = APInt(InnerVWidth, 0); |
1458 | 36 | TmpV = SimplifyDemandedVectorElts(Op0, InnerDemandedElts, UndefElts2, |
1459 | 36 | Depth + 1); |
1460 | 36 | if (TmpV36 ) { II->setArgOperand(0, TmpV); MadeChange = true; }9 |
1461 | 36 | |
1462 | 36 | UndefElts3 = APInt(InnerVWidth, 0); |
1463 | 36 | TmpV = SimplifyDemandedVectorElts(Op1, InnerDemandedElts, UndefElts3, |
1464 | 36 | Depth + 1); |
1465 | 36 | if (TmpV36 ) { II->setArgOperand(1, TmpV); MadeChange = true; }9 |
1466 | 36 | |
1467 | 36 | break; |
1468 | 36 | } |
1469 | 36 | |
1470 | 25 | case Intrinsic::x86_sse2_packssdw_128: |
1471 | 25 | case Intrinsic::x86_sse2_packsswb_128: |
1472 | 25 | case Intrinsic::x86_sse2_packuswb_128: |
1473 | 25 | case Intrinsic::x86_sse41_packusdw: |
1474 | 25 | case Intrinsic::x86_avx2_packssdw: |
1475 | 25 | case Intrinsic::x86_avx2_packsswb: |
1476 | 25 | case Intrinsic::x86_avx2_packusdw: |
1477 | 25 | case Intrinsic::x86_avx2_packuswb: |
1478 | 25 | case Intrinsic::x86_avx512_packssdw_512: |
1479 | 25 | case Intrinsic::x86_avx512_packsswb_512: |
1480 | 25 | case Intrinsic::x86_avx512_packusdw_512: |
1481 | 25 | case Intrinsic::x86_avx512_packuswb_512: { |
1482 | 25 | auto *Ty0 = II->getArgOperand(0)->getType(); |
1483 | 25 | unsigned InnerVWidth = Ty0->getVectorNumElements(); |
1484 | 25 | assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); |
1485 | 25 | |
1486 | 25 | unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; |
1487 | 25 | unsigned VWidthPerLane = VWidth / NumLanes; |
1488 | 25 | unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; |
1489 | 25 | |
1490 | 25 | // Per lane, pack the elements of the first input and then the second. |
1491 | 25 | // e.g. |
1492 | 25 | // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) |
1493 | 25 | // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) |
1494 | 75 | for (int OpNum = 0; OpNum != 275 ; ++OpNum50 ) { |
1495 | 50 | APInt OpDemandedElts(InnerVWidth, 0); |
1496 | 164 | for (unsigned Lane = 0; Lane != NumLanes164 ; ++Lane114 ) { |
1497 | 114 | unsigned LaneIdx = Lane * VWidthPerLane; |
1498 | 682 | for (unsigned Elt = 0; Elt != InnerVWidthPerLane682 ; ++Elt568 ) { |
1499 | 568 | unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; |
1500 | 568 | if (DemandedElts[Idx]) |
1501 | 154 | OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); |
1502 | 568 | } |
1503 | 114 | } |
1504 | 50 | |
1505 | 50 | // Demand elements from the operand. |
1506 | 50 | auto *Op = II->getArgOperand(OpNum); |
1507 | 50 | APInt OpUndefElts(InnerVWidth, 0); |
1508 | 50 | TmpV = SimplifyDemandedVectorElts(Op, OpDemandedElts, OpUndefElts, |
1509 | 50 | Depth + 1); |
1510 | 50 | if (TmpV50 ) { |
1511 | 19 | II->setArgOperand(OpNum, TmpV); |
1512 | 19 | MadeChange = true; |
1513 | 19 | } |
1514 | 50 | |
1515 | 50 | // Pack the operand's UNDEF elements, one lane at a time. |
1516 | 50 | OpUndefElts = OpUndefElts.zext(VWidth); |
1517 | 164 | for (unsigned Lane = 0; Lane != NumLanes164 ; ++Lane114 ) { |
1518 | 114 | APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); |
1519 | 114 | LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); |
1520 | 114 | LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); |
1521 | 114 | UndefElts |= LaneElts; |
1522 | 114 | } |
1523 | 50 | } |
1524 | 25 | break; |
1525 | 25 | } |
1526 | 25 | |
1527 | 25 | // PSHUFB |
1528 | 26 | case Intrinsic::x86_ssse3_pshuf_b_128: |
1529 | 26 | case Intrinsic::x86_avx2_pshuf_b: |
1530 | 26 | case Intrinsic::x86_avx512_pshuf_b_512: |
1531 | 26 | // PERMILVAR |
1532 | 26 | case Intrinsic::x86_avx_vpermilvar_ps: |
1533 | 26 | case Intrinsic::x86_avx_vpermilvar_ps_256: |
1534 | 26 | case Intrinsic::x86_avx512_vpermilvar_ps_512: |
1535 | 26 | case Intrinsic::x86_avx_vpermilvar_pd: |
1536 | 26 | case Intrinsic::x86_avx_vpermilvar_pd_256: |
1537 | 26 | case Intrinsic::x86_avx512_vpermilvar_pd_512: |
1538 | 26 | // PERMV |
1539 | 26 | case Intrinsic::x86_avx2_permd: |
1540 | 26 | case Intrinsic::x86_avx2_permps: { |
1541 | 26 | Value *Op1 = II->getArgOperand(1); |
1542 | 26 | TmpV = SimplifyDemandedVectorElts(Op1, DemandedElts, UndefElts, |
1543 | 26 | Depth + 1); |
1544 | 26 | if (TmpV26 ) { II->setArgOperand(1, TmpV); MadeChange = true; }11 |
1545 | 26 | break; |
1546 | 26 | } |
1547 | 26 | |
1548 | 26 | // SSE4A instructions leave the upper 64-bits of the 128-bit result |
1549 | 26 | // in an undefined state. |
1550 | 4 | case Intrinsic::x86_sse4a_extrq: |
1551 | 4 | case Intrinsic::x86_sse4a_extrqi: |
1552 | 4 | case Intrinsic::x86_sse4a_insertq: |
1553 | 4 | case Intrinsic::x86_sse4a_insertqi: |
1554 | 4 | UndefElts.setHighBits(VWidth / 2); |
1555 | 4 | break; |
1556 | 97 | case Intrinsic::amdgcn_buffer_load: |
1557 | 97 | case Intrinsic::amdgcn_buffer_load_format: |
1558 | 97 | case Intrinsic::amdgcn_image_sample: |
1559 | 97 | case Intrinsic::amdgcn_image_sample_cl: |
1560 | 97 | case Intrinsic::amdgcn_image_sample_d: |
1561 | 97 | case Intrinsic::amdgcn_image_sample_d_cl: |
1562 | 97 | case Intrinsic::amdgcn_image_sample_l: |
1563 | 97 | case Intrinsic::amdgcn_image_sample_b: |
1564 | 97 | case Intrinsic::amdgcn_image_sample_b_cl: |
1565 | 97 | case Intrinsic::amdgcn_image_sample_lz: |
1566 | 97 | case Intrinsic::amdgcn_image_sample_cd: |
1567 | 97 | case Intrinsic::amdgcn_image_sample_cd_cl: |
1568 | 97 | |
1569 | 97 | case Intrinsic::amdgcn_image_sample_c: |
1570 | 97 | case Intrinsic::amdgcn_image_sample_c_cl: |
1571 | 97 | case Intrinsic::amdgcn_image_sample_c_d: |
1572 | 97 | case Intrinsic::amdgcn_image_sample_c_d_cl: |
1573 | 97 | case Intrinsic::amdgcn_image_sample_c_l: |
1574 | 97 | case Intrinsic::amdgcn_image_sample_c_b: |
1575 | 97 | case Intrinsic::amdgcn_image_sample_c_b_cl: |
1576 | 97 | case Intrinsic::amdgcn_image_sample_c_lz: |
1577 | 97 | case Intrinsic::amdgcn_image_sample_c_cd: |
1578 | 97 | case Intrinsic::amdgcn_image_sample_c_cd_cl: |
1579 | 97 | |
1580 | 97 | case Intrinsic::amdgcn_image_sample_o: |
1581 | 97 | case Intrinsic::amdgcn_image_sample_cl_o: |
1582 | 97 | case Intrinsic::amdgcn_image_sample_d_o: |
1583 | 97 | case Intrinsic::amdgcn_image_sample_d_cl_o: |
1584 | 97 | case Intrinsic::amdgcn_image_sample_l_o: |
1585 | 97 | case Intrinsic::amdgcn_image_sample_b_o: |
1586 | 97 | case Intrinsic::amdgcn_image_sample_b_cl_o: |
1587 | 97 | case Intrinsic::amdgcn_image_sample_lz_o: |
1588 | 97 | case Intrinsic::amdgcn_image_sample_cd_o: |
1589 | 97 | case Intrinsic::amdgcn_image_sample_cd_cl_o: |
1590 | 97 | |
1591 | 97 | case Intrinsic::amdgcn_image_sample_c_o: |
1592 | 97 | case Intrinsic::amdgcn_image_sample_c_cl_o: |
1593 | 97 | case Intrinsic::amdgcn_image_sample_c_d_o: |
1594 | 97 | case Intrinsic::amdgcn_image_sample_c_d_cl_o: |
1595 | 97 | case Intrinsic::amdgcn_image_sample_c_l_o: |
1596 | 97 | case Intrinsic::amdgcn_image_sample_c_b_o: |
1597 | 97 | case Intrinsic::amdgcn_image_sample_c_b_cl_o: |
1598 | 97 | case Intrinsic::amdgcn_image_sample_c_lz_o: |
1599 | 97 | case Intrinsic::amdgcn_image_sample_c_cd_o: |
1600 | 97 | case Intrinsic::amdgcn_image_sample_c_cd_cl_o: |
1601 | 97 | |
1602 | 97 | case Intrinsic::amdgcn_image_getlod: { |
1603 | 97 | if (VWidth == 1 || 97 !DemandedElts.isMask()97 ) |
1604 | 11 | return nullptr; |
1605 | 86 | |
1606 | 86 | // TODO: Handle 3 vectors when supported in code gen. |
1607 | 86 | unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes()); |
1608 | 86 | if (NewNumElts == VWidth) |
1609 | 10 | return nullptr; |
1610 | 76 | |
1611 | 76 | Module *M = II->getParent()->getParent()->getParent(); |
1612 | 76 | Type *EltTy = V->getType()->getVectorElementType(); |
1613 | 76 | |
1614 | 67 | Type *NewTy = (NewNumElts == 1) ? EltTy : |
1615 | 9 | VectorType::get(EltTy, NewNumElts); |
1616 | 76 | |
1617 | 76 | auto IID = II->getIntrinsicID(); |
1618 | 76 | |
1619 | 76 | bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load || |
1620 | 70 | IID == Intrinsic::amdgcn_buffer_load_format; |
1621 | 76 | |
1622 | 76 | Function *NewIntrin = IsBuffer ? |
1623 | 12 | Intrinsic::getDeclaration(M, IID, NewTy) : |
1624 | 76 | // Samplers have 3 mangled types. |
1625 | 64 | Intrinsic::getDeclaration(M, IID, |
1626 | 64 | { NewTy, II->getArgOperand(0)->getType(), |
1627 | 64 | II->getArgOperand(1)->getType()}); |
1628 | 76 | |
1629 | 76 | SmallVector<Value *, 5> Args; |
1630 | 712 | for (unsigned I = 0, E = II->getNumArgOperands(); I != E712 ; ++I636 ) |
1631 | 636 | Args.push_back(II->getArgOperand(I)); |
1632 | 76 | |
1633 | 76 | IRBuilderBase::InsertPointGuard Guard(Builder); |
1634 | 76 | Builder.SetInsertPoint(II); |
1635 | 76 | |
1636 | 76 | CallInst *NewCall = Builder.CreateCall(NewIntrin, Args); |
1637 | 76 | NewCall->takeName(II); |
1638 | 76 | NewCall->copyMetadata(*II); |
1639 | 76 | |
1640 | 76 | if (!IsBuffer76 ) { |
1641 | 64 | ConstantInt *DMask = dyn_cast<ConstantInt>(NewCall->getArgOperand(3)); |
1642 | 64 | if (DMask64 ) { |
1643 | 63 | unsigned DMaskVal = DMask->getZExtValue() & 0xf; |
1644 | 63 | |
1645 | 63 | unsigned PopCnt = 0; |
1646 | 63 | unsigned NewDMask = 0; |
1647 | 153 | for (unsigned I = 0; I < 4153 ; ++I90 ) { |
1648 | 145 | const unsigned Bit = 1 << I; |
1649 | 145 | if (!!(DMaskVal & Bit)145 ) { |
1650 | 120 | if (++PopCnt > NewNumElts) |
1651 | 55 | break; |
1652 | 65 | |
1653 | 65 | NewDMask |= Bit; |
1654 | 65 | } |
1655 | 145 | } |
1656 | 63 | |
1657 | 63 | NewCall->setArgOperand(3, ConstantInt::get(DMask->getType(), NewDMask)); |
1658 | 63 | } |
1659 | 64 | } |
1660 | 76 | |
1661 | 76 | |
1662 | 76 | if (NewNumElts == 176 ) { |
1663 | 67 | return Builder.CreateInsertElement(UndefValue::get(V->getType()), |
1664 | 67 | NewCall, static_cast<uint64_t>(0)); |
1665 | 67 | } |
1666 | 9 | |
1667 | 9 | SmallVector<uint32_t, 8> EltMask; |
1668 | 43 | for (unsigned I = 0; I < VWidth43 ; ++I34 ) |
1669 | 34 | EltMask.push_back(I); |
1670 | 23 | |
1671 | 23 | Value *Shuffle = Builder.CreateShuffleVector( |
1672 | 23 | NewCall, UndefValue::get(NewTy), EltMask); |
1673 | 23 | |
1674 | 23 | MadeChange = true; |
1675 | 23 | return Shuffle; |
1676 | 23 | } |
1677 | 16.8k | } |
1678 | 16.8k | break; |
1679 | 16.8k | } |
1680 | 731k | } |
1681 | 731k | return MadeChange ? 731k I12.2k : nullptr719k ; |
1682 | 1.41M | } |