/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | /// \file |
11 | | /// \brief This is the parent TargetLowering class for hardware code gen |
12 | | /// targets. |
13 | | // |
14 | | //===----------------------------------------------------------------------===// |
15 | | |
16 | | #include "AMDGPUISelLowering.h" |
17 | | #include "AMDGPU.h" |
18 | | #include "AMDGPUCallLowering.h" |
19 | | #include "AMDGPUFrameLowering.h" |
20 | | #include "AMDGPUIntrinsicInfo.h" |
21 | | #include "AMDGPURegisterInfo.h" |
22 | | #include "AMDGPUSubtarget.h" |
23 | | #include "AMDGPUTargetMachine.h" |
24 | | #include "R600MachineFunctionInfo.h" |
25 | | #include "SIInstrInfo.h" |
26 | | #include "SIMachineFunctionInfo.h" |
27 | | #include "llvm/CodeGen/CallingConvLower.h" |
28 | | #include "llvm/CodeGen/MachineFunction.h" |
29 | | #include "llvm/CodeGen/MachineRegisterInfo.h" |
30 | | #include "llvm/CodeGen/SelectionDAG.h" |
31 | | #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" |
32 | | #include "llvm/IR/DataLayout.h" |
33 | | #include "llvm/IR/DiagnosticInfo.h" |
34 | | #include "llvm/Support/KnownBits.h" |
35 | | using namespace llvm; |
36 | | |
37 | | static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, |
38 | | CCValAssign::LocInfo LocInfo, |
39 | 37.3k | ISD::ArgFlagsTy ArgFlags, CCState &State) { |
40 | 37.3k | MachineFunction &MF = State.getMachineFunction(); |
41 | 37.3k | AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); |
42 | 37.3k | |
43 | 37.3k | uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(), |
44 | 37.3k | ArgFlags.getOrigAlign()); |
45 | 37.3k | State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); |
46 | 37.3k | return true; |
47 | 37.3k | } |
48 | | |
49 | | static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT, |
50 | | CCValAssign::LocInfo LocInfo, |
51 | | ISD::ArgFlagsTy ArgFlags, CCState &State, |
52 | | const TargetRegisterClass *RC, |
53 | 1.04k | unsigned NumRegs) { |
54 | 1.04k | ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs); |
55 | 1.04k | unsigned RegResult = State.AllocateReg(RegList); |
56 | 1.04k | if (RegResult == AMDGPU::NoRegister) |
57 | 36 | return false; |
58 | 1.00k | |
59 | 1.00k | State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo)); |
60 | 1.00k | return true; |
61 | 1.00k | } |
62 | | |
63 | | static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, |
64 | | CCValAssign::LocInfo LocInfo, |
65 | 317 | ISD::ArgFlagsTy ArgFlags, CCState &State) { |
66 | 317 | switch (LocVT.SimpleTy) { |
67 | 317 | case MVT::i64: |
68 | 317 | case MVT::f64: |
69 | 317 | case MVT::v2i32: |
70 | 317 | case MVT::v2f32: { |
71 | 317 | // Up to SGPR0-SGPR39 |
72 | 317 | return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, |
73 | 317 | &AMDGPU::SGPR_64RegClass, 20); |
74 | 317 | } |
75 | 0 | default: |
76 | 0 | return false; |
77 | 0 | } |
78 | 0 | } |
79 | | |
80 | | // Allocate up to VGPR31. |
81 | | // |
82 | | // TODO: Since there are no VGPR alignent requirements would it be better to |
83 | | // split into individual scalar registers? |
84 | | static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, |
85 | | CCValAssign::LocInfo LocInfo, |
86 | 723 | ISD::ArgFlagsTy ArgFlags, CCState &State) { |
87 | 723 | switch (LocVT.SimpleTy) { |
88 | 296 | case MVT::i64: |
89 | 296 | case MVT::f64: |
90 | 296 | case MVT::v2i32: |
91 | 296 | case MVT::v2f32: { |
92 | 296 | return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, |
93 | 296 | &AMDGPU::VReg_64RegClass, 31); |
94 | 296 | } |
95 | 238 | case MVT::v4i32: |
96 | 238 | case MVT::v4f32: |
97 | 238 | case MVT::v2i64: |
98 | 238 | case MVT::v2f64: { |
99 | 238 | return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, |
100 | 238 | &AMDGPU::VReg_128RegClass, 29); |
101 | 238 | } |
102 | 43 | case MVT::v8i32: |
103 | 43 | case MVT::v8f32: { |
104 | 43 | return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, |
105 | 43 | &AMDGPU::VReg_256RegClass, 25); |
106 | 43 | |
107 | 43 | } |
108 | 146 | case MVT::v16i32: |
109 | 146 | case MVT::v16f32: { |
110 | 146 | return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, |
111 | 146 | &AMDGPU::VReg_512RegClass, 17); |
112 | 146 | |
113 | 146 | } |
114 | 0 | default: |
115 | 0 | return false; |
116 | 0 | } |
117 | 0 | } |
118 | | |
119 | | #include "AMDGPUGenCallingConv.inc" |
120 | | |
121 | | // Find a larger type to do a load / store of a vector with. |
122 | 4.17k | EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { |
123 | 4.17k | unsigned StoreSize = VT.getStoreSizeInBits(); |
124 | 4.17k | if (StoreSize <= 32) |
125 | 1.77k | return EVT::getIntegerVT(Ctx, StoreSize); |
126 | 2.39k | |
127 | 4.17k | assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); |
128 | 2.39k | return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); |
129 | 2.39k | } |
130 | | |
131 | | bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op) |
132 | 252 | { |
133 | 252 | assert(Op.getOpcode() == ISD::OR); |
134 | 252 | |
135 | 252 | SDValue N0 = Op->getOperand(0); |
136 | 252 | SDValue N1 = Op->getOperand(1); |
137 | 252 | EVT VT = N0.getValueType(); |
138 | 252 | |
139 | 252 | if (VT.isInteger() && 252 !VT.isVector()252 ) { |
140 | 252 | KnownBits LHSKnown, RHSKnown; |
141 | 252 | DAG.computeKnownBits(N0, LHSKnown); |
142 | 252 | |
143 | 252 | if (LHSKnown.Zero.getBoolValue()252 ) { |
144 | 250 | DAG.computeKnownBits(N1, RHSKnown); |
145 | 250 | |
146 | 250 | if (!(~RHSKnown.Zero & ~LHSKnown.Zero)) |
147 | 234 | return true; |
148 | 18 | } |
149 | 252 | } |
150 | 18 | |
151 | 18 | return false; |
152 | 18 | } |
153 | | |
154 | | AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, |
155 | | const AMDGPUSubtarget &STI) |
156 | 2.06k | : TargetLowering(TM), Subtarget(&STI) { |
157 | 2.06k | AMDGPUASI = AMDGPU::getAMDGPUAS(TM); |
158 | 2.06k | // Lower floating point store/load to integer store/load to reduce the number |
159 | 2.06k | // of patterns in tablegen. |
160 | 2.06k | setOperationAction(ISD::LOAD, MVT::f32, Promote); |
161 | 2.06k | AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); |
162 | 2.06k | |
163 | 2.06k | setOperationAction(ISD::LOAD, MVT::v2f32, Promote); |
164 | 2.06k | AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); |
165 | 2.06k | |
166 | 2.06k | setOperationAction(ISD::LOAD, MVT::v4f32, Promote); |
167 | 2.06k | AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); |
168 | 2.06k | |
169 | 2.06k | setOperationAction(ISD::LOAD, MVT::v8f32, Promote); |
170 | 2.06k | AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); |
171 | 2.06k | |
172 | 2.06k | setOperationAction(ISD::LOAD, MVT::v16f32, Promote); |
173 | 2.06k | AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); |
174 | 2.06k | |
175 | 2.06k | setOperationAction(ISD::LOAD, MVT::i64, Promote); |
176 | 2.06k | AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); |
177 | 2.06k | |
178 | 2.06k | setOperationAction(ISD::LOAD, MVT::v2i64, Promote); |
179 | 2.06k | AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); |
180 | 2.06k | |
181 | 2.06k | setOperationAction(ISD::LOAD, MVT::f64, Promote); |
182 | 2.06k | AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32); |
183 | 2.06k | |
184 | 2.06k | setOperationAction(ISD::LOAD, MVT::v2f64, Promote); |
185 | 2.06k | AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); |
186 | 2.06k | |
187 | 2.06k | // There are no 64-bit extloads. These should be done as a 32-bit extload and |
188 | 2.06k | // an extension to 64-bit. |
189 | 12.4k | for (MVT VT : MVT::integer_valuetypes()) { |
190 | 12.4k | setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); |
191 | 12.4k | setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); |
192 | 12.4k | setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); |
193 | 12.4k | } |
194 | 2.06k | |
195 | 12.4k | for (MVT VT : MVT::integer_valuetypes()) { |
196 | 12.4k | if (VT == MVT::i64) |
197 | 2.06k | continue; |
198 | 10.3k | |
199 | 10.3k | setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); |
200 | 10.3k | setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); |
201 | 10.3k | setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); |
202 | 10.3k | setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); |
203 | 10.3k | |
204 | 10.3k | setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); |
205 | 10.3k | setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); |
206 | 10.3k | setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); |
207 | 10.3k | setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); |
208 | 10.3k | |
209 | 10.3k | setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); |
210 | 10.3k | setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); |
211 | 10.3k | setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); |
212 | 10.3k | setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); |
213 | 10.3k | } |
214 | 2.06k | |
215 | 144k | for (MVT VT : MVT::integer_vector_valuetypes()) { |
216 | 144k | setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); |
217 | 144k | setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); |
218 | 144k | setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); |
219 | 144k | setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); |
220 | 144k | setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); |
221 | 144k | setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); |
222 | 144k | setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); |
223 | 144k | setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); |
224 | 144k | setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); |
225 | 144k | setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); |
226 | 144k | setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); |
227 | 144k | setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); |
228 | 144k | } |
229 | 2.06k | |
230 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); |
231 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); |
232 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); |
233 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); |
234 | 2.06k | |
235 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); |
236 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); |
237 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); |
238 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); |
239 | 2.06k | |
240 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); |
241 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); |
242 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); |
243 | 2.06k | setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); |
244 | 2.06k | |
245 | 2.06k | setOperationAction(ISD::STORE, MVT::f32, Promote); |
246 | 2.06k | AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); |
247 | 2.06k | |
248 | 2.06k | setOperationAction(ISD::STORE, MVT::v2f32, Promote); |
249 | 2.06k | AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); |
250 | 2.06k | |
251 | 2.06k | setOperationAction(ISD::STORE, MVT::v4f32, Promote); |
252 | 2.06k | AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); |
253 | 2.06k | |
254 | 2.06k | setOperationAction(ISD::STORE, MVT::v8f32, Promote); |
255 | 2.06k | AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); |
256 | 2.06k | |
257 | 2.06k | setOperationAction(ISD::STORE, MVT::v16f32, Promote); |
258 | 2.06k | AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); |
259 | 2.06k | |
260 | 2.06k | setOperationAction(ISD::STORE, MVT::i64, Promote); |
261 | 2.06k | AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); |
262 | 2.06k | |
263 | 2.06k | setOperationAction(ISD::STORE, MVT::v2i64, Promote); |
264 | 2.06k | AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); |
265 | 2.06k | |
266 | 2.06k | setOperationAction(ISD::STORE, MVT::f64, Promote); |
267 | 2.06k | AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32); |
268 | 2.06k | |
269 | 2.06k | setOperationAction(ISD::STORE, MVT::v2f64, Promote); |
270 | 2.06k | AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); |
271 | 2.06k | |
272 | 2.06k | setTruncStoreAction(MVT::i64, MVT::i1, Expand); |
273 | 2.06k | setTruncStoreAction(MVT::i64, MVT::i8, Expand); |
274 | 2.06k | setTruncStoreAction(MVT::i64, MVT::i16, Expand); |
275 | 2.06k | setTruncStoreAction(MVT::i64, MVT::i32, Expand); |
276 | 2.06k | |
277 | 2.06k | setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); |
278 | 2.06k | setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand); |
279 | 2.06k | setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand); |
280 | 2.06k | setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); |
281 | 2.06k | |
282 | 2.06k | setTruncStoreAction(MVT::f32, MVT::f16, Expand); |
283 | 2.06k | setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); |
284 | 2.06k | setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); |
285 | 2.06k | setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); |
286 | 2.06k | |
287 | 2.06k | setTruncStoreAction(MVT::f64, MVT::f16, Expand); |
288 | 2.06k | setTruncStoreAction(MVT::f64, MVT::f32, Expand); |
289 | 2.06k | |
290 | 2.06k | setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); |
291 | 2.06k | setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); |
292 | 2.06k | |
293 | 2.06k | setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); |
294 | 2.06k | setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); |
295 | 2.06k | |
296 | 2.06k | setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); |
297 | 2.06k | setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); |
298 | 2.06k | |
299 | 2.06k | |
300 | 2.06k | setOperationAction(ISD::Constant, MVT::i32, Legal); |
301 | 2.06k | setOperationAction(ISD::Constant, MVT::i64, Legal); |
302 | 2.06k | setOperationAction(ISD::ConstantFP, MVT::f32, Legal); |
303 | 2.06k | setOperationAction(ISD::ConstantFP, MVT::f64, Legal); |
304 | 2.06k | |
305 | 2.06k | setOperationAction(ISD::BR_JT, MVT::Other, Expand); |
306 | 2.06k | setOperationAction(ISD::BRIND, MVT::Other, Expand); |
307 | 2.06k | |
308 | 2.06k | // This is totally unsupported, just custom lower to produce an error. |
309 | 2.06k | setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); |
310 | 2.06k | |
311 | 2.06k | // Library functions. These default to Expand, but we have instructions |
312 | 2.06k | // for them. |
313 | 2.06k | setOperationAction(ISD::FCEIL, MVT::f32, Legal); |
314 | 2.06k | setOperationAction(ISD::FEXP2, MVT::f32, Legal); |
315 | 2.06k | setOperationAction(ISD::FPOW, MVT::f32, Legal); |
316 | 2.06k | setOperationAction(ISD::FLOG2, MVT::f32, Legal); |
317 | 2.06k | setOperationAction(ISD::FABS, MVT::f32, Legal); |
318 | 2.06k | setOperationAction(ISD::FFLOOR, MVT::f32, Legal); |
319 | 2.06k | setOperationAction(ISD::FRINT, MVT::f32, Legal); |
320 | 2.06k | setOperationAction(ISD::FTRUNC, MVT::f32, Legal); |
321 | 2.06k | setOperationAction(ISD::FMINNUM, MVT::f32, Legal); |
322 | 2.06k | setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); |
323 | 2.06k | |
324 | 2.06k | setOperationAction(ISD::FROUND, MVT::f32, Custom); |
325 | 2.06k | setOperationAction(ISD::FROUND, MVT::f64, Custom); |
326 | 2.06k | |
327 | 2.06k | setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); |
328 | 2.06k | setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); |
329 | 2.06k | |
330 | 2.06k | setOperationAction(ISD::FREM, MVT::f32, Custom); |
331 | 2.06k | setOperationAction(ISD::FREM, MVT::f64, Custom); |
332 | 2.06k | |
333 | 2.06k | // v_mad_f32 does not support denormals according to some sources. |
334 | 2.06k | if (!Subtarget->hasFP32Denormals()) |
335 | 2.02k | setOperationAction(ISD::FMAD, MVT::f32, Legal); |
336 | 2.06k | |
337 | 2.06k | // Expand to fneg + fadd. |
338 | 2.06k | setOperationAction(ISD::FSUB, MVT::f64, Expand); |
339 | 2.06k | |
340 | 2.06k | setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); |
341 | 2.06k | setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); |
342 | 2.06k | setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); |
343 | 2.06k | setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); |
344 | 2.06k | setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); |
345 | 2.06k | setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); |
346 | 2.06k | setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); |
347 | 2.06k | setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); |
348 | 2.06k | setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); |
349 | 2.06k | setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); |
350 | 2.06k | |
351 | 2.06k | if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS2.06k ) { |
352 | 1.08k | setOperationAction(ISD::FCEIL, MVT::f64, Custom); |
353 | 1.08k | setOperationAction(ISD::FTRUNC, MVT::f64, Custom); |
354 | 1.08k | setOperationAction(ISD::FRINT, MVT::f64, Custom); |
355 | 1.08k | setOperationAction(ISD::FFLOOR, MVT::f64, Custom); |
356 | 1.08k | } |
357 | 2.06k | |
358 | 2.06k | if (!Subtarget->hasBFI()2.06k ) { |
359 | 37 | // fcopysign can be done in a single instruction with BFI. |
360 | 37 | setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); |
361 | 37 | setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); |
362 | 37 | } |
363 | 2.06k | |
364 | 2.06k | setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); |
365 | 2.06k | setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); |
366 | 2.06k | setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom); |
367 | 2.06k | |
368 | 2.06k | const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; |
369 | 4.13k | for (MVT VT : ScalarIntVTs) { |
370 | 4.13k | // These should use [SU]DIVREM, so set them to expand |
371 | 4.13k | setOperationAction(ISD::SDIV, VT, Expand); |
372 | 4.13k | setOperationAction(ISD::UDIV, VT, Expand); |
373 | 4.13k | setOperationAction(ISD::SREM, VT, Expand); |
374 | 4.13k | setOperationAction(ISD::UREM, VT, Expand); |
375 | 4.13k | |
376 | 4.13k | // GPU does not have divrem function for signed or unsigned. |
377 | 4.13k | setOperationAction(ISD::SDIVREM, VT, Custom); |
378 | 4.13k | setOperationAction(ISD::UDIVREM, VT, Custom); |
379 | 4.13k | |
380 | 4.13k | // GPU does not have [S|U]MUL_LOHI functions as a single instruction. |
381 | 4.13k | setOperationAction(ISD::SMUL_LOHI, VT, Expand); |
382 | 4.13k | setOperationAction(ISD::UMUL_LOHI, VT, Expand); |
383 | 4.13k | |
384 | 4.13k | setOperationAction(ISD::BSWAP, VT, Expand); |
385 | 4.13k | setOperationAction(ISD::CTTZ, VT, Expand); |
386 | 4.13k | setOperationAction(ISD::CTLZ, VT, Expand); |
387 | 4.13k | } |
388 | 2.06k | |
389 | 2.06k | if (!Subtarget->hasBCNT(32)) |
390 | 37 | setOperationAction(ISD::CTPOP, MVT::i32, Expand); |
391 | 2.06k | |
392 | 2.06k | if (!Subtarget->hasBCNT(64)) |
393 | 271 | setOperationAction(ISD::CTPOP, MVT::i64, Expand); |
394 | 2.06k | |
395 | 2.06k | // The hardware supports 32-bit ROTR, but not ROTL. |
396 | 2.06k | setOperationAction(ISD::ROTL, MVT::i32, Expand); |
397 | 2.06k | setOperationAction(ISD::ROTL, MVT::i64, Expand); |
398 | 2.06k | setOperationAction(ISD::ROTR, MVT::i64, Expand); |
399 | 2.06k | |
400 | 2.06k | setOperationAction(ISD::MUL, MVT::i64, Expand); |
401 | 2.06k | setOperationAction(ISD::MULHU, MVT::i64, Expand); |
402 | 2.06k | setOperationAction(ISD::MULHS, MVT::i64, Expand); |
403 | 2.06k | setOperationAction(ISD::UDIV, MVT::i32, Expand); |
404 | 2.06k | setOperationAction(ISD::UREM, MVT::i32, Expand); |
405 | 2.06k | setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); |
406 | 2.06k | setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); |
407 | 2.06k | setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); |
408 | 2.06k | setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); |
409 | 2.06k | setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); |
410 | 2.06k | |
411 | 2.06k | setOperationAction(ISD::SMIN, MVT::i32, Legal); |
412 | 2.06k | setOperationAction(ISD::UMIN, MVT::i32, Legal); |
413 | 2.06k | setOperationAction(ISD::SMAX, MVT::i32, Legal); |
414 | 2.06k | setOperationAction(ISD::UMAX, MVT::i32, Legal); |
415 | 2.06k | |
416 | 2.06k | if (Subtarget->hasFFBH()) |
417 | 2.03k | setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); |
418 | 2.06k | |
419 | 2.06k | if (Subtarget->hasFFBL()) |
420 | 2.03k | setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal); |
421 | 2.06k | |
422 | 2.06k | setOperationAction(ISD::CTLZ, MVT::i64, Custom); |
423 | 2.06k | setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); |
424 | 2.06k | |
425 | 2.06k | // We only really have 32-bit BFE instructions (and 16-bit on VI). |
426 | 2.06k | // |
427 | 2.06k | // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any |
428 | 2.06k | // effort to match them now. We want this to be false for i64 cases when the |
429 | 2.06k | // extraction isn't restricted to the upper or lower half. Ideally we would |
430 | 2.06k | // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that |
431 | 2.06k | // span the midpoint are probably relatively rare, so don't worry about them |
432 | 2.06k | // for now. |
433 | 2.06k | if (Subtarget->hasBFE()) |
434 | 2.03k | setHasExtractBitsInsn(true); |
435 | 2.06k | |
436 | 2.06k | static const MVT::SimpleValueType VectorIntTypes[] = { |
437 | 2.06k | MVT::v2i32, MVT::v4i32 |
438 | 2.06k | }; |
439 | 2.06k | |
440 | 4.13k | for (MVT VT : VectorIntTypes) { |
441 | 4.13k | // Expand the following operations for the current type by default. |
442 | 4.13k | setOperationAction(ISD::ADD, VT, Expand); |
443 | 4.13k | setOperationAction(ISD::AND, VT, Expand); |
444 | 4.13k | setOperationAction(ISD::FP_TO_SINT, VT, Expand); |
445 | 4.13k | setOperationAction(ISD::FP_TO_UINT, VT, Expand); |
446 | 4.13k | setOperationAction(ISD::MUL, VT, Expand); |
447 | 4.13k | setOperationAction(ISD::MULHU, VT, Expand); |
448 | 4.13k | setOperationAction(ISD::MULHS, VT, Expand); |
449 | 4.13k | setOperationAction(ISD::OR, VT, Expand); |
450 | 4.13k | setOperationAction(ISD::SHL, VT, Expand); |
451 | 4.13k | setOperationAction(ISD::SRA, VT, Expand); |
452 | 4.13k | setOperationAction(ISD::SRL, VT, Expand); |
453 | 4.13k | setOperationAction(ISD::ROTL, VT, Expand); |
454 | 4.13k | setOperationAction(ISD::ROTR, VT, Expand); |
455 | 4.13k | setOperationAction(ISD::SUB, VT, Expand); |
456 | 4.13k | setOperationAction(ISD::SINT_TO_FP, VT, Expand); |
457 | 4.13k | setOperationAction(ISD::UINT_TO_FP, VT, Expand); |
458 | 4.13k | setOperationAction(ISD::SDIV, VT, Expand); |
459 | 4.13k | setOperationAction(ISD::UDIV, VT, Expand); |
460 | 4.13k | setOperationAction(ISD::SREM, VT, Expand); |
461 | 4.13k | setOperationAction(ISD::UREM, VT, Expand); |
462 | 4.13k | setOperationAction(ISD::SMUL_LOHI, VT, Expand); |
463 | 4.13k | setOperationAction(ISD::UMUL_LOHI, VT, Expand); |
464 | 4.13k | setOperationAction(ISD::SDIVREM, VT, Custom); |
465 | 4.13k | setOperationAction(ISD::UDIVREM, VT, Expand); |
466 | 4.13k | setOperationAction(ISD::ADDC, VT, Expand); |
467 | 4.13k | setOperationAction(ISD::SUBC, VT, Expand); |
468 | 4.13k | setOperationAction(ISD::ADDE, VT, Expand); |
469 | 4.13k | setOperationAction(ISD::SUBE, VT, Expand); |
470 | 4.13k | setOperationAction(ISD::SELECT, VT, Expand); |
471 | 4.13k | setOperationAction(ISD::VSELECT, VT, Expand); |
472 | 4.13k | setOperationAction(ISD::SELECT_CC, VT, Expand); |
473 | 4.13k | setOperationAction(ISD::XOR, VT, Expand); |
474 | 4.13k | setOperationAction(ISD::BSWAP, VT, Expand); |
475 | 4.13k | setOperationAction(ISD::CTPOP, VT, Expand); |
476 | 4.13k | setOperationAction(ISD::CTTZ, VT, Expand); |
477 | 4.13k | setOperationAction(ISD::CTLZ, VT, Expand); |
478 | 4.13k | setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); |
479 | 4.13k | } |
480 | 2.06k | |
481 | 2.06k | static const MVT::SimpleValueType FloatVectorTypes[] = { |
482 | 2.06k | MVT::v2f32, MVT::v4f32 |
483 | 2.06k | }; |
484 | 2.06k | |
485 | 4.13k | for (MVT VT : FloatVectorTypes) { |
486 | 4.13k | setOperationAction(ISD::FABS, VT, Expand); |
487 | 4.13k | setOperationAction(ISD::FMINNUM, VT, Expand); |
488 | 4.13k | setOperationAction(ISD::FMAXNUM, VT, Expand); |
489 | 4.13k | setOperationAction(ISD::FADD, VT, Expand); |
490 | 4.13k | setOperationAction(ISD::FCEIL, VT, Expand); |
491 | 4.13k | setOperationAction(ISD::FCOS, VT, Expand); |
492 | 4.13k | setOperationAction(ISD::FDIV, VT, Expand); |
493 | 4.13k | setOperationAction(ISD::FEXP2, VT, Expand); |
494 | 4.13k | setOperationAction(ISD::FLOG2, VT, Expand); |
495 | 4.13k | setOperationAction(ISD::FREM, VT, Expand); |
496 | 4.13k | setOperationAction(ISD::FPOW, VT, Expand); |
497 | 4.13k | setOperationAction(ISD::FFLOOR, VT, Expand); |
498 | 4.13k | setOperationAction(ISD::FTRUNC, VT, Expand); |
499 | 4.13k | setOperationAction(ISD::FMUL, VT, Expand); |
500 | 4.13k | setOperationAction(ISD::FMA, VT, Expand); |
501 | 4.13k | setOperationAction(ISD::FRINT, VT, Expand); |
502 | 4.13k | setOperationAction(ISD::FNEARBYINT, VT, Expand); |
503 | 4.13k | setOperationAction(ISD::FSQRT, VT, Expand); |
504 | 4.13k | setOperationAction(ISD::FSIN, VT, Expand); |
505 | 4.13k | setOperationAction(ISD::FSUB, VT, Expand); |
506 | 4.13k | setOperationAction(ISD::FNEG, VT, Expand); |
507 | 4.13k | setOperationAction(ISD::VSELECT, VT, Expand); |
508 | 4.13k | setOperationAction(ISD::SELECT_CC, VT, Expand); |
509 | 4.13k | setOperationAction(ISD::FCOPYSIGN, VT, Expand); |
510 | 4.13k | setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); |
511 | 4.13k | } |
512 | 2.06k | |
513 | 2.06k | // This causes using an unrolled select operation rather than expansion with |
514 | 2.06k | // bit operations. This is in general better, but the alternative using BFI |
515 | 2.06k | // instructions may be better if the select sources are SGPRs. |
516 | 2.06k | setOperationAction(ISD::SELECT, MVT::v2f32, Promote); |
517 | 2.06k | AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); |
518 | 2.06k | |
519 | 2.06k | setOperationAction(ISD::SELECT, MVT::v4f32, Promote); |
520 | 2.06k | AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); |
521 | 2.06k | |
522 | 2.06k | // There are no libcalls of any kind. |
523 | 884k | for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL884k ; ++I882k ) |
524 | 882k | setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr); |
525 | 2.06k | |
526 | 2.06k | setBooleanContents(ZeroOrNegativeOneBooleanContent); |
527 | 2.06k | setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); |
528 | 2.06k | |
529 | 2.06k | setSchedulingPreference(Sched::RegPressure); |
530 | 2.06k | setJumpIsExpensive(true); |
531 | 2.06k | |
532 | 2.06k | // FIXME: This is only partially true. If we have to do vector compares, any |
533 | 2.06k | // SGPR pair can be a condition register. If we have a uniform condition, we |
534 | 2.06k | // are better off doing SALU operations, where there is only one SCC. For now, |
535 | 2.06k | // we don't have a way of knowing during instruction selection if a condition |
536 | 2.06k | // will be uniform and we always use vector compares. Assume we are using |
537 | 2.06k | // vector compares until that is fixed. |
538 | 2.06k | setHasMultipleConditionRegisters(true); |
539 | 2.06k | |
540 | 2.06k | // SI at least has hardware support for floating point exceptions, but no way |
541 | 2.06k | // of using or handling them is implemented. They are also optional in OpenCL |
542 | 2.06k | // (Section 7.3) |
543 | 2.06k | setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); |
544 | 2.06k | |
545 | 2.06k | PredictableSelectIsExpensive = false; |
546 | 2.06k | |
547 | 2.06k | // We want to find all load dependencies for long chains of stores to enable |
548 | 2.06k | // merging into very wide vectors. The problem is with vectors with > 4 |
549 | 2.06k | // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 |
550 | 2.06k | // vectors are a legal type, even though we have to split the loads |
551 | 2.06k | // usually. When we can more precisely specify load legality per address |
552 | 2.06k | // space, we should be able to make FindBetterChain/MergeConsecutiveStores |
553 | 2.06k | // smarter so that they can figure out what to do in 2 iterations without all |
554 | 2.06k | // N > 4 stores on the same chain. |
555 | 2.06k | GatherAllAliasesMaxDepth = 16; |
556 | 2.06k | |
557 | 2.06k | // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry |
558 | 2.06k | // about these during lowering. |
559 | 2.06k | MaxStoresPerMemcpy = 0xffffffff; |
560 | 2.06k | MaxStoresPerMemmove = 0xffffffff; |
561 | 2.06k | MaxStoresPerMemset = 0xffffffff; |
562 | 2.06k | |
563 | 2.06k | setTargetDAGCombine(ISD::BITCAST); |
564 | 2.06k | setTargetDAGCombine(ISD::SHL); |
565 | 2.06k | setTargetDAGCombine(ISD::SRA); |
566 | 2.06k | setTargetDAGCombine(ISD::SRL); |
567 | 2.06k | setTargetDAGCombine(ISD::MUL); |
568 | 2.06k | setTargetDAGCombine(ISD::MULHU); |
569 | 2.06k | setTargetDAGCombine(ISD::MULHS); |
570 | 2.06k | setTargetDAGCombine(ISD::SELECT); |
571 | 2.06k | setTargetDAGCombine(ISD::SELECT_CC); |
572 | 2.06k | setTargetDAGCombine(ISD::STORE); |
573 | 2.06k | setTargetDAGCombine(ISD::FADD); |
574 | 2.06k | setTargetDAGCombine(ISD::FSUB); |
575 | 2.06k | setTargetDAGCombine(ISD::FNEG); |
576 | 2.06k | setTargetDAGCombine(ISD::FABS); |
577 | 2.06k | setTargetDAGCombine(ISD::AssertZext); |
578 | 2.06k | setTargetDAGCombine(ISD::AssertSext); |
579 | 2.06k | } |
580 | | |
581 | | //===----------------------------------------------------------------------===// |
582 | | // Target Information |
583 | | //===----------------------------------------------------------------------===// |
584 | | |
585 | | LLVM_READNONE |
586 | 728 | static bool fnegFoldsIntoOp(unsigned Opc) { |
587 | 728 | switch (Opc) { |
588 | 208 | case ISD::FADD: |
589 | 208 | case ISD::FSUB: |
590 | 208 | case ISD::FMUL: |
591 | 208 | case ISD::FMA: |
592 | 208 | case ISD::FMAD: |
593 | 208 | case ISD::FMINNUM: |
594 | 208 | case ISD::FMAXNUM: |
595 | 208 | case ISD::FSIN: |
596 | 208 | case ISD::FTRUNC: |
597 | 208 | case ISD::FRINT: |
598 | 208 | case ISD::FNEARBYINT: |
599 | 208 | case AMDGPUISD::RCP: |
600 | 208 | case AMDGPUISD::RCP_LEGACY: |
601 | 208 | case AMDGPUISD::SIN_HW: |
602 | 208 | case AMDGPUISD::FMUL_LEGACY: |
603 | 208 | case AMDGPUISD::FMIN_LEGACY: |
604 | 208 | case AMDGPUISD::FMAX_LEGACY: |
605 | 208 | return true; |
606 | 520 | default: |
607 | 520 | return false; |
608 | 0 | } |
609 | 0 | } |
610 | | |
611 | | /// \p returns true if the operation will definitely need to use a 64-bit |
612 | | /// encoding, and thus will use a VOP3 encoding regardless of the source |
613 | | /// modifiers. |
614 | | LLVM_READONLY |
615 | 1.87k | static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { |
616 | 1.15k | return N->getNumOperands() > 2 || VT == MVT::f64; |
617 | 1.87k | } |
618 | | |
619 | | // Most FP instructions support source modifiers, but this could be refined |
620 | | // slightly. |
621 | | LLVM_READONLY |
622 | 2.47k | static bool hasSourceMods(const SDNode *N) { |
623 | 2.47k | if (isa<MemSDNode>(N)) |
624 | 255 | return false; |
625 | 2.22k | |
626 | 2.22k | switch (N->getOpcode()) { |
627 | 344 | case ISD::CopyToReg: |
628 | 344 | case ISD::SELECT: |
629 | 344 | case ISD::FDIV: |
630 | 344 | case ISD::FREM: |
631 | 344 | case ISD::INLINEASM: |
632 | 344 | case AMDGPUISD::INTERP_P1: |
633 | 344 | case AMDGPUISD::INTERP_P2: |
634 | 344 | case AMDGPUISD::DIV_SCALE: |
635 | 344 | |
636 | 344 | // TODO: Should really be looking at the users of the bitcast. These are |
637 | 344 | // problematic because bitcasts are used to legalize all stores to integer |
638 | 344 | // types. |
639 | 344 | case ISD::BITCAST: |
640 | 344 | return false; |
641 | 1.87k | default: |
642 | 1.87k | return true; |
643 | 0 | } |
644 | 0 | } |
645 | | |
646 | | bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, |
647 | 2.38k | unsigned CostThreshold) { |
648 | 2.38k | // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus |
649 | 2.38k | // it is truly free to use a source modifier in all cases. If there are |
650 | 2.38k | // multiple users but for each one will necessitate using VOP3, there will be |
651 | 2.38k | // a code size increase. Try to avoid increasing code size unless we know it |
652 | 2.38k | // will save on the instruction count. |
653 | 2.38k | unsigned NumMayIncreaseSize = 0; |
654 | 2.38k | MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); |
655 | 2.38k | |
656 | 2.38k | // XXX - Should this limit number of uses to check? |
657 | 2.47k | for (const SDNode *U : N->uses()) { |
658 | 2.47k | if (!hasSourceMods(U)) |
659 | 599 | return false; |
660 | 1.87k | |
661 | 1.87k | if (1.87k !opMustUseVOP3Encoding(U, VT)1.87k ) { |
662 | 946 | if (++NumMayIncreaseSize > CostThreshold) |
663 | 830 | return false; |
664 | 952 | } |
665 | 2.47k | } |
666 | 952 | |
667 | 952 | return true; |
668 | 952 | } |
669 | | |
670 | 84.3k | MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { |
671 | 84.3k | return MVT::i32; |
672 | 84.3k | } |
673 | | |
674 | 1.48k | bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { |
675 | 1.48k | return true; |
676 | 1.48k | } |
677 | | |
678 | | // The backend supports 32 and 64 bit floating point immediates. |
679 | | // FIXME: Why are we reporting vectors of FP immediates as legal? |
680 | 0 | bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { |
681 | 0 | EVT ScalarVT = VT.getScalarType(); |
682 | 0 | return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || |
683 | 0 | (ScalarVT == MVT::f16 && 0 Subtarget->has16BitInsts()0 )); |
684 | 0 | } |
685 | | |
686 | | // We don't want to shrink f64 / f32 constants. |
687 | 0 | bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { |
688 | 0 | EVT ScalarVT = VT.getScalarType(); |
689 | 0 | return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); |
690 | 0 | } |
691 | | |
692 | | bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, |
693 | | ISD::LoadExtType, |
694 | 1.47k | EVT NewVT) const { |
695 | 1.47k | |
696 | 1.47k | unsigned NewSize = NewVT.getStoreSizeInBits(); |
697 | 1.47k | |
698 | 1.47k | // If we are reducing to a 32-bit load, this is always better. |
699 | 1.47k | if (NewSize == 32) |
700 | 313 | return true; |
701 | 1.15k | |
702 | 1.15k | EVT OldVT = N->getValueType(0); |
703 | 1.15k | unsigned OldSize = OldVT.getStoreSizeInBits(); |
704 | 1.15k | |
705 | 1.15k | // Don't produce extloads from sub 32-bit types. SI doesn't have scalar |
706 | 1.15k | // extloads, so doing one requires using a buffer_load. In cases where we |
707 | 1.15k | // still couldn't use a scalar load, using the wider load shouldn't really |
708 | 1.15k | // hurt anything. |
709 | 1.15k | |
710 | 1.15k | // If the old size already had to be an extload, there's no harm in continuing |
711 | 1.15k | // to reduce the width. |
712 | 1.15k | return (OldSize < 32); |
713 | 1.15k | } |
714 | | |
715 | | bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, |
716 | 12.1k | EVT CastTy) const { |
717 | 12.1k | |
718 | 12.1k | assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); |
719 | 12.1k | |
720 | 12.1k | if (LoadTy.getScalarType() == MVT::i32) |
721 | 10.7k | return false; |
722 | 1.48k | |
723 | 1.48k | unsigned LScalarSize = LoadTy.getScalarSizeInBits(); |
724 | 1.48k | unsigned CastScalarSize = CastTy.getScalarSizeInBits(); |
725 | 1.48k | |
726 | 1.48k | return (LScalarSize < CastScalarSize) || |
727 | 1.24k | (CastScalarSize >= 32); |
728 | 12.1k | } |
729 | | |
730 | | // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also |
731 | | // profitable with the expansion for 64-bit since it's generally good to |
732 | | // speculate things. |
733 | | // FIXME: These should really have the size as a parameter. |
734 | 12 | bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { |
735 | 12 | return true; |
736 | 12 | } |
737 | | |
738 | 60 | bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { |
739 | 60 | return true; |
740 | 60 | } |
741 | | |
742 | | //===---------------------------------------------------------------------===// |
743 | | // Target Properties |
744 | | //===---------------------------------------------------------------------===// |
745 | | |
746 | 1.91k | bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { |
747 | 1.91k | assert(VT.isFloatingPoint()); |
748 | 1.91k | |
749 | 1.91k | // Packed operations do not have a fabs modifier. |
750 | 685 | return VT == MVT::f32 || VT == MVT::f64 || |
751 | 450 | (Subtarget->has16BitInsts() && 450 VT == MVT::f16362 ); |
752 | 1.91k | } |
753 | | |
754 | 3.18k | bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { |
755 | 3.18k | assert(VT.isFloatingPoint()); |
756 | 1.06k | return VT == MVT::f32 || VT == MVT::f64 || |
757 | 542 | (Subtarget->has16BitInsts() && 542 VT == MVT::f16453 ) || |
758 | 219 | (Subtarget->hasVOP3PInsts() && 219 VT == MVT::v2f1678 ); |
759 | 3.18k | } |
760 | | |
761 | | bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, |
762 | | unsigned NumElem, |
763 | 3.96k | unsigned AS) const { |
764 | 3.96k | return true; |
765 | 3.96k | } |
766 | | |
767 | 23.5k | bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const { |
768 | 23.5k | // There are few operations which truly have vector input operands. Any vector |
769 | 23.5k | // operation is going to involve operations on each component, and a |
770 | 23.5k | // build_vector will be a copy per element, so it always makes sense to use a |
771 | 23.5k | // build_vector input in place of the extracted element to avoid a copy into a |
772 | 23.5k | // super register. |
773 | 23.5k | // |
774 | 23.5k | // We should probably only do this if all users are extracts only, but this |
775 | 23.5k | // should be the common case. |
776 | 23.5k | return true; |
777 | 23.5k | } |
778 | | |
779 | 11.8k | bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { |
780 | 11.8k | // Truncate is just accessing a subregister. |
781 | 11.8k | |
782 | 11.8k | unsigned SrcSize = Source.getSizeInBits(); |
783 | 11.8k | unsigned DestSize = Dest.getSizeInBits(); |
784 | 11.8k | |
785 | 11.8k | return DestSize < SrcSize && DestSize % 32 == 0 ; |
786 | 11.8k | } |
787 | | |
788 | 498 | bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { |
789 | 498 | // Truncate is just accessing a subregister. |
790 | 498 | |
791 | 498 | unsigned SrcSize = Source->getScalarSizeInBits(); |
792 | 498 | unsigned DestSize = Dest->getScalarSizeInBits(); |
793 | 498 | |
794 | 498 | if (DestSize== 16 && 498 Subtarget->has16BitInsts()19 ) |
795 | 17 | return SrcSize >= 32; |
796 | 481 | |
797 | 481 | return DestSize < SrcSize && 481 DestSize % 32 == 0231 ; |
798 | 498 | } |
799 | | |
800 | 2 | bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { |
801 | 2 | unsigned SrcSize = Src->getScalarSizeInBits(); |
802 | 2 | unsigned DestSize = Dest->getScalarSizeInBits(); |
803 | 2 | |
804 | 2 | if (SrcSize == 16 && 2 Subtarget->has16BitInsts()0 ) |
805 | 0 | return DestSize >= 32; |
806 | 2 | |
807 | 2 | return SrcSize == 32 && 2 DestSize == 642 ; |
808 | 2 | } |
809 | | |
810 | 5.93k | bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { |
811 | 5.93k | // Any register load of a 64-bit value really requires 2 32-bit moves. For all |
812 | 5.93k | // practical purposes, the extra mov 0 to load a 64-bit is free. As used, |
813 | 5.93k | // this will enable reducing 64-bit operations the 32-bit, which is always |
814 | 5.93k | // good. |
815 | 5.93k | |
816 | 5.93k | if (Src == MVT::i16) |
817 | 9 | return Dest == MVT::i32 ||9 Dest == MVT::i643 ; |
818 | 5.93k | |
819 | 5.93k | return Src == MVT::i32 && 5.93k Dest == MVT::i641.89k ; |
820 | 5.93k | } |
821 | | |
822 | 5.81k | bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { |
823 | 5.81k | return isZExtFree(Val.getValueType(), VT2); |
824 | 5.81k | } |
825 | | |
826 | 6.72k | bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { |
827 | 6.72k | // There aren't really 64-bit registers, but pairs of 32-bit ones and only a |
828 | 6.72k | // limited number of native 64-bit operations. Shrinking an operation to fit |
829 | 6.72k | // in a single 32-bit register should always be helpful. As currently used, |
830 | 6.72k | // this is much less general than the name suggests, and is only used in |
831 | 6.72k | // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is |
832 | 6.72k | // not profitable, and may actually be harmful. |
833 | 26 | return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; |
834 | 6.72k | } |
835 | | |
836 | | //===---------------------------------------------------------------------===// |
837 | | // TargetLowering Callbacks |
838 | | //===---------------------------------------------------------------------===// |
839 | | |
840 | | CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, |
841 | 2.05k | bool IsVarArg) { |
842 | 2.05k | switch (CC) { |
843 | 26 | case CallingConv::AMDGPU_KERNEL: |
844 | 26 | case CallingConv::SPIR_KERNEL: |
845 | 26 | return CC_AMDGPU_Kernel; |
846 | 604 | case CallingConv::AMDGPU_VS: |
847 | 604 | case CallingConv::AMDGPU_GS: |
848 | 604 | case CallingConv::AMDGPU_PS: |
849 | 604 | case CallingConv::AMDGPU_CS: |
850 | 604 | case CallingConv::AMDGPU_HS: |
851 | 604 | return CC_AMDGPU; |
852 | 1.42k | case CallingConv::C: |
853 | 1.42k | case CallingConv::Fast: |
854 | 1.42k | case CallingConv::Cold: |
855 | 1.42k | return CC_AMDGPU_Func; |
856 | 0 | default: |
857 | 0 | report_fatal_error("Unsupported calling convention."); |
858 | 0 | } |
859 | 0 | } |
860 | | |
861 | | CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, |
862 | 3.12k | bool IsVarArg) { |
863 | 3.12k | switch (CC) { |
864 | 0 | case CallingConv::AMDGPU_KERNEL: |
865 | 0 | case CallingConv::SPIR_KERNEL: |
866 | 0 | return CC_AMDGPU_Kernel; |
867 | 549 | case CallingConv::AMDGPU_VS: |
868 | 549 | case CallingConv::AMDGPU_GS: |
869 | 549 | case CallingConv::AMDGPU_PS: |
870 | 549 | case CallingConv::AMDGPU_CS: |
871 | 549 | case CallingConv::AMDGPU_HS: |
872 | 549 | return RetCC_SI_Shader; |
873 | 2.58k | case CallingConv::C: |
874 | 2.58k | case CallingConv::Fast: |
875 | 2.58k | case CallingConv::Cold: |
876 | 2.58k | return RetCC_AMDGPU_Func; |
877 | 0 | default: |
878 | 0 | report_fatal_error("Unsupported calling convention."); |
879 | 0 | } |
880 | 0 | } |
881 | | |
882 | | /// The SelectionDAGBuilder will automatically promote function arguments |
883 | | /// with illegal types. However, this does not work for the AMDGPU targets |
884 | | /// since the function arguments are stored in memory as these illegal types. |
885 | | /// In order to handle this properly we need to get the original types sizes |
886 | | /// from the LLVM IR Function and fixup the ISD:InputArg values before |
887 | | /// passing them to AnalyzeFormalArguments() |
888 | | |
889 | | /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting |
890 | | /// input values across multiple registers. Each item in the Ins array |
891 | | /// represents a single value that will be stored in registers. Ins[x].VT is |
892 | | /// the value type of the value that will be stored in the register, so |
893 | | /// whatever SDNode we lower the argument to needs to be this type. |
894 | | /// |
895 | | /// In order to correctly lower the arguments we need to know the size of each |
896 | | /// argument. Since Ins[x].VT gives us the size of the register that will |
897 | | /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type |
898 | | /// for the orignal function argument so that we can deduce the correct memory |
899 | | /// type to use for Ins[x]. In most cases the correct memory type will be |
900 | | /// Ins[x].ArgVT. However, this will not always be the case. If, for example, |
901 | | /// we have a kernel argument of type v8i8, this argument will be split into |
902 | | /// 8 parts and each part will be represented by its own item in the Ins array. |
903 | | /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of |
904 | | /// the argument before it was split. From this, we deduce that the memory type |
905 | | /// for each individual part is i8. We pass the memory type as LocVT to the |
906 | | /// calling convention analysis function and the register type (Ins[x].VT) as |
907 | | /// the ValVT. |
908 | | void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State, |
909 | 15.6k | const SmallVectorImpl<ISD::InputArg> &Ins) const { |
910 | 52.9k | for (unsigned i = 0, e = Ins.size(); i != e52.9k ; ++i37.2k ) { |
911 | 37.2k | const ISD::InputArg &In = Ins[i]; |
912 | 37.2k | EVT MemVT; |
913 | 37.2k | |
914 | 37.2k | unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT); |
915 | 37.2k | |
916 | 37.2k | if (!Subtarget->isAmdHsaOS() && |
917 | 37.2k | (In.ArgVT == MVT::i16 || 33.9k In.ArgVT == MVT::i833.7k || In.ArgVT == MVT::f1633.2k )) { |
918 | 694 | // The ABI says the caller will extend these values to 32-bits. |
919 | 694 | MemVT = In.ArgVT.isInteger() ? MVT::i32660 : MVT::f3234 ; |
920 | 37.2k | } else if (36.5k NumRegs == 136.5k ) { |
921 | 33.4k | // This argument is not split, so the IR type is the memory type. |
922 | 33.4k | assert(!In.Flags.isSplit()); |
923 | 33.4k | if (In.ArgVT.isExtended()33.4k ) { |
924 | 78 | // We have an extended type, like i24, so we should just use the register type |
925 | 78 | MemVT = In.VT; |
926 | 33.4k | } else { |
927 | 33.3k | MemVT = In.ArgVT; |
928 | 33.3k | } |
929 | 36.5k | } else if (3.17k In.ArgVT.isVector() && 3.17k In.VT.isVector()2.64k && |
930 | 3.17k | In.ArgVT.getScalarType() == In.VT.getScalarType()702 ) { |
931 | 702 | assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements()); |
932 | 702 | // We have a vector value which has been split into a vector with |
933 | 702 | // the same scalar type, but fewer elements. This should handle |
934 | 702 | // all the floating-point vector types. |
935 | 702 | MemVT = In.VT; |
936 | 3.17k | } else if (2.47k In.ArgVT.isVector() && |
937 | 2.47k | In.ArgVT.getVectorNumElements() == NumRegs1.94k ) { |
938 | 1.93k | // This arg has been split so that each element is stored in a separate |
939 | 1.93k | // register. |
940 | 1.93k | MemVT = In.ArgVT.getScalarType(); |
941 | 2.47k | } else if (542 In.ArgVT.isExtended()542 ) { |
942 | 20 | // We have an extended type, like i65. |
943 | 20 | MemVT = In.VT; |
944 | 542 | } else { |
945 | 522 | unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs; |
946 | 522 | assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0); |
947 | 522 | if (In.VT.isInteger()522 ) { |
948 | 522 | MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); |
949 | 522 | } else if (0 In.VT.isVector()0 ) { |
950 | 0 | assert(!In.VT.getScalarType().isFloatingPoint()); |
951 | 0 | unsigned NumElements = In.VT.getVectorNumElements(); |
952 | 0 | assert(MemoryBits % NumElements == 0); |
953 | 0 | // This vector type has been split into another vector type with |
954 | 0 | // a different elements size. |
955 | 0 | EVT ScalarVT = EVT::getIntegerVT(State.getContext(), |
956 | 0 | MemoryBits / NumElements); |
957 | 0 | MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); |
958 | 0 | } else { |
959 | 0 | llvm_unreachable("cannot deduce memory type."); |
960 | 0 | } |
961 | 37.2k | } |
962 | 37.2k | |
963 | 37.2k | // Convert one element vectors to scalar. |
964 | 37.2k | if (37.2k MemVT.isVector() && 37.2k MemVT.getVectorNumElements() == 11.73k ) |
965 | 50 | MemVT = MemVT.getScalarType(); |
966 | 37.2k | |
967 | 37.2k | if (MemVT.isExtended()37.2k ) { |
968 | 0 | // This should really only happen if we have vec3 arguments |
969 | 0 | assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); |
970 | 0 | MemVT = MemVT.getPow2VectorType(State.getContext()); |
971 | 0 | } |
972 | 37.2k | |
973 | 37.2k | assert(MemVT.isSimple()); |
974 | 37.2k | allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags, |
975 | 37.2k | State); |
976 | 37.2k | } |
977 | 15.6k | } |
978 | | |
979 | | SDValue AMDGPUTargetLowering::LowerReturn( |
980 | | SDValue Chain, CallingConv::ID CallConv, |
981 | | bool isVarArg, |
982 | | const SmallVectorImpl<ISD::OutputArg> &Outs, |
983 | | const SmallVectorImpl<SDValue> &OutVals, |
984 | 15.6k | const SDLoc &DL, SelectionDAG &DAG) const { |
985 | 15.6k | // FIXME: Fails for r600 tests |
986 | 15.6k | //assert(!isVarArg && Outs.empty() && OutVals.empty() && |
987 | 15.6k | // "wave terminate should not have return values"); |
988 | 15.6k | return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); |
989 | 15.6k | } |
990 | | |
991 | | //===---------------------------------------------------------------------===// |
992 | | // Target specific lowering |
993 | | //===---------------------------------------------------------------------===// |
994 | | |
995 | | /// Selects the correct CCAssignFn for a given CallingConvention value. |
996 | | CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, |
997 | 2.01k | bool IsVarArg) { |
998 | 2.01k | return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg); |
999 | 2.01k | } |
1000 | | |
1001 | | CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, |
1002 | 3.12k | bool IsVarArg) { |
1003 | 3.12k | return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); |
1004 | 3.12k | } |
1005 | | |
1006 | | SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, |
1007 | | SelectionDAG &DAG, |
1008 | | MachineFrameInfo &MFI, |
1009 | 27 | int ClobberedFI) const { |
1010 | 27 | SmallVector<SDValue, 8> ArgChains; |
1011 | 27 | int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); |
1012 | 27 | int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; |
1013 | 27 | |
1014 | 27 | // Include the original chain at the beginning of the list. When this is |
1015 | 27 | // used by target LowerCall hooks, this helps legalize find the |
1016 | 27 | // CALLSEQ_BEGIN node. |
1017 | 27 | ArgChains.push_back(Chain); |
1018 | 27 | |
1019 | 27 | // Add a chain value for each stack argument corresponding |
1020 | 27 | for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), |
1021 | 27 | UE = DAG.getEntryNode().getNode()->use_end(); |
1022 | 918 | U != UE918 ; ++U891 ) { |
1023 | 891 | if (LoadSDNode *L891 = dyn_cast<LoadSDNode>(*U)) { |
1024 | 72 | if (FrameIndexSDNode *FI72 = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) { |
1025 | 72 | if (FI->getIndex() < 072 ) { |
1026 | 72 | int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); |
1027 | 72 | int64_t InLastByte = InFirstByte; |
1028 | 72 | InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; |
1029 | 72 | |
1030 | 72 | if ((InFirstByte <= FirstByte && 72 FirstByte <= InLastByte12 ) || |
1031 | 60 | (FirstByte <= InFirstByte && 60 InFirstByte <= LastByte60 )) |
1032 | 12 | ArgChains.push_back(SDValue(L, 1)); |
1033 | 72 | } |
1034 | 72 | } |
1035 | 72 | } |
1036 | 891 | } |
1037 | 27 | |
1038 | 27 | // Build a tokenfactor for all the chains. |
1039 | 27 | return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); |
1040 | 27 | } |
1041 | | |
1042 | | SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, |
1043 | | SmallVectorImpl<SDValue> &InVals, |
1044 | 20 | StringRef Reason) const { |
1045 | 20 | SDValue Callee = CLI.Callee; |
1046 | 20 | SelectionDAG &DAG = CLI.DAG; |
1047 | 20 | |
1048 | 20 | const Function &Fn = *DAG.getMachineFunction().getFunction(); |
1049 | 20 | |
1050 | 20 | StringRef FuncName("<unknown>"); |
1051 | 20 | |
1052 | 20 | if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) |
1053 | 1 | FuncName = G->getSymbol(); |
1054 | 19 | else if (const GlobalAddressSDNode *19 G19 = dyn_cast<GlobalAddressSDNode>(Callee)) |
1055 | 17 | FuncName = G->getGlobal()->getName(); |
1056 | 20 | |
1057 | 20 | DiagnosticInfoUnsupported NoCalls( |
1058 | 20 | Fn, Reason + FuncName, CLI.DL.getDebugLoc()); |
1059 | 20 | DAG.getContext()->diagnose(NoCalls); |
1060 | 20 | |
1061 | 20 | if (!CLI.IsTailCall20 ) { |
1062 | 25 | for (unsigned I = 0, E = CLI.Ins.size(); I != E25 ; ++I10 ) |
1063 | 10 | InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); |
1064 | 15 | } |
1065 | 20 | |
1066 | 20 | return DAG.getEntryNode(); |
1067 | 20 | } |
1068 | | |
1069 | | SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, |
1070 | 14 | SmallVectorImpl<SDValue> &InVals) const { |
1071 | 14 | return lowerUnhandledCall(CLI, InVals, "unsupported call to function "); |
1072 | 14 | } |
1073 | | |
1074 | | SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, |
1075 | 3 | SelectionDAG &DAG) const { |
1076 | 3 | const Function &Fn = *DAG.getMachineFunction().getFunction(); |
1077 | 3 | |
1078 | 3 | DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca", |
1079 | 3 | SDLoc(Op).getDebugLoc()); |
1080 | 3 | DAG.getContext()->diagnose(NoDynamicAlloca); |
1081 | 3 | auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; |
1082 | 3 | return DAG.getMergeValues(Ops, SDLoc()); |
1083 | 3 | } |
1084 | | |
1085 | | SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, |
1086 | 17.2k | SelectionDAG &DAG) const { |
1087 | 17.2k | switch (Op.getOpcode()) { |
1088 | 0 | default: |
1089 | 0 | Op->print(errs(), &DAG); |
1090 | 0 | llvm_unreachable("Custom lowering code for this" |
1091 | 17.2k | "instruction is not implemented yet!"); |
1092 | 0 | break; |
1093 | 30 | case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); |
1094 | 2.23k | case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); |
1095 | 12.9k | case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); |
1096 | 344 | case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); |
1097 | 147 | case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); |
1098 | 36 | case ISD::FREM: return LowerFREM(Op, DAG); |
1099 | 31 | case ISD::FCEIL: return LowerFCEIL(Op, DAG); |
1100 | 75 | case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); |
1101 | 14 | case ISD::FRINT: return LowerFRINT(Op, DAG); |
1102 | 44 | case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); |
1103 | 90 | case ISD::FROUND: return LowerFROUND(Op, DAG); |
1104 | 0 | case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); |
1105 | 42 | case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); |
1106 | 51 | case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); |
1107 | 698 | case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); |
1108 | 65 | case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); |
1109 | 37 | case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); |
1110 | 342 | case ISD::CTLZ: |
1111 | 342 | case ISD::CTLZ_ZERO_UNDEF: |
1112 | 342 | return LowerCTLZ(Op, DAG); |
1113 | 3 | case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); |
1114 | 0 | } |
1115 | 0 | return Op; |
1116 | 0 | } |
1117 | | |
1118 | | void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, |
1119 | | SmallVectorImpl<SDValue> &Results, |
1120 | 51 | SelectionDAG &DAG) const { |
1121 | 51 | switch (N->getOpcode()) { |
1122 | 0 | case ISD::SIGN_EXTEND_INREG: |
1123 | 0 | // Different parts of legalization seem to interpret which type of |
1124 | 0 | // sign_extend_inreg is the one to check for custom lowering. The extended |
1125 | 0 | // from type is what really matters, but some places check for custom |
1126 | 0 | // lowering of the result type. This results in trying to use |
1127 | 0 | // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do |
1128 | 0 | // nothing here and let the illegal result integer be handled normally. |
1129 | 0 | return; |
1130 | 51 | default: |
1131 | 51 | return; |
1132 | 0 | } |
1133 | 0 | } |
1134 | | |
1135 | 303 | static bool hasDefinedInitializer(const GlobalValue *GV) { |
1136 | 303 | const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); |
1137 | 303 | if (!GVar || 303 !GVar->hasInitializer()303 ) |
1138 | 5 | return false; |
1139 | 298 | |
1140 | 298 | return !isa<UndefValue>(GVar->getInitializer()); |
1141 | 298 | } |
1142 | | |
1143 | | SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, |
1144 | | SDValue Op, |
1145 | 303 | SelectionDAG &DAG) const { |
1146 | 303 | |
1147 | 303 | const DataLayout &DL = DAG.getDataLayout(); |
1148 | 303 | GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); |
1149 | 303 | const GlobalValue *GV = G->getGlobal(); |
1150 | 303 | |
1151 | 303 | if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS303 ) { |
1152 | 303 | // XXX: What does the value of G->getOffset() mean? |
1153 | 303 | assert(G->getOffset() == 0 && |
1154 | 303 | "Do not know what to do with an non-zero offset"); |
1155 | 303 | |
1156 | 303 | // TODO: We could emit code to handle the initialization somewhere. |
1157 | 303 | if (!hasDefinedInitializer(GV)303 ) { |
1158 | 295 | unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); |
1159 | 295 | return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); |
1160 | 295 | } |
1161 | 8 | } |
1162 | 8 | |
1163 | 8 | const Function &Fn = *DAG.getMachineFunction().getFunction(); |
1164 | 8 | DiagnosticInfoUnsupported BadInit( |
1165 | 8 | Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc()); |
1166 | 8 | DAG.getContext()->diagnose(BadInit); |
1167 | 8 | return SDValue(); |
1168 | 8 | } |
1169 | | |
1170 | | SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, |
1171 | 2.23k | SelectionDAG &DAG) const { |
1172 | 2.23k | SmallVector<SDValue, 8> Args; |
1173 | 2.23k | |
1174 | 2.23k | for (const SDUse &U : Op->ops()) |
1175 | 4.46k | DAG.ExtractVectorElements(U.get(), Args); |
1176 | 2.23k | |
1177 | 2.23k | return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); |
1178 | 2.23k | } |
1179 | | |
1180 | | SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, |
1181 | 12.9k | SelectionDAG &DAG) const { |
1182 | 12.9k | |
1183 | 12.9k | SmallVector<SDValue, 8> Args; |
1184 | 12.9k | unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); |
1185 | 12.9k | EVT VT = Op.getValueType(); |
1186 | 12.9k | DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, |
1187 | 12.9k | VT.getVectorNumElements()); |
1188 | 12.9k | |
1189 | 12.9k | return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); |
1190 | 12.9k | } |
1191 | | |
1192 | | /// \brief Generate Min/Max node |
1193 | | SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, |
1194 | | SDValue LHS, SDValue RHS, |
1195 | | SDValue True, SDValue False, |
1196 | | SDValue CC, |
1197 | 687 | DAGCombinerInfo &DCI) const { |
1198 | 687 | if (!(LHS == True && 687 RHS == False89 ) && !(LHS == False && 602 RHS == True12 )) |
1199 | 592 | return SDValue(); |
1200 | 95 | |
1201 | 95 | SelectionDAG &DAG = DCI.DAG; |
1202 | 95 | ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); |
1203 | 95 | switch (CCOpcode) { |
1204 | 27 | case ISD::SETOEQ: |
1205 | 27 | case ISD::SETONE: |
1206 | 27 | case ISD::SETUNE: |
1207 | 27 | case ISD::SETNE: |
1208 | 27 | case ISD::SETUEQ: |
1209 | 27 | case ISD::SETEQ: |
1210 | 27 | case ISD::SETFALSE: |
1211 | 27 | case ISD::SETFALSE2: |
1212 | 27 | case ISD::SETTRUE: |
1213 | 27 | case ISD::SETTRUE2: |
1214 | 27 | case ISD::SETUO: |
1215 | 27 | case ISD::SETO: |
1216 | 27 | break; |
1217 | 14 | case ISD::SETULE: |
1218 | 14 | case ISD::SETULT: { |
1219 | 14 | if (LHS == True) |
1220 | 14 | return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); |
1221 | 0 | return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); |
1222 | 0 | } |
1223 | 4 | case ISD::SETOLE: |
1224 | 4 | case ISD::SETOLT: |
1225 | 4 | case ISD::SETLE: |
1226 | 4 | case ISD::SETLT: { |
1227 | 4 | // Ordered. Assume ordered for undefined. |
1228 | 4 | |
1229 | 4 | // Only do this after legalization to avoid interfering with other combines |
1230 | 4 | // which might occur. |
1231 | 4 | if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && |
1232 | 2 | !DCI.isCalledByLegalizer()) |
1233 | 2 | return SDValue(); |
1234 | 2 | |
1235 | 2 | // We need to permute the operands to get the correct NaN behavior. The |
1236 | 2 | // selected operand is the second one based on the failing compare with NaN, |
1237 | 2 | // so permute it based on the compare type the hardware uses. |
1238 | 2 | if (2 LHS == True2 ) |
1239 | 2 | return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); |
1240 | 0 | return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); |
1241 | 0 | } |
1242 | 11 | case ISD::SETUGE: |
1243 | 11 | case ISD::SETUGT: { |
1244 | 11 | if (LHS == True) |
1245 | 10 | return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); |
1246 | 1 | return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); |
1247 | 1 | } |
1248 | 39 | case ISD::SETGT: |
1249 | 39 | case ISD::SETGE: |
1250 | 39 | case ISD::SETOGE: |
1251 | 39 | case ISD::SETOGT: { |
1252 | 39 | if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && |
1253 | 33 | !DCI.isCalledByLegalizer()) |
1254 | 14 | return SDValue(); |
1255 | 25 | |
1256 | 25 | if (25 LHS == True25 ) |
1257 | 16 | return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); |
1258 | 9 | return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); |
1259 | 9 | } |
1260 | 0 | case ISD::SETCC_INVALID: |
1261 | 0 | llvm_unreachable("Invalid setcc condcode!"); |
1262 | 27 | } |
1263 | 27 | return SDValue(); |
1264 | 27 | } |
1265 | | |
1266 | | std::pair<SDValue, SDValue> |
1267 | 2.83k | AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const { |
1268 | 2.83k | SDLoc SL(Op); |
1269 | 2.83k | |
1270 | 2.83k | SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); |
1271 | 2.83k | |
1272 | 2.83k | const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); |
1273 | 2.83k | const SDValue One = DAG.getConstant(1, SL, MVT::i32); |
1274 | 2.83k | |
1275 | 2.83k | SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); |
1276 | 2.83k | SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); |
1277 | 2.83k | |
1278 | 2.83k | return std::make_pair(Lo, Hi); |
1279 | 2.83k | } |
1280 | | |
1281 | 0 | SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const { |
1282 | 0 | SDLoc SL(Op); |
1283 | 0 |
|
1284 | 0 | SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); |
1285 | 0 | const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); |
1286 | 0 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); |
1287 | 0 | } |
1288 | | |
1289 | 131 | SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { |
1290 | 131 | SDLoc SL(Op); |
1291 | 131 | |
1292 | 131 | SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); |
1293 | 131 | const SDValue One = DAG.getConstant(1, SL, MVT::i32); |
1294 | 131 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); |
1295 | 131 | } |
1296 | | |
1297 | | SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, |
1298 | 1.83k | SelectionDAG &DAG) const { |
1299 | 1.83k | LoadSDNode *Load = cast<LoadSDNode>(Op); |
1300 | 1.83k | EVT VT = Op.getValueType(); |
1301 | 1.83k | |
1302 | 1.83k | |
1303 | 1.83k | // If this is a 2 element vector, we really want to scalarize and not create |
1304 | 1.83k | // weird 1 element vectors. |
1305 | 1.83k | if (VT.getVectorNumElements() == 2) |
1306 | 0 | return scalarizeVectorLoad(Load, DAG); |
1307 | 1.83k | |
1308 | 1.83k | SDValue BasePtr = Load->getBasePtr(); |
1309 | 1.83k | EVT PtrVT = BasePtr.getValueType(); |
1310 | 1.83k | EVT MemVT = Load->getMemoryVT(); |
1311 | 1.83k | SDLoc SL(Op); |
1312 | 1.83k | |
1313 | 1.83k | const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); |
1314 | 1.83k | |
1315 | 1.83k | EVT LoVT, HiVT; |
1316 | 1.83k | EVT LoMemVT, HiMemVT; |
1317 | 1.83k | SDValue Lo, Hi; |
1318 | 1.83k | |
1319 | 1.83k | std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); |
1320 | 1.83k | std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); |
1321 | 1.83k | std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); |
1322 | 1.83k | |
1323 | 1.83k | unsigned Size = LoMemVT.getStoreSize(); |
1324 | 1.83k | unsigned BaseAlign = Load->getAlignment(); |
1325 | 1.83k | unsigned HiAlign = MinAlign(BaseAlign, Size); |
1326 | 1.83k | |
1327 | 1.83k | SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, |
1328 | 1.83k | Load->getChain(), BasePtr, SrcValue, LoMemVT, |
1329 | 1.83k | BaseAlign, Load->getMemOperand()->getFlags()); |
1330 | 1.83k | SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, |
1331 | 1.83k | DAG.getConstant(Size, SL, PtrVT)); |
1332 | 1.83k | SDValue HiLoad = |
1333 | 1.83k | DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), |
1334 | 1.83k | HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), |
1335 | 1.83k | HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); |
1336 | 1.83k | |
1337 | 1.83k | SDValue Ops[] = { |
1338 | 1.83k | DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), |
1339 | 1.83k | DAG.getNode(ISD::TokenFactor, SL, MVT::Other, |
1340 | 1.83k | LoLoad.getValue(1), HiLoad.getValue(1)) |
1341 | 1.83k | }; |
1342 | 1.83k | |
1343 | 1.83k | return DAG.getMergeValues(Ops, SL); |
1344 | 1.83k | } |
1345 | | |
1346 | | SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, |
1347 | 5.92k | SelectionDAG &DAG) const { |
1348 | 5.92k | StoreSDNode *Store = cast<StoreSDNode>(Op); |
1349 | 5.92k | SDValue Val = Store->getValue(); |
1350 | 5.92k | EVT VT = Val.getValueType(); |
1351 | 5.92k | |
1352 | 5.92k | // If this is a 2 element vector, we really want to scalarize and not create |
1353 | 5.92k | // weird 1 element vectors. |
1354 | 5.92k | if (VT.getVectorNumElements() == 2) |
1355 | 0 | return scalarizeVectorStore(Store, DAG); |
1356 | 5.92k | |
1357 | 5.92k | EVT MemVT = Store->getMemoryVT(); |
1358 | 5.92k | SDValue Chain = Store->getChain(); |
1359 | 5.92k | SDValue BasePtr = Store->getBasePtr(); |
1360 | 5.92k | SDLoc SL(Op); |
1361 | 5.92k | |
1362 | 5.92k | EVT LoVT, HiVT; |
1363 | 5.92k | EVT LoMemVT, HiMemVT; |
1364 | 5.92k | SDValue Lo, Hi; |
1365 | 5.92k | |
1366 | 5.92k | std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); |
1367 | 5.92k | std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); |
1368 | 5.92k | std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); |
1369 | 5.92k | |
1370 | 5.92k | EVT PtrVT = BasePtr.getValueType(); |
1371 | 5.92k | SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, |
1372 | 5.92k | DAG.getConstant(LoMemVT.getStoreSize(), SL, |
1373 | 5.92k | PtrVT)); |
1374 | 5.92k | |
1375 | 5.92k | const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); |
1376 | 5.92k | unsigned BaseAlign = Store->getAlignment(); |
1377 | 5.92k | unsigned Size = LoMemVT.getStoreSize(); |
1378 | 5.92k | unsigned HiAlign = MinAlign(BaseAlign, Size); |
1379 | 5.92k | |
1380 | 5.92k | SDValue LoStore = |
1381 | 5.92k | DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, |
1382 | 5.92k | Store->getMemOperand()->getFlags()); |
1383 | 5.92k | SDValue HiStore = |
1384 | 5.92k | DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), |
1385 | 5.92k | HiMemVT, HiAlign, Store->getMemOperand()->getFlags()); |
1386 | 5.92k | |
1387 | 5.92k | return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); |
1388 | 5.92k | } |
1389 | | |
1390 | | // This is a shortcut for integer division because we have fast i32<->f32 |
1391 | | // conversions, and fast f32 reciprocal instructions. The fractional part of a |
1392 | | // float is enough to accurately represent up to a 24-bit signed integer. |
1393 | | SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, |
1394 | 423 | bool Sign) const { |
1395 | 423 | SDLoc DL(Op); |
1396 | 423 | EVT VT = Op.getValueType(); |
1397 | 423 | SDValue LHS = Op.getOperand(0); |
1398 | 423 | SDValue RHS = Op.getOperand(1); |
1399 | 423 | MVT IntVT = MVT::i32; |
1400 | 423 | MVT FltVT = MVT::f32; |
1401 | 423 | |
1402 | 423 | unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS); |
1403 | 423 | if (LHSSignBits < 9) |
1404 | 326 | return SDValue(); |
1405 | 97 | |
1406 | 97 | unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS); |
1407 | 97 | if (RHSSignBits < 9) |
1408 | 13 | return SDValue(); |
1409 | 84 | |
1410 | 84 | unsigned BitSize = VT.getSizeInBits(); |
1411 | 84 | unsigned SignBits = std::min(LHSSignBits, RHSSignBits); |
1412 | 84 | unsigned DivBits = BitSize - SignBits; |
1413 | 84 | if (Sign) |
1414 | 45 | ++DivBits; |
1415 | 84 | |
1416 | 84 | ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP45 : ISD::UINT_TO_FP39 ; |
1417 | 84 | ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT45 : ISD::FP_TO_UINT39 ; |
1418 | 84 | |
1419 | 84 | SDValue jq = DAG.getConstant(1, DL, IntVT); |
1420 | 84 | |
1421 | 84 | if (Sign84 ) { |
1422 | 45 | // char|short jq = ia ^ ib; |
1423 | 45 | jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); |
1424 | 45 | |
1425 | 45 | // jq = jq >> (bitsize - 2) |
1426 | 45 | jq = DAG.getNode(ISD::SRA, DL, VT, jq, |
1427 | 45 | DAG.getConstant(BitSize - 2, DL, VT)); |
1428 | 45 | |
1429 | 45 | // jq = jq | 0x1 |
1430 | 45 | jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); |
1431 | 45 | } |
1432 | 84 | |
1433 | 84 | // int ia = (int)LHS; |
1434 | 84 | SDValue ia = LHS; |
1435 | 84 | |
1436 | 84 | // int ib, (int)RHS; |
1437 | 84 | SDValue ib = RHS; |
1438 | 84 | |
1439 | 84 | // float fa = (float)ia; |
1440 | 84 | SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); |
1441 | 84 | |
1442 | 84 | // float fb = (float)ib; |
1443 | 84 | SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); |
1444 | 84 | |
1445 | 84 | SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, |
1446 | 84 | fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); |
1447 | 84 | |
1448 | 84 | // fq = trunc(fq); |
1449 | 84 | fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); |
1450 | 84 | |
1451 | 84 | // float fqneg = -fq; |
1452 | 84 | SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); |
1453 | 84 | |
1454 | 84 | // float fr = mad(fqneg, fb, fa); |
1455 | 84 | unsigned OpCode = Subtarget->hasFP32Denormals() ? |
1456 | 4 | (unsigned)AMDGPUISD::FMAD_FTZ : |
1457 | 80 | (unsigned)ISD::FMAD; |
1458 | 84 | SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); |
1459 | 84 | |
1460 | 84 | // int iq = (int)fq; |
1461 | 84 | SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); |
1462 | 84 | |
1463 | 84 | // fr = fabs(fr); |
1464 | 84 | fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); |
1465 | 84 | |
1466 | 84 | // fb = fabs(fb); |
1467 | 84 | fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); |
1468 | 84 | |
1469 | 84 | EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); |
1470 | 84 | |
1471 | 84 | // int cv = fr >= fb; |
1472 | 84 | SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); |
1473 | 84 | |
1474 | 84 | // jq = (cv ? jq : 0); |
1475 | 84 | jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); |
1476 | 84 | |
1477 | 84 | // dst = iq + jq; |
1478 | 84 | SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); |
1479 | 84 | |
1480 | 84 | // Rem needs compensation, it's easier to recompute it |
1481 | 84 | SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); |
1482 | 84 | Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); |
1483 | 84 | |
1484 | 84 | // Truncate to number of bits this divide really is. |
1485 | 84 | if (Sign84 ) { |
1486 | 45 | SDValue InRegSize |
1487 | 45 | = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits)); |
1488 | 45 | Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize); |
1489 | 45 | Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize); |
1490 | 84 | } else { |
1491 | 39 | SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT); |
1492 | 39 | Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask); |
1493 | 39 | Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask); |
1494 | 39 | } |
1495 | 423 | |
1496 | 423 | return DAG.getMergeValues({ Div, Rem }, DL); |
1497 | 423 | } |
1498 | | |
1499 | | void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, |
1500 | | SelectionDAG &DAG, |
1501 | 66 | SmallVectorImpl<SDValue> &Results) const { |
1502 | 66 | assert(Op.getValueType() == MVT::i64); |
1503 | 66 | |
1504 | 66 | SDLoc DL(Op); |
1505 | 66 | EVT VT = Op.getValueType(); |
1506 | 66 | EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); |
1507 | 66 | |
1508 | 66 | SDValue one = DAG.getConstant(1, DL, HalfVT); |
1509 | 66 | SDValue zero = DAG.getConstant(0, DL, HalfVT); |
1510 | 66 | |
1511 | 66 | //HiLo split |
1512 | 66 | SDValue LHS = Op.getOperand(0); |
1513 | 66 | SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); |
1514 | 66 | SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); |
1515 | 66 | |
1516 | 66 | SDValue RHS = Op.getOperand(1); |
1517 | 66 | SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); |
1518 | 66 | SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); |
1519 | 66 | |
1520 | 66 | if (VT == MVT::i64 && |
1521 | 66 | DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && |
1522 | 66 | DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))12 ) { |
1523 | 12 | |
1524 | 12 | SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), |
1525 | 12 | LHS_Lo, RHS_Lo); |
1526 | 12 | |
1527 | 12 | SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero}); |
1528 | 12 | SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero}); |
1529 | 12 | |
1530 | 12 | Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV)); |
1531 | 12 | Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM)); |
1532 | 12 | return; |
1533 | 12 | } |
1534 | 54 | |
1535 | 54 | // Get Speculative values |
1536 | 54 | SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); |
1537 | 54 | SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); |
1538 | 54 | |
1539 | 54 | SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); |
1540 | 54 | SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero}); |
1541 | 54 | REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); |
1542 | 54 | |
1543 | 54 | SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); |
1544 | 54 | SDValue DIV_Lo = zero; |
1545 | 54 | |
1546 | 54 | const unsigned halfBitWidth = HalfVT.getSizeInBits(); |
1547 | 54 | |
1548 | 1.78k | for (unsigned i = 0; i < halfBitWidth1.78k ; ++i1.72k ) { |
1549 | 1.72k | const unsigned bitPos = halfBitWidth - i - 1; |
1550 | 1.72k | SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); |
1551 | 1.72k | // Get value of high bit |
1552 | 1.72k | SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); |
1553 | 1.72k | HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); |
1554 | 1.72k | HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); |
1555 | 1.72k | |
1556 | 1.72k | // Shift |
1557 | 1.72k | REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); |
1558 | 1.72k | // Add LHS high bit |
1559 | 1.72k | REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); |
1560 | 1.72k | |
1561 | 1.72k | SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); |
1562 | 1.72k | SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); |
1563 | 1.72k | |
1564 | 1.72k | DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); |
1565 | 1.72k | |
1566 | 1.72k | // Update REM |
1567 | 1.72k | SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); |
1568 | 1.72k | REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); |
1569 | 1.72k | } |
1570 | 66 | |
1571 | 66 | SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi}); |
1572 | 66 | DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV); |
1573 | 66 | Results.push_back(DIV); |
1574 | 66 | Results.push_back(REM); |
1575 | 66 | } |
1576 | | |
1577 | | SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, |
1578 | 344 | SelectionDAG &DAG) const { |
1579 | 344 | SDLoc DL(Op); |
1580 | 344 | EVT VT = Op.getValueType(); |
1581 | 344 | |
1582 | 344 | if (VT == MVT::i64344 ) { |
1583 | 44 | SmallVector<SDValue, 2> Results; |
1584 | 44 | LowerUDIVREM64(Op, DAG, Results); |
1585 | 44 | return DAG.getMergeValues(Results, DL); |
1586 | 44 | } |
1587 | 300 | |
1588 | 300 | if (300 VT == MVT::i32300 ) { |
1589 | 300 | if (SDValue Res = LowerDIVREM24(Op, DAG, false)) |
1590 | 39 | return Res; |
1591 | 261 | } |
1592 | 261 | |
1593 | 261 | SDValue Num = Op.getOperand(0); |
1594 | 261 | SDValue Den = Op.getOperand(1); |
1595 | 261 | |
1596 | 261 | // RCP = URECIP(Den) = 2^32 / Den + e |
1597 | 261 | // e is rounding error. |
1598 | 261 | SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); |
1599 | 261 | |
1600 | 261 | // RCP_LO = mul(RCP, Den) */ |
1601 | 261 | SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); |
1602 | 261 | |
1603 | 261 | // RCP_HI = mulhu (RCP, Den) */ |
1604 | 261 | SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); |
1605 | 261 | |
1606 | 261 | // NEG_RCP_LO = -RCP_LO |
1607 | 261 | SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), |
1608 | 261 | RCP_LO); |
1609 | 261 | |
1610 | 261 | // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) |
1611 | 261 | SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), |
1612 | 261 | NEG_RCP_LO, RCP_LO, |
1613 | 261 | ISD::SETEQ); |
1614 | 261 | // Calculate the rounding error from the URECIP instruction |
1615 | 261 | // E = mulhu(ABS_RCP_LO, RCP) |
1616 | 261 | SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); |
1617 | 261 | |
1618 | 261 | // RCP_A_E = RCP + E |
1619 | 261 | SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); |
1620 | 261 | |
1621 | 261 | // RCP_S_E = RCP - E |
1622 | 261 | SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); |
1623 | 261 | |
1624 | 261 | // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) |
1625 | 261 | SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), |
1626 | 261 | RCP_A_E, RCP_S_E, |
1627 | 261 | ISD::SETEQ); |
1628 | 261 | // Quotient = mulhu(Tmp0, Num) |
1629 | 261 | SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); |
1630 | 261 | |
1631 | 261 | // Num_S_Remainder = Quotient * Den |
1632 | 261 | SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); |
1633 | 261 | |
1634 | 261 | // Remainder = Num - Num_S_Remainder |
1635 | 261 | SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); |
1636 | 261 | |
1637 | 261 | // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) |
1638 | 261 | SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, |
1639 | 261 | DAG.getConstant(-1, DL, VT), |
1640 | 261 | DAG.getConstant(0, DL, VT), |
1641 | 261 | ISD::SETUGE); |
1642 | 261 | // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) |
1643 | 261 | SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, |
1644 | 261 | Num_S_Remainder, |
1645 | 261 | DAG.getConstant(-1, DL, VT), |
1646 | 261 | DAG.getConstant(0, DL, VT), |
1647 | 261 | ISD::SETUGE); |
1648 | 261 | // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero |
1649 | 261 | SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, |
1650 | 261 | Remainder_GE_Zero); |
1651 | 261 | |
1652 | 261 | // Calculate Division result: |
1653 | 261 | |
1654 | 261 | // Quotient_A_One = Quotient + 1 |
1655 | 261 | SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, |
1656 | 261 | DAG.getConstant(1, DL, VT)); |
1657 | 261 | |
1658 | 261 | // Quotient_S_One = Quotient - 1 |
1659 | 261 | SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, |
1660 | 261 | DAG.getConstant(1, DL, VT)); |
1661 | 261 | |
1662 | 261 | // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) |
1663 | 261 | SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), |
1664 | 261 | Quotient, Quotient_A_One, ISD::SETEQ); |
1665 | 261 | |
1666 | 261 | // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) |
1667 | 261 | Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), |
1668 | 261 | Quotient_S_One, Div, ISD::SETEQ); |
1669 | 261 | |
1670 | 261 | // Calculate Rem result: |
1671 | 261 | |
1672 | 261 | // Remainder_S_Den = Remainder - Den |
1673 | 261 | SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); |
1674 | 261 | |
1675 | 261 | // Remainder_A_Den = Remainder + Den |
1676 | 261 | SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); |
1677 | 261 | |
1678 | 261 | // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) |
1679 | 261 | SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), |
1680 | 261 | Remainder, Remainder_S_Den, ISD::SETEQ); |
1681 | 261 | |
1682 | 261 | // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) |
1683 | 261 | Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), |
1684 | 261 | Remainder_A_Den, Rem, ISD::SETEQ); |
1685 | 261 | SDValue Ops[2] = { |
1686 | 261 | Div, |
1687 | 261 | Rem |
1688 | 261 | }; |
1689 | 261 | return DAG.getMergeValues(Ops, DL); |
1690 | 261 | } |
1691 | | |
1692 | | SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, |
1693 | 159 | SelectionDAG &DAG) const { |
1694 | 159 | SDLoc DL(Op); |
1695 | 159 | EVT VT = Op.getValueType(); |
1696 | 159 | |
1697 | 159 | SDValue LHS = Op.getOperand(0); |
1698 | 159 | SDValue RHS = Op.getOperand(1); |
1699 | 159 | |
1700 | 159 | SDValue Zero = DAG.getConstant(0, DL, VT); |
1701 | 159 | SDValue NegOne = DAG.getConstant(-1, DL, VT); |
1702 | 159 | |
1703 | 159 | if (VT == MVT::i32159 ) { |
1704 | 123 | if (SDValue Res = LowerDIVREM24(Op, DAG, true)) |
1705 | 45 | return Res; |
1706 | 114 | } |
1707 | 114 | |
1708 | 114 | if (114 VT == MVT::i64 && |
1709 | 36 | DAG.ComputeNumSignBits(LHS) > 32 && |
1710 | 114 | DAG.ComputeNumSignBits(RHS) > 3212 ) { |
1711 | 12 | EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); |
1712 | 12 | |
1713 | 12 | //HiLo split |
1714 | 12 | SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); |
1715 | 12 | SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); |
1716 | 12 | SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), |
1717 | 12 | LHS_Lo, RHS_Lo); |
1718 | 12 | SDValue Res[2] = { |
1719 | 12 | DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), |
1720 | 12 | DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) |
1721 | 12 | }; |
1722 | 12 | return DAG.getMergeValues(Res, DL); |
1723 | 12 | } |
1724 | 102 | |
1725 | 102 | SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); |
1726 | 102 | SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); |
1727 | 102 | SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); |
1728 | 102 | SDValue RSign = LHSign; // Remainder sign is the same as LHS |
1729 | 102 | |
1730 | 102 | LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); |
1731 | 102 | RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); |
1732 | 102 | |
1733 | 102 | LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); |
1734 | 102 | RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); |
1735 | 102 | |
1736 | 102 | SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); |
1737 | 102 | SDValue Rem = Div.getValue(1); |
1738 | 102 | |
1739 | 102 | Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); |
1740 | 102 | Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); |
1741 | 102 | |
1742 | 102 | Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); |
1743 | 102 | Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); |
1744 | 102 | |
1745 | 102 | SDValue Res[2] = { |
1746 | 102 | Div, |
1747 | 102 | Rem |
1748 | 102 | }; |
1749 | 102 | return DAG.getMergeValues(Res, DL); |
1750 | 102 | } |
1751 | | |
1752 | | // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) |
1753 | 36 | SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { |
1754 | 36 | SDLoc SL(Op); |
1755 | 36 | EVT VT = Op.getValueType(); |
1756 | 36 | SDValue X = Op.getOperand(0); |
1757 | 36 | SDValue Y = Op.getOperand(1); |
1758 | 36 | |
1759 | 36 | // TODO: Should this propagate fast-math-flags? |
1760 | 36 | |
1761 | 36 | SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); |
1762 | 36 | SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); |
1763 | 36 | SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); |
1764 | 36 | |
1765 | 36 | return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); |
1766 | 36 | } |
1767 | | |
1768 | 31 | SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { |
1769 | 31 | SDLoc SL(Op); |
1770 | 31 | SDValue Src = Op.getOperand(0); |
1771 | 31 | |
1772 | 31 | // result = trunc(src) |
1773 | 31 | // if (src > 0.0 && src != result) |
1774 | 31 | // result += 1.0 |
1775 | 31 | |
1776 | 31 | SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); |
1777 | 31 | |
1778 | 31 | const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); |
1779 | 31 | const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); |
1780 | 31 | |
1781 | 31 | EVT SetCCVT = |
1782 | 31 | getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); |
1783 | 31 | |
1784 | 31 | SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); |
1785 | 31 | SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); |
1786 | 31 | SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); |
1787 | 31 | |
1788 | 31 | SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); |
1789 | 31 | // TODO: Should this propagate fast-math-flags? |
1790 | 31 | return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); |
1791 | 31 | } |
1792 | | |
1793 | | static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, |
1794 | 91 | SelectionDAG &DAG) { |
1795 | 91 | const unsigned FractBits = 52; |
1796 | 91 | const unsigned ExpBits = 11; |
1797 | 91 | |
1798 | 91 | SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, |
1799 | 91 | Hi, |
1800 | 91 | DAG.getConstant(FractBits - 32, SL, MVT::i32), |
1801 | 91 | DAG.getConstant(ExpBits, SL, MVT::i32)); |
1802 | 91 | SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, |
1803 | 91 | DAG.getConstant(1023, SL, MVT::i32)); |
1804 | 91 | |
1805 | 91 | return Exp; |
1806 | 91 | } |
1807 | | |
1808 | 75 | SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { |
1809 | 75 | SDLoc SL(Op); |
1810 | 75 | SDValue Src = Op.getOperand(0); |
1811 | 75 | |
1812 | 75 | assert(Op.getValueType() == MVT::f64); |
1813 | 75 | |
1814 | 75 | const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); |
1815 | 75 | const SDValue One = DAG.getConstant(1, SL, MVT::i32); |
1816 | 75 | |
1817 | 75 | SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); |
1818 | 75 | |
1819 | 75 | // Extract the upper half, since this is where we will find the sign and |
1820 | 75 | // exponent. |
1821 | 75 | SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); |
1822 | 75 | |
1823 | 75 | SDValue Exp = extractF64Exponent(Hi, SL, DAG); |
1824 | 75 | |
1825 | 75 | const unsigned FractBits = 52; |
1826 | 75 | |
1827 | 75 | // Extract the sign bit. |
1828 | 75 | const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); |
1829 | 75 | SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); |
1830 | 75 | |
1831 | 75 | // Extend back to to 64-bits. |
1832 | 75 | SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit}); |
1833 | 75 | SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); |
1834 | 75 | |
1835 | 75 | SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); |
1836 | 75 | const SDValue FractMask |
1837 | 75 | = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); |
1838 | 75 | |
1839 | 75 | SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); |
1840 | 75 | SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); |
1841 | 75 | SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); |
1842 | 75 | |
1843 | 75 | EVT SetCCVT = |
1844 | 75 | getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); |
1845 | 75 | |
1846 | 75 | const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); |
1847 | 75 | |
1848 | 75 | SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); |
1849 | 75 | SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); |
1850 | 75 | |
1851 | 75 | SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); |
1852 | 75 | SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); |
1853 | 75 | |
1854 | 75 | return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); |
1855 | 75 | } |
1856 | | |
1857 | 14 | SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { |
1858 | 14 | SDLoc SL(Op); |
1859 | 14 | SDValue Src = Op.getOperand(0); |
1860 | 14 | |
1861 | 14 | assert(Op.getValueType() == MVT::f64); |
1862 | 14 | |
1863 | 14 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); |
1864 | 14 | SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); |
1865 | 14 | SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); |
1866 | 14 | |
1867 | 14 | // TODO: Should this propagate fast-math-flags? |
1868 | 14 | |
1869 | 14 | SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); |
1870 | 14 | SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); |
1871 | 14 | |
1872 | 14 | SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); |
1873 | 14 | |
1874 | 14 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); |
1875 | 14 | SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); |
1876 | 14 | |
1877 | 14 | EVT SetCCVT = |
1878 | 14 | getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); |
1879 | 14 | SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); |
1880 | 14 | |
1881 | 14 | return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); |
1882 | 14 | } |
1883 | | |
1884 | 44 | SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { |
1885 | 44 | // FNEARBYINT and FRINT are the same, except in their handling of FP |
1886 | 44 | // exceptions. Those aren't really meaningful for us, and OpenCL only has |
1887 | 44 | // rint, so just treat them as equivalent. |
1888 | 44 | return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); |
1889 | 44 | } |
1890 | | |
1891 | | // XXX - May require not supporting f32 denormals? |
1892 | | |
1893 | | // Don't handle v2f16. The extra instructions to scalarize and repack around the |
1894 | | // compare and vselect end up producing worse code than scalarizing the whole |
1895 | | // operation. |
1896 | 74 | SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const { |
1897 | 74 | SDLoc SL(Op); |
1898 | 74 | SDValue X = Op.getOperand(0); |
1899 | 74 | EVT VT = Op.getValueType(); |
1900 | 74 | |
1901 | 74 | SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X); |
1902 | 74 | |
1903 | 74 | // TODO: Should this propagate fast-math-flags? |
1904 | 74 | |
1905 | 74 | SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T); |
1906 | 74 | |
1907 | 74 | SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff); |
1908 | 74 | |
1909 | 74 | const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); |
1910 | 74 | const SDValue One = DAG.getConstantFP(1.0, SL, VT); |
1911 | 74 | const SDValue Half = DAG.getConstantFP(0.5, SL, VT); |
1912 | 74 | |
1913 | 74 | SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X); |
1914 | 74 | |
1915 | 74 | EVT SetCCVT = |
1916 | 74 | getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); |
1917 | 74 | |
1918 | 74 | SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); |
1919 | 74 | |
1920 | 74 | SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero); |
1921 | 74 | |
1922 | 74 | return DAG.getNode(ISD::FADD, SL, VT, T, Sel); |
1923 | 74 | } |
1924 | | |
1925 | 16 | SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { |
1926 | 16 | SDLoc SL(Op); |
1927 | 16 | SDValue X = Op.getOperand(0); |
1928 | 16 | |
1929 | 16 | SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); |
1930 | 16 | |
1931 | 16 | const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); |
1932 | 16 | const SDValue One = DAG.getConstant(1, SL, MVT::i32); |
1933 | 16 | const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); |
1934 | 16 | const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); |
1935 | 16 | EVT SetCCVT = |
1936 | 16 | getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); |
1937 | 16 | |
1938 | 16 | SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); |
1939 | 16 | |
1940 | 16 | SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); |
1941 | 16 | |
1942 | 16 | SDValue Exp = extractF64Exponent(Hi, SL, DAG); |
1943 | 16 | |
1944 | 16 | const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL, |
1945 | 16 | MVT::i64); |
1946 | 16 | |
1947 | 16 | SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); |
1948 | 16 | SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, |
1949 | 16 | DAG.getConstant(INT64_C(0x0008000000000000), SL, |
1950 | 16 | MVT::i64), |
1951 | 16 | Exp); |
1952 | 16 | |
1953 | 16 | SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); |
1954 | 16 | SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, |
1955 | 16 | DAG.getConstant(0, SL, MVT::i64), Tmp0, |
1956 | 16 | ISD::SETNE); |
1957 | 16 | |
1958 | 16 | SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, |
1959 | 16 | D, DAG.getConstant(0, SL, MVT::i64)); |
1960 | 16 | SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); |
1961 | 16 | |
1962 | 16 | K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); |
1963 | 16 | K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); |
1964 | 16 | |
1965 | 16 | SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); |
1966 | 16 | SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); |
1967 | 16 | SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); |
1968 | 16 | |
1969 | 16 | SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, |
1970 | 16 | ExpEqNegOne, |
1971 | 16 | DAG.getConstantFP(1.0, SL, MVT::f64), |
1972 | 16 | DAG.getConstantFP(0.0, SL, MVT::f64)); |
1973 | 16 | |
1974 | 16 | SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); |
1975 | 16 | |
1976 | 16 | K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); |
1977 | 16 | K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); |
1978 | 16 | |
1979 | 16 | return K; |
1980 | 16 | } |
1981 | | |
1982 | 90 | SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { |
1983 | 90 | EVT VT = Op.getValueType(); |
1984 | 90 | |
1985 | 90 | if (VT == MVT::f32 || 90 VT == MVT::f1622 ) |
1986 | 74 | return LowerFROUND32_16(Op, DAG); |
1987 | 16 | |
1988 | 16 | if (16 VT == MVT::f6416 ) |
1989 | 16 | return LowerFROUND64(Op, DAG); |
1990 | 0 |
|
1991 | 0 | llvm_unreachable0 ("unhandled type"); |
1992 | 0 | } |
1993 | | |
1994 | 0 | SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { |
1995 | 0 | SDLoc SL(Op); |
1996 | 0 | SDValue Src = Op.getOperand(0); |
1997 | 0 |
|
1998 | 0 | // result = trunc(src); |
1999 | 0 | // if (src < 0.0 && src != result) |
2000 | 0 | // result += -1.0. |
2001 | 0 |
|
2002 | 0 | SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); |
2003 | 0 |
|
2004 | 0 | const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); |
2005 | 0 | const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); |
2006 | 0 |
|
2007 | 0 | EVT SetCCVT = |
2008 | 0 | getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); |
2009 | 0 |
|
2010 | 0 | SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); |
2011 | 0 | SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); |
2012 | 0 | SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); |
2013 | 0 |
|
2014 | 0 | SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); |
2015 | 0 | // TODO: Should this propagate fast-math-flags? |
2016 | 0 | return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); |
2017 | 0 | } |
2018 | | |
2019 | 342 | SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { |
2020 | 342 | SDLoc SL(Op); |
2021 | 342 | SDValue Src = Op.getOperand(0); |
2022 | 342 | bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; |
2023 | 342 | |
2024 | 342 | if (ZeroUndef && 342 Src.getValueType() == MVT::i32334 ) |
2025 | 260 | return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src); |
2026 | 82 | |
2027 | 82 | SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); |
2028 | 82 | |
2029 | 82 | const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); |
2030 | 82 | const SDValue One = DAG.getConstant(1, SL, MVT::i32); |
2031 | 82 | |
2032 | 82 | SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); |
2033 | 82 | SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); |
2034 | 82 | |
2035 | 82 | EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), |
2036 | 82 | *DAG.getContext(), MVT::i32); |
2037 | 82 | |
2038 | 82 | SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ); |
2039 | 82 | |
2040 | 82 | SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo); |
2041 | 82 | SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi); |
2042 | 82 | |
2043 | 82 | const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); |
2044 | 82 | SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32); |
2045 | 82 | |
2046 | 82 | // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) |
2047 | 82 | SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi); |
2048 | 82 | |
2049 | 82 | if (!ZeroUndef82 ) { |
2050 | 8 | // Test if the full 64-bit input is zero. |
2051 | 8 | |
2052 | 8 | // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, |
2053 | 8 | // which we probably don't want. |
2054 | 8 | SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ); |
2055 | 8 | SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0); |
2056 | 8 | |
2057 | 8 | // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction |
2058 | 8 | // with the same cycles, otherwise it is slower. |
2059 | 8 | // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src, |
2060 | 8 | // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ); |
2061 | 8 | |
2062 | 8 | const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32); |
2063 | 8 | |
2064 | 8 | // The instruction returns -1 for 0 input, but the defined intrinsic |
2065 | 8 | // behavior is to return the number of bits. |
2066 | 8 | NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, |
2067 | 8 | SrcIsZero, Bits32, NewCtlz); |
2068 | 8 | } |
2069 | 342 | |
2070 | 342 | return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz); |
2071 | 342 | } |
2072 | | |
2073 | | SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, |
2074 | 67 | bool Signed) const { |
2075 | 67 | // Unsigned |
2076 | 67 | // cul2f(ulong u) |
2077 | 67 | //{ |
2078 | 67 | // uint lz = clz(u); |
2079 | 67 | // uint e = (u != 0) ? 127U + 63U - lz : 0; |
2080 | 67 | // u = (u << lz) & 0x7fffffffffffffffUL; |
2081 | 67 | // ulong t = u & 0xffffffffffUL; |
2082 | 67 | // uint v = (e << 23) | (uint)(u >> 40); |
2083 | 67 | // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U); |
2084 | 67 | // return as_float(v + r); |
2085 | 67 | //} |
2086 | 67 | // Signed |
2087 | 67 | // cl2f(long l) |
2088 | 67 | //{ |
2089 | 67 | // long s = l >> 63; |
2090 | 67 | // float r = cul2f((l + s) ^ s); |
2091 | 67 | // return s ? -r : r; |
2092 | 67 | //} |
2093 | 67 | |
2094 | 67 | SDLoc SL(Op); |
2095 | 67 | SDValue Src = Op.getOperand(0); |
2096 | 67 | SDValue L = Src; |
2097 | 67 | |
2098 | 67 | SDValue S; |
2099 | 67 | if (Signed67 ) { |
2100 | 32 | const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64); |
2101 | 32 | S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit); |
2102 | 32 | |
2103 | 32 | SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S); |
2104 | 32 | L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S); |
2105 | 32 | } |
2106 | 67 | |
2107 | 67 | EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), |
2108 | 67 | *DAG.getContext(), MVT::f32); |
2109 | 67 | |
2110 | 67 | |
2111 | 67 | SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32); |
2112 | 67 | SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64); |
2113 | 67 | SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L); |
2114 | 67 | LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ); |
2115 | 67 | |
2116 | 67 | SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32); |
2117 | 67 | SDValue E = DAG.getSelect(SL, MVT::i32, |
2118 | 67 | DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE), |
2119 | 67 | DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ), |
2120 | 67 | ZeroI32); |
2121 | 67 | |
2122 | 67 | SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64, |
2123 | 67 | DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ), |
2124 | 67 | DAG.getConstant((-1ULL) >> 1, SL, MVT::i64)); |
2125 | 67 | |
2126 | 67 | SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U, |
2127 | 67 | DAG.getConstant(0xffffffffffULL, SL, MVT::i64)); |
2128 | 67 | |
2129 | 67 | SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64, |
2130 | 67 | U, DAG.getConstant(40, SL, MVT::i64)); |
2131 | 67 | |
2132 | 67 | SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32, |
2133 | 67 | DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)), |
2134 | 67 | DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl)); |
2135 | 67 | |
2136 | 67 | SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64); |
2137 | 67 | SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT); |
2138 | 67 | SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ); |
2139 | 67 | |
2140 | 67 | SDValue One = DAG.getConstant(1, SL, MVT::i32); |
2141 | 67 | |
2142 | 67 | SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One); |
2143 | 67 | |
2144 | 67 | SDValue R = DAG.getSelect(SL, MVT::i32, |
2145 | 67 | RCmp, |
2146 | 67 | One, |
2147 | 67 | DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32)); |
2148 | 67 | R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R); |
2149 | 67 | R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R); |
2150 | 67 | |
2151 | 67 | if (!Signed) |
2152 | 35 | return R; |
2153 | 32 | |
2154 | 32 | SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R); |
2155 | 32 | return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R); |
2156 | 32 | } |
2157 | | |
2158 | | SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, |
2159 | 10 | bool Signed) const { |
2160 | 10 | SDLoc SL(Op); |
2161 | 10 | SDValue Src = Op.getOperand(0); |
2162 | 10 | |
2163 | 10 | SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); |
2164 | 10 | |
2165 | 10 | SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, |
2166 | 10 | DAG.getConstant(0, SL, MVT::i32)); |
2167 | 10 | SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, |
2168 | 10 | DAG.getConstant(1, SL, MVT::i32)); |
2169 | 10 | |
2170 | 10 | SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP2 : ISD::UINT_TO_FP8 , |
2171 | 10 | SL, MVT::f64, Hi); |
2172 | 10 | |
2173 | 10 | SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); |
2174 | 10 | |
2175 | 10 | SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, |
2176 | 10 | DAG.getConstant(32, SL, MVT::i32)); |
2177 | 10 | // TODO: Should this propagate fast-math-flags? |
2178 | 10 | return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); |
2179 | 10 | } |
2180 | | |
2181 | | SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, |
2182 | 51 | SelectionDAG &DAG) const { |
2183 | 51 | assert(Op.getOperand(0).getValueType() == MVT::i64 && |
2184 | 51 | "operation should be legal"); |
2185 | 51 | |
2186 | 51 | // TODO: Factor out code common with LowerSINT_TO_FP. |
2187 | 51 | |
2188 | 51 | EVT DestVT = Op.getValueType(); |
2189 | 51 | if (Subtarget->has16BitInsts() && 51 DestVT == MVT::f1625 ) { |
2190 | 8 | SDLoc DL(Op); |
2191 | 8 | SDValue Src = Op.getOperand(0); |
2192 | 8 | |
2193 | 8 | SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); |
2194 | 8 | SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); |
2195 | 8 | SDValue FPRound = |
2196 | 8 | DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); |
2197 | 8 | |
2198 | 8 | return FPRound; |
2199 | 8 | } |
2200 | 43 | |
2201 | 43 | if (43 DestVT == MVT::f3243 ) |
2202 | 35 | return LowerINT_TO_FP32(Op, DAG, false); |
2203 | 8 | |
2204 | 43 | assert(DestVT == MVT::f64); |
2205 | 8 | return LowerINT_TO_FP64(Op, DAG, false); |
2206 | 8 | } |
2207 | | |
2208 | | SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, |
2209 | 42 | SelectionDAG &DAG) const { |
2210 | 42 | assert(Op.getOperand(0).getValueType() == MVT::i64 && |
2211 | 42 | "operation should be legal"); |
2212 | 42 | |
2213 | 42 | // TODO: Factor out code common with LowerUINT_TO_FP. |
2214 | 42 | |
2215 | 42 | EVT DestVT = Op.getValueType(); |
2216 | 42 | if (Subtarget->has16BitInsts() && 42 DestVT == MVT::f1624 ) { |
2217 | 8 | SDLoc DL(Op); |
2218 | 8 | SDValue Src = Op.getOperand(0); |
2219 | 8 | |
2220 | 8 | SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); |
2221 | 8 | SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); |
2222 | 8 | SDValue FPRound = |
2223 | 8 | DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); |
2224 | 8 | |
2225 | 8 | return FPRound; |
2226 | 8 | } |
2227 | 34 | |
2228 | 34 | if (34 DestVT == MVT::f3234 ) |
2229 | 32 | return LowerINT_TO_FP32(Op, DAG, true); |
2230 | 2 | |
2231 | 34 | assert(DestVT == MVT::f64); |
2232 | 2 | return LowerINT_TO_FP64(Op, DAG, true); |
2233 | 2 | } |
2234 | | |
2235 | | SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, |
2236 | 16 | bool Signed) const { |
2237 | 16 | SDLoc SL(Op); |
2238 | 16 | |
2239 | 16 | SDValue Src = Op.getOperand(0); |
2240 | 16 | |
2241 | 16 | SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); |
2242 | 16 | |
2243 | 16 | SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL, |
2244 | 16 | MVT::f64); |
2245 | 16 | SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, |
2246 | 16 | MVT::f64); |
2247 | 16 | // TODO: Should this propagate fast-math-flags? |
2248 | 16 | SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); |
2249 | 16 | |
2250 | 16 | SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); |
2251 | 16 | |
2252 | 16 | |
2253 | 16 | SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); |
2254 | 16 | |
2255 | 16 | SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT2 : ISD::FP_TO_UINT14 , SL, |
2256 | 16 | MVT::i32, FloorMul); |
2257 | 16 | SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); |
2258 | 16 | |
2259 | 16 | SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}); |
2260 | 16 | |
2261 | 16 | return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); |
2262 | 16 | } |
2263 | | |
2264 | 698 | SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { |
2265 | 698 | SDLoc DL(Op); |
2266 | 698 | SDValue N0 = Op.getOperand(0); |
2267 | 698 | |
2268 | 698 | // Convert to target node to get known bits |
2269 | 698 | if (N0.getValueType() == MVT::f32) |
2270 | 649 | return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); |
2271 | 49 | |
2272 | 49 | if (49 getTargetMachine().Options.UnsafeFPMath49 ) { |
2273 | 10 | // There is a generic expand for FP_TO_FP16 with unsafe fast math. |
2274 | 10 | return SDValue(); |
2275 | 10 | } |
2276 | 39 | |
2277 | 49 | assert(N0.getSimpleValueType() == MVT::f64); |
2278 | 39 | |
2279 | 39 | // f64 -> f16 conversion using round-to-nearest-even rounding mode. |
2280 | 39 | const unsigned ExpMask = 0x7ff; |
2281 | 39 | const unsigned ExpBiasf64 = 1023; |
2282 | 39 | const unsigned ExpBiasf16 = 15; |
2283 | 39 | SDValue Zero = DAG.getConstant(0, DL, MVT::i32); |
2284 | 39 | SDValue One = DAG.getConstant(1, DL, MVT::i32); |
2285 | 39 | SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0); |
2286 | 39 | SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U, |
2287 | 39 | DAG.getConstant(32, DL, MVT::i64)); |
2288 | 39 | UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32); |
2289 | 39 | U = DAG.getZExtOrTrunc(U, DL, MVT::i32); |
2290 | 39 | SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, |
2291 | 39 | DAG.getConstant(20, DL, MVT::i64)); |
2292 | 39 | E = DAG.getNode(ISD::AND, DL, MVT::i32, E, |
2293 | 39 | DAG.getConstant(ExpMask, DL, MVT::i32)); |
2294 | 39 | // Subtract the fp64 exponent bias (1023) to get the real exponent and |
2295 | 39 | // add the f16 bias (15) to get the biased exponent for the f16 format. |
2296 | 39 | E = DAG.getNode(ISD::ADD, DL, MVT::i32, E, |
2297 | 39 | DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32)); |
2298 | 39 | |
2299 | 39 | SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, |
2300 | 39 | DAG.getConstant(8, DL, MVT::i32)); |
2301 | 39 | M = DAG.getNode(ISD::AND, DL, MVT::i32, M, |
2302 | 39 | DAG.getConstant(0xffe, DL, MVT::i32)); |
2303 | 39 | |
2304 | 39 | SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH, |
2305 | 39 | DAG.getConstant(0x1ff, DL, MVT::i32)); |
2306 | 39 | MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U); |
2307 | 39 | |
2308 | 39 | SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ); |
2309 | 39 | M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set); |
2310 | 39 | |
2311 | 39 | // (M != 0 ? 0x0200 : 0) | 0x7c00; |
2312 | 39 | SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32, |
2313 | 39 | DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32), |
2314 | 39 | Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32)); |
2315 | 39 | |
2316 | 39 | // N = M | (E << 12); |
2317 | 39 | SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M, |
2318 | 39 | DAG.getNode(ISD::SHL, DL, MVT::i32, E, |
2319 | 39 | DAG.getConstant(12, DL, MVT::i32))); |
2320 | 39 | |
2321 | 39 | // B = clamp(1-E, 0, 13); |
2322 | 39 | SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32, |
2323 | 39 | One, E); |
2324 | 39 | SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero); |
2325 | 39 | B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B, |
2326 | 39 | DAG.getConstant(13, DL, MVT::i32)); |
2327 | 39 | |
2328 | 39 | SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M, |
2329 | 39 | DAG.getConstant(0x1000, DL, MVT::i32)); |
2330 | 39 | |
2331 | 39 | SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B); |
2332 | 39 | SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B); |
2333 | 39 | SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE); |
2334 | 39 | D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1); |
2335 | 39 | |
2336 | 39 | SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT); |
2337 | 39 | SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V, |
2338 | 39 | DAG.getConstant(0x7, DL, MVT::i32)); |
2339 | 39 | V = DAG.getNode(ISD::SRL, DL, MVT::i32, V, |
2340 | 39 | DAG.getConstant(2, DL, MVT::i32)); |
2341 | 39 | SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32), |
2342 | 39 | One, Zero, ISD::SETEQ); |
2343 | 39 | SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32), |
2344 | 39 | One, Zero, ISD::SETGT); |
2345 | 39 | V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1); |
2346 | 39 | V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1); |
2347 | 39 | |
2348 | 39 | V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32), |
2349 | 39 | DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT); |
2350 | 39 | V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32), |
2351 | 39 | I, V, ISD::SETEQ); |
2352 | 39 | |
2353 | 39 | // Extract the sign bit. |
2354 | 39 | SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, |
2355 | 39 | DAG.getConstant(16, DL, MVT::i32)); |
2356 | 39 | Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign, |
2357 | 39 | DAG.getConstant(0x8000, DL, MVT::i32)); |
2358 | 39 | |
2359 | 39 | V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); |
2360 | 39 | return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); |
2361 | 39 | } |
2362 | | |
2363 | | SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, |
2364 | 65 | SelectionDAG &DAG) const { |
2365 | 65 | SDValue Src = Op.getOperand(0); |
2366 | 65 | |
2367 | 65 | // TODO: Factor out code common with LowerFP_TO_UINT. |
2368 | 65 | |
2369 | 65 | EVT SrcVT = Src.getValueType(); |
2370 | 65 | if (Subtarget->has16BitInsts() && 65 SrcVT == MVT::f1633 ) { |
2371 | 3 | SDLoc DL(Op); |
2372 | 3 | |
2373 | 3 | SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); |
2374 | 3 | SDValue FpToInt32 = |
2375 | 3 | DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); |
2376 | 3 | |
2377 | 3 | return FpToInt32; |
2378 | 3 | } |
2379 | 62 | |
2380 | 62 | if (62 Op.getValueType() == MVT::i64 && 62 Src.getValueType() == MVT::f6462 ) |
2381 | 2 | return LowerFP64_TO_INT(Op, DAG, true); |
2382 | 60 | |
2383 | 60 | return SDValue(); |
2384 | 60 | } |
2385 | | |
2386 | | SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, |
2387 | 37 | SelectionDAG &DAG) const { |
2388 | 37 | SDValue Src = Op.getOperand(0); |
2389 | 37 | |
2390 | 37 | // TODO: Factor out code common with LowerFP_TO_SINT. |
2391 | 37 | |
2392 | 37 | EVT SrcVT = Src.getValueType(); |
2393 | 37 | if (Subtarget->has16BitInsts() && 37 SrcVT == MVT::f1613 ) { |
2394 | 3 | SDLoc DL(Op); |
2395 | 3 | |
2396 | 3 | SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); |
2397 | 3 | SDValue FpToInt32 = |
2398 | 3 | DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); |
2399 | 3 | |
2400 | 3 | return FpToInt32; |
2401 | 3 | } |
2402 | 34 | |
2403 | 34 | if (34 Op.getValueType() == MVT::i64 && 34 Src.getValueType() == MVT::f6434 ) |
2404 | 14 | return LowerFP64_TO_INT(Op, DAG, false); |
2405 | 20 | |
2406 | 20 | return SDValue(); |
2407 | 20 | } |
2408 | | |
2409 | | SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, |
2410 | 30 | SelectionDAG &DAG) const { |
2411 | 30 | EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); |
2412 | 30 | MVT VT = Op.getSimpleValueType(); |
2413 | 30 | MVT ScalarVT = VT.getScalarType(); |
2414 | 30 | |
2415 | 30 | assert(VT.isVector()); |
2416 | 30 | |
2417 | 30 | SDValue Src = Op.getOperand(0); |
2418 | 30 | SDLoc DL(Op); |
2419 | 30 | |
2420 | 30 | // TODO: Don't scalarize on Evergreen? |
2421 | 30 | unsigned NElts = VT.getVectorNumElements(); |
2422 | 30 | SmallVector<SDValue, 8> Args; |
2423 | 30 | DAG.ExtractVectorElements(Src, Args, 0, NElts); |
2424 | 30 | |
2425 | 30 | SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); |
2426 | 122 | for (unsigned I = 0; I < NElts122 ; ++I92 ) |
2427 | 92 | Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); |
2428 | 30 | |
2429 | 30 | return DAG.getBuildVector(VT, DL, Args); |
2430 | 30 | } |
2431 | | |
2432 | | //===----------------------------------------------------------------------===// |
2433 | | // Custom DAG optimizations |
2434 | | //===----------------------------------------------------------------------===// |
2435 | | |
2436 | 3.14k | static bool isU24(SDValue Op, SelectionDAG &DAG) { |
2437 | 3.14k | KnownBits Known; |
2438 | 3.14k | EVT VT = Op.getValueType(); |
2439 | 3.14k | DAG.computeKnownBits(Op, Known); |
2440 | 3.14k | |
2441 | 3.14k | return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24; |
2442 | 3.14k | } |
2443 | | |
2444 | 1.18k | static bool isI24(SDValue Op, SelectionDAG &DAG) { |
2445 | 1.18k | EVT VT = Op.getValueType(); |
2446 | 1.18k | |
2447 | 1.18k | // In order for this to be a signed 24-bit value, bit 23, must |
2448 | 1.18k | // be a sign bit. |
2449 | 1.18k | return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated |
2450 | 1.18k | // as unsigned 24-bit values. |
2451 | 1.18k | (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; |
2452 | 1.18k | } |
2453 | | |
2454 | | static bool simplifyI24(SDNode *Node24, unsigned OpIdx, |
2455 | 2.59k | TargetLowering::DAGCombinerInfo &DCI) { |
2456 | 2.59k | |
2457 | 2.59k | SelectionDAG &DAG = DCI.DAG; |
2458 | 2.59k | SDValue Op = Node24->getOperand(OpIdx); |
2459 | 2.59k | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
2460 | 2.59k | EVT VT = Op.getValueType(); |
2461 | 2.59k | |
2462 | 2.59k | APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); |
2463 | 2.59k | APInt KnownZero, KnownOne; |
2464 | 2.59k | TargetLowering::TargetLoweringOpt TLO(DAG, true, true); |
2465 | 2.59k | if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO)) |
2466 | 171 | return true; |
2467 | 2.42k | |
2468 | 2.42k | return false; |
2469 | 2.42k | } |
2470 | | |
2471 | | template <typename IntTy> |
2472 | | static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, |
2473 | 48 | uint32_t Width, const SDLoc &DL) { |
2474 | 48 | if (Width + Offset < 3248 ) { |
2475 | 20 | uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); |
2476 | 20 | IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); |
2477 | 20 | return DAG.getConstant(Result, DL, MVT::i32); |
2478 | 20 | } |
2479 | 28 | |
2480 | 28 | return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); |
2481 | 28 | } AMDGPUISelLowering.cpp:llvm::SDValue constantFoldBFE<int>(llvm::SelectionDAG&, int, unsigned int, unsigned int, llvm::SDLoc const&) Line | Count | Source | 2473 | 24 | uint32_t Width, const SDLoc &DL) { | 2474 | 24 | if (Width + Offset < 3224 ) { | 2475 | 10 | uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); | 2476 | 10 | IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); | 2477 | 10 | return DAG.getConstant(Result, DL, MVT::i32); | 2478 | 10 | } | 2479 | 14 | | 2480 | 14 | return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); | 2481 | 14 | } |
AMDGPUISelLowering.cpp:llvm::SDValue constantFoldBFE<unsigned int>(llvm::SelectionDAG&, unsigned int, unsigned int, unsigned int, llvm::SDLoc const&) Line | Count | Source | 2473 | 24 | uint32_t Width, const SDLoc &DL) { | 2474 | 24 | if (Width + Offset < 3224 ) { | 2475 | 10 | uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); | 2476 | 10 | IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); | 2477 | 10 | return DAG.getConstant(Result, DL, MVT::i32); | 2478 | 10 | } | 2479 | 14 | | 2480 | 14 | return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); | 2481 | 14 | } |
|
2482 | | |
2483 | 81.4k | static bool hasVolatileUser(SDNode *Val) { |
2484 | 170k | for (SDNode *U : Val->uses()) { |
2485 | 170k | if (MemSDNode *M170k = dyn_cast<MemSDNode>(U)) { |
2486 | 43.3k | if (M->isVolatile()) |
2487 | 3.59k | return true; |
2488 | 77.9k | } |
2489 | 170k | } |
2490 | 77.9k | |
2491 | 77.9k | return false; |
2492 | 77.9k | } |
2493 | | |
2494 | 118k | bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { |
2495 | 118k | // i32 vectors are the canonical memory type. |
2496 | 118k | if (VT.getScalarType() == MVT::i32 || 118k isTypeLegal(VT)70.1k ) |
2497 | 106k | return false; |
2498 | 11.6k | |
2499 | 11.6k | if (11.6k !VT.isByteSized()11.6k ) |
2500 | 600 | return false; |
2501 | 11.0k | |
2502 | 11.0k | unsigned Size = VT.getStoreSize(); |
2503 | 11.0k | |
2504 | 11.0k | if ((Size == 1 || 11.0k Size == 28.26k || Size == 44.32k ) && !VT.isVector()8.11k ) |
2505 | 6.35k | return false; |
2506 | 4.69k | |
2507 | 4.69k | if (4.69k Size == 3 || 4.69k (Size > 4 && 4.46k (Size % 4 != 0)2.69k )) |
2508 | 558 | return false; |
2509 | 4.13k | |
2510 | 4.13k | return true; |
2511 | 4.13k | } |
2512 | | |
2513 | | // Replace load of an illegal type with a store of a bitcast to a friendlier |
2514 | | // type. |
2515 | | SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, |
2516 | 130k | DAGCombinerInfo &DCI) const { |
2517 | 130k | if (!DCI.isBeforeLegalize()) |
2518 | 33.5k | return SDValue(); |
2519 | 97.2k | |
2520 | 97.2k | LoadSDNode *LN = cast<LoadSDNode>(N); |
2521 | 97.2k | if (LN->isVolatile() || 97.2k !ISD::isNormalLoad(LN)92.1k || hasVolatileUser(LN)81.4k ) |
2522 | 19.3k | return SDValue(); |
2523 | 77.9k | |
2524 | 77.9k | SDLoc SL(N); |
2525 | 77.9k | SelectionDAG &DAG = DCI.DAG; |
2526 | 77.9k | EVT VT = LN->getMemoryVT(); |
2527 | 77.9k | |
2528 | 77.9k | unsigned Size = VT.getStoreSize(); |
2529 | 77.9k | unsigned Align = LN->getAlignment(); |
2530 | 77.9k | if (Align < Size && 77.9k isTypeLegal(VT)4.32k ) { |
2531 | 4.05k | bool IsFast; |
2532 | 4.05k | unsigned AS = LN->getAddressSpace(); |
2533 | 4.05k | |
2534 | 4.05k | // Expand unaligned loads earlier than legalization. Due to visitation order |
2535 | 4.05k | // problems during legalization, the emitted instructions to pack and unpack |
2536 | 4.05k | // the bytes again are not eliminated in the case of an unaligned copy. |
2537 | 4.05k | if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)4.05k ) { |
2538 | 286 | if (VT.isVector()) |
2539 | 55 | return scalarizeVectorLoad(LN, DAG); |
2540 | 231 | |
2541 | 231 | SDValue Ops[2]; |
2542 | 231 | std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); |
2543 | 231 | return DAG.getMergeValues(Ops, SDLoc(N)); |
2544 | 231 | } |
2545 | 3.77k | |
2546 | 3.77k | if (3.77k !IsFast3.77k ) |
2547 | 20 | return SDValue(); |
2548 | 77.5k | } |
2549 | 77.5k | |
2550 | 77.5k | if (77.5k !shouldCombineMemoryType(VT)77.5k ) |
2551 | 75.7k | return SDValue(); |
2552 | 1.89k | |
2553 | 1.89k | EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); |
2554 | 1.89k | |
2555 | 1.89k | SDValue NewLoad |
2556 | 1.89k | = DAG.getLoad(NewVT, SL, LN->getChain(), |
2557 | 1.89k | LN->getBasePtr(), LN->getMemOperand()); |
2558 | 1.89k | |
2559 | 1.89k | SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad); |
2560 | 1.89k | DCI.CombineTo(N, BC, NewLoad.getValue(1)); |
2561 | 1.89k | return SDValue(N, 0); |
2562 | 1.89k | } |
2563 | | |
2564 | | // Replace store of an illegal type with a store of a bitcast to a friendlier |
2565 | | // type. |
2566 | | SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, |
2567 | 87.3k | DAGCombinerInfo &DCI) const { |
2568 | 87.3k | if (!DCI.isBeforeLegalize()) |
2569 | 33.6k | return SDValue(); |
2570 | 53.6k | |
2571 | 53.6k | StoreSDNode *SN = cast<StoreSDNode>(N); |
2572 | 53.6k | if (SN->isVolatile() || 53.6k !ISD::isNormalStore(SN)49.5k ) |
2573 | 12.8k | return SDValue(); |
2574 | 40.7k | |
2575 | 40.7k | EVT VT = SN->getMemoryVT(); |
2576 | 40.7k | unsigned Size = VT.getStoreSize(); |
2577 | 40.7k | |
2578 | 40.7k | SDLoc SL(N); |
2579 | 40.7k | SelectionDAG &DAG = DCI.DAG; |
2580 | 40.7k | unsigned Align = SN->getAlignment(); |
2581 | 40.7k | if (Align < Size && 40.7k isTypeLegal(VT)4.18k ) { |
2582 | 3.89k | bool IsFast; |
2583 | 3.89k | unsigned AS = SN->getAddressSpace(); |
2584 | 3.89k | |
2585 | 3.89k | // Expand unaligned stores earlier than legalization. Due to visitation |
2586 | 3.89k | // order problems during legalization, the emitted instructions to pack and |
2587 | 3.89k | // unpack the bytes again are not eliminated in the case of an unaligned |
2588 | 3.89k | // copy. |
2589 | 3.89k | if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)3.89k ) { |
2590 | 307 | if (VT.isVector()) |
2591 | 70 | return scalarizeVectorStore(SN, DAG); |
2592 | 237 | |
2593 | 237 | return expandUnalignedStore(SN, DAG); |
2594 | 237 | } |
2595 | 3.58k | |
2596 | 3.58k | if (3.58k !IsFast3.58k ) |
2597 | 0 | return SDValue(); |
2598 | 40.4k | } |
2599 | 40.4k | |
2600 | 40.4k | if (40.4k !shouldCombineMemoryType(VT)40.4k ) |
2601 | 38.2k | return SDValue(); |
2602 | 2.23k | |
2603 | 2.23k | EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); |
2604 | 2.23k | SDValue Val = SN->getValue(); |
2605 | 2.23k | |
2606 | 2.23k | //DCI.AddToWorklist(Val.getNode()); |
2607 | 2.23k | |
2608 | 2.23k | bool OtherUses = !Val.hasOneUse(); |
2609 | 2.23k | SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); |
2610 | 2.23k | if (OtherUses2.23k ) { |
2611 | 31 | SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal); |
2612 | 31 | DAG.ReplaceAllUsesOfValueWith(Val, CastBack); |
2613 | 31 | } |
2614 | 87.3k | |
2615 | 87.3k | return DAG.getStore(SN->getChain(), SL, CastVal, |
2616 | 87.3k | SN->getBasePtr(), SN->getMemOperand()); |
2617 | 87.3k | } |
2618 | | |
2619 | | SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N, |
2620 | 321 | DAGCombinerInfo &DCI) const { |
2621 | 321 | ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); |
2622 | 321 | if (!CSrc) |
2623 | 297 | return SDValue(); |
2624 | 24 | |
2625 | 24 | const APFloat &F = CSrc->getValueAPF(); |
2626 | 24 | APFloat Zero = APFloat::getZero(F.getSemantics()); |
2627 | 24 | APFloat::cmpResult Cmp0 = F.compare(Zero); |
2628 | 24 | if (Cmp0 == APFloat::cmpLessThan || |
2629 | 24 | (Cmp0 == APFloat::cmpUnordered && 21 Subtarget->enableDX10Clamp()12 )) { |
2630 | 9 | return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); |
2631 | 9 | } |
2632 | 15 | |
2633 | 15 | APFloat One(F.getSemantics(), "1.0"); |
2634 | 15 | APFloat::cmpResult Cmp1 = F.compare(One); |
2635 | 15 | if (Cmp1 == APFloat::cmpGreaterThan) |
2636 | 3 | return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); |
2637 | 12 | |
2638 | 12 | return SDValue(CSrc, 0); |
2639 | 12 | } |
2640 | | |
2641 | | // FIXME: This should go in generic DAG combiner with an isTruncateFree check, |
2642 | | // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU |
2643 | | // issues. |
2644 | | SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, |
2645 | 9.56k | DAGCombinerInfo &DCI) const { |
2646 | 9.56k | SelectionDAG &DAG = DCI.DAG; |
2647 | 9.56k | SDValue N0 = N->getOperand(0); |
2648 | 9.56k | |
2649 | 9.56k | // (vt2 (assertzext (truncate vt0:x), vt1)) -> |
2650 | 9.56k | // (vt2 (truncate (assertzext vt0:x, vt1))) |
2651 | 9.56k | if (N0.getOpcode() == ISD::TRUNCATE9.56k ) { |
2652 | 27 | SDValue N1 = N->getOperand(1); |
2653 | 27 | EVT ExtVT = cast<VTSDNode>(N1)->getVT(); |
2654 | 27 | SDLoc SL(N); |
2655 | 27 | |
2656 | 27 | SDValue Src = N0.getOperand(0); |
2657 | 27 | EVT SrcVT = Src.getValueType(); |
2658 | 27 | if (SrcVT.bitsGE(ExtVT)27 ) { |
2659 | 27 | SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1); |
2660 | 27 | return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg); |
2661 | 27 | } |
2662 | 9.53k | } |
2663 | 9.53k | |
2664 | 9.53k | return SDValue(); |
2665 | 9.53k | } |
2666 | | /// Split the 64-bit value \p LHS into two 32-bit components, and perform the |
2667 | | /// binary operation \p Opc to it with the corresponding constant operands. |
2668 | | SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( |
2669 | | DAGCombinerInfo &DCI, const SDLoc &SL, |
2670 | | unsigned Opc, SDValue LHS, |
2671 | 1.13k | uint32_t ValLo, uint32_t ValHi) const { |
2672 | 1.13k | SelectionDAG &DAG = DCI.DAG; |
2673 | 1.13k | SDValue Lo, Hi; |
2674 | 1.13k | std::tie(Lo, Hi) = split64BitValue(LHS, DAG); |
2675 | 1.13k | |
2676 | 1.13k | SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32); |
2677 | 1.13k | SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32); |
2678 | 1.13k | |
2679 | 1.13k | SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS); |
2680 | 1.13k | SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS); |
2681 | 1.13k | |
2682 | 1.13k | // Re-visit the ands. It's possible we eliminated one of them and it could |
2683 | 1.13k | // simplify the vector. |
2684 | 1.13k | DCI.AddToWorklist(Lo.getNode()); |
2685 | 1.13k | DCI.AddToWorklist(Hi.getNode()); |
2686 | 1.13k | |
2687 | 1.13k | SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd}); |
2688 | 1.13k | return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); |
2689 | 1.13k | } |
2690 | | |
2691 | | SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, |
2692 | 18.2k | DAGCombinerInfo &DCI) const { |
2693 | 18.2k | EVT VT = N->getValueType(0); |
2694 | 18.2k | |
2695 | 18.2k | ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); |
2696 | 18.2k | if (!RHS) |
2697 | 3.54k | return SDValue(); |
2698 | 14.7k | |
2699 | 14.7k | SDValue LHS = N->getOperand(0); |
2700 | 14.7k | unsigned RHSVal = RHS->getZExtValue(); |
2701 | 14.7k | if (!RHSVal) |
2702 | 0 | return LHS; |
2703 | 14.7k | |
2704 | 14.7k | SDLoc SL(N); |
2705 | 14.7k | SelectionDAG &DAG = DCI.DAG; |
2706 | 14.7k | |
2707 | 14.7k | switch (LHS->getOpcode()) { |
2708 | 10.5k | default: |
2709 | 10.5k | break; |
2710 | 3.76k | case ISD::ZERO_EXTEND: |
2711 | 3.76k | case ISD::SIGN_EXTEND: |
2712 | 3.76k | case ISD::ANY_EXTEND: { |
2713 | 3.76k | SDValue X = LHS->getOperand(0); |
2714 | 3.76k | |
2715 | 3.76k | if (VT == MVT::i32 && 3.76k RHSVal == 16507 && X.getValueType() == MVT::i16446 && |
2716 | 3.76k | isTypeLegal(MVT::v2i16)446 ) { |
2717 | 11 | // Prefer build_vector as the canonical form if packed types are legal. |
2718 | 11 | // (shl ([asz]ext i16:x), 16 -> build_vector 0, x |
2719 | 11 | SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, |
2720 | 11 | { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) }); |
2721 | 11 | return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); |
2722 | 11 | } |
2723 | 3.75k | |
2724 | 3.75k | // shl (ext x) => zext (shl x), if shift does not overflow int |
2725 | 3.75k | if (3.75k VT != MVT::i643.75k ) |
2726 | 556 | break; |
2727 | 3.20k | KnownBits Known; |
2728 | 3.20k | DAG.computeKnownBits(X, Known); |
2729 | 3.20k | unsigned LZ = Known.countMinLeadingZeros(); |
2730 | 3.20k | if (LZ < RHSVal) |
2731 | 785 | break; |
2732 | 2.41k | EVT XVT = X.getValueType(); |
2733 | 2.41k | SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); |
2734 | 2.41k | return DAG.getZExtOrTrunc(Shl, SL, VT); |
2735 | 2.41k | } |
2736 | 252 | case ISD::OR: |
2737 | 252 | if (!isOrEquivalentToAdd(DAG, LHS)) |
2738 | 18 | break; |
2739 | 234 | LLVM_FALLTHROUGH234 ; |
2740 | 387 | case ISD::ADD: { |
2741 | 387 | // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) |
2742 | 387 | if (ConstantSDNode *C2387 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { |
2743 | 40 | SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0), |
2744 | 40 | SDValue(RHS, 0)); |
2745 | 40 | SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal, |
2746 | 40 | SDLoc(C2), VT); |
2747 | 40 | return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V); |
2748 | 40 | } |
2749 | 347 | break; |
2750 | 347 | } |
2751 | 12.2k | } |
2752 | 12.2k | |
2753 | 12.2k | if (12.2k VT != MVT::i6412.2k ) |
2754 | 9.55k | return SDValue(); |
2755 | 2.69k | |
2756 | 2.69k | // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) |
2757 | 2.69k | |
2758 | 2.69k | // On some subtargets, 64-bit shift is a quarter rate instruction. In the |
2759 | 2.69k | // common case, splitting this into a move and a 32-bit shift is faster and |
2760 | 2.69k | // the same code size. |
2761 | 2.69k | if (2.69k RHSVal < 322.69k ) |
2762 | 2.13k | return SDValue(); |
2763 | 558 | |
2764 | 558 | SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); |
2765 | 558 | |
2766 | 558 | SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); |
2767 | 558 | SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt); |
2768 | 558 | |
2769 | 558 | const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); |
2770 | 558 | |
2771 | 558 | SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift}); |
2772 | 558 | return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); |
2773 | 558 | } |
2774 | | |
2775 | | SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, |
2776 | 5.29k | DAGCombinerInfo &DCI) const { |
2777 | 5.29k | if (N->getValueType(0) != MVT::i64) |
2778 | 4.84k | return SDValue(); |
2779 | 446 | |
2780 | 446 | const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); |
2781 | 446 | if (!RHS) |
2782 | 20 | return SDValue(); |
2783 | 426 | |
2784 | 426 | SelectionDAG &DAG = DCI.DAG; |
2785 | 426 | SDLoc SL(N); |
2786 | 426 | unsigned RHSVal = RHS->getZExtValue(); |
2787 | 426 | |
2788 | 426 | // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31) |
2789 | 426 | if (RHSVal == 32426 ) { |
2790 | 6 | SDValue Hi = getHiHalf64(N->getOperand(0), DAG); |
2791 | 6 | SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, |
2792 | 6 | DAG.getConstant(31, SL, MVT::i32)); |
2793 | 6 | |
2794 | 6 | SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift}); |
2795 | 6 | return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); |
2796 | 6 | } |
2797 | 420 | |
2798 | 420 | // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31) |
2799 | 420 | if (420 RHSVal == 63420 ) { |
2800 | 125 | SDValue Hi = getHiHalf64(N->getOperand(0), DAG); |
2801 | 125 | SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, |
2802 | 125 | DAG.getConstant(31, SL, MVT::i32)); |
2803 | 125 | SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift}); |
2804 | 125 | return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); |
2805 | 125 | } |
2806 | 295 | |
2807 | 295 | return SDValue(); |
2808 | 295 | } |
2809 | | |
2810 | | SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, |
2811 | 53.0k | DAGCombinerInfo &DCI) const { |
2812 | 53.0k | if (N->getValueType(0) != MVT::i64) |
2813 | 44.8k | return SDValue(); |
2814 | 8.20k | |
2815 | 8.20k | const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); |
2816 | 8.20k | if (!RHS) |
2817 | 372 | return SDValue(); |
2818 | 7.83k | |
2819 | 7.83k | unsigned ShiftAmt = RHS->getZExtValue(); |
2820 | 7.83k | if (ShiftAmt < 32) |
2821 | 155 | return SDValue(); |
2822 | 7.68k | |
2823 | 7.68k | // srl i64:x, C for C >= 32 |
2824 | 7.68k | // => |
2825 | 7.68k | // build_pair (srl hi_32(x), C - 32), 0 |
2826 | 7.68k | |
2827 | 7.68k | SelectionDAG &DAG = DCI.DAG; |
2828 | 7.68k | SDLoc SL(N); |
2829 | 7.68k | |
2830 | 7.68k | SDValue One = DAG.getConstant(1, SL, MVT::i32); |
2831 | 7.68k | SDValue Zero = DAG.getConstant(0, SL, MVT::i32); |
2832 | 7.68k | |
2833 | 7.68k | SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0)); |
2834 | 7.68k | SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, |
2835 | 7.68k | VecOp, One); |
2836 | 7.68k | |
2837 | 7.68k | SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32); |
2838 | 7.68k | SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); |
2839 | 7.68k | |
2840 | 7.68k | SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero}); |
2841 | 7.68k | |
2842 | 7.68k | return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); |
2843 | 7.68k | } |
2844 | | |
2845 | | // We need to specifically handle i64 mul here to avoid unnecessary conversion |
2846 | | // instructions. If we only match on the legalized i64 mul expansion, |
2847 | | // SimplifyDemandedBits will be unable to remove them because there will be |
2848 | | // multiple uses due to the separate mul + mulh[su]. |
2849 | | static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, |
2850 | 325 | SDValue N0, SDValue N1, unsigned Size, bool Signed) { |
2851 | 325 | if (Size <= 32325 ) { |
2852 | 292 | unsigned MulOpc = Signed ? AMDGPUISD::MUL_I2414 : AMDGPUISD::MUL_U24278 ; |
2853 | 292 | return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1); |
2854 | 292 | } |
2855 | 33 | |
2856 | 33 | // Because we want to eliminate extension instructions before the |
2857 | 33 | // operation, we need to create a single user here (i.e. not the separate |
2858 | 33 | // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it. |
2859 | 33 | |
2860 | 33 | unsigned MulOpc = Signed ? 33 AMDGPUISD::MUL_LOHI_I2415 : AMDGPUISD::MUL_LOHI_U2418 ; |
2861 | 325 | |
2862 | 325 | SDValue Mul = DAG.getNode(MulOpc, SL, |
2863 | 325 | DAG.getVTList(MVT::i32, MVT::i32), N0, N1); |
2864 | 325 | |
2865 | 325 | return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, |
2866 | 325 | Mul.getValue(0), Mul.getValue(1)); |
2867 | 325 | } |
2868 | | |
2869 | | SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, |
2870 | 1.90k | DAGCombinerInfo &DCI) const { |
2871 | 1.90k | EVT VT = N->getValueType(0); |
2872 | 1.90k | |
2873 | 1.90k | unsigned Size = VT.getSizeInBits(); |
2874 | 1.90k | if (VT.isVector() || 1.90k Size > 641.79k ) |
2875 | 128 | return SDValue(); |
2876 | 1.77k | |
2877 | 1.77k | // There are i16 integer mul/mad. |
2878 | 1.77k | if (1.77k Subtarget->has16BitInsts() && 1.77k VT.getScalarType().bitsLE(MVT::i16)713 ) |
2879 | 51 | return SDValue(); |
2880 | 1.72k | |
2881 | 1.72k | SelectionDAG &DAG = DCI.DAG; |
2882 | 1.72k | SDLoc DL(N); |
2883 | 1.72k | |
2884 | 1.72k | SDValue N0 = N->getOperand(0); |
2885 | 1.72k | SDValue N1 = N->getOperand(1); |
2886 | 1.72k | SDValue Mul; |
2887 | 1.72k | |
2888 | 1.72k | if (Subtarget->hasMulU24() && 1.72k isU24(N0, DAG)1.72k && isU24(N1, DAG)339 ) { |
2889 | 296 | N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); |
2890 | 296 | N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); |
2891 | 296 | Mul = getMul24(DAG, DL, N0, N1, Size, false); |
2892 | 1.72k | } else if (1.43k Subtarget->hasMulI24() && 1.43k isI24(N0, DAG)1.06k && isI24(N1, DAG)76 ) { |
2893 | 29 | N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); |
2894 | 29 | N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); |
2895 | 29 | Mul = getMul24(DAG, DL, N0, N1, Size, true); |
2896 | 1.43k | } else { |
2897 | 1.40k | return SDValue(); |
2898 | 1.40k | } |
2899 | 325 | |
2900 | 325 | // We need to use sext even for MUL_U24, because MUL_U24 is used |
2901 | 325 | // for signed multiply of 8 and 16-bit types. |
2902 | 325 | return DAG.getSExtOrTrunc(Mul, DL, VT); |
2903 | 325 | } |
2904 | | |
2905 | | SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, |
2906 | 73 | DAGCombinerInfo &DCI) const { |
2907 | 73 | EVT VT = N->getValueType(0); |
2908 | 73 | |
2909 | 73 | if (!Subtarget->hasMulI24() || 73 VT.isVector()45 ) |
2910 | 28 | return SDValue(); |
2911 | 45 | |
2912 | 45 | SelectionDAG &DAG = DCI.DAG; |
2913 | 45 | SDLoc DL(N); |
2914 | 45 | |
2915 | 45 | SDValue N0 = N->getOperand(0); |
2916 | 45 | SDValue N1 = N->getOperand(1); |
2917 | 45 | |
2918 | 45 | if (!isI24(N0, DAG) || 45 !isI24(N1, DAG)0 ) |
2919 | 45 | return SDValue(); |
2920 | 0 |
|
2921 | 0 | N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); |
2922 | 0 | N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); |
2923 | 0 |
|
2924 | 0 | SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1); |
2925 | 0 | DCI.AddToWorklist(Mulhi.getNode()); |
2926 | 0 | return DAG.getSExtOrTrunc(Mulhi, DL, VT); |
2927 | 0 | } |
2928 | | |
2929 | | SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, |
2930 | 1.07k | DAGCombinerInfo &DCI) const { |
2931 | 1.07k | EVT VT = N->getValueType(0); |
2932 | 1.07k | |
2933 | 1.07k | if (!Subtarget->hasMulU24() || 1.07k VT.isVector()1.07k || VT.getSizeInBits() > 321.07k ) |
2934 | 0 | return SDValue(); |
2935 | 1.07k | |
2936 | 1.07k | SelectionDAG &DAG = DCI.DAG; |
2937 | 1.07k | SDLoc DL(N); |
2938 | 1.07k | |
2939 | 1.07k | SDValue N0 = N->getOperand(0); |
2940 | 1.07k | SDValue N1 = N->getOperand(1); |
2941 | 1.07k | |
2942 | 1.07k | if (!isU24(N0, DAG) || 1.07k !isU24(N1, DAG)0 ) |
2943 | 1.07k | return SDValue(); |
2944 | 0 |
|
2945 | 0 | N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); |
2946 | 0 | N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); |
2947 | 0 |
|
2948 | 0 | SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1); |
2949 | 0 | DCI.AddToWorklist(Mulhi.getNode()); |
2950 | 0 | return DAG.getZExtOrTrunc(Mulhi, DL, VT); |
2951 | 0 | } |
2952 | | |
2953 | | SDValue AMDGPUTargetLowering::performMulLoHi24Combine( |
2954 | 121 | SDNode *N, DAGCombinerInfo &DCI) const { |
2955 | 121 | SelectionDAG &DAG = DCI.DAG; |
2956 | 121 | |
2957 | 121 | // Simplify demanded bits before splitting into multiple users. |
2958 | 121 | if (simplifyI24(N, 0, DCI) || 121 simplifyI24(N, 1, DCI)73 ) |
2959 | 88 | return SDValue(); |
2960 | 33 | |
2961 | 33 | SDValue N0 = N->getOperand(0); |
2962 | 33 | SDValue N1 = N->getOperand(1); |
2963 | 33 | |
2964 | 33 | bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24); |
2965 | 33 | |
2966 | 33 | unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I2415 : AMDGPUISD::MUL_U2418 ; |
2967 | 33 | unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I2415 : AMDGPUISD::MULHI_U2418 ; |
2968 | 121 | |
2969 | 121 | SDLoc SL(N); |
2970 | 121 | |
2971 | 121 | SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1); |
2972 | 121 | SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1); |
2973 | 121 | return DAG.getMergeValues({ MulLo, MulHi }, SL); |
2974 | 121 | } |
2975 | | |
2976 | 25 | static bool isNegativeOne(SDValue Val) { |
2977 | 25 | if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) |
2978 | 25 | return C->isAllOnesValue(); |
2979 | 0 | return false; |
2980 | 0 | } |
2981 | | |
2982 | 2.50k | static bool isCtlzOpc(unsigned Opc) { |
2983 | 2.49k | return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; |
2984 | 2.50k | } |
2985 | | |
2986 | | SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, |
2987 | | SDValue Op, |
2988 | 21 | const SDLoc &DL) const { |
2989 | 21 | EVT VT = Op.getValueType(); |
2990 | 21 | EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); |
2991 | 21 | if (LegalVT != MVT::i32 && 21 (Subtarget->has16BitInsts() && |
2992 | 3 | LegalVT != MVT::i16)) |
2993 | 0 | return SDValue(); |
2994 | 21 | |
2995 | 21 | if (21 VT != MVT::i3221 ) |
2996 | 11 | Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); |
2997 | 21 | |
2998 | 21 | SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op); |
2999 | 21 | if (VT != MVT::i32) |
3000 | 11 | FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH); |
3001 | 21 | |
3002 | 21 | return FFBH; |
3003 | 21 | } |
3004 | | |
3005 | | // The native instructions return -1 on 0 input. Optimize out a select that |
3006 | | // produces -1 on 0. |
3007 | | // |
3008 | | // TODO: If zero is not undef, we could also do this if the output is compared |
3009 | | // against the bitwidth. |
3010 | | // |
3011 | | // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. |
3012 | | SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, |
3013 | | SDValue LHS, SDValue RHS, |
3014 | 12.2k | DAGCombinerInfo &DCI) const { |
3015 | 12.2k | ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); |
3016 | 12.2k | if (!CmpRhs || 12.2k !CmpRhs->isNullValue()5.33k ) |
3017 | 8.35k | return SDValue(); |
3018 | 3.88k | |
3019 | 3.88k | SelectionDAG &DAG = DCI.DAG; |
3020 | 3.88k | ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); |
3021 | 3.88k | SDValue CmpLHS = Cond.getOperand(0); |
3022 | 3.88k | |
3023 | 3.88k | // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x |
3024 | 3.88k | if (CCOpcode == ISD::SETEQ && |
3025 | 2.05k | isCtlzOpc(RHS.getOpcode()) && |
3026 | 2 | RHS.getOperand(0) == CmpLHS && |
3027 | 3.88k | isNegativeOne(LHS)2 ) { |
3028 | 2 | return getFFBH_U32(DAG, CmpLHS, SL); |
3029 | 2 | } |
3030 | 3.87k | |
3031 | 3.87k | // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x |
3032 | 3.87k | if (3.87k CCOpcode == ISD::SETNE && |
3033 | 455 | isCtlzOpc(LHS.getOpcode()) && |
3034 | 23 | LHS.getOperand(0) == CmpLHS && |
3035 | 3.87k | isNegativeOne(RHS)23 ) { |
3036 | 19 | return getFFBH_U32(DAG, CmpLHS, SL); |
3037 | 19 | } |
3038 | 3.86k | |
3039 | 3.86k | return SDValue(); |
3040 | 3.86k | } |
3041 | | |
3042 | | static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, |
3043 | | unsigned Op, |
3044 | | const SDLoc &SL, |
3045 | | SDValue Cond, |
3046 | | SDValue N1, |
3047 | 24 | SDValue N2) { |
3048 | 24 | SelectionDAG &DAG = DCI.DAG; |
3049 | 24 | EVT VT = N1.getValueType(); |
3050 | 24 | |
3051 | 24 | SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, |
3052 | 24 | N1.getOperand(0), N2.getOperand(0)); |
3053 | 24 | DCI.AddToWorklist(NewSelect.getNode()); |
3054 | 24 | return DAG.getNode(Op, SL, VT, NewSelect); |
3055 | 24 | } |
3056 | | |
3057 | | // Pull a free FP operation out of a select so it may fold into uses. |
3058 | | // |
3059 | | // select c, (fneg x), (fneg y) -> fneg (select c, x, y) |
3060 | | // select c, (fneg x), k -> fneg (select c, x, (fneg k)) |
3061 | | // |
3062 | | // select c, (fabs x), (fabs y) -> fabs (select c, x, y) |
3063 | | // select c, (fabs x), +k -> fabs (select c, x, k) |
3064 | | static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, |
3065 | 13.6k | SDValue N) { |
3066 | 13.6k | SelectionDAG &DAG = DCI.DAG; |
3067 | 13.6k | SDValue Cond = N.getOperand(0); |
3068 | 13.6k | SDValue LHS = N.getOperand(1); |
3069 | 13.6k | SDValue RHS = N.getOperand(2); |
3070 | 13.6k | |
3071 | 13.6k | EVT VT = N.getValueType(); |
3072 | 13.6k | if ((LHS.getOpcode() == ISD::FABS && 13.6k RHS.getOpcode() == ISD::FABS48 ) || |
3073 | 13.6k | (LHS.getOpcode() == ISD::FNEG && 13.6k RHS.getOpcode() == ISD::FNEG104 )) { |
3074 | 24 | return distributeOpThroughSelect(DCI, LHS.getOpcode(), |
3075 | 24 | SDLoc(N), Cond, LHS, RHS); |
3076 | 24 | } |
3077 | 13.5k | |
3078 | 13.5k | bool Inv = false; |
3079 | 13.5k | if (RHS.getOpcode() == ISD::FABS || 13.5k RHS.getOpcode() == ISD::FNEG13.5k ) { |
3080 | 42 | std::swap(LHS, RHS); |
3081 | 42 | Inv = true; |
3082 | 42 | } |
3083 | 13.5k | |
3084 | 13.5k | // TODO: Support vector constants. |
3085 | 13.5k | ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); |
3086 | 13.5k | if ((LHS.getOpcode() == ISD::FNEG || 13.5k LHS.getOpcode() == ISD::FABS13.4k ) && CRHS154 ) { |
3087 | 78 | SDLoc SL(N); |
3088 | 78 | // If one side is an fneg/fabs and the other is a constant, we can push the |
3089 | 78 | // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. |
3090 | 78 | SDValue NewLHS = LHS.getOperand(0); |
3091 | 78 | SDValue NewRHS = RHS; |
3092 | 78 | |
3093 | 78 | // Careful: if the neg can be folded up, don't try to pull it back down. |
3094 | 78 | bool ShouldFoldNeg = true; |
3095 | 78 | |
3096 | 78 | if (NewLHS.hasOneUse()78 ) { |
3097 | 68 | unsigned Opc = NewLHS.getOpcode(); |
3098 | 68 | if (LHS.getOpcode() == ISD::FNEG && 68 fnegFoldsIntoOp(Opc)34 ) |
3099 | 10 | ShouldFoldNeg = false; |
3100 | 68 | if (LHS.getOpcode() == ISD::FABS && 68 Opc == ISD::FMUL34 ) |
3101 | 0 | ShouldFoldNeg = false; |
3102 | 68 | } |
3103 | 78 | |
3104 | 78 | if (ShouldFoldNeg78 ) { |
3105 | 68 | if (LHS.getOpcode() == ISD::FNEG) |
3106 | 34 | NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); |
3107 | 34 | else if (34 CRHS->isNegative()34 ) |
3108 | 26 | return SDValue(); |
3109 | 42 | |
3110 | 42 | if (42 Inv42 ) |
3111 | 12 | std::swap(NewLHS, NewRHS); |
3112 | 68 | |
3113 | 68 | SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, |
3114 | 68 | Cond, NewLHS, NewRHS); |
3115 | 68 | DCI.AddToWorklist(NewSelect.getNode()); |
3116 | 68 | return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); |
3117 | 68 | } |
3118 | 78 | } |
3119 | 13.5k | |
3120 | 13.5k | return SDValue(); |
3121 | 13.5k | } |
3122 | | |
3123 | | |
3124 | | SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, |
3125 | 13.6k | DAGCombinerInfo &DCI) const { |
3126 | 13.6k | if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) |
3127 | 66 | return Folded; |
3128 | 13.5k | |
3129 | 13.5k | SDValue Cond = N->getOperand(0); |
3130 | 13.5k | if (Cond.getOpcode() != ISD::SETCC) |
3131 | 775 | return SDValue(); |
3132 | 12.7k | |
3133 | 12.7k | EVT VT = N->getValueType(0); |
3134 | 12.7k | SDValue LHS = Cond.getOperand(0); |
3135 | 12.7k | SDValue RHS = Cond.getOperand(1); |
3136 | 12.7k | SDValue CC = Cond.getOperand(2); |
3137 | 12.7k | |
3138 | 12.7k | SDValue True = N->getOperand(1); |
3139 | 12.7k | SDValue False = N->getOperand(2); |
3140 | 12.7k | |
3141 | 12.7k | if (Cond.hasOneUse()12.7k ) { // TODO: Look for multiple select uses. |
3142 | 3.52k | SelectionDAG &DAG = DCI.DAG; |
3143 | 3.52k | if ((DAG.isConstantValueOfAnyType(True) || |
3144 | 2.37k | DAG.isConstantValueOfAnyType(True)) && |
3145 | 1.15k | (!DAG.isConstantValueOfAnyType(False) && |
3146 | 3.52k | !DAG.isConstantValueOfAnyType(False)228 )) { |
3147 | 228 | // Swap cmp + select pair to move constant to false input. |
3148 | 228 | // This will allow using VOPC cndmasks more often. |
3149 | 228 | // select (setcc x, y), k, x -> select (setcc y, x) x, x |
3150 | 228 | |
3151 | 228 | SDLoc SL(N); |
3152 | 228 | ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), |
3153 | 228 | LHS.getValueType().isInteger()); |
3154 | 228 | |
3155 | 228 | SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); |
3156 | 228 | return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); |
3157 | 228 | } |
3158 | 3.29k | |
3159 | 3.29k | if (3.29k VT == MVT::f32 && 3.29k Subtarget->hasFminFmaxLegacy()572 ) { |
3160 | 314 | SDValue MinMax |
3161 | 314 | = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); |
3162 | 314 | // Revisit this node so we can catch min3/max3/med3 patterns. |
3163 | 314 | //DCI.AddToWorklist(MinMax.getNode()); |
3164 | 314 | return MinMax; |
3165 | 314 | } |
3166 | 12.2k | } |
3167 | 12.2k | |
3168 | 12.2k | // There's no reason to not do this if the condition has other uses. |
3169 | 12.2k | return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); |
3170 | 12.2k | } |
3171 | | |
3172 | 63 | static bool isConstantFPZero(SDValue N) { |
3173 | 63 | if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) |
3174 | 46 | return C->isZero() && 46 !C->isNegative()21 ; |
3175 | 17 | return false; |
3176 | 17 | } |
3177 | | |
3178 | 46 | static unsigned inverseMinMax(unsigned Opc) { |
3179 | 46 | switch (Opc) { |
3180 | 20 | case ISD::FMAXNUM: |
3181 | 20 | return ISD::FMINNUM; |
3182 | 21 | case ISD::FMINNUM: |
3183 | 21 | return ISD::FMAXNUM; |
3184 | 2 | case AMDGPUISD::FMAX_LEGACY: |
3185 | 2 | return AMDGPUISD::FMIN_LEGACY; |
3186 | 3 | case AMDGPUISD::FMIN_LEGACY: |
3187 | 3 | return AMDGPUISD::FMAX_LEGACY; |
3188 | 0 | default: |
3189 | 0 | llvm_unreachable("invalid min/max opcode"); |
3190 | 0 | } |
3191 | 0 | } |
3192 | | |
3193 | | SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, |
3194 | 2.81k | DAGCombinerInfo &DCI) const { |
3195 | 2.81k | SelectionDAG &DAG = DCI.DAG; |
3196 | 2.81k | SDValue N0 = N->getOperand(0); |
3197 | 2.81k | EVT VT = N->getValueType(0); |
3198 | 2.81k | |
3199 | 2.81k | unsigned Opc = N0.getOpcode(); |
3200 | 2.81k | |
3201 | 2.81k | // If the input has multiple uses and we can either fold the negate down, or |
3202 | 2.81k | // the other uses cannot, give up. This both prevents unprofitable |
3203 | 2.81k | // transformations and infinite loops: we won't repeatedly try to fold around |
3204 | 2.81k | // a negate that has no 'good' form. |
3205 | 2.81k | if (N0.hasOneUse()2.81k ) { |
3206 | 2.11k | // This may be able to fold into the source, but at a code size cost. Don't |
3207 | 2.11k | // fold if the fold into the user is free. |
3208 | 2.11k | if (allUsesHaveSourceMods(N, 0)) |
3209 | 762 | return SDValue(); |
3210 | 694 | } else { |
3211 | 694 | if (fnegFoldsIntoOp(Opc) && |
3212 | 198 | (allUsesHaveSourceMods(N) || 198 !allUsesHaveSourceMods(N0.getNode())43 )) |
3213 | 177 | return SDValue(); |
3214 | 1.87k | } |
3215 | 1.87k | |
3216 | 1.87k | SDLoc SL(N); |
3217 | 1.87k | switch (Opc) { |
3218 | 100 | case ISD::FADD: { |
3219 | 100 | if (!mayIgnoreSignedZero(N0)) |
3220 | 86 | return SDValue(); |
3221 | 14 | |
3222 | 14 | // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) |
3223 | 14 | SDValue LHS = N0.getOperand(0); |
3224 | 14 | SDValue RHS = N0.getOperand(1); |
3225 | 14 | |
3226 | 14 | if (LHS.getOpcode() != ISD::FNEG) |
3227 | 10 | LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); |
3228 | 14 | else |
3229 | 4 | LHS = LHS.getOperand(0); |
3230 | 14 | |
3231 | 14 | if (RHS.getOpcode() != ISD::FNEG) |
3232 | 12 | RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); |
3233 | 14 | else |
3234 | 2 | RHS = RHS.getOperand(0); |
3235 | 14 | |
3236 | 14 | SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags()); |
3237 | 14 | if (!N0.hasOneUse()) |
3238 | 2 | DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); |
3239 | 14 | return Res; |
3240 | 14 | } |
3241 | 39 | case ISD::FMUL: |
3242 | 39 | case AMDGPUISD::FMUL_LEGACY: { |
3243 | 39 | // (fneg (fmul x, y)) -> (fmul x, (fneg y)) |
3244 | 39 | // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y)) |
3245 | 39 | SDValue LHS = N0.getOperand(0); |
3246 | 39 | SDValue RHS = N0.getOperand(1); |
3247 | 39 | |
3248 | 39 | if (LHS.getOpcode() == ISD::FNEG) |
3249 | 8 | LHS = LHS.getOperand(0); |
3250 | 31 | else if (31 RHS.getOpcode() == ISD::FNEG31 ) |
3251 | 2 | RHS = RHS.getOperand(0); |
3252 | 31 | else |
3253 | 29 | RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); |
3254 | 39 | |
3255 | 39 | SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags()); |
3256 | 39 | if (!N0.hasOneUse()) |
3257 | 5 | DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); |
3258 | 39 | return Res; |
3259 | 39 | } |
3260 | 47 | case ISD::FMA: |
3261 | 47 | case ISD::FMAD: { |
3262 | 47 | if (!mayIgnoreSignedZero(N0)) |
3263 | 35 | return SDValue(); |
3264 | 12 | |
3265 | 12 | // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) |
3266 | 12 | SDValue LHS = N0.getOperand(0); |
3267 | 12 | SDValue MHS = N0.getOperand(1); |
3268 | 12 | SDValue RHS = N0.getOperand(2); |
3269 | 12 | |
3270 | 12 | if (LHS.getOpcode() == ISD::FNEG) |
3271 | 5 | LHS = LHS.getOperand(0); |
3272 | 7 | else if (7 MHS.getOpcode() == ISD::FNEG7 ) |
3273 | 1 | MHS = MHS.getOperand(0); |
3274 | 7 | else |
3275 | 6 | MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS); |
3276 | 12 | |
3277 | 12 | if (RHS.getOpcode() != ISD::FNEG) |
3278 | 10 | RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); |
3279 | 12 | else |
3280 | 2 | RHS = RHS.getOperand(0); |
3281 | 12 | |
3282 | 12 | SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); |
3283 | 12 | if (!N0.hasOneUse()) |
3284 | 1 | DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); |
3285 | 12 | return Res; |
3286 | 12 | } |
3287 | 63 | case ISD::FMAXNUM: |
3288 | 63 | case ISD::FMINNUM: |
3289 | 63 | case AMDGPUISD::FMAX_LEGACY: |
3290 | 63 | case AMDGPUISD::FMIN_LEGACY: { |
3291 | 63 | // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) |
3292 | 63 | // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y) |
3293 | 63 | // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y) |
3294 | 63 | // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y) |
3295 | 63 | |
3296 | 63 | SDValue LHS = N0.getOperand(0); |
3297 | 63 | SDValue RHS = N0.getOperand(1); |
3298 | 63 | |
3299 | 63 | // 0 doesn't have a negated inline immediate. |
3300 | 63 | // TODO: Shouldn't fold 1/2pi either, and should be generalized to other |
3301 | 63 | // operations. |
3302 | 63 | if (isConstantFPZero(RHS)) |
3303 | 17 | return SDValue(); |
3304 | 46 | |
3305 | 46 | SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); |
3306 | 46 | SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); |
3307 | 46 | unsigned Opposite = inverseMinMax(Opc); |
3308 | 46 | |
3309 | 46 | SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags()); |
3310 | 46 | if (!N0.hasOneUse()) |
3311 | 4 | DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); |
3312 | 46 | return Res; |
3313 | 46 | } |
3314 | 38 | case ISD::FP_EXTEND: |
3315 | 38 | case ISD::FTRUNC: |
3316 | 38 | case ISD::FRINT: |
3317 | 38 | case ISD::FNEARBYINT: // XXX - Should fround be handled? |
3318 | 38 | case ISD::FSIN: |
3319 | 38 | case AMDGPUISD::RCP: |
3320 | 38 | case AMDGPUISD::RCP_LEGACY: |
3321 | 38 | case AMDGPUISD::SIN_HW: { |
3322 | 38 | SDValue CvtSrc = N0.getOperand(0); |
3323 | 38 | if (CvtSrc.getOpcode() == ISD::FNEG38 ) { |
3324 | 6 | // (fneg (fp_extend (fneg x))) -> (fp_extend x) |
3325 | 6 | // (fneg (rcp (fneg x))) -> (rcp x) |
3326 | 6 | return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0)); |
3327 | 6 | } |
3328 | 32 | |
3329 | 32 | if (32 !N0.hasOneUse()32 ) |
3330 | 14 | return SDValue(); |
3331 | 18 | |
3332 | 18 | // (fneg (fp_extend x)) -> (fp_extend (fneg x)) |
3333 | 18 | // (fneg (rcp x)) -> (rcp (fneg x)) |
3334 | 18 | SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); |
3335 | 18 | return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags()); |
3336 | 18 | } |
3337 | 9 | case ISD::FP_ROUND: { |
3338 | 9 | SDValue CvtSrc = N0.getOperand(0); |
3339 | 9 | |
3340 | 9 | if (CvtSrc.getOpcode() == ISD::FNEG9 ) { |
3341 | 0 | // (fneg (fp_round (fneg x))) -> (fp_round x) |
3342 | 0 | return DAG.getNode(ISD::FP_ROUND, SL, VT, |
3343 | 0 | CvtSrc.getOperand(0), N0.getOperand(1)); |
3344 | 0 | } |
3345 | 9 | |
3346 | 9 | if (9 !N0.hasOneUse()9 ) |
3347 | 4 | return SDValue(); |
3348 | 5 | |
3349 | 5 | // (fneg (fp_round x)) -> (fp_round (fneg x)) |
3350 | 5 | SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); |
3351 | 5 | return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); |
3352 | 5 | } |
3353 | 36 | case ISD::FP16_TO_FP: { |
3354 | 36 | // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal |
3355 | 36 | // f16, but legalization of f16 fneg ends up pulling it out of the source. |
3356 | 36 | // Put the fneg back as a legal source operation that can be matched later. |
3357 | 36 | SDLoc SL(N); |
3358 | 36 | |
3359 | 36 | SDValue Src = N0.getOperand(0); |
3360 | 36 | EVT SrcVT = Src.getValueType(); |
3361 | 36 | |
3362 | 36 | // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000) |
3363 | 36 | SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src, |
3364 | 36 | DAG.getConstant(0x8000, SL, SrcVT)); |
3365 | 36 | return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); |
3366 | 5 | } |
3367 | 1.54k | default: |
3368 | 1.54k | return SDValue(); |
3369 | 0 | } |
3370 | 0 | } |
3371 | | |
3372 | | SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, |
3373 | 1.77k | DAGCombinerInfo &DCI) const { |
3374 | 1.77k | SelectionDAG &DAG = DCI.DAG; |
3375 | 1.77k | SDValue N0 = N->getOperand(0); |
3376 | 1.77k | |
3377 | 1.77k | if (!N0.hasOneUse()) |
3378 | 263 | return SDValue(); |
3379 | 1.51k | |
3380 | 1.51k | switch (N0.getOpcode()) { |
3381 | 32 | case ISD::FP16_TO_FP: { |
3382 | 32 | assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); |
3383 | 32 | SDLoc SL(N); |
3384 | 32 | SDValue Src = N0.getOperand(0); |
3385 | 32 | EVT SrcVT = Src.getValueType(); |
3386 | 32 | |
3387 | 32 | // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff) |
3388 | 32 | SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src, |
3389 | 32 | DAG.getConstant(0x7fff, SL, SrcVT)); |
3390 | 32 | return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs); |
3391 | 1.51k | } |
3392 | 1.47k | default: |
3393 | 1.47k | return SDValue(); |
3394 | 0 | } |
3395 | 0 | } |
3396 | | |
3397 | | SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, |
3398 | 584k | DAGCombinerInfo &DCI) const { |
3399 | 584k | SelectionDAG &DAG = DCI.DAG; |
3400 | 584k | SDLoc DL(N); |
3401 | 584k | |
3402 | 584k | switch(N->getOpcode()) { |
3403 | 126k | default: |
3404 | 126k | break; |
3405 | 85.6k | case ISD::BITCAST: { |
3406 | 85.6k | EVT DestVT = N->getValueType(0); |
3407 | 85.6k | |
3408 | 85.6k | // Push casts through vector builds. This helps avoid emitting a large |
3409 | 85.6k | // number of copies when materializing floating point vector constants. |
3410 | 85.6k | // |
3411 | 85.6k | // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) => |
3412 | 85.6k | // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) |
3413 | 85.6k | if (DestVT.isVector()85.6k ) { |
3414 | 24.6k | SDValue Src = N->getOperand(0); |
3415 | 24.6k | if (Src.getOpcode() == ISD::BUILD_VECTOR24.6k ) { |
3416 | 2.56k | EVT SrcVT = Src.getValueType(); |
3417 | 2.56k | unsigned NElts = DestVT.getVectorNumElements(); |
3418 | 2.56k | |
3419 | 2.56k | if (SrcVT.getVectorNumElements() == NElts2.56k ) { |
3420 | 1.67k | EVT DestEltVT = DestVT.getVectorElementType(); |
3421 | 1.67k | |
3422 | 1.67k | SmallVector<SDValue, 8> CastedElts; |
3423 | 1.67k | SDLoc SL(N); |
3424 | 10.9k | for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E10.9k ; ++I9.24k ) { |
3425 | 9.24k | SDValue Elt = Src.getOperand(I); |
3426 | 9.24k | CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt)); |
3427 | 9.24k | } |
3428 | 1.67k | |
3429 | 1.67k | return DAG.getBuildVector(DestVT, SL, CastedElts); |
3430 | 1.67k | } |
3431 | 84.0k | } |
3432 | 24.6k | } |
3433 | 84.0k | |
3434 | 84.0k | if (84.0k DestVT.getSizeInBits() != 64 && 84.0k !DestVT.isVector()35.5k ) |
3435 | 27.2k | break; |
3436 | 56.7k | |
3437 | 56.7k | // Fold bitcasts of constants. |
3438 | 56.7k | // |
3439 | 56.7k | // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k) |
3440 | 56.7k | // TODO: Generalize and move to DAGCombiner |
3441 | 56.7k | SDValue Src = N->getOperand(0); |
3442 | 56.7k | if (ConstantSDNode *C56.7k = dyn_cast<ConstantSDNode>(Src)) { |
3443 | 313 | assert(Src.getValueType() == MVT::i64); |
3444 | 313 | SDLoc SL(N); |
3445 | 313 | uint64_t CVal = C->getZExtValue(); |
3446 | 313 | return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, |
3447 | 313 | DAG.getConstant(Lo_32(CVal), SL, MVT::i32), |
3448 | 313 | DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); |
3449 | 313 | } |
3450 | 56.4k | |
3451 | 56.4k | if (ConstantFPSDNode *56.4k C56.4k = dyn_cast<ConstantFPSDNode>(Src)) { |
3452 | 18 | const APInt &Val = C->getValueAPF().bitcastToAPInt(); |
3453 | 18 | SDLoc SL(N); |
3454 | 18 | uint64_t CVal = Val.getZExtValue(); |
3455 | 18 | SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, |
3456 | 18 | DAG.getConstant(Lo_32(CVal), SL, MVT::i32), |
3457 | 18 | DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); |
3458 | 18 | |
3459 | 18 | return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec); |
3460 | 18 | } |
3461 | 56.4k | |
3462 | 56.4k | break; |
3463 | 56.4k | } |
3464 | 30.0k | case ISD::SHL: { |
3465 | 30.0k | if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) |
3466 | 11.7k | break; |
3467 | 18.2k | |
3468 | 18.2k | return performShlCombine(N, DCI); |
3469 | 18.2k | } |
3470 | 79.8k | case ISD::SRL: { |
3471 | 79.8k | if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) |
3472 | 26.7k | break; |
3473 | 53.0k | |
3474 | 53.0k | return performSrlCombine(N, DCI); |
3475 | 53.0k | } |
3476 | 11.7k | case ISD::SRA: { |
3477 | 11.7k | if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) |
3478 | 6.43k | break; |
3479 | 5.29k | |
3480 | 5.29k | return performSraCombine(N, DCI); |
3481 | 5.29k | } |
3482 | 1.90k | case ISD::MUL: |
3483 | 1.90k | return performMulCombine(N, DCI); |
3484 | 73 | case ISD::MULHS: |
3485 | 73 | return performMulhsCombine(N, DCI); |
3486 | 1.07k | case ISD::MULHU: |
3487 | 1.07k | return performMulhuCombine(N, DCI); |
3488 | 1.22k | case AMDGPUISD::MUL_I24: |
3489 | 1.22k | case AMDGPUISD::MUL_U24: |
3490 | 1.22k | case AMDGPUISD::MULHI_I24: |
3491 | 1.22k | case AMDGPUISD::MULHI_U24: { |
3492 | 1.22k | // If the first call to simplify is successfull, then N may end up being |
3493 | 1.22k | // deleted, so we shouldn't call simplifyI24 again. |
3494 | 1.17k | simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI); |
3495 | 1.22k | return SDValue(); |
3496 | 1.22k | } |
3497 | 121 | case AMDGPUISD::MUL_LOHI_I24: |
3498 | 121 | case AMDGPUISD::MUL_LOHI_U24: |
3499 | 121 | return performMulLoHi24Combine(N, DCI); |
3500 | 13.6k | case ISD::SELECT: |
3501 | 13.6k | return performSelectCombine(N, DCI); |
3502 | 2.81k | case ISD::FNEG: |
3503 | 2.81k | return performFNegCombine(N, DCI); |
3504 | 1.77k | case ISD::FABS: |
3505 | 1.77k | return performFAbsCombine(N, DCI); |
3506 | 416 | case AMDGPUISD::BFE_I32: |
3507 | 416 | case AMDGPUISD::BFE_U32: { |
3508 | 416 | assert(!N->getValueType(0).isVector() && |
3509 | 416 | "Vector handling of BFE not implemented"); |
3510 | 416 | ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); |
3511 | 416 | if (!Width) |
3512 | 12 | break; |
3513 | 404 | |
3514 | 404 | uint32_t WidthVal = Width->getZExtValue() & 0x1f; |
3515 | 404 | if (WidthVal == 0) |
3516 | 22 | return DAG.getConstant(0, DL, MVT::i32); |
3517 | 382 | |
3518 | 382 | ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); |
3519 | 382 | if (!Offset) |
3520 | 4 | break; |
3521 | 378 | |
3522 | 378 | SDValue BitsFrom = N->getOperand(0); |
3523 | 378 | uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; |
3524 | 378 | |
3525 | 378 | bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; |
3526 | 378 | |
3527 | 378 | if (OffsetVal == 0378 ) { |
3528 | 58 | // This is already sign / zero extended, so try to fold away extra BFEs. |
3529 | 58 | unsigned SignBits = Signed ? (32 - WidthVal + 1)32 : (32 - WidthVal)26 ; |
3530 | 58 | |
3531 | 58 | unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); |
3532 | 58 | if (OpSignBits >= SignBits) |
3533 | 34 | return BitsFrom; |
3534 | 24 | |
3535 | 24 | EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); |
3536 | 24 | if (Signed24 ) { |
3537 | 18 | // This is a sign_extend_inreg. Replace it to take advantage of existing |
3538 | 18 | // DAG Combines. If not eliminated, we will match back to BFE during |
3539 | 18 | // selection. |
3540 | 18 | |
3541 | 18 | // TODO: The sext_inreg of extended types ends, although we can could |
3542 | 18 | // handle them in a single BFE. |
3543 | 18 | return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, |
3544 | 18 | DAG.getValueType(SmallVT)); |
3545 | 18 | } |
3546 | 6 | |
3547 | 6 | return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); |
3548 | 6 | } |
3549 | 320 | |
3550 | 320 | if (ConstantSDNode *320 CVal320 = dyn_cast<ConstantSDNode>(BitsFrom)) { |
3551 | 48 | if (Signed48 ) { |
3552 | 24 | return constantFoldBFE<int32_t>(DAG, |
3553 | 24 | CVal->getSExtValue(), |
3554 | 24 | OffsetVal, |
3555 | 24 | WidthVal, |
3556 | 24 | DL); |
3557 | 24 | } |
3558 | 24 | |
3559 | 24 | return constantFoldBFE<uint32_t>(DAG, |
3560 | 24 | CVal->getZExtValue(), |
3561 | 24 | OffsetVal, |
3562 | 24 | WidthVal, |
3563 | 24 | DL); |
3564 | 24 | } |
3565 | 272 | |
3566 | 272 | if (272 (OffsetVal + WidthVal) >= 32 && |
3567 | 272 | !(Subtarget->hasSDWA() && 84 OffsetVal == 1667 && WidthVal == 162 )) { |
3568 | 82 | SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); |
3569 | 82 | return DAG.getNode(Signed ? ISD::SRA16 : ISD::SRL66 , DL, MVT::i32, |
3570 | 82 | BitsFrom, ShiftVal); |
3571 | 82 | } |
3572 | 190 | |
3573 | 190 | if (190 BitsFrom.hasOneUse()190 ) { |
3574 | 48 | APInt Demanded = APInt::getBitsSet(32, |
3575 | 48 | OffsetVal, |
3576 | 48 | OffsetVal + WidthVal); |
3577 | 48 | |
3578 | 48 | KnownBits Known; |
3579 | 48 | TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), |
3580 | 48 | !DCI.isBeforeLegalizeOps()); |
3581 | 48 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
3582 | 48 | if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) || |
3583 | 48 | TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)40 ) { |
3584 | 14 | DCI.CommitTargetLoweringOpt(TLO); |
3585 | 14 | } |
3586 | 48 | } |
3587 | 190 | |
3588 | 190 | break; |
3589 | 190 | } |
3590 | 130k | case ISD::LOAD: |
3591 | 130k | return performLoadCombine(N, DCI); |
3592 | 87.3k | case ISD::STORE: |
3593 | 87.3k | return performStoreCombine(N, DCI); |
3594 | 321 | case AMDGPUISD::CLAMP: |
3595 | 321 | return performClampCombine(N, DCI); |
3596 | 409 | case AMDGPUISD::RCP: { |
3597 | 409 | if (const auto *CFP409 = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) { |
3598 | 17 | // XXX - Should this flush denormals? |
3599 | 17 | const APFloat &Val = CFP->getValueAPF(); |
3600 | 17 | APFloat One(Val.getSemantics(), "1.0"); |
3601 | 17 | return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); |
3602 | 17 | } |
3603 | 392 | |
3604 | 392 | break; |
3605 | 392 | } |
3606 | 9.56k | case ISD::AssertZext: |
3607 | 9.56k | case ISD::AssertSext: |
3608 | 9.56k | return performAssertSZExtCombine(N, DCI); |
3609 | 255k | } |
3610 | 255k | return SDValue(); |
3611 | 255k | } |
3612 | | |
3613 | | //===----------------------------------------------------------------------===// |
3614 | | // Helper functions |
3615 | | //===----------------------------------------------------------------------===// |
3616 | | |
3617 | | SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, |
3618 | | const TargetRegisterClass *RC, |
3619 | | unsigned Reg, EVT VT, |
3620 | | const SDLoc &SL, |
3621 | 4.16k | bool RawReg) const { |
3622 | 4.16k | MachineFunction &MF = DAG.getMachineFunction(); |
3623 | 4.16k | MachineRegisterInfo &MRI = MF.getRegInfo(); |
3624 | 4.16k | unsigned VReg; |
3625 | 4.16k | |
3626 | 4.16k | if (!MRI.isLiveIn(Reg)4.16k ) { |
3627 | 1.02k | VReg = MRI.createVirtualRegister(RC); |
3628 | 1.02k | MRI.addLiveIn(Reg, VReg); |
3629 | 4.16k | } else { |
3630 | 3.13k | VReg = MRI.getLiveInVirtReg(Reg); |
3631 | 3.13k | } |
3632 | 4.16k | |
3633 | 4.16k | if (RawReg) |
3634 | 169 | return DAG.getRegister(VReg, VT); |
3635 | 3.99k | |
3636 | 3.99k | return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); |
3637 | 3.99k | } |
3638 | | |
3639 | | SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, |
3640 | | EVT VT, |
3641 | | const SDLoc &SL, |
3642 | 8 | int64_t Offset) const { |
3643 | 8 | MachineFunction &MF = DAG.getMachineFunction(); |
3644 | 8 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
3645 | 8 | |
3646 | 8 | int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true); |
3647 | 8 | auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); |
3648 | 8 | SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); |
3649 | 8 | |
3650 | 8 | return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4, |
3651 | 8 | MachineMemOperand::MODereferenceable | |
3652 | 8 | MachineMemOperand::MOInvariant); |
3653 | 8 | } |
3654 | | |
3655 | | SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, |
3656 | | const SDLoc &SL, |
3657 | | SDValue Chain, |
3658 | | SDValue StackPtr, |
3659 | | SDValue ArgVal, |
3660 | 10 | int64_t Offset) const { |
3661 | 10 | MachineFunction &MF = DAG.getMachineFunction(); |
3662 | 10 | MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); |
3663 | 10 | SDValue PtrOffset = DAG.getConstant(Offset, SL, MVT::i32); |
3664 | 10 | SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, StackPtr, PtrOffset); |
3665 | 10 | |
3666 | 10 | SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, |
3667 | 10 | MachineMemOperand::MODereferenceable); |
3668 | 10 | return Store; |
3669 | 10 | } |
3670 | | |
3671 | | SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, |
3672 | | const TargetRegisterClass *RC, |
3673 | | EVT VT, const SDLoc &SL, |
3674 | 2.94k | const ArgDescriptor &Arg) const { |
3675 | 2.94k | assert(Arg && "Attempting to load missing argument"); |
3676 | 2.94k | |
3677 | 2.94k | if (Arg.isRegister()) |
3678 | 2.94k | return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL); |
3679 | 8 | return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); |
3680 | 8 | } |
3681 | | |
3682 | | uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( |
3683 | 30 | const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { |
3684 | 30 | unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr(); |
3685 | 30 | uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment); |
3686 | 30 | switch (Param) { |
3687 | 30 | case GRID_DIM: |
3688 | 30 | return ArgOffset; |
3689 | 0 | case GRID_OFFSET: |
3690 | 0 | return ArgOffset + 4; |
3691 | 0 | } |
3692 | 0 | llvm_unreachable0 ("unexpected implicit parameter type"); |
3693 | 0 | } |
3694 | | |
3695 | 0 | #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; |
3696 | | |
3697 | 0 | const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { |
3698 | 0 | switch ((AMDGPUISD::NodeType)Opcode) { |
3699 | 0 | case AMDGPUISD::FIRST_NUMBER: break; |
3700 | 0 | // AMDIL DAG nodes |
3701 | 0 | NODE_NAME_CASE0 (UMUL)0 ; |
3702 | 0 | NODE_NAME_CASE0 (BRANCH_COND)0 ; |
3703 | 0 |
|
3704 | 0 | // AMDGPU DAG nodes |
3705 | 0 | NODE_NAME_CASE0 (IF) |
3706 | 0 | NODE_NAME_CASE0 (ELSE) |
3707 | 0 | NODE_NAME_CASE0 (LOOP) |
3708 | 0 | NODE_NAME_CASE0 (CALL) |
3709 | 0 | NODE_NAME_CASE0 (TC_RETURN) |
3710 | 0 | NODE_NAME_CASE0 (TRAP) |
3711 | 0 | NODE_NAME_CASE0 (RET_FLAG) |
3712 | 0 | NODE_NAME_CASE0 (RETURN_TO_EPILOG) |
3713 | 0 | NODE_NAME_CASE0 (ENDPGM) |
3714 | 0 | NODE_NAME_CASE0 (DWORDADDR) |
3715 | 0 | NODE_NAME_CASE0 (FRACT) |
3716 | 0 | NODE_NAME_CASE0 (SETCC) |
3717 | 0 | NODE_NAME_CASE0 (SETREG) |
3718 | 0 | NODE_NAME_CASE0 (FMA_W_CHAIN) |
3719 | 0 | NODE_NAME_CASE0 (FMUL_W_CHAIN) |
3720 | 0 | NODE_NAME_CASE0 (CLAMP) |
3721 | 0 | NODE_NAME_CASE0 (COS_HW) |
3722 | 0 | NODE_NAME_CASE0 (SIN_HW) |
3723 | 0 | NODE_NAME_CASE0 (FMAX_LEGACY) |
3724 | 0 | NODE_NAME_CASE0 (FMIN_LEGACY) |
3725 | 0 | NODE_NAME_CASE0 (FMAX3) |
3726 | 0 | NODE_NAME_CASE0 (SMAX3) |
3727 | 0 | NODE_NAME_CASE0 (UMAX3) |
3728 | 0 | NODE_NAME_CASE0 (FMIN3) |
3729 | 0 | NODE_NAME_CASE0 (SMIN3) |
3730 | 0 | NODE_NAME_CASE0 (UMIN3) |
3731 | 0 | NODE_NAME_CASE0 (FMED3) |
3732 | 0 | NODE_NAME_CASE0 (SMED3) |
3733 | 0 | NODE_NAME_CASE0 (UMED3) |
3734 | 0 | NODE_NAME_CASE0 (URECIP) |
3735 | 0 | NODE_NAME_CASE0 (DIV_SCALE) |
3736 | 0 | NODE_NAME_CASE0 (DIV_FMAS) |
3737 | 0 | NODE_NAME_CASE0 (DIV_FIXUP) |
3738 | 0 | NODE_NAME_CASE0 (FMAD_FTZ) |
3739 | 0 | NODE_NAME_CASE0 (TRIG_PREOP) |
3740 | 0 | NODE_NAME_CASE0 (RCP) |
3741 | 0 | NODE_NAME_CASE0 (RSQ) |
3742 | 0 | NODE_NAME_CASE0 (RCP_LEGACY) |
3743 | 0 | NODE_NAME_CASE0 (RSQ_LEGACY) |
3744 | 0 | NODE_NAME_CASE0 (FMUL_LEGACY) |
3745 | 0 | NODE_NAME_CASE0 (RSQ_CLAMP) |
3746 | 0 | NODE_NAME_CASE0 (LDEXP) |
3747 | 0 | NODE_NAME_CASE0 (FP_CLASS) |
3748 | 0 | NODE_NAME_CASE0 (DOT4) |
3749 | 0 | NODE_NAME_CASE0 (CARRY) |
3750 | 0 | NODE_NAME_CASE0 (BORROW) |
3751 | 0 | NODE_NAME_CASE0 (BFE_U32) |
3752 | 0 | NODE_NAME_CASE0 (BFE_I32) |
3753 | 0 | NODE_NAME_CASE0 (BFI) |
3754 | 0 | NODE_NAME_CASE0 (BFM) |
3755 | 0 | NODE_NAME_CASE0 (FFBH_U32) |
3756 | 0 | NODE_NAME_CASE0 (FFBH_I32) |
3757 | 0 | NODE_NAME_CASE0 (MUL_U24) |
3758 | 0 | NODE_NAME_CASE0 (MUL_I24) |
3759 | 0 | NODE_NAME_CASE0 (MULHI_U24) |
3760 | 0 | NODE_NAME_CASE0 (MULHI_I24) |
3761 | 0 | NODE_NAME_CASE0 (MUL_LOHI_U24) |
3762 | 0 | NODE_NAME_CASE0 (MUL_LOHI_I24) |
3763 | 0 | NODE_NAME_CASE0 (MAD_U24) |
3764 | 0 | NODE_NAME_CASE0 (MAD_I24) |
3765 | 0 | NODE_NAME_CASE0 (TEXTURE_FETCH) |
3766 | 0 | NODE_NAME_CASE0 (EXPORT) |
3767 | 0 | NODE_NAME_CASE0 (EXPORT_DONE) |
3768 | 0 | NODE_NAME_CASE0 (R600_EXPORT) |
3769 | 0 | NODE_NAME_CASE0 (CONST_ADDRESS) |
3770 | 0 | NODE_NAME_CASE0 (REGISTER_LOAD) |
3771 | 0 | NODE_NAME_CASE0 (REGISTER_STORE) |
3772 | 0 | NODE_NAME_CASE0 (SAMPLE) |
3773 | 0 | NODE_NAME_CASE0 (SAMPLEB) |
3774 | 0 | NODE_NAME_CASE0 (SAMPLED) |
3775 | 0 | NODE_NAME_CASE0 (SAMPLEL) |
3776 | 0 | NODE_NAME_CASE0 (CVT_F32_UBYTE0) |
3777 | 0 | NODE_NAME_CASE0 (CVT_F32_UBYTE1) |
3778 | 0 | NODE_NAME_CASE0 (CVT_F32_UBYTE2) |
3779 | 0 | NODE_NAME_CASE0 (CVT_F32_UBYTE3) |
3780 | 0 | NODE_NAME_CASE0 (CVT_PKRTZ_F16_F32) |
3781 | 0 | NODE_NAME_CASE0 (FP_TO_FP16) |
3782 | 0 | NODE_NAME_CASE0 (FP16_ZEXT) |
3783 | 0 | NODE_NAME_CASE0 (BUILD_VERTICAL_VECTOR) |
3784 | 0 | NODE_NAME_CASE0 (CONST_DATA_PTR) |
3785 | 0 | NODE_NAME_CASE0 (PC_ADD_REL_OFFSET) |
3786 | 0 | NODE_NAME_CASE0 (KILL) |
3787 | 0 | NODE_NAME_CASE0 (DUMMY_CHAIN) |
3788 | 0 | case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; |
3789 | 0 | NODE_NAME_CASE0 (INIT_EXEC) |
3790 | 0 | NODE_NAME_CASE0 (INIT_EXEC_FROM_INPUT) |
3791 | 0 | NODE_NAME_CASE0 (SENDMSG) |
3792 | 0 | NODE_NAME_CASE0 (SENDMSGHALT) |
3793 | 0 | NODE_NAME_CASE0 (INTERP_MOV) |
3794 | 0 | NODE_NAME_CASE0 (INTERP_P1) |
3795 | 0 | NODE_NAME_CASE0 (INTERP_P2) |
3796 | 0 | NODE_NAME_CASE0 (STORE_MSKOR) |
3797 | 0 | NODE_NAME_CASE0 (LOAD_CONSTANT) |
3798 | 0 | NODE_NAME_CASE0 (TBUFFER_STORE_FORMAT) |
3799 | 0 | NODE_NAME_CASE0 (TBUFFER_STORE_FORMAT_X3) |
3800 | 0 | NODE_NAME_CASE0 (TBUFFER_LOAD_FORMAT) |
3801 | 0 | NODE_NAME_CASE0 (ATOMIC_CMP_SWAP) |
3802 | 0 | NODE_NAME_CASE0 (ATOMIC_INC) |
3803 | 0 | NODE_NAME_CASE0 (ATOMIC_DEC) |
3804 | 0 | NODE_NAME_CASE0 (BUFFER_LOAD) |
3805 | 0 | NODE_NAME_CASE0 (BUFFER_LOAD_FORMAT) |
3806 | 0 | case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; |
3807 | 0 | } |
3808 | 0 | return nullptr; |
3809 | 0 | } |
3810 | | |
3811 | | SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, |
3812 | | SelectionDAG &DAG, int Enabled, |
3813 | | int &RefinementSteps, |
3814 | | bool &UseOneConstNR, |
3815 | 8 | bool Reciprocal) const { |
3816 | 8 | EVT VT = Operand.getValueType(); |
3817 | 8 | |
3818 | 8 | if (VT == MVT::f328 ) { |
3819 | 5 | RefinementSteps = 0; |
3820 | 5 | return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); |
3821 | 5 | } |
3822 | 3 | |
3823 | 3 | // TODO: There is also f64 rsq instruction, but the documentation is less |
3824 | 3 | // clear on its precision. |
3825 | 3 | |
3826 | 3 | return SDValue(); |
3827 | 3 | } |
3828 | | |
3829 | | SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, |
3830 | | SelectionDAG &DAG, int Enabled, |
3831 | 12 | int &RefinementSteps) const { |
3832 | 12 | EVT VT = Operand.getValueType(); |
3833 | 12 | |
3834 | 12 | if (VT == MVT::f3212 ) { |
3835 | 7 | // Reciprocal, < 1 ulp error. |
3836 | 7 | // |
3837 | 7 | // This reciprocal approximation converges to < 0.5 ulp error with one |
3838 | 7 | // newton rhapson performed with two fused multiple adds (FMAs). |
3839 | 7 | |
3840 | 7 | RefinementSteps = 0; |
3841 | 7 | return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); |
3842 | 7 | } |
3843 | 5 | |
3844 | 5 | // TODO: There is also f64 rcp instruction, but the documentation is less |
3845 | 5 | // clear on its precision. |
3846 | 5 | |
3847 | 5 | return SDValue(); |
3848 | 5 | } |
3849 | | |
3850 | | void AMDGPUTargetLowering::computeKnownBitsForTargetNode( |
3851 | | const SDValue Op, KnownBits &Known, |
3852 | 115k | const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { |
3853 | 115k | |
3854 | 115k | Known.resetAll(); // Don't know anything. |
3855 | 115k | |
3856 | 115k | unsigned Opc = Op.getOpcode(); |
3857 | 115k | |
3858 | 115k | switch (Opc) { |
3859 | 99.6k | default: |
3860 | 99.6k | break; |
3861 | 8.40k | case AMDGPUISD::CARRY: |
3862 | 8.40k | case AMDGPUISD::BORROW: { |
3863 | 8.40k | Known.Zero = APInt::getHighBitsSet(32, 31); |
3864 | 8.40k | break; |
3865 | 8.40k | } |
3866 | 8.40k | |
3867 | 527 | case AMDGPUISD::BFE_I32: |
3868 | 527 | case AMDGPUISD::BFE_U32: { |
3869 | 527 | ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); |
3870 | 527 | if (!CWidth) |
3871 | 0 | return; |
3872 | 527 | |
3873 | 527 | uint32_t Width = CWidth->getZExtValue() & 0x1f; |
3874 | 527 | |
3875 | 527 | if (Opc == AMDGPUISD::BFE_U32) |
3876 | 495 | Known.Zero = APInt::getHighBitsSet(32, 32 - Width); |
3877 | 527 | |
3878 | 527 | break; |
3879 | 527 | } |
3880 | 1.36k | case AMDGPUISD::FP_TO_FP16: |
3881 | 1.36k | case AMDGPUISD::FP16_ZEXT: { |
3882 | 1.36k | unsigned BitWidth = Known.getBitWidth(); |
3883 | 1.36k | |
3884 | 1.36k | // High bits are zero. |
3885 | 1.36k | Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); |
3886 | 1.36k | break; |
3887 | 1.36k | } |
3888 | 5.57k | case AMDGPUISD::MUL_U24: |
3889 | 5.57k | case AMDGPUISD::MUL_I24: { |
3890 | 5.57k | KnownBits LHSKnown, RHSKnown; |
3891 | 5.57k | DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1); |
3892 | 5.57k | DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1); |
3893 | 5.57k | |
3894 | 5.57k | unsigned TrailZ = LHSKnown.countMinTrailingZeros() + |
3895 | 5.57k | RHSKnown.countMinTrailingZeros(); |
3896 | 5.57k | Known.Zero.setLowBits(std::min(TrailZ, 32u)); |
3897 | 5.57k | |
3898 | 5.57k | unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u); |
3899 | 5.57k | unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u); |
3900 | 5.57k | unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); |
3901 | 5.57k | if (MaxValBits >= 32) |
3902 | 2.70k | break; |
3903 | 2.86k | bool Negative = false; |
3904 | 2.86k | if (Opc == AMDGPUISD::MUL_I242.86k ) { |
3905 | 5 | bool LHSNegative = !!(LHSKnown.One & (1 << 23)); |
3906 | 5 | bool LHSPositive = !!(LHSKnown.Zero & (1 << 23)); |
3907 | 5 | bool RHSNegative = !!(RHSKnown.One & (1 << 23)); |
3908 | 5 | bool RHSPositive = !!(RHSKnown.Zero & (1 << 23)); |
3909 | 5 | if ((!LHSNegative && 5 !LHSPositive0 ) || (!RHSNegative && 5 !RHSPositive0 )) |
3910 | 0 | break; |
3911 | 5 | Negative = (LHSNegative && 5 RHSPositive5 ) || (LHSPositive && 5 RHSNegative0 ); |
3912 | 5 | } |
3913 | 2.86k | if (2.86k Negative2.86k ) |
3914 | 0 | Known.One.setHighBits(32 - MaxValBits); |
3915 | 2.86k | else |
3916 | 2.86k | Known.Zero.setHighBits(32 - MaxValBits); |
3917 | 2.86k | break; |
3918 | 115k | } |
3919 | 115k | } |
3920 | 115k | } |
3921 | | |
3922 | | unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( |
3923 | | SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, |
3924 | 1.59k | unsigned Depth) const { |
3925 | 1.59k | switch (Op.getOpcode()) { |
3926 | 2 | case AMDGPUISD::BFE_I32: { |
3927 | 2 | ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); |
3928 | 2 | if (!Width) |
3929 | 0 | return 1; |
3930 | 2 | |
3931 | 2 | unsigned SignBits = 32 - Width->getZExtValue() + 1; |
3932 | 2 | if (!isNullConstant(Op.getOperand(1))) |
3933 | 2 | return SignBits; |
3934 | 0 |
|
3935 | 0 | // TODO: Could probably figure something out with non-0 offsets. |
3936 | 0 | unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); |
3937 | 0 | return std::max(SignBits, Op0SignBits); |
3938 | 0 | } |
3939 | 0 |
|
3940 | 0 | case AMDGPUISD::BFE_U32: { |
3941 | 0 | ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); |
3942 | 0 | return Width ? 32 - (Width->getZExtValue() & 0x1f)0 : 10 ; |
3943 | 0 | } |
3944 | 0 |
|
3945 | 694 | case AMDGPUISD::CARRY: |
3946 | 694 | case AMDGPUISD::BORROW: |
3947 | 694 | return 31; |
3948 | 1 | case AMDGPUISD::FP_TO_FP16: |
3949 | 1 | case AMDGPUISD::FP16_ZEXT: |
3950 | 1 | return 16; |
3951 | 900 | default: |
3952 | 900 | return 1; |
3953 | 0 | } |
3954 | 0 | } |