/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions ----------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This file contains a pass that expands pseudo instructions into target |
11 | | // instructions to allow proper scheduling and other late optimizations. This |
12 | | // pass should be run after register allocation but before the post-regalloc |
13 | | // scheduling pass. |
14 | | // |
15 | | //===----------------------------------------------------------------------===// |
16 | | |
17 | | #include "AArch64InstrInfo.h" |
18 | | #include "AArch64Subtarget.h" |
19 | | #include "MCTargetDesc/AArch64AddressingModes.h" |
20 | | #include "Utils/AArch64BaseInfo.h" |
21 | | #include "llvm/ADT/DenseMap.h" |
22 | | #include "llvm/ADT/Triple.h" |
23 | | #include "llvm/CodeGen/LivePhysRegs.h" |
24 | | #include "llvm/CodeGen/MachineBasicBlock.h" |
25 | | #include "llvm/CodeGen/MachineFunction.h" |
26 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
27 | | #include "llvm/CodeGen/MachineInstr.h" |
28 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
29 | | #include "llvm/CodeGen/MachineOperand.h" |
30 | | #include "llvm/IR/DebugLoc.h" |
31 | | #include "llvm/MC/MCInstrDesc.h" |
32 | | #include "llvm/Pass.h" |
33 | | #include "llvm/Support/CodeGen.h" |
34 | | #include "llvm/Support/MathExtras.h" |
35 | | #include "llvm/Target/TargetMachine.h" |
36 | | #include "llvm/Target/TargetSubtargetInfo.h" |
37 | | #include <cassert> |
38 | | #include <cstdint> |
39 | | #include <iterator> |
40 | | #include <limits> |
41 | | #include <utility> |
42 | | |
43 | | using namespace llvm; |
44 | | |
45 | 13.9k | #define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass" |
46 | | |
47 | | namespace { |
48 | | |
49 | | class AArch64ExpandPseudo : public MachineFunctionPass { |
50 | | public: |
51 | | const AArch64InstrInfo *TII; |
52 | | |
53 | | static char ID; |
54 | | |
55 | 13.9k | AArch64ExpandPseudo() : MachineFunctionPass(ID) { |
56 | 13.9k | initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry()); |
57 | 13.9k | } |
58 | | |
59 | | bool runOnMachineFunction(MachineFunction &Fn) override; |
60 | | |
61 | 13.9k | StringRef getPassName() const override { return 13.9k AARCH64_EXPAND_PSEUDO_NAME13.9k ; } |
62 | | |
63 | | private: |
64 | | bool expandMBB(MachineBasicBlock &MBB); |
65 | | bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, |
66 | | MachineBasicBlock::iterator &NextMBBI); |
67 | | bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, |
68 | | unsigned BitSize); |
69 | | |
70 | | bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, |
71 | | unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, |
72 | | unsigned ExtendImm, unsigned ZeroReg, |
73 | | MachineBasicBlock::iterator &NextMBBI); |
74 | | bool expandCMP_SWAP_128(MachineBasicBlock &MBB, |
75 | | MachineBasicBlock::iterator MBBI, |
76 | | MachineBasicBlock::iterator &NextMBBI); |
77 | | }; |
78 | | |
79 | | } // end anonymous namespace |
80 | | |
81 | | char AArch64ExpandPseudo::ID = 0; |
82 | | |
83 | | INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo", |
84 | | AARCH64_EXPAND_PSEUDO_NAME, false, false) |
85 | | |
86 | | /// \brief Transfer implicit operands on the pseudo instruction to the |
87 | | /// instructions created from the expansion. |
88 | | static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, |
89 | 7.02M | MachineInstrBuilder &DefMI) { |
90 | 7.02M | const MCInstrDesc &Desc = OldMI.getDesc(); |
91 | 8.77M | for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e; |
92 | 7.02M | ++i1.74M ) { |
93 | 1.74M | const MachineOperand &MO = OldMI.getOperand(i); |
94 | 1.74M | assert(MO.isReg() && MO.getReg()); |
95 | 1.74M | if (MO.isUse()) |
96 | 740k | UseMI.add(MO); |
97 | 1.74M | else |
98 | 1.00M | DefMI.add(MO); |
99 | 1.74M | } |
100 | 7.02M | } |
101 | | |
102 | | /// \brief Helper function which extracts the specified 16-bit chunk from a |
103 | | /// 64-bit value. |
104 | 1.25M | static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) { |
105 | 1.25M | assert(ChunkIdx < 4 && "Out of range chunk index specified!"); |
106 | 1.25M | |
107 | 1.25M | return (Imm >> (ChunkIdx * 16)) & 0xFFFF; |
108 | 1.25M | } |
109 | | |
110 | | /// \brief Helper function which replicates a 16-bit chunk within a 64-bit |
111 | | /// value. Indices correspond to element numbers in a v4i16. |
112 | 5.23k | static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) { |
113 | 5.23k | assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!"); |
114 | 5.23k | const unsigned ShiftAmt = ToIdx * 16; |
115 | 5.23k | |
116 | 5.23k | // Replicate the source chunk to the destination position. |
117 | 5.23k | const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt; |
118 | 5.23k | // Clear the destination chunk. |
119 | 5.23k | Imm &= ~(0xFFFFLL << ShiftAmt); |
120 | 5.23k | // Insert the replicated chunk. |
121 | 5.23k | return Imm | Chunk; |
122 | 5.23k | } |
123 | | |
124 | | /// \brief Helper function which tries to materialize a 64-bit value with an |
125 | | /// ORR + MOVK instruction sequence. |
126 | | static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI, |
127 | | MachineBasicBlock &MBB, |
128 | | MachineBasicBlock::iterator &MBBI, |
129 | 5.23k | const AArch64InstrInfo *TII, unsigned ChunkIdx) { |
130 | 5.23k | assert(ChunkIdx < 4 && "Out of range chunk index specified!"); |
131 | 5.23k | const unsigned ShiftAmt = ChunkIdx * 16; |
132 | 5.23k | |
133 | 5.23k | uint64_t Encoding; |
134 | 5.23k | if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)5.23k ) { |
135 | 3.93k | // Create the ORR-immediate instruction. |
136 | 3.93k | MachineInstrBuilder MIB = |
137 | 3.93k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) |
138 | 3.93k | .add(MI.getOperand(0)) |
139 | 3.93k | .addReg(AArch64::XZR) |
140 | 3.93k | .addImm(Encoding); |
141 | 3.93k | |
142 | 3.93k | // Create the MOVK instruction. |
143 | 3.93k | const unsigned Imm16 = getChunk(UImm, ChunkIdx); |
144 | 3.93k | const unsigned DstReg = MI.getOperand(0).getReg(); |
145 | 3.93k | const bool DstIsDead = MI.getOperand(0).isDead(); |
146 | 3.93k | MachineInstrBuilder MIB1 = |
147 | 3.93k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) |
148 | 3.93k | .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) |
149 | 3.93k | .addReg(DstReg) |
150 | 3.93k | .addImm(Imm16) |
151 | 3.93k | .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); |
152 | 3.93k | |
153 | 3.93k | transferImpOps(MI, MIB, MIB1); |
154 | 3.93k | MI.eraseFromParent(); |
155 | 3.93k | return true; |
156 | 3.93k | } |
157 | 1.30k | |
158 | 1.30k | return false; |
159 | 1.30k | } |
160 | | |
161 | | /// \brief Check whether the given 16-bit chunk replicated to full 64-bit width |
162 | | /// can be materialized with an ORR instruction. |
163 | 28.8k | static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) { |
164 | 28.8k | Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk; |
165 | 28.8k | |
166 | 28.8k | return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding); |
167 | 28.8k | } |
168 | | |
169 | | /// \brief Check for identical 16-bit chunks within the constant and if so |
170 | | /// materialize them with a single ORR instruction. The remaining one or two |
171 | | /// 16-bit chunks will be materialized with MOVK instructions. |
172 | | /// |
173 | | /// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order |
174 | | /// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with |
175 | | /// an ORR instruction. |
176 | | static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI, |
177 | | MachineBasicBlock &MBB, |
178 | | MachineBasicBlock::iterator &MBBI, |
179 | 130k | const AArch64InstrInfo *TII) { |
180 | 130k | using CountMap = DenseMap<uint64_t, unsigned>; |
181 | 130k | |
182 | 130k | CountMap Counts; |
183 | 130k | |
184 | 130k | // Scan the constant and count how often every chunk occurs. |
185 | 650k | for (unsigned Idx = 0; Idx < 4650k ; ++Idx520k ) |
186 | 520k | ++Counts[getChunk(UImm, Idx)]; |
187 | 130k | |
188 | 130k | // Traverse the chunks to find one which occurs more than once. |
189 | 130k | for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end(); |
190 | 412k | Chunk != End412k ; ++Chunk281k ) { |
191 | 282k | const uint64_t ChunkVal = Chunk->first; |
192 | 282k | const unsigned Count = Chunk->second; |
193 | 282k | |
194 | 282k | uint64_t Encoding = 0; |
195 | 282k | |
196 | 282k | // We are looking for chunks which have two or three instances and can be |
197 | 282k | // materialized with an ORR instruction. |
198 | 282k | if ((Count != 2 && 282k Count != 3273k ) || !canUseOrr(ChunkVal, Encoding)28.8k ) |
199 | 281k | continue; |
200 | 1.02k | |
201 | 1.02k | const bool CountThree = Count == 3; |
202 | 1.02k | // Create the ORR-immediate instruction. |
203 | 1.02k | MachineInstrBuilder MIB = |
204 | 1.02k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) |
205 | 1.02k | .add(MI.getOperand(0)) |
206 | 1.02k | .addReg(AArch64::XZR) |
207 | 1.02k | .addImm(Encoding); |
208 | 1.02k | |
209 | 1.02k | const unsigned DstReg = MI.getOperand(0).getReg(); |
210 | 1.02k | const bool DstIsDead = MI.getOperand(0).isDead(); |
211 | 1.02k | |
212 | 1.02k | unsigned ShiftAmt = 0; |
213 | 1.02k | uint64_t Imm16 = 0; |
214 | 1.02k | // Find the first chunk not materialized with the ORR instruction. |
215 | 1.73k | for (; ShiftAmt < 641.73k ; ShiftAmt += 16711 ) { |
216 | 1.73k | Imm16 = (UImm >> ShiftAmt) & 0xFFFF; |
217 | 1.73k | |
218 | 1.73k | if (Imm16 != ChunkVal) |
219 | 1.02k | break; |
220 | 1.73k | } |
221 | 1.02k | |
222 | 1.02k | // Create the first MOVK instruction. |
223 | 1.02k | MachineInstrBuilder MIB1 = |
224 | 1.02k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) |
225 | 1.02k | .addReg(DstReg, |
226 | 0 | RegState::Define | getDeadRegState(DstIsDead && CountThree)) |
227 | 1.02k | .addReg(DstReg) |
228 | 1.02k | .addImm(Imm16) |
229 | 1.02k | .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); |
230 | 1.02k | |
231 | 1.02k | // In case we have three instances the whole constant is now materialized |
232 | 1.02k | // and we can exit. |
233 | 1.02k | if (CountThree1.02k ) { |
234 | 0 | transferImpOps(MI, MIB, MIB1); |
235 | 0 | MI.eraseFromParent(); |
236 | 0 | return true; |
237 | 0 | } |
238 | 1.02k | |
239 | 1.02k | // Find the remaining chunk which needs to be materialized. |
240 | 1.61k | for (ShiftAmt += 16; 1.02k ShiftAmt < 641.61k ; ShiftAmt += 16597 ) { |
241 | 1.61k | Imm16 = (UImm >> ShiftAmt) & 0xFFFF; |
242 | 1.61k | |
243 | 1.61k | if (Imm16 != ChunkVal) |
244 | 1.02k | break; |
245 | 1.61k | } |
246 | 282k | |
247 | 282k | // Create the second MOVK instruction. |
248 | 282k | MachineInstrBuilder MIB2 = |
249 | 282k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) |
250 | 282k | .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) |
251 | 282k | .addReg(DstReg) |
252 | 282k | .addImm(Imm16) |
253 | 282k | .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); |
254 | 282k | |
255 | 282k | transferImpOps(MI, MIB, MIB2); |
256 | 282k | MI.eraseFromParent(); |
257 | 282k | return true; |
258 | 282k | } |
259 | 130k | |
260 | 129k | return false; |
261 | 130k | } |
262 | | |
263 | | /// \brief Check whether this chunk matches the pattern '1...0...'. This pattern |
264 | | /// starts a contiguous sequence of ones if we look at the bits from the LSB |
265 | | /// towards the MSB. |
266 | 516k | static bool isStartChunk(uint64_t Chunk) { |
267 | 516k | if (Chunk == 0 || 516k Chunk == std::numeric_limits<uint64_t>::max()182k ) |
268 | 357k | return false; |
269 | 158k | |
270 | 158k | return isMask_64(~Chunk); |
271 | 158k | } |
272 | | |
273 | | /// \brief Check whether this chunk matches the pattern '0...1...' This pattern |
274 | | /// ends a contiguous sequence of ones if we look at the bits from the LSB |
275 | | /// towards the MSB. |
276 | 511k | static bool isEndChunk(uint64_t Chunk) { |
277 | 511k | if (Chunk == 0 || 511k Chunk == std::numeric_limits<uint64_t>::max()177k ) |
278 | 357k | return false; |
279 | 153k | |
280 | 153k | return isMask_64(Chunk); |
281 | 153k | } |
282 | | |
283 | | /// \brief Clear or set all bits in the chunk at the given index. |
284 | 990 | static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) { |
285 | 990 | const uint64_t Mask = 0xFFFF; |
286 | 990 | |
287 | 990 | if (Clear) |
288 | 990 | // Clear chunk in the immediate. |
289 | 551 | Imm &= ~(Mask << (Idx * 16)); |
290 | 990 | else |
291 | 990 | // Set all bits in the immediate for the particular chunk. |
292 | 439 | Imm |= Mask << (Idx * 16); |
293 | 990 | |
294 | 990 | return Imm; |
295 | 990 | } |
296 | | |
297 | | /// \brief Check whether the constant contains a sequence of contiguous ones, |
298 | | /// which might be interrupted by one or two chunks. If so, materialize the |
299 | | /// sequence of contiguous ones with an ORR instruction. |
300 | | /// Materialize the chunks which are either interrupting the sequence or outside |
301 | | /// of the sequence with a MOVK instruction. |
302 | | /// |
303 | | /// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk |
304 | | /// which ends the sequence (0...1...). Then we are looking for constants which |
305 | | /// contain at least one S and E chunk. |
306 | | /// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|. |
307 | | /// |
308 | | /// We are also looking for constants like |S|A|B|E| where the contiguous |
309 | | /// sequence of ones wraps around the MSB into the LSB. |
310 | | static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI, |
311 | | MachineBasicBlock &MBB, |
312 | | MachineBasicBlock::iterator &MBBI, |
313 | 129k | const AArch64InstrInfo *TII) { |
314 | 129k | const int NotSet = -1; |
315 | 129k | const uint64_t Mask = 0xFFFF; |
316 | 129k | |
317 | 129k | int StartIdx = NotSet; |
318 | 129k | int EndIdx = NotSet; |
319 | 129k | // Try to find the chunks which start/end a contiguous sequence of ones. |
320 | 645k | for (int Idx = 0; Idx < 4645k ; ++Idx516k ) { |
321 | 516k | int64_t Chunk = getChunk(UImm, Idx); |
322 | 516k | // Sign extend the 16-bit chunk to 64-bit. |
323 | 516k | Chunk = (Chunk << 48) >> 48; |
324 | 516k | |
325 | 516k | if (isStartChunk(Chunk)) |
326 | 4.76k | StartIdx = Idx; |
327 | 511k | else if (511k isEndChunk(Chunk)511k ) |
328 | 46.5k | EndIdx = Idx; |
329 | 516k | } |
330 | 129k | |
331 | 129k | // Early exit in case we can't find a start/end chunk. |
332 | 129k | if (StartIdx == NotSet || 129k EndIdx == NotSet4.63k ) |
333 | 128k | return false; |
334 | 593 | |
335 | 593 | // Outside of the contiguous sequence of ones everything needs to be zero. |
336 | 593 | uint64_t Outside = 0; |
337 | 593 | // Chunks between the start and end chunk need to have all their bits set. |
338 | 593 | uint64_t Inside = Mask; |
339 | 593 | |
340 | 593 | // If our contiguous sequence of ones wraps around from the MSB into the LSB, |
341 | 593 | // just swap indices and pretend we are materializing a contiguous sequence |
342 | 593 | // of zeros surrounded by a contiguous sequence of ones. |
343 | 593 | if (StartIdx > EndIdx593 ) { |
344 | 294 | std::swap(StartIdx, EndIdx); |
345 | 294 | std::swap(Outside, Inside); |
346 | 294 | } |
347 | 593 | |
348 | 593 | uint64_t OrrImm = UImm; |
349 | 593 | int FirstMovkIdx = NotSet; |
350 | 593 | int SecondMovkIdx = NotSet; |
351 | 593 | |
352 | 593 | // Find out which chunks we need to patch up to obtain a contiguous sequence |
353 | 593 | // of ones. |
354 | 2.96k | for (int Idx = 0; Idx < 42.96k ; ++Idx2.37k ) { |
355 | 2.37k | const uint64_t Chunk = getChunk(UImm, Idx); |
356 | 2.37k | |
357 | 2.37k | // Check whether we are looking at a chunk which is not part of the |
358 | 2.37k | // contiguous sequence of ones. |
359 | 2.37k | if ((Idx < StartIdx || 2.37k EndIdx < Idx1.93k ) && Chunk != Outside808 ) { |
360 | 682 | OrrImm = updateImm(OrrImm, Idx, Outside == 0); |
361 | 682 | |
362 | 682 | // Remember the index we need to patch. |
363 | 682 | if (FirstMovkIdx == NotSet) |
364 | 428 | FirstMovkIdx = Idx; |
365 | 682 | else |
366 | 254 | SecondMovkIdx = Idx; |
367 | 682 | |
368 | 682 | // Check whether we are looking a chunk which is part of the contiguous |
369 | 682 | // sequence of ones. |
370 | 2.37k | } else if (1.69k Idx > StartIdx && 1.69k Idx < EndIdx1.04k && Chunk != Inside378 ) { |
371 | 308 | OrrImm = updateImm(OrrImm, Idx, Inside != Mask); |
372 | 308 | |
373 | 308 | // Remember the index we need to patch. |
374 | 308 | if (FirstMovkIdx == NotSet) |
375 | 165 | FirstMovkIdx = Idx; |
376 | 308 | else |
377 | 143 | SecondMovkIdx = Idx; |
378 | 1.69k | } |
379 | 2.37k | } |
380 | 593 | assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!"); |
381 | 593 | |
382 | 593 | // Create the ORR-immediate instruction. |
383 | 593 | uint64_t Encoding = 0; |
384 | 593 | AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding); |
385 | 593 | MachineInstrBuilder MIB = |
386 | 593 | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) |
387 | 593 | .add(MI.getOperand(0)) |
388 | 593 | .addReg(AArch64::XZR) |
389 | 593 | .addImm(Encoding); |
390 | 593 | |
391 | 593 | const unsigned DstReg = MI.getOperand(0).getReg(); |
392 | 593 | const bool DstIsDead = MI.getOperand(0).isDead(); |
393 | 593 | |
394 | 593 | const bool SingleMovk = SecondMovkIdx == NotSet; |
395 | 593 | // Create the first MOVK instruction. |
396 | 593 | MachineInstrBuilder MIB1 = |
397 | 593 | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) |
398 | 593 | .addReg(DstReg, |
399 | 0 | RegState::Define | getDeadRegState(DstIsDead && SingleMovk)) |
400 | 593 | .addReg(DstReg) |
401 | 593 | .addImm(getChunk(UImm, FirstMovkIdx)) |
402 | 593 | .addImm( |
403 | 593 | AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16)); |
404 | 593 | |
405 | 593 | // Early exit in case we only need to emit a single MOVK instruction. |
406 | 593 | if (SingleMovk593 ) { |
407 | 196 | transferImpOps(MI, MIB, MIB1); |
408 | 196 | MI.eraseFromParent(); |
409 | 196 | return true; |
410 | 196 | } |
411 | 397 | |
412 | 397 | // Create the second MOVK instruction. |
413 | 397 | MachineInstrBuilder MIB2 = |
414 | 397 | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) |
415 | 397 | .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) |
416 | 397 | .addReg(DstReg) |
417 | 397 | .addImm(getChunk(UImm, SecondMovkIdx)) |
418 | 397 | .addImm( |
419 | 397 | AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16)); |
420 | 397 | |
421 | 397 | transferImpOps(MI, MIB, MIB2); |
422 | 397 | MI.eraseFromParent(); |
423 | 397 | return true; |
424 | 397 | } |
425 | | |
426 | | /// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more |
427 | | /// real move-immediate instructions to synthesize the immediate. |
428 | | bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, |
429 | | MachineBasicBlock::iterator MBBI, |
430 | 1.37M | unsigned BitSize) { |
431 | 1.37M | MachineInstr &MI = *MBBI; |
432 | 1.37M | unsigned DstReg = MI.getOperand(0).getReg(); |
433 | 1.37M | uint64_t Imm = MI.getOperand(1).getImm(); |
434 | 1.37M | const unsigned Mask = 0xFFFF; |
435 | 1.37M | |
436 | 1.37M | if (DstReg == AArch64::XZR || 1.37M DstReg == AArch64::WZR1.37M ) { |
437 | 2 | // Useless def, and we don't want to risk creating an invalid ORR (which |
438 | 2 | // would really write to sp). |
439 | 2 | MI.eraseFromParent(); |
440 | 2 | return true; |
441 | 2 | } |
442 | 1.37M | |
443 | 1.37M | // Try a MOVI instruction (aka ORR-immediate with the zero register). |
444 | 1.37M | uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); |
445 | 1.37M | uint64_t Encoding; |
446 | 1.37M | if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)1.37M ) { |
447 | 760k | unsigned Opc = (BitSize == 32 ? AArch64::ORRWri653k : AArch64::ORRXri107k ); |
448 | 760k | MachineInstrBuilder MIB = |
449 | 760k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) |
450 | 760k | .add(MI.getOperand(0)) |
451 | 760k | .addReg(BitSize == 32 ? AArch64::WZR653k : AArch64::XZR107k ) |
452 | 760k | .addImm(Encoding); |
453 | 760k | transferImpOps(MI, MIB, MIB); |
454 | 760k | MI.eraseFromParent(); |
455 | 760k | return true; |
456 | 760k | } |
457 | 618k | |
458 | 618k | // Scan the immediate and count the number of 16-bit chunks which are either |
459 | 618k | // all ones or all zeros. |
460 | 618k | unsigned OneChunks = 0; |
461 | 618k | unsigned ZeroChunks = 0; |
462 | 2.12M | for (unsigned Shift = 0; Shift < BitSize2.12M ; Shift += 161.50M ) { |
463 | 1.50M | const unsigned Chunk = (Imm >> Shift) & Mask; |
464 | 1.50M | if (Chunk == Mask) |
465 | 164k | OneChunks++; |
466 | 1.34M | else if (1.34M Chunk == 01.34M ) |
467 | 733k | ZeroChunks++; |
468 | 1.50M | } |
469 | 618k | |
470 | 618k | // Since we can't materialize the constant with a single ORR instruction, |
471 | 618k | // let's see whether we can materialize 3/4 of the constant with an ORR |
472 | 618k | // instruction and use an additional MOVK instruction to materialize the |
473 | 618k | // remaining 1/4. |
474 | 618k | // |
475 | 618k | // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|. |
476 | 618k | // |
477 | 618k | // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR, |
478 | 618k | // we would create the following instruction sequence: |
479 | 618k | // |
480 | 618k | // ORR x0, xzr, |A|X|A|X| |
481 | 618k | // MOVK x0, |B|, LSL #16 |
482 | 618k | // |
483 | 618k | // Only look at 64-bit constants which can't be materialized with a single |
484 | 618k | // instruction e.g. which have less than either three all zero or all one |
485 | 618k | // chunks. |
486 | 618k | // |
487 | 618k | // Ignore 32-bit constants here, they always can be materialized with a |
488 | 618k | // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized |
489 | 618k | // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair. |
490 | 618k | // Thus we fall back to the default code below which in the best case creates |
491 | 618k | // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one). |
492 | 618k | // |
493 | 618k | if (BitSize == 64 && 618k OneChunks < 3134k && ZeroChunks < 3129k ) { |
494 | 51.8k | // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2 |
495 | 51.8k | // identical? |
496 | 51.8k | if (getChunk(UImm, 0) == getChunk(UImm, 2)51.8k ) { |
497 | 741 | // See if we can come up with a constant which can be materialized with |
498 | 741 | // ORR-immediate by replicating element 3 into element 1. |
499 | 741 | uint64_t OrrImm = replicateChunk(UImm, 3, 1); |
500 | 741 | if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1)) |
501 | 278 | return true; |
502 | 463 | |
503 | 463 | // See if we can come up with a constant which can be materialized with |
504 | 463 | // ORR-immediate by replicating element 1 into element 3. |
505 | 463 | OrrImm = replicateChunk(UImm, 1, 3); |
506 | 463 | if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3)) |
507 | 147 | return true; |
508 | 51.8k | |
509 | 51.8k | // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3 |
510 | 51.8k | // identical? |
511 | 51.1k | } else if (51.1k getChunk(UImm, 1) == getChunk(UImm, 3)51.1k ) { |
512 | 3.69k | // See if we can come up with a constant which can be materialized with |
513 | 3.69k | // ORR-immediate by replicating element 2 into element 0. |
514 | 3.69k | uint64_t OrrImm = replicateChunk(UImm, 2, 0); |
515 | 3.69k | if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0)) |
516 | 3.35k | return true; |
517 | 338 | |
518 | 338 | // See if we can come up with a constant which can be materialized with |
519 | 338 | // ORR-immediate by replicating element 1 into element 3. |
520 | 338 | OrrImm = replicateChunk(UImm, 0, 2); |
521 | 338 | if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2)) |
522 | 155 | return true; |
523 | 614k | } |
524 | 51.8k | } |
525 | 614k | |
526 | 614k | // Check for identical 16-bit chunks within the constant and if so materialize |
527 | 614k | // them with a single ORR instruction. The remaining one or two 16-bit chunks |
528 | 614k | // will be materialized with MOVK instructions. |
529 | 614k | if (614k BitSize == 64 && 614k tryToreplicateChunks(UImm, MI, MBB, MBBI, TII)130k ) |
530 | 1.02k | return true; |
531 | 613k | |
532 | 613k | // Check whether the constant contains a sequence of contiguous ones, which |
533 | 613k | // might be interrupted by one or two chunks. If so, materialize the sequence |
534 | 613k | // of contiguous ones with an ORR instruction. Materialize the chunks which |
535 | 613k | // are either interrupting the sequence or outside of the sequence with a |
536 | 613k | // MOVK instruction. |
537 | 613k | if (613k BitSize == 64 && 613k trySequenceOfOnes(UImm, MI, MBB, MBBI, TII)129k ) |
538 | 593 | return true; |
539 | 613k | |
540 | 613k | // Use a MOVZ or MOVN instruction to set the high bits, followed by one or |
541 | 613k | // more MOVK instructions to insert additional 16-bit portions into the |
542 | 613k | // lower bits. |
543 | 613k | bool isNeg = false; |
544 | 613k | |
545 | 613k | // Use MOVN to materialize the high bits if we have more all one chunks |
546 | 613k | // than all zero chunks. |
547 | 613k | if (OneChunks > ZeroChunks613k ) { |
548 | 105k | isNeg = true; |
549 | 105k | Imm = ~Imm; |
550 | 105k | } |
551 | 613k | |
552 | 613k | unsigned FirstOpc; |
553 | 613k | if (BitSize == 32613k ) { |
554 | 484k | Imm &= (1LL << 32) - 1; |
555 | 484k | FirstOpc = (isNeg ? AArch64::MOVNWi97.4k : AArch64::MOVZWi387k ); |
556 | 613k | } else { |
557 | 128k | FirstOpc = (isNeg ? AArch64::MOVNXi7.92k : AArch64::MOVZXi120k ); |
558 | 128k | } |
559 | 613k | unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN |
560 | 613k | unsigned LastShift = 0; // LSL amount for last MOVK |
561 | 613k | if (Imm != 0613k ) { |
562 | 430k | unsigned LZ = countLeadingZeros(Imm); |
563 | 430k | unsigned TZ = countTrailingZeros(Imm); |
564 | 430k | Shift = (TZ / 16) * 16; |
565 | 430k | LastShift = ((63 - LZ) / 16) * 16; |
566 | 430k | } |
567 | 613k | unsigned Imm16 = (Imm >> Shift) & Mask; |
568 | 613k | bool DstIsDead = MI.getOperand(0).isDead(); |
569 | 613k | MachineInstrBuilder MIB1 = |
570 | 613k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc)) |
571 | 613k | .addReg(DstReg, RegState::Define | |
572 | 29.6k | getDeadRegState(DstIsDead && Shift == LastShift)) |
573 | 613k | .addImm(Imm16) |
574 | 613k | .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); |
575 | 613k | |
576 | 613k | // If a MOVN was used for the high bits of a negative value, flip the rest |
577 | 613k | // of the bits back for use with MOVK. |
578 | 613k | if (isNeg) |
579 | 105k | Imm = ~Imm; |
580 | 613k | |
581 | 613k | if (Shift == LastShift613k ) { |
582 | 499k | transferImpOps(MI, MIB1, MIB1); |
583 | 499k | MI.eraseFromParent(); |
584 | 499k | return true; |
585 | 499k | } |
586 | 113k | |
587 | 113k | MachineInstrBuilder MIB2; |
588 | 113k | unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi67.3k : AArch64::MOVKXi46.3k ); |
589 | 275k | while (Shift < LastShift275k ) { |
590 | 162k | Shift += 16; |
591 | 162k | Imm16 = (Imm >> Shift) & Mask; |
592 | 162k | if (Imm16 == (isNeg ? 162k Mask4.40k : 0157k )) |
593 | 3.97k | continue; // This 16-bit portion is already set correctly. |
594 | 158k | MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) |
595 | 158k | .addReg(DstReg, |
596 | 158k | RegState::Define | |
597 | 10.9k | getDeadRegState(DstIsDead && Shift == LastShift)) |
598 | 162k | .addReg(DstReg) |
599 | 162k | .addImm(Imm16) |
600 | 162k | .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); |
601 | 162k | } |
602 | 1.37M | |
603 | 1.37M | transferImpOps(MI, MIB1, MIB2); |
604 | 1.37M | MI.eraseFromParent(); |
605 | 1.37M | return true; |
606 | 1.37M | } |
607 | | |
608 | | bool AArch64ExpandPseudo::expandCMP_SWAP( |
609 | | MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, |
610 | | unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg, |
611 | 7 | MachineBasicBlock::iterator &NextMBBI) { |
612 | 7 | MachineInstr &MI = *MBBI; |
613 | 7 | DebugLoc DL = MI.getDebugLoc(); |
614 | 7 | const MachineOperand &Dest = MI.getOperand(0); |
615 | 7 | unsigned StatusReg = MI.getOperand(1).getReg(); |
616 | 7 | bool StatusDead = MI.getOperand(1).isDead(); |
617 | 7 | // Duplicating undef operands into 2 instructions does not guarantee the same |
618 | 7 | // value on both; However undef should be replaced by xzr anyway. |
619 | 7 | assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); |
620 | 7 | unsigned AddrReg = MI.getOperand(2).getReg(); |
621 | 7 | unsigned DesiredReg = MI.getOperand(3).getReg(); |
622 | 7 | unsigned NewReg = MI.getOperand(4).getReg(); |
623 | 7 | |
624 | 7 | MachineFunction *MF = MBB.getParent(); |
625 | 7 | auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); |
626 | 7 | auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); |
627 | 7 | auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); |
628 | 7 | |
629 | 7 | MF->insert(++MBB.getIterator(), LoadCmpBB); |
630 | 7 | MF->insert(++LoadCmpBB->getIterator(), StoreBB); |
631 | 7 | MF->insert(++StoreBB->getIterator(), DoneBB); |
632 | 7 | |
633 | 7 | // .Lloadcmp: |
634 | 7 | // mov wStatus, 0 |
635 | 7 | // ldaxr xDest, [xAddr] |
636 | 7 | // cmp xDest, xDesired |
637 | 7 | // b.ne .Ldone |
638 | 7 | if (!StatusDead) |
639 | 7 | BuildMI(LoadCmpBB, DL, TII->get(AArch64::MOVZWi), StatusReg) |
640 | 7 | .addImm(0).addImm(0); |
641 | 7 | BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg()) |
642 | 7 | .addReg(AddrReg); |
643 | 7 | BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg) |
644 | 7 | .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) |
645 | 7 | .addReg(DesiredReg) |
646 | 7 | .addImm(ExtendImm); |
647 | 7 | BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) |
648 | 7 | .addImm(AArch64CC::NE) |
649 | 7 | .addMBB(DoneBB) |
650 | 7 | .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill); |
651 | 7 | LoadCmpBB->addSuccessor(DoneBB); |
652 | 7 | LoadCmpBB->addSuccessor(StoreBB); |
653 | 7 | |
654 | 7 | // .Lstore: |
655 | 7 | // stlxr wStatus, xNew, [xAddr] |
656 | 7 | // cbnz wStatus, .Lloadcmp |
657 | 7 | BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg) |
658 | 7 | .addReg(NewReg) |
659 | 7 | .addReg(AddrReg); |
660 | 7 | BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) |
661 | 7 | .addReg(StatusReg, getKillRegState(StatusDead)) |
662 | 7 | .addMBB(LoadCmpBB); |
663 | 7 | StoreBB->addSuccessor(LoadCmpBB); |
664 | 7 | StoreBB->addSuccessor(DoneBB); |
665 | 7 | |
666 | 7 | DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); |
667 | 7 | DoneBB->transferSuccessors(&MBB); |
668 | 7 | |
669 | 7 | MBB.addSuccessor(LoadCmpBB); |
670 | 7 | |
671 | 7 | NextMBBI = MBB.end(); |
672 | 7 | MI.eraseFromParent(); |
673 | 7 | |
674 | 7 | // Recompute livein lists. |
675 | 7 | LivePhysRegs LiveRegs; |
676 | 7 | computeAndAddLiveIns(LiveRegs, *DoneBB); |
677 | 7 | computeAndAddLiveIns(LiveRegs, *StoreBB); |
678 | 7 | computeAndAddLiveIns(LiveRegs, *LoadCmpBB); |
679 | 7 | // Do an extra pass around the loop to get loop carried registers right. |
680 | 7 | StoreBB->clearLiveIns(); |
681 | 7 | computeAndAddLiveIns(LiveRegs, *StoreBB); |
682 | 7 | LoadCmpBB->clearLiveIns(); |
683 | 7 | computeAndAddLiveIns(LiveRegs, *LoadCmpBB); |
684 | 7 | |
685 | 7 | return true; |
686 | 7 | } |
687 | | |
688 | | bool AArch64ExpandPseudo::expandCMP_SWAP_128( |
689 | | MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, |
690 | 2 | MachineBasicBlock::iterator &NextMBBI) { |
691 | 2 | MachineInstr &MI = *MBBI; |
692 | 2 | DebugLoc DL = MI.getDebugLoc(); |
693 | 2 | MachineOperand &DestLo = MI.getOperand(0); |
694 | 2 | MachineOperand &DestHi = MI.getOperand(1); |
695 | 2 | unsigned StatusReg = MI.getOperand(2).getReg(); |
696 | 2 | bool StatusDead = MI.getOperand(2).isDead(); |
697 | 2 | // Duplicating undef operands into 2 instructions does not guarantee the same |
698 | 2 | // value on both; However undef should be replaced by xzr anyway. |
699 | 2 | assert(!MI.getOperand(3).isUndef() && "cannot handle undef"); |
700 | 2 | unsigned AddrReg = MI.getOperand(3).getReg(); |
701 | 2 | unsigned DesiredLoReg = MI.getOperand(4).getReg(); |
702 | 2 | unsigned DesiredHiReg = MI.getOperand(5).getReg(); |
703 | 2 | unsigned NewLoReg = MI.getOperand(6).getReg(); |
704 | 2 | unsigned NewHiReg = MI.getOperand(7).getReg(); |
705 | 2 | |
706 | 2 | MachineFunction *MF = MBB.getParent(); |
707 | 2 | auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); |
708 | 2 | auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); |
709 | 2 | auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); |
710 | 2 | |
711 | 2 | MF->insert(++MBB.getIterator(), LoadCmpBB); |
712 | 2 | MF->insert(++LoadCmpBB->getIterator(), StoreBB); |
713 | 2 | MF->insert(++StoreBB->getIterator(), DoneBB); |
714 | 2 | |
715 | 2 | // .Lloadcmp: |
716 | 2 | // ldaxp xDestLo, xDestHi, [xAddr] |
717 | 2 | // cmp xDestLo, xDesiredLo |
718 | 2 | // sbcs xDestHi, xDesiredHi |
719 | 2 | // b.ne .Ldone |
720 | 2 | BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX)) |
721 | 2 | .addReg(DestLo.getReg(), RegState::Define) |
722 | 2 | .addReg(DestHi.getReg(), RegState::Define) |
723 | 2 | .addReg(AddrReg); |
724 | 2 | BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) |
725 | 2 | .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead())) |
726 | 2 | .addReg(DesiredLoReg) |
727 | 2 | .addImm(0); |
728 | 2 | BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) |
729 | 2 | .addUse(AArch64::WZR) |
730 | 2 | .addUse(AArch64::WZR) |
731 | 2 | .addImm(AArch64CC::EQ); |
732 | 2 | BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) |
733 | 2 | .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead())) |
734 | 2 | .addReg(DesiredHiReg) |
735 | 2 | .addImm(0); |
736 | 2 | BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) |
737 | 2 | .addUse(StatusReg, RegState::Kill) |
738 | 2 | .addUse(StatusReg, RegState::Kill) |
739 | 2 | .addImm(AArch64CC::EQ); |
740 | 2 | BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW)) |
741 | 2 | .addUse(StatusReg, getKillRegState(StatusDead)) |
742 | 2 | .addMBB(DoneBB); |
743 | 2 | LoadCmpBB->addSuccessor(DoneBB); |
744 | 2 | LoadCmpBB->addSuccessor(StoreBB); |
745 | 2 | |
746 | 2 | // .Lstore: |
747 | 2 | // stlxp wStatus, xNewLo, xNewHi, [xAddr] |
748 | 2 | // cbnz wStatus, .Lloadcmp |
749 | 2 | BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg) |
750 | 2 | .addReg(NewLoReg) |
751 | 2 | .addReg(NewHiReg) |
752 | 2 | .addReg(AddrReg); |
753 | 2 | BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) |
754 | 2 | .addReg(StatusReg, getKillRegState(StatusDead)) |
755 | 2 | .addMBB(LoadCmpBB); |
756 | 2 | StoreBB->addSuccessor(LoadCmpBB); |
757 | 2 | StoreBB->addSuccessor(DoneBB); |
758 | 2 | |
759 | 2 | DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); |
760 | 2 | DoneBB->transferSuccessors(&MBB); |
761 | 2 | |
762 | 2 | MBB.addSuccessor(LoadCmpBB); |
763 | 2 | |
764 | 2 | NextMBBI = MBB.end(); |
765 | 2 | MI.eraseFromParent(); |
766 | 2 | |
767 | 2 | // Recompute liveness bottom up. |
768 | 2 | LivePhysRegs LiveRegs; |
769 | 2 | computeAndAddLiveIns(LiveRegs, *DoneBB); |
770 | 2 | computeAndAddLiveIns(LiveRegs, *StoreBB); |
771 | 2 | computeAndAddLiveIns(LiveRegs, *LoadCmpBB); |
772 | 2 | // Do an extra pass in the loop to get the loop carried dependencies right. |
773 | 2 | StoreBB->clearLiveIns(); |
774 | 2 | computeAndAddLiveIns(LiveRegs, *StoreBB); |
775 | 2 | LoadCmpBB->clearLiveIns(); |
776 | 2 | computeAndAddLiveIns(LiveRegs, *LoadCmpBB); |
777 | 2 | |
778 | 2 | return true; |
779 | 2 | } |
780 | | |
781 | | /// \brief If MBBI references a pseudo instruction that should be expanded here, |
782 | | /// do the expansion and return true. Otherwise return false. |
783 | | bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, |
784 | | MachineBasicBlock::iterator MBBI, |
785 | 22.2M | MachineBasicBlock::iterator &NextMBBI) { |
786 | 22.2M | MachineInstr &MI = *MBBI; |
787 | 22.2M | unsigned Opcode = MI.getOpcode(); |
788 | 22.2M | switch (Opcode) { |
789 | 15.2M | default: |
790 | 15.2M | break; |
791 | 22.2M | |
792 | 4.07M | case AArch64::ADDWrr: |
793 | 4.07M | case AArch64::SUBWrr: |
794 | 4.07M | case AArch64::ADDXrr: |
795 | 4.07M | case AArch64::SUBXrr: |
796 | 4.07M | case AArch64::ADDSWrr: |
797 | 4.07M | case AArch64::SUBSWrr: |
798 | 4.07M | case AArch64::ADDSXrr: |
799 | 4.07M | case AArch64::SUBSXrr: |
800 | 4.07M | case AArch64::ANDWrr: |
801 | 4.07M | case AArch64::ANDXrr: |
802 | 4.07M | case AArch64::BICWrr: |
803 | 4.07M | case AArch64::BICXrr: |
804 | 4.07M | case AArch64::ANDSWrr: |
805 | 4.07M | case AArch64::ANDSXrr: |
806 | 4.07M | case AArch64::BICSWrr: |
807 | 4.07M | case AArch64::BICSXrr: |
808 | 4.07M | case AArch64::EONWrr: |
809 | 4.07M | case AArch64::EONXrr: |
810 | 4.07M | case AArch64::EORWrr: |
811 | 4.07M | case AArch64::EORXrr: |
812 | 4.07M | case AArch64::ORNWrr: |
813 | 4.07M | case AArch64::ORNXrr: |
814 | 4.07M | case AArch64::ORRWrr: |
815 | 4.07M | case AArch64::ORRXrr: { |
816 | 4.07M | unsigned Opcode; |
817 | 4.07M | switch (MI.getOpcode()) { |
818 | 0 | default: |
819 | 0 | return false; |
820 | 96.6k | case AArch64::ADDWrr: Opcode = AArch64::ADDWrs; break; |
821 | 45.5k | case AArch64::SUBWrr: Opcode = AArch64::SUBWrs; break; |
822 | 235k | case AArch64::ADDXrr: Opcode = AArch64::ADDXrs; break; |
823 | 84.5k | case AArch64::SUBXrr: Opcode = AArch64::SUBXrs; break; |
824 | 897 | case AArch64::ADDSWrr: Opcode = AArch64::ADDSWrs; break; |
825 | 269k | case AArch64::SUBSWrr: Opcode = AArch64::SUBSWrs; break; |
826 | 340 | case AArch64::ADDSXrr: Opcode = AArch64::ADDSXrs; break; |
827 | 244k | case AArch64::SUBSXrr: Opcode = AArch64::SUBSXrs; break; |
828 | 27.3k | case AArch64::ANDWrr: Opcode = AArch64::ANDWrs; break; |
829 | 2.06k | case AArch64::ANDXrr: Opcode = AArch64::ANDXrs; break; |
830 | 2.95k | case AArch64::BICWrr: Opcode = AArch64::BICWrs; break; |
831 | 31 | case AArch64::BICXrr: Opcode = AArch64::BICXrs; break; |
832 | 4.16k | case AArch64::ANDSWrr: Opcode = AArch64::ANDSWrs; break; |
833 | 3.62k | case AArch64::ANDSXrr: Opcode = AArch64::ANDSXrs; break; |
834 | 163 | case AArch64::BICSWrr: Opcode = AArch64::BICSWrs; break; |
835 | 2.97k | case AArch64::BICSXrr: Opcode = AArch64::BICSXrs; break; |
836 | 1 | case AArch64::EONWrr: Opcode = AArch64::EONWrs; break; |
837 | 2 | case AArch64::EONXrr: Opcode = AArch64::EONXrs; break; |
838 | 4.32k | case AArch64::EORWrr: Opcode = AArch64::EORWrs; break; |
839 | 2.44k | case AArch64::EORXrr: Opcode = AArch64::EORXrs; break; |
840 | 1.51k | case AArch64::ORNWrr: Opcode = AArch64::ORNWrs; break; |
841 | 532 | case AArch64::ORNXrr: Opcode = AArch64::ORNXrs; break; |
842 | 20.0k | case AArch64::ORRWrr: Opcode = AArch64::ORRWrs; break; |
843 | 3.02M | case AArch64::ORRXrr: Opcode = AArch64::ORRXrs; break; |
844 | 4.07M | } |
845 | 4.07M | MachineInstrBuilder MIB1 = |
846 | 4.07M | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode), |
847 | 4.07M | MI.getOperand(0).getReg()) |
848 | 4.07M | .add(MI.getOperand(1)) |
849 | 4.07M | .add(MI.getOperand(2)) |
850 | 4.07M | .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); |
851 | 4.07M | transferImpOps(MI, MIB1, MIB1); |
852 | 4.07M | MI.eraseFromParent(); |
853 | 4.07M | return true; |
854 | 4.07M | } |
855 | 4.07M | |
856 | 301k | case AArch64::LOADgot: { |
857 | 301k | // Expand into ADRP + LDR. |
858 | 301k | unsigned DstReg = MI.getOperand(0).getReg(); |
859 | 301k | const MachineOperand &MO1 = MI.getOperand(1); |
860 | 301k | unsigned Flags = MO1.getTargetFlags(); |
861 | 301k | MachineInstrBuilder MIB1 = |
862 | 301k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg); |
863 | 301k | MachineInstrBuilder MIB2 = |
864 | 301k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui)) |
865 | 301k | .add(MI.getOperand(0)) |
866 | 301k | .addReg(DstReg); |
867 | 301k | |
868 | 301k | if (MO1.isGlobal()301k ) { |
869 | 301k | MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE); |
870 | 301k | MIB2.addGlobalAddress(MO1.getGlobal(), 0, |
871 | 301k | Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); |
872 | 301k | } else if (0 MO1.isSymbol()0 ) { |
873 | 0 | MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE); |
874 | 0 | MIB2.addExternalSymbol(MO1.getSymbolName(), |
875 | 0 | Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); |
876 | 0 | } else { |
877 | 0 | assert(MO1.isCPI() && |
878 | 0 | "Only expect globals, externalsymbols, or constant pools"); |
879 | 0 | MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(), |
880 | 0 | Flags | AArch64II::MO_PAGE); |
881 | 0 | MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(), |
882 | 0 | Flags | AArch64II::MO_PAGEOFF | |
883 | 0 | AArch64II::MO_NC); |
884 | 0 | } |
885 | 301k | |
886 | 301k | transferImpOps(MI, MIB1, MIB2); |
887 | 301k | MI.eraseFromParent(); |
888 | 301k | return true; |
889 | 4.07M | } |
890 | 4.07M | |
891 | 834k | case AArch64::MOVaddr: |
892 | 834k | case AArch64::MOVaddrJT: |
893 | 834k | case AArch64::MOVaddrCP: |
894 | 834k | case AArch64::MOVaddrBA: |
895 | 834k | case AArch64::MOVaddrTLS: |
896 | 834k | case AArch64::MOVaddrEXT: { |
897 | 834k | // Expand into ADRP + ADD. |
898 | 834k | unsigned DstReg = MI.getOperand(0).getReg(); |
899 | 834k | MachineInstrBuilder MIB1 = |
900 | 834k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg) |
901 | 834k | .add(MI.getOperand(1)); |
902 | 834k | |
903 | 834k | MachineInstrBuilder MIB2 = |
904 | 834k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri)) |
905 | 834k | .add(MI.getOperand(0)) |
906 | 834k | .addReg(DstReg) |
907 | 834k | .add(MI.getOperand(2)) |
908 | 834k | .addImm(0); |
909 | 834k | |
910 | 834k | transferImpOps(MI, MIB1, MIB2); |
911 | 834k | MI.eraseFromParent(); |
912 | 834k | return true; |
913 | 834k | } |
914 | 38 | case AArch64::MOVbaseTLS: { |
915 | 38 | unsigned DstReg = MI.getOperand(0).getReg(); |
916 | 38 | auto SysReg = AArch64SysReg::TPIDR_EL0; |
917 | 38 | MachineFunction *MF = MBB.getParent(); |
918 | 38 | if (MF->getTarget().getTargetTriple().isOSFuchsia() && |
919 | 4 | MF->getTarget().getCodeModel() == CodeModel::Kernel) |
920 | 2 | SysReg = AArch64SysReg::TPIDR_EL1; |
921 | 38 | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg) |
922 | 38 | .addImm(SysReg); |
923 | 38 | MI.eraseFromParent(); |
924 | 38 | return true; |
925 | 834k | } |
926 | 834k | |
927 | 1.13M | case AArch64::MOVi32imm: |
928 | 1.13M | return expandMOVImm(MBB, MBBI, 32); |
929 | 241k | case AArch64::MOVi64imm: |
930 | 241k | return expandMOVImm(MBB, MBBI, 64); |
931 | 434k | case AArch64::RET_ReallyLR: { |
932 | 434k | // Hiding the LR use with RET_ReallyLR may lead to extra kills in the |
933 | 434k | // function and missing live-ins. We are fine in practice because callee |
934 | 434k | // saved register handling ensures the register value is restored before |
935 | 434k | // RET, but we need the undef flag here to appease the MachineVerifier |
936 | 434k | // liveness checks. |
937 | 434k | MachineInstrBuilder MIB = |
938 | 434k | BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET)) |
939 | 434k | .addReg(AArch64::LR, RegState::Undef); |
940 | 434k | transferImpOps(MI, MIB, MIB); |
941 | 434k | MI.eraseFromParent(); |
942 | 434k | return true; |
943 | 834k | } |
944 | 1 | case AArch64::CMP_SWAP_8: |
945 | 1 | return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB, |
946 | 1 | AArch64::SUBSWrx, |
947 | 1 | AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0), |
948 | 1 | AArch64::WZR, NextMBBI); |
949 | 1 | case AArch64::CMP_SWAP_16: |
950 | 1 | return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH, |
951 | 1 | AArch64::SUBSWrx, |
952 | 1 | AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0), |
953 | 1 | AArch64::WZR, NextMBBI); |
954 | 3 | case AArch64::CMP_SWAP_32: |
955 | 3 | return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW, |
956 | 3 | AArch64::SUBSWrs, |
957 | 3 | AArch64_AM::getShifterImm(AArch64_AM::LSL, 0), |
958 | 3 | AArch64::WZR, NextMBBI); |
959 | 2 | case AArch64::CMP_SWAP_64: |
960 | 2 | return expandCMP_SWAP(MBB, MBBI, |
961 | 2 | AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs, |
962 | 2 | AArch64_AM::getShifterImm(AArch64_AM::LSL, 0), |
963 | 2 | AArch64::XZR, NextMBBI); |
964 | 2 | case AArch64::CMP_SWAP_128: |
965 | 2 | return expandCMP_SWAP_128(MBB, MBBI, NextMBBI); |
966 | 834k | |
967 | 144 | case AArch64::AESMCrrTied: |
968 | 144 | case AArch64::AESIMCrrTied: { |
969 | 144 | MachineInstrBuilder MIB = |
970 | 144 | BuildMI(MBB, MBBI, MI.getDebugLoc(), |
971 | 80 | TII->get(Opcode == AArch64::AESMCrrTied ? AArch64::AESMCrr : |
972 | 64 | AArch64::AESIMCrr)) |
973 | 144 | .add(MI.getOperand(0)) |
974 | 144 | .add(MI.getOperand(1)); |
975 | 144 | transferImpOps(MI, MIB, MIB); |
976 | 144 | MI.eraseFromParent(); |
977 | 144 | return true; |
978 | 15.2M | } |
979 | 15.2M | } |
980 | 15.2M | return false; |
981 | 15.2M | } |
982 | | |
983 | | /// \brief Iterate over the instructions in basic block MBB and expand any |
984 | | /// pseudo instructions. Return true if anything was modified. |
985 | 3.72M | bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { |
986 | 3.72M | bool Modified = false; |
987 | 3.72M | |
988 | 3.72M | MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); |
989 | 26.0M | while (MBBI != E26.0M ) { |
990 | 22.2M | MachineBasicBlock::iterator NMBBI = std::next(MBBI); |
991 | 22.2M | Modified |= expandMI(MBB, MBBI, NMBBI); |
992 | 22.2M | MBBI = NMBBI; |
993 | 22.2M | } |
994 | 3.72M | |
995 | 3.72M | return Modified; |
996 | 3.72M | } |
997 | | |
998 | 456k | bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { |
999 | 456k | TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); |
1000 | 456k | |
1001 | 456k | bool Modified = false; |
1002 | 456k | for (auto &MBB : MF) |
1003 | 3.72M | Modified |= expandMBB(MBB); |
1004 | 456k | return Modified; |
1005 | 456k | } |
1006 | | |
1007 | | /// \brief Returns an instance of the pseudo instruction expansion pass. |
1008 | 13.9k | FunctionPass *llvm::createAArch64ExpandPseudoPass() { |
1009 | 13.9k | return new AArch64ExpandPseudo(); |
1010 | 13.9k | } |