/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/ARM/A15SDOptimizer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // The Cortex-A15 processor employs a tracking scheme in its register renaming |
11 | | // in order to process each instruction's micro-ops speculatively and |
12 | | // out-of-order with appropriate forwarding. The ARM architecture allows VFP |
13 | | // instructions to read and write 32-bit S-registers. Each S-register |
14 | | // corresponds to one half (upper or lower) of an overlaid 64-bit D-register. |
15 | | // |
16 | | // There are several instruction patterns which can be used to provide this |
17 | | // capability which can provide higher performance than other, potentially more |
18 | | // direct patterns, specifically around when one micro-op reads a D-register |
19 | | // operand that has recently been written as one or more S-register results. |
20 | | // |
21 | | // This file defines a pre-regalloc pass which looks for SPR producers which |
22 | | // are going to be used by a DPR (or QPR) consumers and creates the more |
23 | | // optimized access pattern. |
24 | | // |
25 | | //===----------------------------------------------------------------------===// |
26 | | |
27 | | #include "ARM.h" |
28 | | #include "ARMBaseInstrInfo.h" |
29 | | #include "ARMBaseRegisterInfo.h" |
30 | | #include "ARMSubtarget.h" |
31 | | #include "llvm/ADT/Statistic.h" |
32 | | #include "llvm/CodeGen/MachineFunction.h" |
33 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
34 | | #include "llvm/CodeGen/MachineInstr.h" |
35 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
36 | | #include "llvm/CodeGen/MachineRegisterInfo.h" |
37 | | #include "llvm/Support/Debug.h" |
38 | | #include "llvm/Support/raw_ostream.h" |
39 | | #include "llvm/Target/TargetRegisterInfo.h" |
40 | | #include "llvm/Target/TargetSubtargetInfo.h" |
41 | | #include <map> |
42 | | #include <set> |
43 | | |
44 | | using namespace llvm; |
45 | | |
46 | | #define DEBUG_TYPE "a15-sd-optimizer" |
47 | | |
48 | | namespace { |
49 | | struct A15SDOptimizer : public MachineFunctionPass { |
50 | | static char ID; |
51 | 4.12k | A15SDOptimizer() : MachineFunctionPass(ID) {} |
52 | | |
53 | | bool runOnMachineFunction(MachineFunction &Fn) override; |
54 | | |
55 | 4.12k | StringRef getPassName() const override { return "ARM A15 S->D optimizer"; } |
56 | | |
57 | | private: |
58 | | const ARMBaseInstrInfo *TII; |
59 | | const TargetRegisterInfo *TRI; |
60 | | MachineRegisterInfo *MRI; |
61 | | |
62 | | bool runOnInstruction(MachineInstr *MI); |
63 | | |
64 | | // |
65 | | // Instruction builder helpers |
66 | | // |
67 | | unsigned createDupLane(MachineBasicBlock &MBB, |
68 | | MachineBasicBlock::iterator InsertBefore, |
69 | | const DebugLoc &DL, unsigned Reg, unsigned Lane, |
70 | | bool QPR = false); |
71 | | |
72 | | unsigned createExtractSubreg(MachineBasicBlock &MBB, |
73 | | MachineBasicBlock::iterator InsertBefore, |
74 | | const DebugLoc &DL, unsigned DReg, |
75 | | unsigned Lane, const TargetRegisterClass *TRC); |
76 | | |
77 | | unsigned createVExt(MachineBasicBlock &MBB, |
78 | | MachineBasicBlock::iterator InsertBefore, |
79 | | const DebugLoc &DL, unsigned Ssub0, unsigned Ssub1); |
80 | | |
81 | | unsigned createRegSequence(MachineBasicBlock &MBB, |
82 | | MachineBasicBlock::iterator InsertBefore, |
83 | | const DebugLoc &DL, unsigned Reg1, |
84 | | unsigned Reg2); |
85 | | |
86 | | unsigned createInsertSubreg(MachineBasicBlock &MBB, |
87 | | MachineBasicBlock::iterator InsertBefore, |
88 | | const DebugLoc &DL, unsigned DReg, |
89 | | unsigned Lane, unsigned ToInsert); |
90 | | |
91 | | unsigned createImplicitDef(MachineBasicBlock &MBB, |
92 | | MachineBasicBlock::iterator InsertBefore, |
93 | | const DebugLoc &DL); |
94 | | |
95 | | // |
96 | | // Various property checkers |
97 | | // |
98 | | bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC); |
99 | | bool hasPartialWrite(MachineInstr *MI); |
100 | | SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI); |
101 | | unsigned getDPRLaneFromSPR(unsigned SReg); |
102 | | |
103 | | // |
104 | | // Methods used for getting the definitions of partial registers |
105 | | // |
106 | | |
107 | | MachineInstr *elideCopies(MachineInstr *MI); |
108 | | void elideCopiesAndPHIs(MachineInstr *MI, |
109 | | SmallVectorImpl<MachineInstr*> &Outs); |
110 | | |
111 | | // |
112 | | // Pattern optimization methods |
113 | | // |
114 | | unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg); |
115 | | unsigned optimizeSDPattern(MachineInstr *MI); |
116 | | unsigned getPrefSPRLane(unsigned SReg); |
117 | | |
118 | | // |
119 | | // Sanitizing method - used to make sure if don't leave dead code around. |
120 | | // |
121 | | void eraseInstrWithNoUses(MachineInstr *MI); |
122 | | |
123 | | // |
124 | | // A map used to track the changes done by this pass. |
125 | | // |
126 | | std::map<MachineInstr*, unsigned> Replacements; |
127 | | std::set<MachineInstr *> DeadInstr; |
128 | | }; |
129 | | char A15SDOptimizer::ID = 0; |
130 | | } // end anonymous namespace |
131 | | |
132 | | // Returns true if this is a use of a SPR register. |
133 | | bool A15SDOptimizer::usesRegClass(MachineOperand &MO, |
134 | 3.20k | const TargetRegisterClass *TRC) { |
135 | 3.20k | if (!MO.isReg()) |
136 | 0 | return false; |
137 | 3.20k | unsigned Reg = MO.getReg(); |
138 | 3.20k | |
139 | 3.20k | if (TargetRegisterInfo::isVirtualRegister(Reg)) |
140 | 1.33k | return MRI->getRegClass(Reg)->hasSuperClassEq(TRC); |
141 | 3.20k | else |
142 | 1.86k | return TRC->contains(Reg); |
143 | 0 | } |
144 | | |
145 | 1 | unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) { |
146 | 1 | unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, |
147 | 1 | &ARM::DPRRegClass); |
148 | 1 | if (DReg != ARM::NoRegister1 ) return ARM::ssub_10 ; |
149 | 1 | return ARM::ssub_0; |
150 | 1 | } |
151 | | |
152 | | // Get the subreg type that is most likely to be coalesced |
153 | | // for an SPR register that will be used in VDUP32d pseudo. |
154 | 5 | unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) { |
155 | 5 | if (!TRI->isVirtualRegister(SReg)) |
156 | 0 | return getDPRLaneFromSPR(SReg); |
157 | 5 | |
158 | 5 | MachineInstr *MI = MRI->getVRegDef(SReg); |
159 | 5 | if (!MI5 ) return ARM::ssub_00 ; |
160 | 5 | MachineOperand *MO = MI->findRegisterDefOperand(SReg); |
161 | 5 | |
162 | 5 | assert(MO->isReg() && "Non-register operand found!"); |
163 | 5 | if (!MO5 ) return ARM::ssub_00 ; |
164 | 5 | |
165 | 5 | if (5 MI->isCopy() && 5 usesRegClass(MI->getOperand(1), |
166 | 5 | &ARM::SPRRegClass)) { |
167 | 1 | SReg = MI->getOperand(1).getReg(); |
168 | 1 | } |
169 | 5 | |
170 | 5 | if (TargetRegisterInfo::isVirtualRegister(SReg)5 ) { |
171 | 4 | if (MO->getSubReg() == ARM::ssub_14 ) return ARM::ssub_10 ; |
172 | 4 | return ARM::ssub_0; |
173 | 4 | } |
174 | 1 | return getDPRLaneFromSPR(SReg); |
175 | 1 | } |
176 | | |
177 | | // MI is known to be dead. Figure out what instructions |
178 | | // are also made dead by this and mark them for removal. |
179 | 5 | void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) { |
180 | 5 | SmallVector<MachineInstr *, 8> Front; |
181 | 5 | DeadInstr.insert(MI); |
182 | 5 | |
183 | 5 | DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n"); |
184 | 5 | Front.push_back(MI); |
185 | 5 | |
186 | 10 | while (Front.size() != 010 ) { |
187 | 5 | MI = Front.back(); |
188 | 5 | Front.pop_back(); |
189 | 5 | |
190 | 5 | // MI is already known to be dead. We need to see |
191 | 5 | // if other instructions can also be removed. |
192 | 28 | for (MachineOperand &MO : MI->operands()) { |
193 | 28 | if ((!MO.isReg()) || 28 (!MO.isUse())17 ) |
194 | 16 | continue; |
195 | 12 | unsigned Reg = MO.getReg(); |
196 | 12 | if (!TRI->isVirtualRegister(Reg)) |
197 | 0 | continue; |
198 | 12 | MachineOperand *Op = MI->findRegisterDefOperand(Reg); |
199 | 12 | |
200 | 12 | if (!Op) |
201 | 12 | continue; |
202 | 0 |
|
203 | 0 | MachineInstr *Def = Op->getParent(); |
204 | 0 |
|
205 | 0 | // We don't need to do anything if we have already marked |
206 | 0 | // this instruction as being dead. |
207 | 0 | if (DeadInstr.find(Def) != DeadInstr.end()) |
208 | 0 | continue; |
209 | 0 |
|
210 | 0 | // Check if all the uses of this instruction are marked as |
211 | 0 | // dead. If so, we can also mark this instruction as being |
212 | 0 | // dead. |
213 | 0 | bool IsDead = true; |
214 | 0 | for (MachineOperand &MODef : Def->operands()) { |
215 | 0 | if ((!MODef.isReg()) || 0 (!MODef.isDef())0 ) |
216 | 0 | continue; |
217 | 0 | unsigned DefReg = MODef.getReg(); |
218 | 0 | if (!TRI->isVirtualRegister(DefReg)0 ) { |
219 | 0 | IsDead = false; |
220 | 0 | break; |
221 | 0 | } |
222 | 0 | for (MachineInstr &Use : MRI->use_instructions(Reg)) 0 { |
223 | 0 | // We don't care about self references. |
224 | 0 | if (&Use == Def) |
225 | 0 | continue; |
226 | 0 | if (0 DeadInstr.find(&Use) == DeadInstr.end()0 ) { |
227 | 0 | IsDead = false; |
228 | 0 | break; |
229 | 0 | } |
230 | 0 | } |
231 | 0 | } |
232 | 0 |
|
233 | 0 | if (!IsDead0 ) continue0 ; |
234 | 0 |
|
235 | 0 | DEBUG0 (dbgs() << "Deleting instruction " << *Def << "\n"); |
236 | 0 | DeadInstr.insert(Def); |
237 | 0 | } |
238 | 5 | } |
239 | 5 | } |
240 | | |
241 | | // Creates the more optimized patterns and generally does all the code |
242 | | // transformations in this pass. |
243 | 7 | unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { |
244 | 7 | if (MI->isCopy()7 ) { |
245 | 0 | return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg()); |
246 | 0 | } |
247 | 7 | |
248 | 7 | if (7 MI->isInsertSubreg()7 ) { |
249 | 2 | unsigned DPRReg = MI->getOperand(1).getReg(); |
250 | 2 | unsigned SPRReg = MI->getOperand(2).getReg(); |
251 | 2 | |
252 | 2 | if (TRI->isVirtualRegister(DPRReg) && 2 TRI->isVirtualRegister(SPRReg)2 ) { |
253 | 2 | MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg()); |
254 | 2 | MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg()); |
255 | 2 | |
256 | 2 | if (DPRMI && 2 SPRMI2 ) { |
257 | 2 | // See if the first operand of this insert_subreg is IMPLICIT_DEF |
258 | 2 | MachineInstr *ECDef = elideCopies(DPRMI); |
259 | 2 | if (ECDef && 2 ECDef->isImplicitDef()1 ) { |
260 | 1 | // Another corner case - if we're inserting something that is purely |
261 | 1 | // a subreg copy of a DPR, just use that DPR. |
262 | 1 | |
263 | 1 | MachineInstr *EC = elideCopies(SPRMI); |
264 | 1 | // Is it a subreg copy of ssub_0? |
265 | 1 | if (EC && 1 EC->isCopy()1 && |
266 | 1 | EC->getOperand(1).getSubReg() == ARM::ssub_00 ) { |
267 | 0 | DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI); |
268 | 0 |
|
269 | 0 | // Find the thing we're subreg copying out of - is it of the same |
270 | 0 | // regclass as DPRMI? (i.e. a DPR or QPR). |
271 | 0 | unsigned FullReg = SPRMI->getOperand(1).getReg(); |
272 | 0 | const TargetRegisterClass *TRC = |
273 | 0 | MRI->getRegClass(MI->getOperand(1).getReg()); |
274 | 0 | if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))0 ) { |
275 | 0 | DEBUG(dbgs() << "Subreg copy is compatible - returning "); |
276 | 0 | DEBUG(dbgs() << PrintReg(FullReg) << "\n"); |
277 | 0 | eraseInstrWithNoUses(MI); |
278 | 0 | return FullReg; |
279 | 0 | } |
280 | 1 | } |
281 | 1 | |
282 | 1 | return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg()); |
283 | 1 | } |
284 | 2 | } |
285 | 2 | } |
286 | 1 | return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg()); |
287 | 1 | } |
288 | 5 | |
289 | 5 | if (5 MI->isRegSequence() && 5 usesRegClass(MI->getOperand(1), |
290 | 5 | &ARM::SPRRegClass)) { |
291 | 5 | // See if all bar one of the operands are IMPLICIT_DEF and insert the |
292 | 5 | // optimizer pattern accordingly. |
293 | 5 | unsigned NumImplicit = 0, NumTotal = 0; |
294 | 5 | unsigned NonImplicitReg = ~0U; |
295 | 5 | |
296 | 33 | for (unsigned I = 1; I < MI->getNumExplicitOperands()33 ; ++I28 ) { |
297 | 28 | if (!MI->getOperand(I).isReg()) |
298 | 14 | continue; |
299 | 14 | ++NumTotal; |
300 | 14 | unsigned OpReg = MI->getOperand(I).getReg(); |
301 | 14 | |
302 | 14 | if (!TRI->isVirtualRegister(OpReg)) |
303 | 0 | break; |
304 | 14 | |
305 | 14 | MachineInstr *Def = MRI->getVRegDef(OpReg); |
306 | 14 | if (!Def) |
307 | 0 | break; |
308 | 14 | if (14 Def->isImplicitDef()14 ) |
309 | 8 | ++NumImplicit; |
310 | 14 | else |
311 | 6 | NonImplicitReg = MI->getOperand(I).getReg(); |
312 | 28 | } |
313 | 5 | |
314 | 5 | if (NumImplicit == NumTotal - 1) |
315 | 4 | return optimizeAllLanesPattern(MI, NonImplicitReg); |
316 | 5 | else |
317 | 1 | return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg()); |
318 | 0 | } |
319 | 0 |
|
320 | 0 | llvm_unreachable0 ("Unhandled update pattern!"); |
321 | 0 | } |
322 | | |
323 | | // Return true if this MachineInstr inserts a scalar (SPR) value into |
324 | | // a D or Q register. |
325 | 37 | bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) { |
326 | 37 | // The only way we can do a partial register update is through a COPY, |
327 | 37 | // INSERT_SUBREG or REG_SEQUENCE. |
328 | 37 | if (MI->isCopy() && 37 usesRegClass(MI->getOperand(1), &ARM::SPRRegClass)3 ) |
329 | 0 | return true; |
330 | 37 | |
331 | 37 | if (37 MI->isInsertSubreg() && 37 usesRegClass(MI->getOperand(2), |
332 | 2 | &ARM::SPRRegClass)) |
333 | 2 | return true; |
334 | 35 | |
335 | 35 | if (35 MI->isRegSequence() && 35 usesRegClass(MI->getOperand(1), &ARM::SPRRegClass)6 ) |
336 | 5 | return true; |
337 | 30 | |
338 | 30 | return false; |
339 | 30 | } |
340 | | |
341 | | // Looks through full copies to get the instruction that defines the input |
342 | | // operand for MI. |
343 | 3 | MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) { |
344 | 3 | if (!MI->isFullCopy()) |
345 | 2 | return MI; |
346 | 1 | if (1 !TRI->isVirtualRegister(MI->getOperand(1).getReg())1 ) |
347 | 1 | return nullptr; |
348 | 0 | MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg()); |
349 | 0 | if (!Def) |
350 | 0 | return nullptr; |
351 | 0 | return elideCopies(Def); |
352 | 0 | } |
353 | | |
354 | | // Look through full copies and PHIs to get the set of non-copy MachineInstrs |
355 | | // that can produce MI. |
356 | | void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI, |
357 | 52 | SmallVectorImpl<MachineInstr*> &Outs) { |
358 | 52 | // Looking through PHIs may create loops so we need to track what |
359 | 52 | // instructions we have visited before. |
360 | 52 | std::set<MachineInstr *> Reached; |
361 | 52 | SmallVector<MachineInstr *, 8> Front; |
362 | 52 | Front.push_back(MI); |
363 | 108 | while (Front.size() != 0108 ) { |
364 | 56 | MI = Front.back(); |
365 | 56 | Front.pop_back(); |
366 | 56 | |
367 | 56 | // If we have already explored this MachineInstr, ignore it. |
368 | 56 | if (Reached.find(MI) != Reached.end()) |
369 | 0 | continue; |
370 | 56 | Reached.insert(MI); |
371 | 56 | if (MI->isPHI()56 ) { |
372 | 0 | for (unsigned I = 1, E = MI->getNumOperands(); I != E0 ; I += 20 ) { |
373 | 0 | unsigned Reg = MI->getOperand(I).getReg(); |
374 | 0 | if (!TRI->isVirtualRegister(Reg)0 ) { |
375 | 0 | continue; |
376 | 0 | } |
377 | 0 | MachineInstr *NewMI = MRI->getVRegDef(Reg); |
378 | 0 | if (!NewMI) |
379 | 0 | continue; |
380 | 0 | Front.push_back(NewMI); |
381 | 0 | } |
382 | 56 | } else if (56 MI->isFullCopy()56 ) { |
383 | 14 | if (!TRI->isVirtualRegister(MI->getOperand(1).getReg())) |
384 | 10 | continue; |
385 | 4 | MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg()); |
386 | 4 | if (!NewMI) |
387 | 0 | continue; |
388 | 4 | Front.push_back(NewMI); |
389 | 56 | } else { |
390 | 42 | DEBUG(dbgs() << "Found partial copy" << *MI <<"\n"); |
391 | 56 | Outs.push_back(MI); |
392 | 56 | } |
393 | 56 | } |
394 | 52 | } |
395 | | |
396 | | // Return the DPR virtual registers that are read by this machine instruction |
397 | | // (if any). |
398 | 664 | SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) { |
399 | 664 | if (MI->isCopyLike() || 664 MI->isInsertSubreg()541 || MI->isRegSequence()539 || |
400 | 534 | MI->isKill()) |
401 | 130 | return SmallVector<unsigned, 8>(); |
402 | 534 | |
403 | 534 | SmallVector<unsigned, 8> Defs; |
404 | 2.48k | for (MachineOperand &MO : MI->operands()) { |
405 | 2.48k | if (!MO.isReg() || 2.48k !MO.isUse()1.50k ) |
406 | 1.39k | continue; |
407 | 1.08k | if (1.08k !usesRegClass(MO, &ARM::DPRRegClass) && |
408 | 1.05k | !usesRegClass(MO, &ARM::QPRRegClass) && |
409 | 1.03k | !usesRegClass(MO, &ARM::DPairRegClass)) // Treat DPair as QPR |
410 | 1.02k | continue; |
411 | 57 | |
412 | 57 | Defs.push_back(MO.getReg()); |
413 | 57 | } |
414 | 664 | return Defs; |
415 | 664 | } |
416 | | |
417 | | // Creates a DPR register from an SPR one by using a VDUP. |
418 | | unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB, |
419 | | MachineBasicBlock::iterator InsertBefore, |
420 | | const DebugLoc &DL, unsigned Reg, |
421 | 13 | unsigned Lane, bool QPR) { |
422 | 2 | unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : |
423 | 11 | &ARM::DPRRegClass); |
424 | 13 | BuildMI(MBB, InsertBefore, DL, |
425 | 13 | TII->get(QPR ? ARM::VDUPLN32q2 : ARM::VDUPLN32d11 ), Out) |
426 | 13 | .addReg(Reg) |
427 | 13 | .addImm(Lane) |
428 | 13 | .add(predOps(ARMCC::AL)); |
429 | 13 | |
430 | 13 | return Out; |
431 | 13 | } |
432 | | |
433 | | // Creates a SPR register from a DPR by copying the value in lane 0. |
434 | | unsigned A15SDOptimizer::createExtractSubreg( |
435 | | MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, |
436 | | const DebugLoc &DL, unsigned DReg, unsigned Lane, |
437 | 4 | const TargetRegisterClass *TRC) { |
438 | 4 | unsigned Out = MRI->createVirtualRegister(TRC); |
439 | 4 | BuildMI(MBB, |
440 | 4 | InsertBefore, |
441 | 4 | DL, |
442 | 4 | TII->get(TargetOpcode::COPY), Out) |
443 | 4 | .addReg(DReg, 0, Lane); |
444 | 4 | |
445 | 4 | return Out; |
446 | 4 | } |
447 | | |
448 | | // Takes two SPR registers and creates a DPR by using a REG_SEQUENCE. |
449 | | unsigned A15SDOptimizer::createRegSequence( |
450 | | MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, |
451 | 2 | const DebugLoc &DL, unsigned Reg1, unsigned Reg2) { |
452 | 2 | unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass); |
453 | 2 | BuildMI(MBB, |
454 | 2 | InsertBefore, |
455 | 2 | DL, |
456 | 2 | TII->get(TargetOpcode::REG_SEQUENCE), Out) |
457 | 2 | .addReg(Reg1) |
458 | 2 | .addImm(ARM::dsub_0) |
459 | 2 | .addReg(Reg2) |
460 | 2 | .addImm(ARM::dsub_1); |
461 | 2 | return Out; |
462 | 2 | } |
463 | | |
464 | | // Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1) |
465 | | // and merges them into one DPR register. |
466 | | unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB, |
467 | | MachineBasicBlock::iterator InsertBefore, |
468 | | const DebugLoc &DL, unsigned Ssub0, |
469 | 4 | unsigned Ssub1) { |
470 | 4 | unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); |
471 | 4 | BuildMI(MBB, InsertBefore, DL, TII->get(ARM::VEXTd32), Out) |
472 | 4 | .addReg(Ssub0) |
473 | 4 | .addReg(Ssub1) |
474 | 4 | .addImm(1) |
475 | 4 | .add(predOps(ARMCC::AL)); |
476 | 4 | return Out; |
477 | 4 | } |
478 | | |
479 | | unsigned A15SDOptimizer::createInsertSubreg( |
480 | | MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, |
481 | 5 | const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) { |
482 | 5 | unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass); |
483 | 5 | BuildMI(MBB, |
484 | 5 | InsertBefore, |
485 | 5 | DL, |
486 | 5 | TII->get(TargetOpcode::INSERT_SUBREG), Out) |
487 | 5 | .addReg(DReg) |
488 | 5 | .addReg(ToInsert) |
489 | 5 | .addImm(Lane); |
490 | 5 | |
491 | 5 | return Out; |
492 | 5 | } |
493 | | |
494 | | unsigned |
495 | | A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB, |
496 | | MachineBasicBlock::iterator InsertBefore, |
497 | 5 | const DebugLoc &DL) { |
498 | 5 | unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); |
499 | 5 | BuildMI(MBB, |
500 | 5 | InsertBefore, |
501 | 5 | DL, |
502 | 5 | TII->get(TargetOpcode::IMPLICIT_DEF), Out); |
503 | 5 | return Out; |
504 | 5 | } |
505 | | |
506 | | // This function inserts instructions in order to optimize interactions between |
507 | | // SPR registers and DPR/QPR registers. It does so by performing VDUPs on all |
508 | | // lanes, and the using VEXT instructions to recompose the result. |
509 | | unsigned |
510 | 7 | A15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) { |
511 | 7 | MachineBasicBlock::iterator InsertPt(MI); |
512 | 7 | DebugLoc DL = MI->getDebugLoc(); |
513 | 7 | MachineBasicBlock &MBB = *MI->getParent(); |
514 | 7 | InsertPt++; |
515 | 7 | unsigned Out; |
516 | 7 | |
517 | 7 | // DPair has the same length as QPR and also has two DPRs as subreg. |
518 | 7 | // Treat DPair as QPR. |
519 | 7 | if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass) || |
520 | 7 | MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPairRegClass)5 ) { |
521 | 2 | unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg, |
522 | 2 | ARM::dsub_0, &ARM::DPRRegClass); |
523 | 2 | unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg, |
524 | 2 | ARM::dsub_1, &ARM::DPRRegClass); |
525 | 2 | |
526 | 2 | unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0); |
527 | 2 | unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1); |
528 | 2 | Out = createVExt(MBB, InsertPt, DL, Out1, Out2); |
529 | 2 | |
530 | 2 | unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0); |
531 | 2 | unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1); |
532 | 2 | Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4); |
533 | 2 | |
534 | 2 | Out = createRegSequence(MBB, InsertPt, DL, Out, Out2); |
535 | 2 | |
536 | 7 | } else if (5 MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)5 ) { |
537 | 0 | unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0); |
538 | 0 | unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1); |
539 | 0 | Out = createVExt(MBB, InsertPt, DL, Out1, Out2); |
540 | 0 |
|
541 | 5 | } else { |
542 | 5 | assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) && |
543 | 5 | "Found unexpected regclass!"); |
544 | 5 | |
545 | 5 | unsigned PrefLane = getPrefSPRLane(Reg); |
546 | 5 | unsigned Lane; |
547 | 5 | switch (PrefLane) { |
548 | 5 | case ARM::ssub_0: Lane = 0; break; |
549 | 0 | case ARM::ssub_1: Lane = 1; break; |
550 | 0 | default: 0 llvm_unreachable0 ("Unknown preferred lane!"); |
551 | 5 | } |
552 | 5 | |
553 | 5 | // Treat DPair as QPR |
554 | 5 | bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass) || |
555 | 3 | usesRegClass(MI->getOperand(0), &ARM::DPairRegClass); |
556 | 5 | |
557 | 5 | Out = createImplicitDef(MBB, InsertPt, DL); |
558 | 5 | Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg); |
559 | 5 | Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR); |
560 | 5 | eraseInstrWithNoUses(MI); |
561 | 5 | } |
562 | 7 | return Out; |
563 | 7 | } |
564 | | |
565 | 664 | bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { |
566 | 664 | // We look for instructions that write S registers that are then read as |
567 | 664 | // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and |
568 | 664 | // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or |
569 | 664 | // merge two SPR values to form a DPR register. In order avoid false |
570 | 664 | // positives we make sure that there is an SPR producer so we look past |
571 | 664 | // COPY and PHI nodes to find it. |
572 | 664 | // |
573 | 664 | // The best code pattern for when an SPR producer is going to be used by a |
574 | 664 | // DPR or QPR consumer depends on whether the other lanes of the |
575 | 664 | // corresponding DPR/QPR are currently defined. |
576 | 664 | // |
577 | 664 | // We can handle these efficiently, depending on the type of |
578 | 664 | // pseudo-instruction that is producing the pattern |
579 | 664 | // |
580 | 664 | // * COPY: * VDUP all lanes and merge the results together |
581 | 664 | // using VEXTs. |
582 | 664 | // |
583 | 664 | // * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR |
584 | 664 | // lane, and the other lane(s) of the DPR/QPR register |
585 | 664 | // that we are inserting in are undefined, use the |
586 | 664 | // original DPR/QPR value. |
587 | 664 | // * Otherwise, fall back on the same stategy as COPY. |
588 | 664 | // |
589 | 664 | // * REG_SEQUENCE: * If all except one of the input operands are |
590 | 664 | // IMPLICIT_DEFs, insert the VDUP pattern for just the |
591 | 664 | // defined input operand |
592 | 664 | // * Otherwise, fall back on the same stategy as COPY. |
593 | 664 | // |
594 | 664 | |
595 | 664 | // First, get all the reads of D-registers done by this instruction. |
596 | 664 | SmallVector<unsigned, 8> Defs = getReadDPRs(MI); |
597 | 664 | bool Modified = false; |
598 | 664 | |
599 | 664 | for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end(); |
600 | 721 | I != E721 ; ++I57 ) { |
601 | 57 | // Follow the def-use chain for this DPR through COPYs, and also through |
602 | 57 | // PHIs (which are essentially multi-way COPYs). It is because of PHIs that |
603 | 57 | // we can end up with multiple defs of this DPR. |
604 | 57 | |
605 | 57 | SmallVector<MachineInstr *, 8> DefSrcs; |
606 | 57 | if (!TRI->isVirtualRegister(*I)) |
607 | 5 | continue; |
608 | 52 | MachineInstr *Def = MRI->getVRegDef(*I); |
609 | 52 | if (!Def) |
610 | 0 | continue; |
611 | 52 | |
612 | 52 | elideCopiesAndPHIs(Def, DefSrcs); |
613 | 52 | |
614 | 42 | for (MachineInstr *MI : DefSrcs) { |
615 | 42 | // If we've already analyzed and replaced this operand, don't do |
616 | 42 | // anything. |
617 | 42 | if (Replacements.find(MI) != Replacements.end()) |
618 | 5 | continue; |
619 | 37 | |
620 | 37 | // Now, work out if the instruction causes a SPR->DPR dependency. |
621 | 37 | if (37 !hasPartialWrite(MI)37 ) |
622 | 30 | continue; |
623 | 7 | |
624 | 7 | // Collect all the uses of this MI's DPR def for updating later. |
625 | 7 | SmallVector<MachineOperand*, 8> Uses; |
626 | 7 | unsigned DPRDefReg = MI->getOperand(0).getReg(); |
627 | 7 | for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg), |
628 | 21 | E = MRI->use_end(); I != E21 ; ++I14 ) |
629 | 14 | Uses.push_back(&*I); |
630 | 7 | |
631 | 7 | // We can optimize this. |
632 | 7 | unsigned NewReg = optimizeSDPattern(MI); |
633 | 7 | |
634 | 7 | if (NewReg != 07 ) { |
635 | 7 | Modified = true; |
636 | 7 | for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(), |
637 | 21 | E = Uses.end(); I != E21 ; ++I14 ) { |
638 | 14 | // Make sure to constrain the register class of the new register to |
639 | 14 | // match what we're replacing. Otherwise we can optimize a DPR_VFP2 |
640 | 14 | // reference into a plain DPR, and that will end poorly. NewReg is |
641 | 14 | // always virtual here, so there will always be a matching subclass |
642 | 14 | // to find. |
643 | 14 | MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg())); |
644 | 14 | |
645 | 14 | DEBUG(dbgs() << "Replacing operand " |
646 | 14 | << **I << " with " |
647 | 14 | << PrintReg(NewReg) << "\n"); |
648 | 14 | (*I)->substVirtReg(NewReg, 0, *TRI); |
649 | 14 | } |
650 | 7 | } |
651 | 42 | Replacements[MI] = NewReg; |
652 | 42 | } |
653 | 57 | } |
654 | 664 | return Modified; |
655 | 664 | } |
656 | | |
657 | 15.8k | bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) { |
658 | 15.8k | if (skipFunction(*Fn.getFunction())) |
659 | 8 | return false; |
660 | 15.8k | |
661 | 15.8k | const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>(); |
662 | 15.8k | // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be |
663 | 15.8k | // enabled when NEON is available. |
664 | 15.8k | if (!(STI.isCortexA15() && 15.8k STI.hasNEON()48 )) |
665 | 15.8k | return false; |
666 | 46 | TII = STI.getInstrInfo(); |
667 | 46 | TRI = STI.getRegisterInfo(); |
668 | 46 | MRI = &Fn.getRegInfo(); |
669 | 46 | bool Modified = false; |
670 | 46 | |
671 | 46 | DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n"); |
672 | 46 | |
673 | 46 | DeadInstr.clear(); |
674 | 46 | Replacements.clear(); |
675 | 46 | |
676 | 98 | for (MachineBasicBlock &MBB : Fn) { |
677 | 664 | for (MachineInstr &MI : MBB) { |
678 | 664 | Modified |= runOnInstruction(&MI); |
679 | 664 | } |
680 | 98 | } |
681 | 46 | |
682 | 5 | for (MachineInstr *MI : DeadInstr) { |
683 | 5 | MI->eraseFromParent(); |
684 | 5 | } |
685 | 15.8k | |
686 | 15.8k | return Modified; |
687 | 15.8k | } |
688 | | |
689 | 4.12k | FunctionPass *llvm::createA15SDOptimizerPass() { |
690 | 4.12k | return new A15SDOptimizer(); |
691 | 4.12k | } |