/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This file contains a pass that performs optimization for vector by element |
11 | | // SIMD instructions. |
12 | | // |
13 | | // Certain SIMD instructions with vector element operand are not efficient. |
14 | | // Rewrite them into SIMD instructions with vector operands. This rewrite |
15 | | // is driven by the latency of the instructions. |
16 | | // |
17 | | // Example: |
18 | | // fmla v0.4s, v1.4s, v2.s[1] |
19 | | // is rewritten into |
20 | | // dup v3.4s, v2.s[1] |
21 | | // fmla v0.4s, v1.4s, v3.4s |
22 | | // |
23 | | //===----------------------------------------------------------------------===// |
24 | | |
25 | | #include "AArch64InstrInfo.h" |
26 | | #include "llvm/ADT/SmallVector.h" |
27 | | #include "llvm/ADT/Statistic.h" |
28 | | #include "llvm/ADT/StringRef.h" |
29 | | #include "llvm/CodeGen/MachineBasicBlock.h" |
30 | | #include "llvm/CodeGen/MachineFunction.h" |
31 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
32 | | #include "llvm/CodeGen/MachineInstr.h" |
33 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
34 | | #include "llvm/CodeGen/MachineOperand.h" |
35 | | #include "llvm/CodeGen/MachineRegisterInfo.h" |
36 | | #include "llvm/CodeGen/TargetSchedule.h" |
37 | | #include "llvm/MC/MCInstrDesc.h" |
38 | | #include "llvm/MC/MCSchedule.h" |
39 | | #include "llvm/Pass.h" |
40 | | #include "llvm/Target/TargetInstrInfo.h" |
41 | | #include "llvm/Target/TargetSubtargetInfo.h" |
42 | | #include <map> |
43 | | |
44 | | using namespace llvm; |
45 | | |
46 | | #define DEBUG_TYPE "aarch64-vectorbyelement-opt" |
47 | | |
48 | | STATISTIC(NumModifiedInstr, |
49 | | "Number of vector by element instructions modified"); |
50 | | |
51 | | #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ |
52 | 13.8k | "AArch64 vector by element instruction optimization pass" |
53 | | |
54 | | namespace { |
55 | | |
56 | | struct AArch64VectorByElementOpt : public MachineFunctionPass { |
57 | | static char ID; |
58 | | |
59 | | const TargetInstrInfo *TII; |
60 | | MachineRegisterInfo *MRI; |
61 | | TargetSchedModel SchedModel; |
62 | | |
63 | 13.8k | AArch64VectorByElementOpt() : MachineFunctionPass(ID) { |
64 | 13.8k | initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry()); |
65 | 13.8k | } |
66 | | |
67 | | /// Based only on latency of instructions, determine if it is cost efficient |
68 | | /// to replace the instruction InstDesc by the two instructions InstDescRep1 |
69 | | /// and InstDescRep2. |
70 | | /// Return true if replacement is recommended. |
71 | | bool |
72 | | shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc, |
73 | | const MCInstrDesc *InstDescRep1, |
74 | | const MCInstrDesc *InstDescRep2, |
75 | | std::map<unsigned, bool> &VecInstElemTable) const; |
76 | | |
77 | | /// Determine if we need to exit the vector by element instruction |
78 | | /// optimization pass early. This makes sure that Targets with no need |
79 | | /// for this optimization do not spent any compile time on this pass. |
80 | | /// This check is done by comparing the latency of an indexed FMLA |
81 | | /// instruction to the latency of the DUP + the latency of a vector |
82 | | /// FMLA instruction. We do not check on other related instructions such |
83 | | /// as FMLS as we assume that if the situation shows up for one |
84 | | /// instruction, then it is likely to show up for the related ones. |
85 | | /// Return true if early exit of the pass is recommended. |
86 | | bool earlyExitVectElement(MachineFunction *MF); |
87 | | |
88 | | /// Check whether an equivalent DUP instruction has already been |
89 | | /// created or not. |
90 | | /// Return true when the dup instruction already exists. In this case, |
91 | | /// DestReg will point to the destination of the already created DUP. |
92 | | bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, |
93 | | unsigned LaneNumber, unsigned *DestReg) const; |
94 | | |
95 | | /// Certain SIMD instructions with vector element operand are not efficient. |
96 | | /// Rewrite them into SIMD instructions with vector operands. This rewrite |
97 | | /// is driven by the latency of the instructions. |
98 | | /// Return true if the SIMD instruction is modified. |
99 | | bool optimizeVectElement(MachineInstr &MI, |
100 | | std::map<unsigned, bool> *VecInstElemTable) const; |
101 | | |
102 | | bool runOnMachineFunction(MachineFunction &Fn) override; |
103 | | |
104 | 13.8k | StringRef getPassName() const override { |
105 | 13.8k | return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; |
106 | 13.8k | } |
107 | | }; |
108 | | |
109 | | char AArch64VectorByElementOpt::ID = 0; |
110 | | |
111 | | } // end anonymous namespace |
112 | | |
113 | | INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt", |
114 | | AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) |
115 | | |
116 | | /// Based only on latency of instructions, determine if it is cost efficient |
117 | | /// to replace the instruction InstDesc by the two instructions InstDescRep1 |
118 | | /// and InstDescRep2. Note that it is assumed in this fuction that an |
119 | | /// instruction of type InstDesc is always replaced by the same two |
120 | | /// instructions as results are cached here. |
121 | | /// Return true if replacement is recommended. |
122 | | bool AArch64VectorByElementOpt::shouldReplaceInstruction( |
123 | | MachineFunction *MF, const MCInstrDesc *InstDesc, |
124 | | const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2, |
125 | 446k | std::map<unsigned, bool> &VecInstElemTable) const { |
126 | 446k | // Check if replacment decision is alredy available in the cached table. |
127 | 446k | // if so, return it. |
128 | 446k | if (!VecInstElemTable.empty() && |
129 | 2 | VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end()) |
130 | 0 | return VecInstElemTable[InstDesc->getOpcode()]; |
131 | 446k | |
132 | 446k | unsigned SCIdx = InstDesc->getSchedClass(); |
133 | 446k | unsigned SCIdxRep1 = InstDescRep1->getSchedClass(); |
134 | 446k | unsigned SCIdxRep2 = InstDescRep2->getSchedClass(); |
135 | 446k | const MCSchedClassDesc *SCDesc = |
136 | 446k | SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); |
137 | 446k | const MCSchedClassDesc *SCDescRep1 = |
138 | 446k | SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1); |
139 | 446k | const MCSchedClassDesc *SCDescRep2 = |
140 | 446k | SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2); |
141 | 446k | |
142 | 446k | // If a subtarget does not define resources for any of the instructions |
143 | 446k | // of interest, then return false for no replacement. |
144 | 446k | if (!SCDesc->isValid() || 446k SCDesc->isVariant()446k || !SCDescRep1->isValid()446k || |
145 | 446k | SCDescRep1->isVariant()446k || !SCDescRep2->isValid()446k || |
146 | 446k | SCDescRep2->isVariant()446k ) { |
147 | 0 | VecInstElemTable[InstDesc->getOpcode()] = false; |
148 | 0 | return false; |
149 | 0 | } |
150 | 446k | |
151 | 446k | if (446k SchedModel.computeInstrLatency(InstDesc->getOpcode()) > |
152 | 446k | SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) + |
153 | 446k | SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) { |
154 | 496 | VecInstElemTable[InstDesc->getOpcode()] = true; |
155 | 496 | return true; |
156 | 496 | } |
157 | 445k | VecInstElemTable[InstDesc->getOpcode()] = false; |
158 | 445k | return false; |
159 | 445k | } |
160 | | |
161 | | /// Determine if we need to exit the vector by element instruction |
162 | | /// optimization pass early. This makes sure that Targets with no need |
163 | | /// for this optimization do not spent any compile time on this pass. |
164 | | /// This check is done by comparing the latency of an indexed FMLA |
165 | | /// instruction to the latency of the DUP + the latency of a vector |
166 | | /// FMLA instruction. We do not check on other related instructions such |
167 | | /// as FMLS as we assume that if the situation shows up for one |
168 | | /// instruction, then it is likely to show up for the related ones. |
169 | | /// Return true if early exit of the pass is recommended. |
170 | 446k | bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) { |
171 | 446k | std::map<unsigned, bool> VecInstElemTable; |
172 | 446k | const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed); |
173 | 446k | const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane); |
174 | 446k | const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32); |
175 | 446k | |
176 | 446k | if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID, |
177 | 446k | VecInstElemTable)) |
178 | 445k | return true; |
179 | 447 | return false; |
180 | 447 | } |
181 | | |
182 | | /// Check whether an equivalent DUP instruction has already been |
183 | | /// created or not. |
184 | | /// Return true when the dup instruction already exists. In this case, |
185 | | /// DestReg will point to the destination of the already created DUP. |
186 | | bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, |
187 | | unsigned SrcReg, unsigned LaneNumber, |
188 | 49 | unsigned *DestReg) const { |
189 | 49 | for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); |
190 | 215 | MII != MIE215 ;) { |
191 | 167 | MII--; |
192 | 167 | MachineInstr *CurrentMI = &*MII; |
193 | 167 | |
194 | 167 | if (CurrentMI->getOpcode() == DupOpcode && |
195 | 2 | CurrentMI->getNumOperands() == 3 && |
196 | 2 | CurrentMI->getOperand(1).getReg() == SrcReg && |
197 | 167 | CurrentMI->getOperand(2).getImm() == LaneNumber2 ) { |
198 | 1 | *DestReg = CurrentMI->getOperand(0).getReg(); |
199 | 1 | return true; |
200 | 1 | } |
201 | 167 | } |
202 | 49 | |
203 | 48 | return false; |
204 | 49 | } |
205 | | |
206 | | /// Certain SIMD instructions with vector element operand are not efficient. |
207 | | /// Rewrite them into SIMD instructions with vector operands. This rewrite |
208 | | /// is driven by the latency of the instructions. |
209 | | /// The instruction of concerns are for the time being fmla, fmls, fmul, |
210 | | /// and fmulx and hence they are hardcoded. |
211 | | /// |
212 | | /// Example: |
213 | | /// fmla v0.4s, v1.4s, v2.s[1] |
214 | | /// is rewritten into |
215 | | /// dup v3.4s, v2.s[1] // dup not necessary if redundant |
216 | | /// fmla v0.4s, v1.4s, v3.4s |
217 | | /// Return true if the SIMD instruction is modified. |
218 | | bool AArch64VectorByElementOpt::optimizeVectElement( |
219 | 4.11k | MachineInstr &MI, std::map<unsigned, bool> *VecInstElemTable) const { |
220 | 4.11k | const MCInstrDesc *MulMCID, *DupMCID; |
221 | 4.11k | const TargetRegisterClass *RC = &AArch64::FPR128RegClass; |
222 | 4.11k | |
223 | 4.11k | switch (MI.getOpcode()) { |
224 | 4.06k | default: |
225 | 4.06k | return false; |
226 | 4.11k | |
227 | 4.11k | // 4X32 instructions |
228 | 6 | case AArch64::FMLAv4i32_indexed: |
229 | 6 | DupMCID = &TII->get(AArch64::DUPv4i32lane); |
230 | 6 | MulMCID = &TII->get(AArch64::FMLAv4f32); |
231 | 6 | break; |
232 | 6 | case AArch64::FMLSv4i32_indexed: |
233 | 6 | DupMCID = &TII->get(AArch64::DUPv4i32lane); |
234 | 6 | MulMCID = &TII->get(AArch64::FMLSv4f32); |
235 | 6 | break; |
236 | 4 | case AArch64::FMULXv4i32_indexed: |
237 | 4 | DupMCID = &TII->get(AArch64::DUPv4i32lane); |
238 | 4 | MulMCID = &TII->get(AArch64::FMULXv4f32); |
239 | 4 | break; |
240 | 4 | case AArch64::FMULv4i32_indexed: |
241 | 4 | DupMCID = &TII->get(AArch64::DUPv4i32lane); |
242 | 4 | MulMCID = &TII->get(AArch64::FMULv4f32); |
243 | 4 | break; |
244 | 4.11k | |
245 | 4.11k | // 2X64 instructions |
246 | 3 | case AArch64::FMLAv2i64_indexed: |
247 | 3 | DupMCID = &TII->get(AArch64::DUPv2i64lane); |
248 | 3 | MulMCID = &TII->get(AArch64::FMLAv2f64); |
249 | 3 | break; |
250 | 3 | case AArch64::FMLSv2i64_indexed: |
251 | 3 | DupMCID = &TII->get(AArch64::DUPv2i64lane); |
252 | 3 | MulMCID = &TII->get(AArch64::FMLSv2f64); |
253 | 3 | break; |
254 | 4 | case AArch64::FMULXv2i64_indexed: |
255 | 4 | DupMCID = &TII->get(AArch64::DUPv2i64lane); |
256 | 4 | MulMCID = &TII->get(AArch64::FMULXv2f64); |
257 | 4 | break; |
258 | 3 | case AArch64::FMULv2i64_indexed: |
259 | 3 | DupMCID = &TII->get(AArch64::DUPv2i64lane); |
260 | 3 | MulMCID = &TII->get(AArch64::FMULv2f64); |
261 | 3 | break; |
262 | 4.11k | |
263 | 4.11k | // 2X32 instructions |
264 | 4 | case AArch64::FMLAv2i32_indexed: |
265 | 4 | RC = &AArch64::FPR64RegClass; |
266 | 4 | DupMCID = &TII->get(AArch64::DUPv2i32lane); |
267 | 4 | MulMCID = &TII->get(AArch64::FMLAv2f32); |
268 | 4 | break; |
269 | 4 | case AArch64::FMLSv2i32_indexed: |
270 | 4 | RC = &AArch64::FPR64RegClass; |
271 | 4 | DupMCID = &TII->get(AArch64::DUPv2i32lane); |
272 | 4 | MulMCID = &TII->get(AArch64::FMLSv2f32); |
273 | 4 | break; |
274 | 4 | case AArch64::FMULXv2i32_indexed: |
275 | 4 | RC = &AArch64::FPR64RegClass; |
276 | 4 | DupMCID = &TII->get(AArch64::DUPv2i32lane); |
277 | 4 | MulMCID = &TII->get(AArch64::FMULXv2f32); |
278 | 4 | break; |
279 | 4 | case AArch64::FMULv2i32_indexed: |
280 | 4 | RC = &AArch64::FPR64RegClass; |
281 | 4 | DupMCID = &TII->get(AArch64::DUPv2i32lane); |
282 | 4 | MulMCID = &TII->get(AArch64::FMULv2f32); |
283 | 4 | break; |
284 | 49 | } |
285 | 49 | |
286 | 49 | if (49 !shouldReplaceInstruction(MI.getParent()->getParent(), |
287 | 49 | &TII->get(MI.getOpcode()), DupMCID, MulMCID, |
288 | 49 | *VecInstElemTable)) |
289 | 0 | return false; |
290 | 49 | |
291 | 49 | const DebugLoc &DL = MI.getDebugLoc(); |
292 | 49 | MachineBasicBlock &MBB = *MI.getParent(); |
293 | 49 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
294 | 49 | |
295 | 49 | // get the operands of the current SIMD arithmetic instruction. |
296 | 49 | unsigned MulDest = MI.getOperand(0).getReg(); |
297 | 49 | unsigned SrcReg0 = MI.getOperand(1).getReg(); |
298 | 49 | unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); |
299 | 49 | unsigned SrcReg1 = MI.getOperand(2).getReg(); |
300 | 49 | unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); |
301 | 49 | unsigned DupDest; |
302 | 49 | |
303 | 49 | // Instructions of interest have either 4 or 5 operands. |
304 | 49 | if (MI.getNumOperands() == 549 ) { |
305 | 26 | unsigned SrcReg2 = MI.getOperand(3).getReg(); |
306 | 26 | unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); |
307 | 26 | unsigned LaneNumber = MI.getOperand(4).getImm(); |
308 | 26 | |
309 | 26 | // Create a new DUP instruction. Note that if an equivalent DUP instruction |
310 | 26 | // has already been created before, then use that one instread of creating |
311 | 26 | // a new one. |
312 | 26 | if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)26 ) { |
313 | 25 | DupDest = MRI.createVirtualRegister(RC); |
314 | 25 | BuildMI(MBB, MI, DL, *DupMCID, DupDest) |
315 | 25 | .addReg(SrcReg2, Src2IsKill) |
316 | 25 | .addImm(LaneNumber); |
317 | 25 | } |
318 | 26 | BuildMI(MBB, MI, DL, *MulMCID, MulDest) |
319 | 26 | .addReg(SrcReg0, Src0IsKill) |
320 | 26 | .addReg(SrcReg1, Src1IsKill) |
321 | 26 | .addReg(DupDest, Src2IsKill); |
322 | 49 | } else if (23 MI.getNumOperands() == 423 ) { |
323 | 23 | unsigned LaneNumber = MI.getOperand(3).getImm(); |
324 | 23 | if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)23 ) { |
325 | 23 | DupDest = MRI.createVirtualRegister(RC); |
326 | 23 | BuildMI(MBB, MI, DL, *DupMCID, DupDest) |
327 | 23 | .addReg(SrcReg1, Src1IsKill) |
328 | 23 | .addImm(LaneNumber); |
329 | 23 | } |
330 | 23 | BuildMI(MBB, MI, DL, *MulMCID, MulDest) |
331 | 23 | .addReg(SrcReg0, Src0IsKill) |
332 | 23 | .addReg(DupDest, Src1IsKill); |
333 | 0 | } else { |
334 | 0 | return false; |
335 | 0 | } |
336 | 49 | |
337 | 49 | ++NumModifiedInstr; |
338 | 49 | return true; |
339 | 49 | } |
340 | | |
341 | 456k | bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) { |
342 | 456k | if (skipFunction(*MF.getFunction())) |
343 | 1 | return false; |
344 | 456k | |
345 | 456k | TII = MF.getSubtarget().getInstrInfo(); |
346 | 456k | MRI = &MF.getRegInfo(); |
347 | 456k | const TargetSubtargetInfo &ST = MF.getSubtarget(); |
348 | 456k | const AArch64InstrInfo *AAII = |
349 | 456k | static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); |
350 | 456k | if (!AAII) |
351 | 0 | return false; |
352 | 456k | SchedModel.init(ST.getSchedModel(), &ST, AAII); |
353 | 456k | if (!SchedModel.hasInstrSchedModel()) |
354 | 10.0k | return false; |
355 | 446k | |
356 | 446k | // A simple check to exit this pass early for targets that do not need it. |
357 | 446k | if (446k earlyExitVectElement(&MF)446k ) |
358 | 445k | return false; |
359 | 447 | |
360 | 447 | bool Changed = false; |
361 | 447 | std::map<unsigned, bool> VecInstElemTable; |
362 | 447 | SmallVector<MachineInstr *, 8> RemoveMIs; |
363 | 447 | |
364 | 514 | for (MachineBasicBlock &MBB : MF) { |
365 | 514 | for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); |
366 | 4.62k | MII != MIE4.62k ;) { |
367 | 4.11k | MachineInstr &MI = *MII; |
368 | 4.11k | if (optimizeVectElement(MI, &VecInstElemTable)4.11k ) { |
369 | 49 | // Add MI to the list of instructions to be removed given that it has |
370 | 49 | // been replaced. |
371 | 49 | RemoveMIs.push_back(&MI); |
372 | 49 | Changed = true; |
373 | 49 | } |
374 | 4.11k | ++MII; |
375 | 4.11k | } |
376 | 514 | } |
377 | 447 | |
378 | 447 | for (MachineInstr *MI : RemoveMIs) |
379 | 49 | MI->eraseFromParent(); |
380 | 456k | |
381 | 456k | return Changed; |
382 | 456k | } |
383 | | |
384 | | /// createAArch64VectorByElementOptPass - returns an instance of the |
385 | | /// vector by element optimization pass. |
386 | 13.8k | FunctionPass *llvm::createAArch64VectorByElementOptPass() { |
387 | 13.8k | return new AArch64VectorByElementOpt(); |
388 | 13.8k | } |