/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | |
10 | | #include "ARM.h" |
11 | | #include "ARMBaseInstrInfo.h" |
12 | | #include "ARMSubtarget.h" |
13 | | #include "MCTargetDesc/ARMBaseInfo.h" |
14 | | #include "Thumb2InstrInfo.h" |
15 | | #include "llvm/ADT/DenseMap.h" |
16 | | #include "llvm/ADT/PostOrderIterator.h" |
17 | | #include "llvm/ADT/STLExtras.h" |
18 | | #include "llvm/ADT/SmallSet.h" |
19 | | #include "llvm/ADT/SmallVector.h" |
20 | | #include "llvm/ADT/Statistic.h" |
21 | | #include "llvm/ADT/StringRef.h" |
22 | | #include "llvm/CodeGen/MachineBasicBlock.h" |
23 | | #include "llvm/CodeGen/MachineFunction.h" |
24 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
25 | | #include "llvm/CodeGen/MachineInstr.h" |
26 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
27 | | #include "llvm/CodeGen/MachineOperand.h" |
28 | | #include "llvm/IR/DebugLoc.h" |
29 | | #include "llvm/IR/Function.h" |
30 | | #include "llvm/MC/MCInstrDesc.h" |
31 | | #include "llvm/MC/MCRegisterInfo.h" |
32 | | #include "llvm/Support/CommandLine.h" |
33 | | #include "llvm/Support/Compiler.h" |
34 | | #include "llvm/Support/Debug.h" |
35 | | #include "llvm/Support/ErrorHandling.h" |
36 | | #include "llvm/Support/raw_ostream.h" |
37 | | #include "llvm/Target/TargetInstrInfo.h" |
38 | | #include <algorithm> |
39 | | #include <cassert> |
40 | | #include <cstdint> |
41 | | #include <functional> |
42 | | #include <iterator> |
43 | | #include <utility> |
44 | | |
45 | | using namespace llvm; |
46 | | |
47 | | #define DEBUG_TYPE "t2-reduce-size" |
48 | | |
49 | | STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones"); |
50 | | STATISTIC(Num2Addrs, "Number of 32-bit instrs reduced to 2addr 16-bit ones"); |
51 | | STATISTIC(NumLdSts, "Number of 32-bit load / store reduced to 16-bit ones"); |
52 | | |
53 | | static cl::opt<int> ReduceLimit("t2-reduce-limit", |
54 | | cl::init(-1), cl::Hidden); |
55 | | static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2", |
56 | | cl::init(-1), cl::Hidden); |
57 | | static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3", |
58 | | cl::init(-1), cl::Hidden); |
59 | | |
60 | | namespace { |
61 | | |
62 | | /// ReduceTable - A static table with information on mapping from wide |
63 | | /// opcodes to narrow |
64 | | struct ReduceEntry { |
65 | | uint16_t WideOpc; // Wide opcode |
66 | | uint16_t NarrowOpc1; // Narrow opcode to transform to |
67 | | uint16_t NarrowOpc2; // Narrow opcode when it's two-address |
68 | | uint8_t Imm1Limit; // Limit of immediate field (bits) |
69 | | uint8_t Imm2Limit; // Limit of immediate field when it's two-address |
70 | | unsigned LowRegs1 : 1; // Only possible if low-registers are used |
71 | | unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr) |
72 | | unsigned PredCC1 : 2; // 0 - If predicated, cc is on and vice versa. |
73 | | // 1 - No cc field. |
74 | | // 2 - Always set CPSR. |
75 | | unsigned PredCC2 : 2; |
76 | | unsigned PartFlag : 1; // 16-bit instruction does partial flag update |
77 | | unsigned Special : 1; // Needs to be dealt with specially |
78 | | unsigned AvoidMovs: 1; // Avoid movs with shifter operand (for Swift) |
79 | | }; |
80 | | |
81 | | static const ReduceEntry ReduceTable[] = { |
82 | | // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C,PF,S,AM |
83 | | { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0,0,0 }, |
84 | | { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0,1,0 }, |
85 | | { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0,0,0 }, |
86 | | { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 0,1,0 }, |
87 | | { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 0,1,0 }, |
88 | | { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 1,0,0 }, |
89 | | { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 1,0,1 }, |
90 | | { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 1,0,1 }, |
91 | | { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 1,0,0 }, |
92 | | //FIXME: Disable CMN, as CCodes are backwards from compare expectations |
93 | | //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, |
94 | | { ARM::t2CMNzrr, ARM::tCMNz, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, |
95 | | { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0,0,0 }, |
96 | | { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0,1,0 }, |
97 | | { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 1,0,0 }, |
98 | | // FIXME: adr.n immediate offset must be multiple of 4. |
99 | | //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, |
100 | | { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 1,0,1 }, |
101 | | { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 1,0,1 }, |
102 | | { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 1,0,1 }, |
103 | | { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 1,0,1 }, |
104 | | { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,0,0 }, |
105 | | { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,1,0 }, |
106 | | // FIXME: Do we need the 16-bit 'S' variant? |
107 | | { ARM::t2MOVr,ARM::tMOVr, 0, 0, 0, 0, 0, 1,0, 0,0,0 }, |
108 | | { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 1,0,0 }, |
109 | | { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0,0,0 }, |
110 | | { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 1,0,0 }, |
111 | | { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, |
112 | | { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, |
113 | | { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, |
114 | | { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 1,0,0 }, |
115 | | { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, |
116 | | { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 0,1,0 }, |
117 | | { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0,0,0 }, |
118 | | { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0,0,0 }, |
119 | | { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0,0,0 }, |
120 | | { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0,0,0 }, |
121 | | { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, |
122 | | { ARM::t2SXTB, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, |
123 | | { ARM::t2SXTH, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, |
124 | | { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, |
125 | | { ARM::t2UXTB, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, |
126 | | { ARM::t2UXTH, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, |
127 | | |
128 | | // FIXME: Clean this up after splitting each Thumb load / store opcode |
129 | | // into multiple ones. |
130 | | { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 0,1,0 }, |
131 | | { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, |
132 | | { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, |
133 | | { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, |
134 | | { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, |
135 | | { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, |
136 | | { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, |
137 | | { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, |
138 | | { ARM::t2LDR_POST,ARM::tLDMIA_UPD,0, 0, 0, 1, 0, 0,0, 0,1,0 }, |
139 | | { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 0,1,0 }, |
140 | | { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, |
141 | | { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, |
142 | | { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, |
143 | | { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, |
144 | | { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, |
145 | | { ARM::t2STR_POST,ARM::tSTMIA_UPD,0, 0, 0, 1, 0, 0,0, 0,1,0 }, |
146 | | |
147 | | { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, |
148 | | { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 }, |
149 | | { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 }, |
150 | | // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent. |
151 | | // tSTMIA_UPD is a change in semantics which can only be used if the base |
152 | | // register is killed. This difference is correctly handled elsewhere. |
153 | | { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, |
154 | | { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, |
155 | | { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 } |
156 | | }; |
157 | | |
158 | | class Thumb2SizeReduce : public MachineFunctionPass { |
159 | | public: |
160 | | static char ID; |
161 | | |
162 | | const Thumb2InstrInfo *TII; |
163 | | const ARMSubtarget *STI; |
164 | | |
165 | | Thumb2SizeReduce(std::function<bool(const Function &)> Ftor); |
166 | | |
167 | | bool runOnMachineFunction(MachineFunction &MF) override; |
168 | | |
169 | 8.50k | MachineFunctionProperties getRequiredProperties() const override { |
170 | 8.50k | return MachineFunctionProperties().set( |
171 | 8.50k | MachineFunctionProperties::Property::NoVRegs); |
172 | 8.50k | } |
173 | | |
174 | 8.50k | StringRef getPassName() const override { |
175 | 8.50k | return "Thumb2 instruction size reduction pass"; |
176 | 8.50k | } |
177 | | |
178 | | private: |
179 | | /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable. |
180 | | DenseMap<unsigned, unsigned> ReduceOpcodeMap; |
181 | | |
182 | | bool canAddPseudoFlagDep(MachineInstr *Use, bool IsSelfLoop); |
183 | | |
184 | | bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, |
185 | | bool is2Addr, ARMCC::CondCodes Pred, |
186 | | bool LiveCPSR, bool &HasCC, bool &CCDead); |
187 | | |
188 | | bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, |
189 | | const ReduceEntry &Entry); |
190 | | |
191 | | bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, |
192 | | const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop); |
193 | | |
194 | | /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address |
195 | | /// instruction. |
196 | | bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, |
197 | | const ReduceEntry &Entry, bool LiveCPSR, |
198 | | bool IsSelfLoop); |
199 | | |
200 | | /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit |
201 | | /// non-two-address instruction. |
202 | | bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, |
203 | | const ReduceEntry &Entry, bool LiveCPSR, |
204 | | bool IsSelfLoop); |
205 | | |
206 | | /// ReduceMI - Attempt to reduce MI, return true on success. |
207 | | bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI, |
208 | | bool LiveCPSR, bool IsSelfLoop); |
209 | | |
210 | | /// ReduceMBB - Reduce width of instructions in the specified basic block. |
211 | | bool ReduceMBB(MachineBasicBlock &MBB); |
212 | | |
213 | | bool OptimizeSize; |
214 | | bool MinimizeSize; |
215 | | |
216 | | // Last instruction to define CPSR in the current block. |
217 | | MachineInstr *CPSRDef; |
218 | | // Was CPSR last defined by a high latency instruction? |
219 | | // When CPSRDef is null, this refers to CPSR defs in predecessors. |
220 | | bool HighLatencyCPSR; |
221 | | |
222 | | struct MBBInfo { |
223 | | // The flags leaving this block have high latency. |
224 | | bool HighLatencyCPSR = false; |
225 | | // Has this block been visited yet? |
226 | | bool Visited = false; |
227 | | |
228 | 52.9k | MBBInfo() = default; |
229 | | }; |
230 | | |
231 | | SmallVector<MBBInfo, 8> BlockInfo; |
232 | | |
233 | | std::function<bool(const Function &)> PredicateFtor; |
234 | | }; |
235 | | |
236 | | char Thumb2SizeReduce::ID = 0; |
237 | | |
238 | | } // end anonymous namespace |
239 | | |
240 | | Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor) |
241 | 8.51k | : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) { |
242 | 8.51k | OptimizeSize = MinimizeSize = false; |
243 | 527k | for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e527k ; ++i519k ) { |
244 | 519k | unsigned FromOpc = ReduceTable[i].WideOpc; |
245 | 519k | if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second) |
246 | 0 | llvm_unreachable("Duplicated entries?"); |
247 | 519k | } |
248 | 8.51k | } |
249 | | |
250 | 17.8k | static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) { |
251 | 17.8k | for (const MCPhysReg *Regs = MCID.getImplicitDefs(); *Regs17.8k ; ++Regs0 ) |
252 | 17.8k | if (17.8k *Regs == ARM::CPSR17.8k ) |
253 | 17.8k | return true; |
254 | 0 | return false; |
255 | 17.8k | } |
256 | | |
257 | | // Check for a likely high-latency flag def. |
258 | 52.3k | static bool isHighLatencyCPSR(MachineInstr *Def) { |
259 | 52.3k | switch(Def->getOpcode()) { |
260 | 1.43k | case ARM::FMSTAT: |
261 | 1.43k | case ARM::tMUL: |
262 | 1.43k | return true; |
263 | 50.9k | } |
264 | 50.9k | return false; |
265 | 50.9k | } |
266 | | |
267 | | /// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations, |
268 | | /// the 's' 16-bit instruction partially update CPSR. Abort the |
269 | | /// transformation to avoid adding false dependency on last CPSR setting |
270 | | /// instruction which hurts the ability for out-of-order execution engine |
271 | | /// to do register renaming magic. |
272 | | /// This function checks if there is a read-of-write dependency between the |
273 | | /// last instruction that defines the CPSR and the current instruction. If there |
274 | | /// is, then there is no harm done since the instruction cannot be retired |
275 | | /// before the CPSR setting instruction anyway. |
276 | | /// Note, we are not doing full dependency analysis here for the sake of compile |
277 | | /// time. We're not looking for cases like: |
278 | | /// r0 = muls ... |
279 | | /// r1 = add.w r0, ... |
280 | | /// ... |
281 | | /// = mul.w r1 |
282 | | /// In this case it would have been ok to narrow the mul.w to muls since there |
283 | | /// are indirect RAW dependency between the muls and the mul.w |
284 | | bool |
285 | 18.5k | Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) { |
286 | 18.5k | // Disable the check for -Oz (aka OptimizeForSizeHarder). |
287 | 18.5k | if (MinimizeSize || 18.5k !STI->avoidCPSRPartialUpdate()15.2k ) |
288 | 14.0k | return false; |
289 | 4.48k | |
290 | 4.48k | if (4.48k !CPSRDef4.48k ) |
291 | 4.48k | // If this BB loops back to itself, conservatively avoid narrowing the |
292 | 4.48k | // first instruction that does partial flag update. |
293 | 3.09k | return HighLatencyCPSR || 3.09k FirstInSelfLoop3.06k ; |
294 | 1.39k | |
295 | 1.39k | SmallSet<unsigned, 2> Defs; |
296 | 7.21k | for (const MachineOperand &MO : CPSRDef->operands()) { |
297 | 7.21k | if (!MO.isReg() || 7.21k MO.isUndef()4.60k || MO.isUse()4.60k ) |
298 | 4.56k | continue; |
299 | 2.65k | unsigned Reg = MO.getReg(); |
300 | 2.65k | if (Reg == 0 || 2.65k Reg == ARM::CPSR2.65k ) |
301 | 1.39k | continue; |
302 | 1.26k | Defs.insert(Reg); |
303 | 1.26k | } |
304 | 1.39k | |
305 | 6.80k | for (const MachineOperand &MO : Use->operands()) { |
306 | 6.80k | if (!MO.isReg() || 6.80k MO.isUndef()4.23k || MO.isDef()4.23k ) |
307 | 3.97k | continue; |
308 | 2.83k | unsigned Reg = MO.getReg(); |
309 | 2.83k | if (Defs.count(Reg)) |
310 | 76 | return false; |
311 | 1.31k | } |
312 | 1.31k | |
313 | 1.31k | // If the current CPSR has high latency, try to avoid the false dependency. |
314 | 1.31k | if (1.31k HighLatencyCPSR1.31k ) |
315 | 7 | return true; |
316 | 1.30k | |
317 | 1.30k | // tMOVi8 usually doesn't start long dependency chains, and there are a lot |
318 | 1.30k | // of them, so always shrink them when CPSR doesn't have high latency. |
319 | 1.30k | if (1.30k Use->getOpcode() == ARM::t2MOVi || |
320 | 74 | Use->getOpcode() == ARM::t2MOVi16) |
321 | 1.25k | return false; |
322 | 57 | |
323 | 57 | // No read-after-write dependency. The narrowing will add false dependency. |
324 | 57 | return true; |
325 | 57 | } |
326 | | |
327 | | bool |
328 | | Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, |
329 | | bool is2Addr, ARMCC::CondCodes Pred, |
330 | 50.7k | bool LiveCPSR, bool &HasCC, bool &CCDead) { |
331 | 50.7k | if ((is2Addr && 50.7k Entry.PredCC2 == 07.87k ) || |
332 | 50.7k | (!is2Addr && 44.5k Entry.PredCC1 == 042.8k )) { |
333 | 29.8k | if (Pred == ARMCC::AL29.8k ) { |
334 | 27.0k | // Not predicated, must set CPSR. |
335 | 27.0k | if (!HasCC27.0k ) { |
336 | 24.2k | // Original instruction was not setting CPSR, but CPSR is not |
337 | 24.2k | // currently live anyway. It's ok to set it. The CPSR def is |
338 | 24.2k | // dead though. |
339 | 24.2k | if (!LiveCPSR24.2k ) { |
340 | 22.6k | HasCC = true; |
341 | 22.6k | CCDead = true; |
342 | 22.6k | return true; |
343 | 22.6k | } |
344 | 1.54k | return false; |
345 | 1.54k | } |
346 | 0 | } else { |
347 | 2.76k | // Predicated, must not set CPSR. |
348 | 2.76k | if (HasCC) |
349 | 12 | return false; |
350 | 50.7k | } |
351 | 20.8k | } else if (20.8k (is2Addr && 20.8k Entry.PredCC2 == 21.71k ) || |
352 | 20.8k | (!is2Addr && 20.8k Entry.PredCC1 == 219.1k )) { |
353 | 17.8k | /// Old opcode has an optional def of CPSR. |
354 | 17.8k | if (HasCC) |
355 | 0 | return true; |
356 | 17.8k | // If old opcode does not implicitly define CPSR, then it's not ok since |
357 | 17.8k | // these new opcodes' CPSR def is not meant to be thrown away. e.g. CMP. |
358 | 17.8k | if (17.8k !HasImplicitCPSRDef(MI->getDesc())17.8k ) |
359 | 0 | return false; |
360 | 17.8k | HasCC = true; |
361 | 20.8k | } else { |
362 | 3.01k | // 16-bit instruction does not set CPSR. |
363 | 3.01k | if (HasCC) |
364 | 251 | return false; |
365 | 26.2k | } |
366 | 26.2k | |
367 | 26.2k | return true; |
368 | 26.2k | } |
369 | | |
370 | 46.7k | static bool VerifyLowRegs(MachineInstr *MI) { |
371 | 46.7k | unsigned Opc = MI->getOpcode(); |
372 | 41.5k | bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA_UPD); |
373 | 46.7k | bool isLROk = (Opc == ARM::t2STMDB_UPD); |
374 | 39.8k | bool isSPOk = isPCOk || isLROk; |
375 | 278k | for (unsigned i = 0, e = MI->getNumOperands(); i != e278k ; ++i231k ) { |
376 | 241k | const MachineOperand &MO = MI->getOperand(i); |
377 | 241k | if (!MO.isReg() || 241k MO.isImplicit()174k ) |
378 | 73.2k | continue; |
379 | 168k | unsigned Reg = MO.getReg(); |
380 | 168k | if (Reg == 0 || 168k Reg == ARM::CPSR128k ) |
381 | 41.0k | continue; |
382 | 127k | if (127k isPCOk && 127k Reg == ARM::PC37.2k ) |
383 | 4.98k | continue; |
384 | 122k | if (122k isLROk && 122k Reg == ARM::LR24.2k ) |
385 | 3.67k | continue; |
386 | 118k | if (118k Reg == ARM::SP118k ) { |
387 | 32.5k | if (isSPOk) |
388 | 22.9k | continue; |
389 | 9.55k | if (9.55k i == 1 && 9.55k (Opc == ARM::t2LDRi12 || 9.44k Opc == ARM::t2STRi125.44k )) |
390 | 9.55k | // Special case for these ldr / str with sp as base register. |
391 | 9.37k | continue; |
392 | 86.5k | } |
393 | 86.5k | if (86.5k !isARMLowRegister(Reg)86.5k ) |
394 | 10.1k | return false; |
395 | 241k | } |
396 | 36.5k | return true; |
397 | 46.7k | } |
398 | | |
399 | | bool |
400 | | Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, |
401 | 32.1k | const ReduceEntry &Entry) { |
402 | 32.1k | if (ReduceLimitLdSt != -1 && 32.1k ((int)NumLdSts >= ReduceLimitLdSt)0 ) |
403 | 0 | return false; |
404 | 32.1k | |
405 | 32.1k | unsigned Scale = 1; |
406 | 32.1k | bool HasImmOffset = false; |
407 | 32.1k | bool HasShift = false; |
408 | 32.1k | bool HasOffReg = true; |
409 | 32.1k | bool isLdStMul = false; |
410 | 32.1k | unsigned Opc = Entry.NarrowOpc1; |
411 | 32.1k | unsigned OpNum = 3; // First 'rest' of operands. |
412 | 32.1k | uint8_t ImmLimit = Entry.Imm1Limit; |
413 | 32.1k | |
414 | 32.1k | switch (Entry.WideOpc) { |
415 | 0 | default: |
416 | 0 | llvm_unreachable("Unexpected Thumb2 load / store opcode!"); |
417 | 19.6k | case ARM::t2LDRi12: |
418 | 19.6k | case ARM::t2STRi12: |
419 | 19.6k | if (MI->getOperand(1).getReg() == ARM::SP19.6k ) { |
420 | 9.37k | Opc = Entry.NarrowOpc2; |
421 | 9.37k | ImmLimit = Entry.Imm2Limit; |
422 | 9.37k | } |
423 | 19.6k | |
424 | 19.6k | Scale = 4; |
425 | 19.6k | HasImmOffset = true; |
426 | 19.6k | HasOffReg = false; |
427 | 19.6k | break; |
428 | 1.72k | case ARM::t2LDRBi12: |
429 | 1.72k | case ARM::t2STRBi12: |
430 | 1.72k | HasImmOffset = true; |
431 | 1.72k | HasOffReg = false; |
432 | 1.72k | break; |
433 | 328 | case ARM::t2LDRHi12: |
434 | 328 | case ARM::t2STRHi12: |
435 | 328 | Scale = 2; |
436 | 328 | HasImmOffset = true; |
437 | 328 | HasOffReg = false; |
438 | 328 | break; |
439 | 1.43k | case ARM::t2LDRs: |
440 | 1.43k | case ARM::t2LDRBs: |
441 | 1.43k | case ARM::t2LDRHs: |
442 | 1.43k | case ARM::t2LDRSBs: |
443 | 1.43k | case ARM::t2LDRSHs: |
444 | 1.43k | case ARM::t2STRs: |
445 | 1.43k | case ARM::t2STRBs: |
446 | 1.43k | case ARM::t2STRHs: |
447 | 1.43k | HasShift = true; |
448 | 1.43k | OpNum = 4; |
449 | 1.43k | break; |
450 | 183 | case ARM::t2LDR_POST: |
451 | 183 | case ARM::t2STR_POST: { |
452 | 183 | if (!MBB.getParent()->getFunction()->optForMinSize()) |
453 | 175 | return false; |
454 | 8 | |
455 | 8 | if (8 !MI->hasOneMemOperand() || |
456 | 8 | (*MI->memoperands_begin())->getAlignment() < 4) |
457 | 2 | return false; |
458 | 6 | |
459 | 6 | // We're creating a completely different type of load/store - LDM from LDR. |
460 | 6 | // For this reason we can't reuse the logic at the end of this function; we |
461 | 6 | // have to implement the MI building here. |
462 | 6 | bool IsStore = Entry.WideOpc == ARM::t2STR_POST; |
463 | 6 | unsigned Rt = MI->getOperand(IsStore ? 13 : 03 ).getReg(); |
464 | 6 | unsigned Rn = MI->getOperand(IsStore ? 03 : 13 ).getReg(); |
465 | 6 | unsigned Offset = MI->getOperand(3).getImm(); |
466 | 6 | unsigned PredImm = MI->getOperand(4).getImm(); |
467 | 6 | unsigned PredReg = MI->getOperand(5).getReg(); |
468 | 6 | assert(isARMLowRegister(Rt)); |
469 | 6 | assert(isARMLowRegister(Rn)); |
470 | 6 | |
471 | 6 | if (Offset != 4) |
472 | 2 | return false; |
473 | 4 | |
474 | 4 | // Add the 16-bit load / store instruction. |
475 | 4 | DebugLoc dl = MI->getDebugLoc(); |
476 | 4 | auto MIB = BuildMI(MBB, MI, dl, TII->get(Entry.NarrowOpc1)) |
477 | 4 | .addReg(Rn, RegState::Define) |
478 | 4 | .addReg(Rn) |
479 | 4 | .addImm(PredImm) |
480 | 4 | .addReg(PredReg) |
481 | 4 | .addReg(Rt, IsStore ? 01 : RegState::Define3 ); |
482 | 4 | |
483 | 4 | // Transfer memoperands. |
484 | 4 | MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); |
485 | 4 | |
486 | 4 | // Transfer MI flags. |
487 | 4 | MIB.setMIFlags(MI->getFlags()); |
488 | 4 | |
489 | 4 | // Kill the old instruction. |
490 | 4 | MI->eraseFromBundle(); |
491 | 4 | ++NumLdSts; |
492 | 4 | return true; |
493 | 4 | } |
494 | 95 | case ARM::t2LDMIA: { |
495 | 95 | unsigned BaseReg = MI->getOperand(0).getReg(); |
496 | 95 | assert(isARMLowRegister(BaseReg)); |
497 | 95 | |
498 | 95 | // For the non-writeback version (this one), the base register must be |
499 | 95 | // one of the registers being loaded. |
500 | 95 | bool isOK = false; |
501 | 360 | for (unsigned i = 3; i < MI->getNumOperands()360 ; ++i265 ) { |
502 | 354 | if (MI->getOperand(i).getReg() == BaseReg354 ) { |
503 | 89 | isOK = true; |
504 | 89 | break; |
505 | 89 | } |
506 | 354 | } |
507 | 95 | |
508 | 95 | if (!isOK) |
509 | 6 | return false; |
510 | 89 | |
511 | 89 | OpNum = 0; |
512 | 89 | isLdStMul = true; |
513 | 89 | break; |
514 | 89 | } |
515 | 47 | case ARM::t2STMIA: |
516 | 47 | // If the base register is killed, we don't care what its value is after the |
517 | 47 | // instruction, so we can use an updating STMIA. |
518 | 47 | if (!MI->getOperand(0).isKill()) |
519 | 12 | return false; |
520 | 35 | |
521 | 35 | break; |
522 | 4.98k | case ARM::t2LDMIA_RET: { |
523 | 4.98k | unsigned BaseReg = MI->getOperand(1).getReg(); |
524 | 4.98k | if (BaseReg != ARM::SP) |
525 | 0 | return false; |
526 | 4.98k | Opc = Entry.NarrowOpc2; // tPOP_RET |
527 | 4.98k | OpNum = 2; |
528 | 4.98k | isLdStMul = true; |
529 | 4.98k | break; |
530 | 4.98k | } |
531 | 3.68k | case ARM::t2LDMIA_UPD: |
532 | 3.68k | case ARM::t2STMIA_UPD: |
533 | 3.68k | case ARM::t2STMDB_UPD: { |
534 | 3.68k | OpNum = 0; |
535 | 3.68k | |
536 | 3.68k | unsigned BaseReg = MI->getOperand(1).getReg(); |
537 | 3.68k | if (BaseReg == ARM::SP && |
538 | 3.68k | (Entry.WideOpc == ARM::t2LDMIA_UPD || |
539 | 3.68k | Entry.WideOpc == ARM::t2STMDB_UPD3.67k )) { |
540 | 3.68k | Opc = Entry.NarrowOpc2; // tPOP or tPUSH |
541 | 3.68k | OpNum = 2; |
542 | 3.68k | } else if (4 !isARMLowRegister(BaseReg) || |
543 | 4 | (Entry.WideOpc != ARM::t2LDMIA_UPD && |
544 | 4 | Entry.WideOpc != ARM::t2STMIA_UPD1 )) { |
545 | 0 | return false; |
546 | 0 | } |
547 | 3.68k | |
548 | 3.68k | isLdStMul = true; |
549 | 3.68k | break; |
550 | 3.68k | } |
551 | 31.9k | } |
552 | 31.9k | |
553 | 31.9k | unsigned OffsetReg = 0; |
554 | 31.9k | bool OffsetKill = false; |
555 | 31.9k | bool OffsetInternal = false; |
556 | 31.9k | if (HasShift31.9k ) { |
557 | 1.43k | OffsetReg = MI->getOperand(2).getReg(); |
558 | 1.43k | OffsetKill = MI->getOperand(2).isKill(); |
559 | 1.43k | OffsetInternal = MI->getOperand(2).isInternalRead(); |
560 | 1.43k | |
561 | 1.43k | if (MI->getOperand(3).getImm()) |
562 | 1.43k | // Thumb1 addressing mode doesn't support shift. |
563 | 490 | return false; |
564 | 31.4k | } |
565 | 31.4k | |
566 | 31.4k | unsigned OffsetImm = 0; |
567 | 31.4k | if (HasImmOffset31.4k ) { |
568 | 21.6k | OffsetImm = MI->getOperand(2).getImm(); |
569 | 21.6k | unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale; |
570 | 21.6k | |
571 | 21.6k | if ((OffsetImm & (Scale - 1)) || 21.6k OffsetImm > MaxOffset21.6k ) |
572 | 21.6k | // Make sure the immediate field fits. |
573 | 1.60k | return false; |
574 | 29.8k | } |
575 | 29.8k | |
576 | 29.8k | // Add the 16-bit load / store instruction. |
577 | 29.8k | DebugLoc dl = MI->getDebugLoc(); |
578 | 29.8k | MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc)); |
579 | 29.8k | |
580 | 29.8k | // tSTMIA_UPD takes a defining register operand. We've already checked that |
581 | 29.8k | // the register is killed, so mark it as dead here. |
582 | 29.8k | if (Entry.WideOpc == ARM::t2STMIA) |
583 | 35 | MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead); |
584 | 29.8k | |
585 | 29.8k | if (!isLdStMul29.8k ) { |
586 | 21.0k | MIB.add(MI->getOperand(0)); |
587 | 21.0k | MIB.add(MI->getOperand(1)); |
588 | 21.0k | |
589 | 21.0k | if (HasImmOffset) |
590 | 20.0k | MIB.addImm(OffsetImm / Scale); |
591 | 21.0k | |
592 | 21.0k | assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!"); |
593 | 21.0k | |
594 | 21.0k | if (HasOffReg) |
595 | 979 | MIB.addReg(OffsetReg, getKillRegState(OffsetKill) | |
596 | 979 | getInternalReadRegState(OffsetInternal)); |
597 | 21.0k | } |
598 | 29.8k | |
599 | 29.8k | // Transfer the rest of operands. |
600 | 128k | for (unsigned e = MI->getNumOperands(); OpNum != e128k ; ++OpNum98.7k ) |
601 | 98.7k | MIB.add(MI->getOperand(OpNum)); |
602 | 29.8k | |
603 | 29.8k | // Transfer memoperands. |
604 | 29.8k | MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); |
605 | 29.8k | |
606 | 29.8k | // Transfer MI flags. |
607 | 29.8k | MIB.setMIFlags(MI->getFlags()); |
608 | 29.8k | |
609 | 29.8k | DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); |
610 | 32.1k | |
611 | 32.1k | MBB.erase_instr(MI); |
612 | 32.1k | ++NumLdSts; |
613 | 32.1k | return true; |
614 | 32.1k | } |
615 | | |
616 | | bool |
617 | | Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, |
618 | | const ReduceEntry &Entry, |
619 | 61.8k | bool LiveCPSR, bool IsSelfLoop) { |
620 | 61.8k | unsigned Opc = MI->getOpcode(); |
621 | 61.8k | if (Opc == ARM::t2ADDri61.8k ) { |
622 | 10.9k | // If the source register is SP, try to reduce to tADDrSPi, otherwise |
623 | 10.9k | // it's a normal reduce. |
624 | 10.9k | if (MI->getOperand(1).getReg() != ARM::SP10.9k ) { |
625 | 6.11k | if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) |
626 | 1.95k | return true; |
627 | 4.16k | return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); |
628 | 4.16k | } |
629 | 4.81k | // Try to reduce to tADDrSPi. |
630 | 4.81k | unsigned Imm = MI->getOperand(2).getImm(); |
631 | 4.81k | // The immediate must be in range, the destination register must be a low |
632 | 4.81k | // reg, the predicate must be "always" and the condition flags must not |
633 | 4.81k | // be being set. |
634 | 4.81k | if (Imm & 3 || 4.81k Imm > 10204.81k ) |
635 | 807 | return false; |
636 | 4.01k | if (4.01k !isARMLowRegister(MI->getOperand(0).getReg())4.01k ) |
637 | 327 | return false; |
638 | 3.68k | if (3.68k MI->getOperand(3).getImm() != ARMCC::AL3.68k ) |
639 | 3 | return false; |
640 | 3.68k | const MCInstrDesc &MCID = MI->getDesc(); |
641 | 3.68k | if (MCID.hasOptionalDef() && |
642 | 3.68k | MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR) |
643 | 0 | return false; |
644 | 3.68k | |
645 | 3.68k | MachineInstrBuilder MIB = |
646 | 3.68k | BuildMI(MBB, MI, MI->getDebugLoc(), |
647 | 3.68k | TII->get(ARM::tADDrSPi)) |
648 | 3.68k | .add(MI->getOperand(0)) |
649 | 3.68k | .add(MI->getOperand(1)) |
650 | 3.68k | .addImm(Imm / 4) // The tADDrSPi has an implied scale by four. |
651 | 3.68k | .add(predOps(ARMCC::AL)); |
652 | 3.68k | |
653 | 3.68k | // Transfer MI flags. |
654 | 3.68k | MIB.setMIFlags(MI->getFlags()); |
655 | 3.68k | |
656 | 3.68k | DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " <<*MIB); |
657 | 10.9k | |
658 | 10.9k | MBB.erase_instr(MI); |
659 | 10.9k | ++NumNarrows; |
660 | 10.9k | return true; |
661 | 10.9k | } |
662 | 50.9k | |
663 | 50.9k | if (50.9k Entry.LowRegs1 && 50.9k !VerifyLowRegs(MI)46.7k ) |
664 | 10.1k | return false; |
665 | 40.7k | |
666 | 40.7k | if (40.7k MI->mayLoadOrStore()40.7k ) |
667 | 32.1k | return ReduceLoadStore(MBB, MI, Entry); |
668 | 8.64k | |
669 | 8.64k | switch (Opc) { |
670 | 0 | default: break; |
671 | 0 | case ARM::t2ADDSri: |
672 | 0 | case ARM::t2ADDSrr: { |
673 | 0 | unsigned PredReg = 0; |
674 | 0 | if (getInstrPredicate(*MI, PredReg) == ARMCC::AL0 ) { |
675 | 0 | switch (Opc) { |
676 | 0 | default: break; |
677 | 0 | case ARM::t2ADDSri: |
678 | 0 | if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) |
679 | 0 | return true; |
680 | 0 | LLVM_FALLTHROUGH0 ; |
681 | 0 | case ARM::t2ADDSrr: |
682 | 0 | return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); |
683 | 0 | } |
684 | 0 | } |
685 | 0 | break; |
686 | 0 | } |
687 | 1.99k | case ARM::t2RSBri: |
688 | 1.99k | case ARM::t2RSBSri: |
689 | 1.99k | case ARM::t2SXTB: |
690 | 1.99k | case ARM::t2SXTH: |
691 | 1.99k | case ARM::t2UXTB: |
692 | 1.99k | case ARM::t2UXTH: |
693 | 1.99k | if (MI->getOperand(2).getImm() == 0) |
694 | 986 | return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); |
695 | 1.01k | break; |
696 | 2.46k | case ARM::t2MOVi16: |
697 | 2.46k | // Can convert only 'pure' immediate operands, not immediates obtained as |
698 | 2.46k | // globals' addresses. |
699 | 2.46k | if (MI->getOperand(1).isImm()) |
700 | 2.17k | return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); |
701 | 291 | break; |
702 | 4.17k | case ARM::t2CMPrr: { |
703 | 4.17k | // Try to reduce to the lo-reg only version first. Why there are two |
704 | 4.17k | // versions of the instruction is a mystery. |
705 | 4.17k | // It would be nice to just have two entries in the master table that |
706 | 4.17k | // are prioritized, but the table assumes a unique entry for each |
707 | 4.17k | // source insn opcode. So for now, we hack a local entry record to use. |
708 | 4.17k | static const ReduceEntry NarrowEntry = |
709 | 4.17k | { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1,0 }; |
710 | 4.17k | if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, IsSelfLoop)) |
711 | 2.80k | return true; |
712 | 1.37k | return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); |
713 | 1.37k | } |
714 | 1.30k | } |
715 | 1.30k | return false; |
716 | 1.30k | } |
717 | | |
718 | | bool |
719 | | Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, |
720 | | const ReduceEntry &Entry, |
721 | 21.8k | bool LiveCPSR, bool IsSelfLoop) { |
722 | 21.8k | if (ReduceLimit2Addr != -1 && 21.8k ((int)Num2Addrs >= ReduceLimit2Addr)6 ) |
723 | 6 | return false; |
724 | 21.8k | |
725 | 21.8k | if (21.8k !OptimizeSize && 21.8k Entry.AvoidMovs16.3k && STI->avoidMOVsShifterOperand()1.29k ) |
726 | 21.8k | // Don't issue movs with shifter operand for some CPUs unless we |
727 | 21.8k | // are optimizing for size. |
728 | 378 | return false; |
729 | 21.4k | |
730 | 21.4k | unsigned Reg0 = MI->getOperand(0).getReg(); |
731 | 21.4k | unsigned Reg1 = MI->getOperand(1).getReg(); |
732 | 21.4k | // t2MUL is "special". The tied source operand is second, not first. |
733 | 21.4k | if (MI->getOpcode() == ARM::t2MUL21.4k ) { |
734 | 254 | unsigned Reg2 = MI->getOperand(2).getReg(); |
735 | 254 | // Early exit if the regs aren't all low regs. |
736 | 254 | if (!isARMLowRegister(Reg0) || 254 !isARMLowRegister(Reg1)230 |
737 | 221 | || !isARMLowRegister(Reg2)) |
738 | 68 | return false; |
739 | 186 | if (186 Reg0 != Reg2186 ) { |
740 | 140 | // If the other operand also isn't the same as the destination, we |
741 | 140 | // can't reduce. |
742 | 140 | if (Reg1 != Reg0) |
743 | 52 | return false; |
744 | 88 | // Try to commute the operands to make it a 2-address instruction. |
745 | 88 | MachineInstr *CommutedMI = TII->commuteInstruction(*MI); |
746 | 88 | if (!CommutedMI) |
747 | 0 | return false; |
748 | 21.4k | } |
749 | 21.1k | } else if (21.1k Reg0 != Reg121.1k ) { |
750 | 14.2k | // Try to commute the operands to make it a 2-address instruction. |
751 | 14.2k | unsigned CommOpIdx1 = 1; |
752 | 14.2k | unsigned CommOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex; |
753 | 14.2k | if (!TII->findCommutedOpIndices(*MI, CommOpIdx1, CommOpIdx2) || |
754 | 5.56k | MI->getOperand(CommOpIdx2).getReg() != Reg0) |
755 | 12.1k | return false; |
756 | 2.07k | MachineInstr *CommutedMI = |
757 | 2.07k | TII->commuteInstruction(*MI, false, CommOpIdx1, CommOpIdx2); |
758 | 2.07k | if (!CommutedMI) |
759 | 0 | return false; |
760 | 9.11k | } |
761 | 9.11k | if (9.11k Entry.LowRegs2 && 9.11k !isARMLowRegister(Reg0)7.39k ) |
762 | 571 | return false; |
763 | 8.54k | if (8.54k Entry.Imm2Limit8.54k ) { |
764 | 3.24k | unsigned Imm = MI->getOperand(2).getImm(); |
765 | 3.24k | unsigned Limit = (1 << Entry.Imm2Limit) - 1; |
766 | 3.24k | if (Imm > Limit) |
767 | 203 | return false; |
768 | 5.29k | } else { |
769 | 5.29k | unsigned Reg2 = MI->getOperand(2).getReg(); |
770 | 5.29k | if (Entry.LowRegs2 && 5.29k !isARMLowRegister(Reg2)3.57k ) |
771 | 470 | return false; |
772 | 7.87k | } |
773 | 7.87k | |
774 | 7.87k | // Check if it's possible / necessary to transfer the predicate. |
775 | 7.87k | const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2); |
776 | 7.87k | unsigned PredReg = 0; |
777 | 7.87k | ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); |
778 | 7.87k | bool SkipPred = false; |
779 | 7.87k | if (Pred != ARMCC::AL7.87k ) { |
780 | 401 | if (!NewMCID.isPredicable()) |
781 | 401 | // Can't transfer predicate, fail. |
782 | 0 | return false; |
783 | 7.47k | } else { |
784 | 7.47k | SkipPred = !NewMCID.isPredicable(); |
785 | 7.47k | } |
786 | 7.87k | |
787 | 7.87k | bool HasCC = false; |
788 | 7.87k | bool CCDead = false; |
789 | 7.87k | const MCInstrDesc &MCID = MI->getDesc(); |
790 | 7.87k | if (MCID.hasOptionalDef()7.87k ) { |
791 | 7.73k | unsigned NumOps = MCID.getNumOperands(); |
792 | 7.73k | HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); |
793 | 7.73k | if (HasCC && 7.73k MI->getOperand(NumOps-1).isDead()1.56k ) |
794 | 0 | CCDead = true; |
795 | 7.73k | } |
796 | 7.87k | if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead)) |
797 | 776 | return false; |
798 | 7.09k | |
799 | 7.09k | // Avoid adding a false dependency on partial flag update by some 16-bit |
800 | 7.09k | // instructions which has the 's' bit set. |
801 | 7.09k | if (7.09k Entry.PartFlag && 7.09k NewMCID.hasOptionalDef()2.57k && HasCC2.57k && |
802 | 2.52k | canAddPseudoFlagDep(MI, IsSelfLoop)) |
803 | 80 | return false; |
804 | 7.01k | |
805 | 7.01k | // Add the 16-bit instruction. |
806 | 7.01k | DebugLoc dl = MI->getDebugLoc(); |
807 | 7.01k | MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID); |
808 | 7.01k | MIB.add(MI->getOperand(0)); |
809 | 7.01k | if (NewMCID.hasOptionalDef()) |
810 | 5.54k | MIB.add(HasCC ? 5.54k t1CondCodeOp(CCDead)5.28k : condCodeOp()264 ); |
811 | 7.01k | |
812 | 7.01k | // Transfer the rest of operands. |
813 | 7.01k | unsigned NumOps = MCID.getNumOperands(); |
814 | 42.9k | for (unsigned i = 1, e = MI->getNumOperands(); i != e42.9k ; ++i35.9k ) { |
815 | 35.9k | if (i < NumOps && 35.9k MCID.OpInfo[i].isOptionalDef()34.9k ) |
816 | 6.89k | continue; |
817 | 29.0k | if (29.0k SkipPred && 29.0k MCID.OpInfo[i].isPredicate()0 ) |
818 | 0 | continue; |
819 | 29.0k | MIB.add(MI->getOperand(i)); |
820 | 29.0k | } |
821 | 7.01k | |
822 | 7.01k | // Transfer MI flags. |
823 | 7.01k | MIB.setMIFlags(MI->getFlags()); |
824 | 7.01k | |
825 | 7.01k | DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); |
826 | 21.8k | |
827 | 21.8k | MBB.erase_instr(MI); |
828 | 21.8k | ++Num2Addrs; |
829 | 21.8k | return true; |
830 | 21.8k | } |
831 | | |
832 | | bool |
833 | | Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, |
834 | | const ReduceEntry &Entry, |
835 | 59.4k | bool LiveCPSR, bool IsSelfLoop) { |
836 | 59.4k | if (ReduceLimit != -1 && 59.4k ((int)NumNarrows >= ReduceLimit)5 ) |
837 | 5 | return false; |
838 | 59.4k | |
839 | 59.4k | if (59.4k !OptimizeSize && 59.4k Entry.AvoidMovs49.7k && STI->avoidMOVsShifterOperand()1.46k ) |
840 | 59.4k | // Don't issue movs with shifter operand for some CPUs unless we |
841 | 59.4k | // are optimizing for size. |
842 | 410 | return false; |
843 | 59.0k | |
844 | 59.0k | unsigned Limit = ~0U; |
845 | 59.0k | if (Entry.Imm1Limit) |
846 | 47.8k | Limit = (1 << Entry.Imm1Limit) - 1; |
847 | 59.0k | |
848 | 59.0k | const MCInstrDesc &MCID = MI->getDesc(); |
849 | 274k | for (unsigned i = 0, e = MCID.getNumOperands(); i != e274k ; ++i215k ) { |
850 | 231k | if (MCID.OpInfo[i].isPredicate()) |
851 | 85.7k | continue; |
852 | 146k | const MachineOperand &MO = MI->getOperand(i); |
853 | 146k | if (MO.isReg()146k ) { |
854 | 102k | unsigned Reg = MO.getReg(); |
855 | 102k | if (!Reg || 102k Reg == ARM::CPSR80.1k ) |
856 | 24.1k | continue; |
857 | 78.6k | if (78.6k Entry.LowRegs1 && 78.6k !isARMLowRegister(Reg)74.0k ) |
858 | 8.40k | return false; |
859 | 43.2k | } else if (43.2k MO.isImm() && |
860 | 43.2k | !MCID.OpInfo[i].isPredicate()43.2k ) { |
861 | 43.2k | if (((unsigned)MO.getImm()) > Limit) |
862 | 7.75k | return false; |
863 | 43.2k | } |
864 | 231k | } |
865 | 59.0k | |
866 | 59.0k | // Check if it's possible / necessary to transfer the predicate. |
867 | 42.8k | const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1); |
868 | 42.8k | unsigned PredReg = 0; |
869 | 42.8k | ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); |
870 | 42.8k | bool SkipPred = false; |
871 | 42.8k | if (Pred != ARMCC::AL42.8k ) { |
872 | 3.80k | if (!NewMCID.isPredicable()) |
873 | 3.80k | // Can't transfer predicate, fail. |
874 | 0 | return false; |
875 | 39.0k | } else { |
876 | 39.0k | SkipPred = !NewMCID.isPredicable(); |
877 | 39.0k | } |
878 | 42.8k | |
879 | 42.8k | bool HasCC = false; |
880 | 42.8k | bool CCDead = false; |
881 | 42.8k | if (MCID.hasOptionalDef()42.8k ) { |
882 | 24.1k | unsigned NumOps = MCID.getNumOperands(); |
883 | 24.1k | HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); |
884 | 24.1k | if (HasCC && 24.1k MI->getOperand(NumOps-1).isDead()1.52k ) |
885 | 0 | CCDead = true; |
886 | 24.1k | } |
887 | 42.8k | if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead)) |
888 | 1.02k | return false; |
889 | 41.8k | |
890 | 41.8k | // Avoid adding a false dependency on partial flag update by some 16-bit |
891 | 41.8k | // instructions which has the 's' bit set. |
892 | 41.8k | if (41.8k Entry.PartFlag && 41.8k NewMCID.hasOptionalDef()18.3k && HasCC18.3k && |
893 | 16.0k | canAddPseudoFlagDep(MI, IsSelfLoop)) |
894 | 33 | return false; |
895 | 41.7k | |
896 | 41.7k | // Add the 16-bit instruction. |
897 | 41.7k | DebugLoc dl = MI->getDebugLoc(); |
898 | 41.7k | MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID); |
899 | 41.7k | MIB.add(MI->getOperand(0)); |
900 | 41.7k | if (NewMCID.hasOptionalDef()) |
901 | 22.6k | MIB.add(HasCC ? 22.6k t1CondCodeOp(CCDead)20.1k : condCodeOp()2.49k ); |
902 | 41.7k | |
903 | 41.7k | // Transfer the rest of operands. |
904 | 41.7k | unsigned NumOps = MCID.getNumOperands(); |
905 | 221k | for (unsigned i = 1, e = MI->getNumOperands(); i != e221k ; ++i179k ) { |
906 | 179k | if (i < NumOps && 179k MCID.OpInfo[i].isOptionalDef()154k ) |
907 | 23.1k | continue; |
908 | 156k | if (156k (MCID.getOpcode() == ARM::t2RSBSri || |
909 | 156k | MCID.getOpcode() == ARM::t2RSBri || |
910 | 153k | MCID.getOpcode() == ARM::t2SXTB || |
911 | 153k | MCID.getOpcode() == ARM::t2SXTH || |
912 | 153k | MCID.getOpcode() == ARM::t2UXTB || |
913 | 156k | MCID.getOpcode() == ARM::t2UXTH152k ) && i == 24.03k ) |
914 | 156k | // Skip the zero immediate operand, it's now implicit. |
915 | 982 | continue; |
916 | 155k | bool isPred = (i < NumOps && 155k MCID.OpInfo[i].isPredicate()130k ); |
917 | 155k | if (SkipPred && 155k isPred0 ) |
918 | 0 | continue; |
919 | 155k | const MachineOperand &MO = MI->getOperand(i); |
920 | 155k | if (MO.isReg() && 155k MO.isImplicit()79.8k && MO.getReg() == ARM::CPSR24.5k ) |
921 | 155k | // Skip implicit def of CPSR. Either it's modeled as an optional |
922 | 155k | // def now or it's already an implicit def on the new instruction. |
923 | 17.8k | continue; |
924 | 137k | MIB.add(MO); |
925 | 137k | } |
926 | 41.7k | if (!MCID.isPredicable() && 41.7k NewMCID.isPredicable()0 ) |
927 | 0 | MIB.add(predOps(ARMCC::AL)); |
928 | 41.7k | |
929 | 41.7k | // Transfer MI flags. |
930 | 41.7k | MIB.setMIFlags(MI->getFlags()); |
931 | 41.7k | |
932 | 41.7k | DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); |
933 | 59.4k | |
934 | 59.4k | MBB.erase_instr(MI); |
935 | 59.4k | ++NumNarrows; |
936 | 59.4k | return true; |
937 | 59.4k | } |
938 | | |
939 | 391k | static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) { |
940 | 391k | bool HasDef = false; |
941 | 1.83M | for (const MachineOperand &MO : MI.operands()) { |
942 | 1.83M | if (!MO.isReg() || 1.83M MO.isUndef()1.19M || MO.isUse()1.19M ) |
943 | 1.44M | continue; |
944 | 392k | if (392k MO.getReg() != ARM::CPSR392k ) |
945 | 340k | continue; |
946 | 52.4k | |
947 | 52.4k | DefCPSR = true; |
948 | 52.4k | if (!MO.isDead()) |
949 | 29.5k | HasDef = true; |
950 | 1.83M | } |
951 | 391k | |
952 | 362k | return HasDef || LiveCPSR; |
953 | 391k | } |
954 | | |
955 | 391k | static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) { |
956 | 1.83M | for (const MachineOperand &MO : MI.operands()) { |
957 | 1.83M | if (!MO.isReg() || 1.83M MO.isUndef()1.19M || MO.isDef()1.19M ) |
958 | 1.01M | continue; |
959 | 818k | if (818k MO.getReg() != ARM::CPSR818k ) |
960 | 785k | continue; |
961 | 818k | assert(LiveCPSR && "CPSR liveness tracking is wrong!"); |
962 | 33.1k | if (MO.isKill()33.1k ) { |
963 | 28.8k | LiveCPSR = false; |
964 | 28.8k | break; |
965 | 28.8k | } |
966 | 391k | } |
967 | 391k | |
968 | 391k | return LiveCPSR; |
969 | 391k | } |
970 | | |
971 | | bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI, |
972 | 391k | bool LiveCPSR, bool IsSelfLoop) { |
973 | 391k | unsigned Opcode = MI->getOpcode(); |
974 | 391k | DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode); |
975 | 391k | if (OPI == ReduceOpcodeMap.end()) |
976 | 271k | return false; |
977 | 120k | const ReduceEntry &Entry = ReduceTable[OPI->second]; |
978 | 120k | |
979 | 120k | // Don't attempt normal reductions on "special" cases for now. |
980 | 120k | if (Entry.Special) |
981 | 61.8k | return ReduceSpecial(MBB, MI, Entry, LiveCPSR, IsSelfLoop); |
982 | 58.1k | |
983 | 58.1k | // Try to transform to a 16-bit two-address instruction. |
984 | 58.1k | if (58.1k Entry.NarrowOpc2 && |
985 | 15.6k | ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) |
986 | 5.06k | return true; |
987 | 53.1k | |
988 | 53.1k | // Try to transform to a 16-bit non-two-address instruction. |
989 | 53.1k | if (53.1k Entry.NarrowOpc1 && |
990 | 46.5k | ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) |
991 | 35.3k | return true; |
992 | 17.8k | |
993 | 17.8k | return false; |
994 | 17.8k | } |
995 | | |
996 | 52.0k | bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { |
997 | 52.0k | bool Modified = false; |
998 | 52.0k | |
999 | 52.0k | // Yes, CPSR could be livein. |
1000 | 52.0k | bool LiveCPSR = MBB.isLiveIn(ARM::CPSR); |
1001 | 52.0k | MachineInstr *BundleMI = nullptr; |
1002 | 52.0k | |
1003 | 52.0k | CPSRDef = nullptr; |
1004 | 52.0k | HighLatencyCPSR = false; |
1005 | 52.0k | |
1006 | 52.0k | // Check predecessors for the latest CPSRDef. |
1007 | 51.9k | for (auto *Pred : MBB.predecessors()) { |
1008 | 51.9k | const MBBInfo &PInfo = BlockInfo[Pred->getNumber()]; |
1009 | 51.9k | if (!PInfo.Visited51.9k ) { |
1010 | 4.56k | // Since blocks are visited in RPO, this must be a back-edge. |
1011 | 4.56k | continue; |
1012 | 4.56k | } |
1013 | 47.3k | if (47.3k PInfo.HighLatencyCPSR47.3k ) { |
1014 | 725 | HighLatencyCPSR = true; |
1015 | 725 | break; |
1016 | 725 | } |
1017 | 52.0k | } |
1018 | 52.0k | |
1019 | 52.0k | // If this BB loops back to itself, conservatively avoid narrowing the |
1020 | 52.0k | // first instruction that does partial flag update. |
1021 | 52.0k | bool IsSelfLoop = MBB.isSuccessor(&MBB); |
1022 | 52.0k | MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),E = MBB.instr_end(); |
1023 | 52.0k | MachineBasicBlock::instr_iterator NextMII; |
1024 | 450k | for (; MII != E450k ; MII = NextMII398k ) { |
1025 | 398k | NextMII = std::next(MII); |
1026 | 398k | |
1027 | 398k | MachineInstr *MI = &*MII; |
1028 | 398k | if (MI->isBundle()398k ) { |
1029 | 6.80k | BundleMI = MI; |
1030 | 6.80k | continue; |
1031 | 6.80k | } |
1032 | 391k | if (391k MI->isDebugValue()391k ) |
1033 | 141 | continue; |
1034 | 391k | |
1035 | 391k | LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR); |
1036 | 391k | |
1037 | 391k | // Does NextMII belong to the same bundle as MI? |
1038 | 339k | bool NextInSameBundle = NextMII != E && NextMII->isBundledWithPred(); |
1039 | 391k | |
1040 | 391k | if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)391k ) { |
1041 | 82.3k | Modified = true; |
1042 | 82.3k | MachineBasicBlock::instr_iterator I = std::prev(NextMII); |
1043 | 82.3k | MI = &*I; |
1044 | 82.3k | // Removing and reinserting the first instruction in a bundle will break |
1045 | 82.3k | // up the bundle. Fix the bundling if it was broken. |
1046 | 82.3k | if (NextInSameBundle && 82.3k !NextMII->isBundledWithPred()1.23k ) |
1047 | 0 | NextMII->bundleWithPred(); |
1048 | 82.3k | } |
1049 | 391k | |
1050 | 391k | if (BundleMI && 391k !NextInSameBundle38.9k && MI->isInsideBundle()29.5k ) { |
1051 | 6.80k | // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill |
1052 | 6.80k | // marker is only on the BUNDLE instruction. Process the BUNDLE |
1053 | 6.80k | // instruction as we finish with the bundled instruction to work around |
1054 | 6.80k | // the inconsistency. |
1055 | 6.80k | if (BundleMI->killsRegister(ARM::CPSR)) |
1056 | 6.19k | LiveCPSR = false; |
1057 | 6.80k | MachineOperand *MO = BundleMI->findRegisterDefOperand(ARM::CPSR); |
1058 | 6.80k | if (MO && 6.80k !MO->isDead()566 ) |
1059 | 546 | LiveCPSR = true; |
1060 | 6.80k | MO = BundleMI->findRegisterUseOperand(ARM::CPSR); |
1061 | 6.80k | if (MO && 6.80k !MO->isKill()6.75k ) |
1062 | 567 | LiveCPSR = true; |
1063 | 6.80k | } |
1064 | 391k | |
1065 | 391k | bool DefCPSR = false; |
1066 | 391k | LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR); |
1067 | 391k | if (MI->isCall()391k ) { |
1068 | 20.6k | // Calls don't really set CPSR. |
1069 | 20.6k | CPSRDef = nullptr; |
1070 | 20.6k | HighLatencyCPSR = false; |
1071 | 20.6k | IsSelfLoop = false; |
1072 | 391k | } else if (371k DefCPSR371k ) { |
1073 | 52.3k | // This is the last CPSR defining instruction. |
1074 | 52.3k | CPSRDef = MI; |
1075 | 52.3k | HighLatencyCPSR = isHighLatencyCPSR(CPSRDef); |
1076 | 52.3k | IsSelfLoop = false; |
1077 | 52.3k | } |
1078 | 398k | } |
1079 | 52.0k | |
1080 | 52.0k | MBBInfo &Info = BlockInfo[MBB.getNumber()]; |
1081 | 52.0k | Info.HighLatencyCPSR = HighLatencyCPSR; |
1082 | 52.0k | Info.Visited = true; |
1083 | 52.0k | return Modified; |
1084 | 52.0k | } |
1085 | | |
1086 | 32.8k | bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { |
1087 | 32.8k | if (PredicateFtor && 32.8k !PredicateFtor(*MF.getFunction())15.8k ) |
1088 | 15.2k | return false; |
1089 | 17.6k | |
1090 | 17.6k | STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget()); |
1091 | 17.6k | if (STI->isThumb1Only() || 17.6k STI->prefers32BitThumb()16.5k ) |
1092 | 1.10k | return false; |
1093 | 16.5k | |
1094 | 16.5k | TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo()); |
1095 | 16.5k | |
1096 | 16.5k | // Optimizing / minimizing size? Minimizing size implies optimizing for size. |
1097 | 16.5k | OptimizeSize = MF.getFunction()->optForSize(); |
1098 | 16.5k | MinimizeSize = MF.getFunction()->optForMinSize(); |
1099 | 16.5k | |
1100 | 16.5k | BlockInfo.clear(); |
1101 | 16.5k | BlockInfo.resize(MF.getNumBlockIDs()); |
1102 | 16.5k | |
1103 | 16.5k | // Visit blocks in reverse post-order so LastCPSRDef is known for all |
1104 | 16.5k | // predecessors. |
1105 | 16.5k | ReversePostOrderTraversal<MachineFunction*> RPOT(&MF); |
1106 | 16.5k | bool Modified = false; |
1107 | 16.5k | for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator |
1108 | 68.5k | I = RPOT.begin(), E = RPOT.end(); I != E68.5k ; ++I52.0k ) |
1109 | 52.0k | Modified |= ReduceMBB(**I); |
1110 | 32.8k | return Modified; |
1111 | 32.8k | } |
1112 | | |
1113 | | /// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size |
1114 | | /// reduction pass. |
1115 | | FunctionPass *llvm::createThumb2SizeReductionPass( |
1116 | 8.51k | std::function<bool(const Function &)> Ftor) { |
1117 | 8.51k | return new Thumb2SizeReduce(std::move(Ftor)); |
1118 | 8.51k | } |