/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | /// \file |
8 | | /// This file implements a TargetTransformInfo analysis pass specific to the |
9 | | /// Hexagon target machine. It uses the target's detailed information to provide |
10 | | /// more precise answers to certain TTI queries, while letting the target |
11 | | /// independent and default TTI implementations handle the rest. |
12 | | /// |
13 | | //===----------------------------------------------------------------------===// |
14 | | |
15 | | #include "HexagonTargetTransformInfo.h" |
16 | | #include "HexagonSubtarget.h" |
17 | | #include "llvm/Analysis/TargetTransformInfo.h" |
18 | | #include "llvm/CodeGen/ValueTypes.h" |
19 | | #include "llvm/IR/InstrTypes.h" |
20 | | #include "llvm/IR/Instructions.h" |
21 | | #include "llvm/IR/User.h" |
22 | | #include "llvm/Support/Casting.h" |
23 | | #include "llvm/Support/CommandLine.h" |
24 | | #include "llvm/Transforms/Utils/UnrollLoop.h" |
25 | | |
26 | | using namespace llvm; |
27 | | |
28 | | #define DEBUG_TYPE "hexagontti" |
29 | | |
30 | | static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false), |
31 | | cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); |
32 | | |
33 | | static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables", |
34 | | cl::init(true), cl::Hidden, |
35 | | cl::desc("Control lookup table emission on Hexagon target")); |
36 | | |
37 | | // Constant "cost factor" to make floating point operations more expensive |
38 | | // in terms of vectorization cost. This isn't the best way, but it should |
39 | | // do. Ultimately, the cost should use cycles. |
40 | | static const unsigned FloatFactor = 4; |
41 | | |
42 | 105 | bool HexagonTTIImpl::useHVX() const { |
43 | 105 | return ST.useHVXOps() && HexagonAutoHVX60 ; |
44 | 105 | } |
45 | | |
46 | 30 | bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const { |
47 | 30 | assert(VecTy->isVectorTy()); |
48 | 30 | // Avoid types like <2 x i32*>. |
49 | 30 | if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy()) |
50 | 0 | return false; |
51 | 30 | EVT VecVT = EVT::getEVT(VecTy); |
52 | 30 | if (!VecVT.isSimple() || VecVT.getSizeInBits() <= 64) |
53 | 6 | return false; |
54 | 24 | if (ST.isHVXVectorType(VecVT.getSimpleVT())) |
55 | 10 | return true; |
56 | 14 | auto Action = TLI.getPreferredVectorAction(VecVT.getSimpleVT()); |
57 | 14 | return Action == TargetLoweringBase::TypeWidenVector; |
58 | 14 | } |
59 | | |
60 | 0 | unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const { |
61 | 0 | if (Ty->isVectorTy()) |
62 | 0 | return Ty->getVectorNumElements(); |
63 | 0 | assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) && |
64 | 0 | "Expecting scalar type"); |
65 | 0 | return 1; |
66 | 0 | } |
67 | | |
68 | | TargetTransformInfo::PopcntSupportKind |
69 | 0 | HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { |
70 | 0 | // Return fast hardware support as every input < 64 bits will be promoted |
71 | 0 | // to 64 bits. |
72 | 0 | return TargetTransformInfo::PSK_FastHardware; |
73 | 0 | } |
74 | | |
75 | | // The Hexagon target can unroll loops with run-time trip counts. |
76 | | void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
77 | 3 | TTI::UnrollingPreferences &UP) { |
78 | 3 | UP.Runtime = UP.Partial = true; |
79 | 3 | // Only try to peel innermost loops with small runtime trip counts. |
80 | 3 | if (L && L->empty() && canPeel(L) && |
81 | 3 | SE.getSmallConstantTripCount(L) == 0 && |
82 | 3 | SE.getSmallConstantMaxTripCount(L) > 0 && |
83 | 3 | SE.getSmallConstantMaxTripCount(L) <= 5) { |
84 | 1 | UP.PeelCount = 2; |
85 | 1 | } |
86 | 3 | } |
87 | | |
88 | 11.4k | bool HexagonTTIImpl::shouldFavorPostInc() const { |
89 | 11.4k | return true; |
90 | 11.4k | } |
91 | | |
92 | | /// --- Vector TTI begin --- |
93 | | |
94 | 8.50k | unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const { |
95 | 8.50k | if (Vector) |
96 | 33 | return useHVX() ? 326 : 027 ; |
97 | 8.47k | return 32; |
98 | 8.47k | } |
99 | | |
100 | 20 | unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) { |
101 | 20 | return useHVX() ? 22 : 018 ; |
102 | 20 | } |
103 | | |
104 | 22 | unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const { |
105 | 22 | return Vector ? getMinVectorRegisterBitWidth() : 320 ; |
106 | 22 | } |
107 | | |
108 | 22 | unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const { |
109 | 22 | return useHVX() ? ST.getVectorLength()*8 : 00 ; |
110 | 22 | } |
111 | | |
112 | 2 | unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const { |
113 | 2 | return (8 * ST.getVectorLength()) / ElemWidth; |
114 | 2 | } |
115 | | |
116 | | unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, |
117 | 10 | bool Extract) { |
118 | 10 | return BaseT::getScalarizationOverhead(Ty, Insert, Extract); |
119 | 10 | } |
120 | | |
121 | | unsigned HexagonTTIImpl::getOperandsScalarizationOverhead( |
122 | 5 | ArrayRef<const Value*> Args, unsigned VF) { |
123 | 5 | return BaseT::getOperandsScalarizationOverhead(Args, VF); |
124 | 5 | } |
125 | | |
126 | | unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, |
127 | 0 | ArrayRef<Type*> Tys) { |
128 | 0 | return BaseT::getCallInstrCost(F, RetTy, Tys); |
129 | 0 | } |
130 | | |
131 | | unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, |
132 | 0 | ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) { |
133 | 0 | return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); |
134 | 0 | } |
135 | | |
136 | | unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, |
137 | | ArrayRef<Type*> Tys, FastMathFlags FMF, |
138 | 0 | unsigned ScalarizationCostPassed) { |
139 | 0 | if (ID == Intrinsic::bswap) { |
140 | 0 | std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, RetTy); |
141 | 0 | return LT.first + 2; |
142 | 0 | } |
143 | 0 | return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF, |
144 | 0 | ScalarizationCostPassed); |
145 | 0 | } |
146 | | |
147 | | unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp, |
148 | 27 | ScalarEvolution *SE, const SCEV *S) { |
149 | 27 | return 0; |
150 | 27 | } |
151 | | |
152 | | unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, |
153 | 72 | unsigned Alignment, unsigned AddressSpace, const Instruction *I) { |
154 | 72 | assert(Opcode == Instruction::Load || Opcode == Instruction::Store); |
155 | 72 | if (Opcode == Instruction::Store) |
156 | 24 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I); |
157 | 48 | |
158 | 48 | if (Src->isVectorTy()) { |
159 | 30 | VectorType *VecTy = cast<VectorType>(Src); |
160 | 30 | unsigned VecWidth = VecTy->getBitWidth(); |
161 | 30 | if (useHVX() && isTypeForHVX(VecTy)) { |
162 | 16 | unsigned RegWidth = getRegisterBitWidth(true); |
163 | 16 | assert(RegWidth && "Non-zero vector register width expected"); |
164 | 16 | // Cost of HVX loads. |
165 | 16 | if (VecWidth % RegWidth == 0) |
166 | 10 | return VecWidth / RegWidth; |
167 | 6 | // Cost of constructing HVX vector from scalar loads. |
168 | 6 | Alignment = std::min(Alignment, RegWidth / 8); |
169 | 6 | unsigned AlignWidth = 8 * std::max(1u, Alignment); |
170 | 6 | unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; |
171 | 6 | return 3 * NumLoads; |
172 | 6 | } |
173 | 14 | |
174 | 14 | // Non-HVX vectors. |
175 | 14 | // Add extra cost for floating point types. |
176 | 14 | unsigned Cost = VecTy->getElementType()->isFloatingPointTy() ? FloatFactor0 |
177 | 14 | : 1; |
178 | 14 | Alignment = std::min(Alignment, 8u); |
179 | 14 | unsigned AlignWidth = 8 * std::max(1u, Alignment); |
180 | 14 | unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; |
181 | 14 | if (Alignment == 4 || Alignment == 86 ) |
182 | 8 | return Cost * NumLoads; |
183 | 6 | // Loads of less than 32 bits will need extra inserts to compose a vector. |
184 | 6 | unsigned LogA = Log2_32(Alignment); |
185 | 6 | return (3 - LogA) * Cost * NumLoads; |
186 | 6 | } |
187 | 18 | |
188 | 18 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I); |
189 | 18 | } |
190 | | |
191 | | unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, |
192 | 0 | Type *Src, unsigned Alignment, unsigned AddressSpace) { |
193 | 0 | return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); |
194 | 0 | } |
195 | | |
196 | | unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, |
197 | 0 | int Index, Type *SubTp) { |
198 | 0 | return 1; |
199 | 0 | } |
200 | | |
201 | | unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, |
202 | 0 | Value *Ptr, bool VariableMask, unsigned Alignment) { |
203 | 0 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
204 | 0 | Alignment); |
205 | 0 | } |
206 | | |
207 | | unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, |
208 | | Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
209 | | unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, |
210 | 15 | bool UseMaskForGaps) { |
211 | 15 | if (Indices.size() != Factor || UseMaskForCond10 || UseMaskForGaps10 ) |
212 | 5 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
213 | 5 | Alignment, AddressSpace, |
214 | 5 | UseMaskForCond, UseMaskForGaps); |
215 | 10 | return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr); |
216 | 10 | } |
217 | | |
218 | | unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
219 | 12 | Type *CondTy, const Instruction *I) { |
220 | 12 | if (ValTy->isVectorTy()) { |
221 | 0 | std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy); |
222 | 0 | if (Opcode == Instruction::FCmp) |
223 | 0 | return LT.first + FloatFactor * getTypeNumElements(ValTy); |
224 | 12 | } |
225 | 12 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); |
226 | 12 | } |
227 | | |
228 | | unsigned HexagonTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, |
229 | | TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, |
230 | | TTI::OperandValueProperties Opd1PropInfo, |
231 | 42 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value*> Args) { |
232 | 42 | if (Ty->isVectorTy()) { |
233 | 20 | std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, Ty); |
234 | 20 | if (LT.second.isFloatingPoint()) |
235 | 0 | return LT.first + FloatFactor * getTypeNumElements(Ty); |
236 | 42 | } |
237 | 42 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, |
238 | 42 | Opd1PropInfo, Opd2PropInfo, Args); |
239 | 42 | } |
240 | | |
241 | | unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, |
242 | 0 | Type *SrcTy, const Instruction *I) { |
243 | 0 | if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) { |
244 | 0 | unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0; |
245 | 0 | unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0; |
246 | 0 |
|
247 | 0 | std::pair<int, MVT> SrcLT = TLI.getTypeLegalizationCost(DL, SrcTy); |
248 | 0 | std::pair<int, MVT> DstLT = TLI.getTypeLegalizationCost(DL, DstTy); |
249 | 0 | return std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN); |
250 | 0 | } |
251 | 0 | return 1; |
252 | 0 | } |
253 | | |
254 | | unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
255 | 474 | unsigned Index) { |
256 | 474 | Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType() |
257 | 474 | : Val0 ; |
258 | 474 | if (Opcode == Instruction::InsertElement) { |
259 | 248 | // Need two rotations for non-zero index. |
260 | 248 | unsigned Cost = (Index != 0) ? 2233 : 015 ; |
261 | 248 | if (ElemTy->isIntegerTy(32)) |
262 | 248 | return Cost; |
263 | 0 | // If it's not a 32-bit value, there will need to be an extract. |
264 | 0 | return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index); |
265 | 0 | } |
266 | 226 | |
267 | 226 | if (Opcode == Instruction::ExtractElement) |
268 | 226 | return 2; |
269 | 0 | |
270 | 0 | return 1; |
271 | 0 | } |
272 | | |
273 | | /// --- Vector TTI end --- |
274 | | |
275 | 2 | unsigned HexagonTTIImpl::getPrefetchDistance() const { |
276 | 2 | return ST.getL1PrefetchDistance(); |
277 | 2 | } |
278 | | |
279 | 0 | unsigned HexagonTTIImpl::getCacheLineSize() const { |
280 | 0 | return ST.getL1CacheLineSize(); |
281 | 0 | } |
282 | | |
283 | | int HexagonTTIImpl::getUserCost(const User *U, |
284 | 290 | ArrayRef<const Value *> Operands) { |
285 | 290 | auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool { |
286 | 30 | if (!CI->isIntegerCast()) |
287 | 6 | return false; |
288 | 24 | // Only extensions from an integer type shorter than 32-bit to i32 |
289 | 24 | // can be folded into the load. |
290 | 24 | const DataLayout &DL = getDataLayout(); |
291 | 24 | unsigned SBW = DL.getTypeSizeInBits(CI->getSrcTy()); |
292 | 24 | unsigned DBW = DL.getTypeSizeInBits(CI->getDestTy()); |
293 | 24 | if (DBW != 32 || SBW >= DBW13 ) |
294 | 12 | return false; |
295 | 12 | |
296 | 12 | const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0)); |
297 | 12 | // Technically, this code could allow multiple uses of the load, and |
298 | 12 | // check if all the uses are the same extension operation, but this |
299 | 12 | // should be sufficient for most cases. |
300 | 12 | return LI && LI->hasOneUse()2 ; |
301 | 12 | }; |
302 | 290 | |
303 | 290 | if (const CastInst *CI = dyn_cast<const CastInst>(U)) |
304 | 30 | if (isCastFoldedIntoLoad(CI)) |
305 | 0 | return TargetTransformInfo::TCC_Free; |
306 | 290 | return BaseT::getUserCost(U, Operands); |
307 | 290 | } |
308 | | |
309 | 39 | bool HexagonTTIImpl::shouldBuildLookupTables() const { |
310 | 39 | return EmitLookupTables; |
311 | 39 | } |