/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/tools/polly/lib/Transform/ScheduleOptimizer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- Schedule.cpp - Calculate an optimized schedule ---------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This pass generates an entirely new schedule tree from the data dependences |
10 | | // and iteration domains. The new schedule tree is computed in two steps: |
11 | | // |
12 | | // 1) The isl scheduling optimizer is run |
13 | | // |
14 | | // The isl scheduling optimizer creates a new schedule tree that maximizes |
15 | | // parallelism and tileability and minimizes data-dependence distances. The |
16 | | // algorithm used is a modified version of the ``Pluto'' algorithm: |
17 | | // |
18 | | // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan. |
19 | | // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer. |
20 | | // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language |
21 | | // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008. |
22 | | // |
23 | | // 2) A set of post-scheduling transformations is applied on the schedule tree. |
24 | | // |
25 | | // These optimizations include: |
26 | | // |
27 | | // - Tiling of the innermost tilable bands |
28 | | // - Prevectorization - The choice of a possible outer loop that is strip-mined |
29 | | // to the innermost level to enable inner-loop |
30 | | // vectorization. |
31 | | // - Some optimizations for spatial locality are also planned. |
32 | | // |
33 | | // For a detailed description of the schedule tree itself please see section 6 |
34 | | // of: |
35 | | // |
36 | | // Polyhedral AST generation is more than scanning polyhedra |
37 | | // Tobias Grosser, Sven Verdoolaege, Albert Cohen |
38 | | // ACM Transactions on Programming Languages and Systems (TOPLAS), |
39 | | // 37(4), July 2015 |
40 | | // http://www.grosser.es/#pub-polyhedral-AST-generation |
41 | | // |
42 | | // This publication also contains a detailed discussion of the different options |
43 | | // for polyhedral loop unrolling, full/partial tile separation and other uses |
44 | | // of the schedule tree. |
45 | | // |
46 | | //===----------------------------------------------------------------------===// |
47 | | |
48 | | #include "polly/ScheduleOptimizer.h" |
49 | | #include "polly/CodeGen/CodeGeneration.h" |
50 | | #include "polly/DependenceInfo.h" |
51 | | #include "polly/LinkAllPasses.h" |
52 | | #include "polly/Options.h" |
53 | | #include "polly/ScheduleTreeTransform.h" |
54 | | #include "polly/ScopInfo.h" |
55 | | #include "polly/ScopPass.h" |
56 | | #include "polly/Simplify.h" |
57 | | #include "polly/Support/ISLOStream.h" |
58 | | #include "llvm/ADT/Statistic.h" |
59 | | #include "llvm/Analysis/TargetTransformInfo.h" |
60 | | #include "llvm/IR/Function.h" |
61 | | #include "llvm/Support/CommandLine.h" |
62 | | #include "llvm/Support/Debug.h" |
63 | | #include "llvm/Support/raw_ostream.h" |
64 | | #include "isl/ctx.h" |
65 | | #include "isl/options.h" |
66 | | #include "isl/printer.h" |
67 | | #include "isl/schedule.h" |
68 | | #include "isl/schedule_node.h" |
69 | | #include "isl/union_map.h" |
70 | | #include "isl/union_set.h" |
71 | | #include <algorithm> |
72 | | #include <cassert> |
73 | | #include <cmath> |
74 | | #include <cstdint> |
75 | | #include <cstdlib> |
76 | | #include <string> |
77 | | #include <vector> |
78 | | |
79 | | using namespace llvm; |
80 | | using namespace polly; |
81 | | |
82 | | #define DEBUG_TYPE "polly-opt-isl" |
83 | | |
84 | | static cl::opt<std::string> |
85 | | OptimizeDeps("polly-opt-optimize-only", |
86 | | cl::desc("Only a certain kind of dependences (all/raw)"), |
87 | | cl::Hidden, cl::init("all"), cl::ZeroOrMore, |
88 | | cl::cat(PollyCategory)); |
89 | | |
90 | | static cl::opt<std::string> |
91 | | SimplifyDeps("polly-opt-simplify-deps", |
92 | | cl::desc("Dependences should be simplified (yes/no)"), |
93 | | cl::Hidden, cl::init("yes"), cl::ZeroOrMore, |
94 | | cl::cat(PollyCategory)); |
95 | | |
96 | | static cl::opt<int> MaxConstantTerm( |
97 | | "polly-opt-max-constant-term", |
98 | | cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden, |
99 | | cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); |
100 | | |
101 | | static cl::opt<int> MaxCoefficient( |
102 | | "polly-opt-max-coefficient", |
103 | | cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden, |
104 | | cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory)); |
105 | | |
106 | | static cl::opt<std::string> FusionStrategy( |
107 | | "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"), |
108 | | cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory)); |
109 | | |
110 | | static cl::opt<std::string> |
111 | | MaximizeBandDepth("polly-opt-maximize-bands", |
112 | | cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, |
113 | | cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); |
114 | | |
115 | | static cl::opt<std::string> OuterCoincidence( |
116 | | "polly-opt-outer-coincidence", |
117 | | cl::desc("Try to construct schedules where the outer member of each band " |
118 | | "satisfies the coincidence constraints (yes/no)"), |
119 | | cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory)); |
120 | | |
121 | | static cl::opt<int> PrevectorWidth( |
122 | | "polly-prevect-width", |
123 | | cl::desc( |
124 | | "The number of loop iterations to strip-mine for pre-vectorization"), |
125 | | cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory)); |
126 | | |
127 | | static cl::opt<bool> FirstLevelTiling("polly-tiling", |
128 | | cl::desc("Enable loop tiling"), |
129 | | cl::init(true), cl::ZeroOrMore, |
130 | | cl::cat(PollyCategory)); |
131 | | |
132 | | static cl::opt<int> LatencyVectorFma( |
133 | | "polly-target-latency-vector-fma", |
134 | | cl::desc("The minimal number of cycles between issuing two " |
135 | | "dependent consecutive vector fused multiply-add " |
136 | | "instructions."), |
137 | | cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory)); |
138 | | |
139 | | static cl::opt<int> ThroughputVectorFma( |
140 | | "polly-target-throughput-vector-fma", |
141 | | cl::desc("A throughput of the processor floating-point arithmetic units " |
142 | | "expressed in the number of vector fused multiply-add " |
143 | | "instructions per clock cycle."), |
144 | | cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory)); |
145 | | |
146 | | // This option, along with --polly-target-2nd-cache-level-associativity, |
147 | | // --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size |
148 | | // represent the parameters of the target cache, which do not have typical |
149 | | // values that can be used by default. However, to apply the pattern matching |
150 | | // optimizations, we use the values of the parameters of Intel Core i7-3820 |
151 | | // SandyBridge in case the parameters are not specified or not provided by the |
152 | | // TargetTransformInfo. |
153 | | static cl::opt<int> FirstCacheLevelAssociativity( |
154 | | "polly-target-1st-cache-level-associativity", |
155 | | cl::desc("The associativity of the first cache level."), cl::Hidden, |
156 | | cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory)); |
157 | | |
158 | | static cl::opt<int> FirstCacheLevelDefaultAssociativity( |
159 | | "polly-target-1st-cache-level-default-associativity", |
160 | | cl::desc("The default associativity of the first cache level" |
161 | | " (if not enough were provided by the TargetTransformInfo)."), |
162 | | cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory)); |
163 | | |
164 | | static cl::opt<int> SecondCacheLevelAssociativity( |
165 | | "polly-target-2nd-cache-level-associativity", |
166 | | cl::desc("The associativity of the second cache level."), cl::Hidden, |
167 | | cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory)); |
168 | | |
169 | | static cl::opt<int> SecondCacheLevelDefaultAssociativity( |
170 | | "polly-target-2nd-cache-level-default-associativity", |
171 | | cl::desc("The default associativity of the second cache level" |
172 | | " (if not enough were provided by the TargetTransformInfo)."), |
173 | | cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory)); |
174 | | |
175 | | static cl::opt<int> FirstCacheLevelSize( |
176 | | "polly-target-1st-cache-level-size", |
177 | | cl::desc("The size of the first cache level specified in bytes."), |
178 | | cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory)); |
179 | | |
180 | | static cl::opt<int> FirstCacheLevelDefaultSize( |
181 | | "polly-target-1st-cache-level-default-size", |
182 | | cl::desc("The default size of the first cache level specified in bytes" |
183 | | " (if not enough were provided by the TargetTransformInfo)."), |
184 | | cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory)); |
185 | | |
186 | | static cl::opt<int> SecondCacheLevelSize( |
187 | | "polly-target-2nd-cache-level-size", |
188 | | cl::desc("The size of the second level specified in bytes."), cl::Hidden, |
189 | | cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory)); |
190 | | |
191 | | static cl::opt<int> SecondCacheLevelDefaultSize( |
192 | | "polly-target-2nd-cache-level-default-size", |
193 | | cl::desc("The default size of the second cache level specified in bytes" |
194 | | " (if not enough were provided by the TargetTransformInfo)."), |
195 | | cl::Hidden, cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory)); |
196 | | |
197 | | static cl::opt<int> VectorRegisterBitwidth( |
198 | | "polly-target-vector-register-bitwidth", |
199 | | cl::desc("The size in bits of a vector register (if not set, this " |
200 | | "information is taken from LLVM's target information."), |
201 | | cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory)); |
202 | | |
203 | | static cl::opt<int> FirstLevelDefaultTileSize( |
204 | | "polly-default-tile-size", |
205 | | cl::desc("The default tile size (if not enough were provided by" |
206 | | " --polly-tile-sizes)"), |
207 | | cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory)); |
208 | | |
209 | | static cl::list<int> |
210 | | FirstLevelTileSizes("polly-tile-sizes", |
211 | | cl::desc("A tile size for each loop dimension, filled " |
212 | | "with --polly-default-tile-size"), |
213 | | cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, |
214 | | cl::cat(PollyCategory)); |
215 | | |
216 | | static cl::opt<bool> |
217 | | SecondLevelTiling("polly-2nd-level-tiling", |
218 | | cl::desc("Enable a 2nd level loop of loop tiling"), |
219 | | cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); |
220 | | |
221 | | static cl::opt<int> SecondLevelDefaultTileSize( |
222 | | "polly-2nd-level-default-tile-size", |
223 | | cl::desc("The default 2nd-level tile size (if not enough were provided by" |
224 | | " --polly-2nd-level-tile-sizes)"), |
225 | | cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory)); |
226 | | |
227 | | static cl::list<int> |
228 | | SecondLevelTileSizes("polly-2nd-level-tile-sizes", |
229 | | cl::desc("A tile size for each loop dimension, filled " |
230 | | "with --polly-default-tile-size"), |
231 | | cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, |
232 | | cl::cat(PollyCategory)); |
233 | | |
234 | | static cl::opt<bool> RegisterTiling("polly-register-tiling", |
235 | | cl::desc("Enable register tiling"), |
236 | | cl::init(false), cl::ZeroOrMore, |
237 | | cl::cat(PollyCategory)); |
238 | | |
239 | | static cl::opt<int> RegisterDefaultTileSize( |
240 | | "polly-register-tiling-default-tile-size", |
241 | | cl::desc("The default register tile size (if not enough were provided by" |
242 | | " --polly-register-tile-sizes)"), |
243 | | cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory)); |
244 | | |
245 | | static cl::opt<int> PollyPatternMatchingNcQuotient( |
246 | | "polly-pattern-matching-nc-quotient", |
247 | | cl::desc("Quotient that is obtained by dividing Nc, the parameter of the" |
248 | | "macro-kernel, by Nr, the parameter of the micro-kernel"), |
249 | | cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory)); |
250 | | |
251 | | static cl::list<int> |
252 | | RegisterTileSizes("polly-register-tile-sizes", |
253 | | cl::desc("A tile size for each loop dimension, filled " |
254 | | "with --polly-register-tile-size"), |
255 | | cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, |
256 | | cl::cat(PollyCategory)); |
257 | | |
258 | | static cl::opt<bool> |
259 | | PMBasedOpts("polly-pattern-matching-based-opts", |
260 | | cl::desc("Perform optimizations based on pattern matching"), |
261 | | cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); |
262 | | |
263 | | static cl::opt<bool> OptimizedScops( |
264 | | "polly-optimized-scops", |
265 | | cl::desc("Polly - Dump polyhedral description of Scops optimized with " |
266 | | "the isl scheduling optimizer and the set of post-scheduling " |
267 | | "transformations is applied on the schedule tree"), |
268 | | cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); |
269 | | |
270 | | STATISTIC(ScopsProcessed, "Number of scops processed"); |
271 | | STATISTIC(ScopsRescheduled, "Number of scops rescheduled"); |
272 | | STATISTIC(ScopsOptimized, "Number of scops optimized"); |
273 | | |
274 | | STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized"); |
275 | | STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized"); |
276 | | |
277 | | #define THREE_STATISTICS(VARNAME, DESC) \ |
278 | | static Statistic VARNAME[3] = { \ |
279 | | {DEBUG_TYPE, #VARNAME "0", DESC " (original)", {0}, {false}}, \ |
280 | | {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)", {0}, {false}}, \ |
281 | | {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)", {0}, {false}}} |
282 | | |
283 | | THREE_STATISTICS(NumBands, "Number of bands"); |
284 | | THREE_STATISTICS(NumBandMembers, "Number of band members"); |
285 | | THREE_STATISTICS(NumCoincident, "Number of coincident band members"); |
286 | | THREE_STATISTICS(NumPermutable, "Number of permutable bands"); |
287 | | THREE_STATISTICS(NumFilters, "Number of filter nodes"); |
288 | | THREE_STATISTICS(NumExtension, "Number of extension nodes"); |
289 | | |
290 | | STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied"); |
291 | | STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied"); |
292 | | STATISTIC(RegisterTileOpts, "Number of register tiling applied"); |
293 | | STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied"); |
294 | | STATISTIC(MatMulOpts, |
295 | | "Number of matrix multiplication patterns detected and optimized"); |
296 | | |
297 | | /// Create an isl::union_set, which describes the isolate option based on |
298 | | /// IsolateDomain. |
299 | | /// |
300 | | /// @param IsolateDomain An isl::set whose @p OutDimsNum last dimensions should |
301 | | /// belong to the current band node. |
302 | | /// @param OutDimsNum A number of dimensions that should belong to |
303 | | /// the current band node. |
304 | | static isl::union_set getIsolateOptions(isl::set IsolateDomain, |
305 | 41 | unsigned OutDimsNum) { |
306 | 41 | unsigned Dims = IsolateDomain.dim(isl::dim::set); |
307 | 41 | assert(OutDimsNum <= Dims && |
308 | 41 | "The isl::set IsolateDomain is used to describe the range of schedule " |
309 | 41 | "dimensions values, which should be isolated. Consequently, the " |
310 | 41 | "number of its dimensions should be greater than or equal to the " |
311 | 41 | "number of the schedule dimensions."); |
312 | 41 | isl::map IsolateRelation = isl::map::from_domain(IsolateDomain); |
313 | 41 | IsolateRelation = IsolateRelation.move_dims(isl::dim::out, 0, isl::dim::in, |
314 | 41 | Dims - OutDimsNum, OutDimsNum); |
315 | 41 | isl::set IsolateOption = IsolateRelation.wrap(); |
316 | 41 | isl::id Id = isl::id::alloc(IsolateOption.get_ctx(), "isolate", nullptr); |
317 | 41 | IsolateOption = IsolateOption.set_tuple_id(Id); |
318 | 41 | return isl::union_set(IsolateOption); |
319 | 41 | } |
320 | | |
321 | | namespace { |
322 | | /// Create an isl::union_set, which describes the specified option for the |
323 | | /// dimension of the current node. |
324 | | /// |
325 | | /// @param Ctx An isl::ctx, which is used to create the isl::union_set. |
326 | | /// @param Option The name of the option. |
327 | 41 | isl::union_set getDimOptions(isl::ctx Ctx, const char *Option) { |
328 | 41 | isl::space Space(Ctx, 0, 1); |
329 | 41 | auto DimOption = isl::set::universe(Space); |
330 | 41 | auto Id = isl::id::alloc(Ctx, Option, nullptr); |
331 | 41 | DimOption = DimOption.set_tuple_id(Id); |
332 | 41 | return isl::union_set(DimOption); |
333 | 41 | } |
334 | | } // namespace |
335 | | |
336 | | /// Create an isl::union_set, which describes the option of the form |
337 | | /// [isolate[] -> unroll[x]]. |
338 | | /// |
339 | | /// @param Ctx An isl::ctx, which is used to create the isl::union_set. |
340 | 12 | static isl::union_set getUnrollIsolatedSetOptions(isl::ctx Ctx) { |
341 | 12 | isl::space Space = isl::space(Ctx, 0, 0, 1); |
342 | 12 | isl::map UnrollIsolatedSetOption = isl::map::universe(Space); |
343 | 12 | isl::id DimInId = isl::id::alloc(Ctx, "isolate", nullptr); |
344 | 12 | isl::id DimOutId = isl::id::alloc(Ctx, "unroll", nullptr); |
345 | 12 | UnrollIsolatedSetOption = |
346 | 12 | UnrollIsolatedSetOption.set_tuple_id(isl::dim::in, DimInId); |
347 | 12 | UnrollIsolatedSetOption = |
348 | 12 | UnrollIsolatedSetOption.set_tuple_id(isl::dim::out, DimOutId); |
349 | 12 | return UnrollIsolatedSetOption.wrap(); |
350 | 12 | } |
351 | | |
352 | | /// Make the last dimension of Set to take values from 0 to VectorWidth - 1. |
353 | | /// |
354 | | /// @param Set A set, which should be modified. |
355 | | /// @param VectorWidth A parameter, which determines the constraint. |
356 | 44 | static isl::set addExtentConstraints(isl::set Set, int VectorWidth) { |
357 | 44 | unsigned Dims = Set.dim(isl::dim::set); |
358 | 44 | isl::space Space = Set.get_space(); |
359 | 44 | isl::local_space LocalSpace = isl::local_space(Space); |
360 | 44 | isl::constraint ExtConstr = isl::constraint::alloc_inequality(LocalSpace); |
361 | 44 | ExtConstr = ExtConstr.set_constant_si(0); |
362 | 44 | ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, 1); |
363 | 44 | Set = Set.add_constraint(ExtConstr); |
364 | 44 | ExtConstr = isl::constraint::alloc_inequality(LocalSpace); |
365 | 44 | ExtConstr = ExtConstr.set_constant_si(VectorWidth - 1); |
366 | 44 | ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, -1); |
367 | 44 | return Set.add_constraint(ExtConstr); |
368 | 44 | } |
369 | | |
370 | 44 | isl::set getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth) { |
371 | 44 | unsigned Dims = ScheduleRange.dim(isl::dim::set); |
372 | 44 | isl::set LoopPrefixes = |
373 | 44 | ScheduleRange.drop_constraints_involving_dims(isl::dim::set, Dims - 1, 1); |
374 | 44 | auto ExtentPrefixes = addExtentConstraints(LoopPrefixes, VectorWidth); |
375 | 44 | isl::set BadPrefixes = ExtentPrefixes.subtract(ScheduleRange); |
376 | 44 | BadPrefixes = BadPrefixes.project_out(isl::dim::set, Dims - 1, 1); |
377 | 44 | LoopPrefixes = LoopPrefixes.project_out(isl::dim::set, Dims - 1, 1); |
378 | 44 | return LoopPrefixes.subtract(BadPrefixes); |
379 | 44 | } |
380 | | |
381 | | isl::schedule_node |
382 | | ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, |
383 | 17 | int VectorWidth) { |
384 | 17 | assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); |
385 | 17 | Node = Node.child(0).child(0); |
386 | 17 | isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation(); |
387 | 17 | isl::map ScheduleRelation = isl::map::from_union_map(SchedRelUMap); |
388 | 17 | isl::set ScheduleRange = ScheduleRelation.range(); |
389 | 17 | isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth); |
390 | 17 | auto AtomicOption = getDimOptions(IsolateDomain.get_ctx(), "atomic"); |
391 | 17 | isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1); |
392 | 17 | Node = Node.parent().parent(); |
393 | 17 | isl::union_set Options = IsolateOption.unite(AtomicOption); |
394 | 17 | Node = Node.band_set_ast_build_options(Options); |
395 | 17 | return Node; |
396 | 17 | } |
397 | | |
398 | | isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( |
399 | 17 | isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) { |
400 | 17 | assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); |
401 | 17 | |
402 | 17 | auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); |
403 | 17 | auto ScheduleDimensions = Space.dim(isl::dim::set); |
404 | 17 | assert(DimToVectorize < ScheduleDimensions); |
405 | 17 | |
406 | 17 | if (DimToVectorize > 0) { |
407 | 16 | Node = isl::manage( |
408 | 16 | isl_schedule_node_band_split(Node.release(), DimToVectorize)); |
409 | 16 | Node = Node.child(0); |
410 | 16 | } |
411 | 17 | if (DimToVectorize < ScheduleDimensions - 1) |
412 | 8 | Node = isl::manage(isl_schedule_node_band_split(Node.release(), 1)); |
413 | 17 | Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); |
414 | 17 | auto Sizes = isl::multi_val::zero(Space); |
415 | 17 | Sizes = Sizes.set_val(0, isl::val(Node.get_ctx(), VectorWidth)); |
416 | 17 | Node = |
417 | 17 | isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release())); |
418 | 17 | Node = isolateFullPartialTiles(Node, VectorWidth); |
419 | 17 | Node = Node.child(0); |
420 | 17 | // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise, |
421 | 17 | // we will have troubles to match it in the backend. |
422 | 17 | Node = Node.band_set_ast_build_options( |
423 | 17 | isl::union_set(Node.get_ctx(), "{ unroll[x]: 1 = 0 }")); |
424 | 17 | Node = isl::manage(isl_schedule_node_band_sink(Node.release())); |
425 | 17 | Node = Node.child(0); |
426 | 17 | if (isl_schedule_node_get_type(Node.get()) == isl_schedule_node_leaf) |
427 | 9 | Node = Node.parent(); |
428 | 17 | auto LoopMarker = isl::id::alloc(Node.get_ctx(), "SIMD", nullptr); |
429 | 17 | PrevectOpts++; |
430 | 17 | return Node.insert_mark(LoopMarker); |
431 | 17 | } |
432 | | |
433 | | isl::schedule_node ScheduleTreeOptimizer::tileNode(isl::schedule_node Node, |
434 | | const char *Identifier, |
435 | | ArrayRef<int> TileSizes, |
436 | 59 | int DefaultTileSize) { |
437 | 59 | auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); |
438 | 59 | auto Dims = Space.dim(isl::dim::set); |
439 | 59 | auto Sizes = isl::multi_val::zero(Space); |
440 | 59 | std::string IdentifierString(Identifier); |
441 | 210 | for (unsigned i = 0; i < Dims; i++151 ) { |
442 | 151 | auto tileSize = i < TileSizes.size() ? TileSizes[i]84 : DefaultTileSize67 ; |
443 | 151 | Sizes = Sizes.set_val(i, isl::val(Node.get_ctx(), tileSize)); |
444 | 151 | } |
445 | 59 | auto TileLoopMarkerStr = IdentifierString + " - Tiles"; |
446 | 59 | auto TileLoopMarker = |
447 | 59 | isl::id::alloc(Node.get_ctx(), TileLoopMarkerStr, nullptr); |
448 | 59 | Node = Node.insert_mark(TileLoopMarker); |
449 | 59 | Node = Node.child(0); |
450 | 59 | Node = |
451 | 59 | isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release())); |
452 | 59 | Node = Node.child(0); |
453 | 59 | auto PointLoopMarkerStr = IdentifierString + " - Points"; |
454 | 59 | auto PointLoopMarker = |
455 | 59 | isl::id::alloc(Node.get_ctx(), PointLoopMarkerStr, nullptr); |
456 | 59 | Node = Node.insert_mark(PointLoopMarker); |
457 | 59 | return Node.child(0); |
458 | 59 | } |
459 | | |
460 | | isl::schedule_node ScheduleTreeOptimizer::applyRegisterTiling( |
461 | 16 | isl::schedule_node Node, ArrayRef<int> TileSizes, int DefaultTileSize) { |
462 | 16 | Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize); |
463 | 16 | auto Ctx = Node.get_ctx(); |
464 | 16 | return Node.band_set_ast_build_options(isl::union_set(Ctx, "{unroll[x]}")); |
465 | 16 | } |
466 | | |
467 | 79 | static bool isSimpleInnermostBand(const isl::schedule_node &Node) { |
468 | 79 | assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); |
469 | 79 | assert(isl_schedule_node_n_children(Node.get()) == 1); |
470 | 79 | |
471 | 79 | auto ChildType = isl_schedule_node_get_type(Node.child(0).get()); |
472 | 79 | |
473 | 79 | if (ChildType == isl_schedule_node_leaf) |
474 | 45 | return true; |
475 | 34 | |
476 | 34 | if (ChildType != isl_schedule_node_sequence) |
477 | 33 | return false; |
478 | 1 | |
479 | 1 | auto Sequence = Node.child(0); |
480 | 1 | |
481 | 3 | for (int c = 0, nc = isl_schedule_node_n_children(Sequence.get()); c < nc; |
482 | 2 | ++c) { |
483 | 2 | auto Child = Sequence.child(c); |
484 | 2 | if (isl_schedule_node_get_type(Child.get()) != isl_schedule_node_filter) |
485 | 0 | return false; |
486 | 2 | if (isl_schedule_node_get_type(Child.child(0).get()) != |
487 | 2 | isl_schedule_node_leaf) |
488 | 0 | return false; |
489 | 2 | } |
490 | 1 | return true; |
491 | 1 | } |
492 | | |
493 | 551 | bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { |
494 | 551 | if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band) |
495 | 382 | return false; |
496 | 169 | |
497 | 169 | if (isl_schedule_node_n_children(Node.get()) != 1) |
498 | 0 | return false; |
499 | 169 | |
500 | 169 | if (!isl_schedule_node_band_get_permutable(Node.get())) |
501 | 40 | return false; |
502 | 129 | |
503 | 129 | auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); |
504 | 129 | auto Dims = Space.dim(isl::dim::set); |
505 | 129 | |
506 | 129 | if (Dims <= 1) |
507 | 50 | return false; |
508 | 79 | |
509 | 79 | return isSimpleInnermostBand(Node); |
510 | 79 | } |
511 | | |
512 | | __isl_give isl::schedule_node |
513 | 32 | ScheduleTreeOptimizer::standardBandOpts(isl::schedule_node Node, void *User) { |
514 | 32 | if (FirstLevelTiling) { |
515 | 28 | Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes, |
516 | 28 | FirstLevelDefaultTileSize); |
517 | 28 | FirstLevelTileOpts++; |
518 | 28 | } |
519 | 32 | |
520 | 32 | if (SecondLevelTiling) { |
521 | 3 | Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes, |
522 | 3 | SecondLevelDefaultTileSize); |
523 | 3 | SecondLevelTileOpts++; |
524 | 3 | } |
525 | 32 | |
526 | 32 | if (RegisterTiling) { |
527 | 2 | Node = |
528 | 2 | applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize); |
529 | 2 | RegisterTileOpts++; |
530 | 2 | } |
531 | 32 | |
532 | 32 | if (PollyVectorizerChoice == VECTORIZER_NONE) |
533 | 15 | return Node; |
534 | 17 | |
535 | 17 | auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); |
536 | 17 | auto Dims = Space.dim(isl::dim::set); |
537 | 17 | |
538 | 25 | for (int i = Dims - 1; i >= 0; i--8 ) |
539 | 25 | if (Node.band_member_get_coincident(i)) { |
540 | 17 | Node = prevectSchedBand(Node, i, PrevectorWidth); |
541 | 17 | break; |
542 | 17 | } |
543 | 17 | |
544 | 17 | return Node; |
545 | 17 | } |
546 | | |
547 | | /// Permute the two dimensions of the isl map. |
548 | | /// |
549 | | /// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that |
550 | | /// have type @p DimType. |
551 | | /// |
552 | | /// @param Map The isl map to be modified. |
553 | | /// @param DimType The type of the dimensions. |
554 | | /// @param DstPos The first dimension. |
555 | | /// @param SrcPos The second dimension. |
556 | | /// @return The modified map. |
557 | | isl::map permuteDimensions(isl::map Map, isl::dim DimType, unsigned DstPos, |
558 | 42 | unsigned SrcPos) { |
559 | 42 | assert(DstPos < Map.dim(DimType) && SrcPos < Map.dim(DimType)); |
560 | 42 | if (DstPos == SrcPos) |
561 | 14 | return Map; |
562 | 28 | isl::id DimId; |
563 | 28 | if (Map.has_tuple_id(DimType)) |
564 | 0 | DimId = Map.get_tuple_id(DimType); |
565 | 28 | auto FreeDim = DimType == isl::dim::in ? isl::dim::out0 : isl::dim::in; |
566 | 28 | isl::id FreeDimId; |
567 | 28 | if (Map.has_tuple_id(FreeDim)) |
568 | 28 | FreeDimId = Map.get_tuple_id(FreeDim); |
569 | 28 | auto MaxDim = std::max(DstPos, SrcPos); |
570 | 28 | auto MinDim = std::min(DstPos, SrcPos); |
571 | 28 | Map = Map.move_dims(FreeDim, 0, DimType, MaxDim, 1); |
572 | 28 | Map = Map.move_dims(FreeDim, 0, DimType, MinDim, 1); |
573 | 28 | Map = Map.move_dims(DimType, MinDim, FreeDim, 1, 1); |
574 | 28 | Map = Map.move_dims(DimType, MaxDim, FreeDim, 0, 1); |
575 | 28 | if (DimId) |
576 | 0 | Map = Map.set_tuple_id(DimType, DimId); |
577 | 28 | if (FreeDimId) |
578 | 28 | Map = Map.set_tuple_id(FreeDim, FreeDimId); |
579 | 28 | return Map; |
580 | 28 | } |
581 | | |
582 | | /// Check the form of the access relation. |
583 | | /// |
584 | | /// Check that the access relation @p AccMap has the form M[i][j], where i |
585 | | /// is a @p FirstPos and j is a @p SecondPos. |
586 | | /// |
587 | | /// @param AccMap The access relation to be checked. |
588 | | /// @param FirstPos The index of the input dimension that is mapped to |
589 | | /// the first output dimension. |
590 | | /// @param SecondPos The index of the input dimension that is mapped to the |
591 | | /// second output dimension. |
592 | | /// @return True in case @p AccMap has the expected form and false, |
593 | | /// otherwise. |
594 | | static bool isMatMulOperandAcc(isl::set Domain, isl::map AccMap, int &FirstPos, |
595 | 102 | int &SecondPos) { |
596 | 102 | isl::space Space = AccMap.get_space(); |
597 | 102 | isl::map Universe = isl::map::universe(Space); |
598 | 102 | |
599 | 102 | if (Space.dim(isl::dim::out) != 2) |
600 | 4 | return false; |
601 | 98 | |
602 | 98 | // MatMul has the form: |
603 | 98 | // for (i = 0; i < N; i++) |
604 | 98 | // for (j = 0; j < M; j++) |
605 | 98 | // for (k = 0; k < P; k++) |
606 | 98 | // C[i, j] += A[i, k] * B[k, j] |
607 | 98 | // |
608 | 98 | // Permutation of three outer loops: 3! = 6 possibilities. |
609 | 98 | int FirstDims[] = {0, 0, 1, 1, 2, 2}; |
610 | 98 | int SecondDims[] = {1, 2, 2, 0, 0, 1}; |
611 | 434 | for (int i = 0; i < 6; i += 1336 ) { |
612 | 392 | auto PossibleMatMul = |
613 | 392 | Universe.equate(isl::dim::in, FirstDims[i], isl::dim::out, 0) |
614 | 392 | .equate(isl::dim::in, SecondDims[i], isl::dim::out, 1); |
615 | 392 | |
616 | 392 | AccMap = AccMap.intersect_domain(Domain); |
617 | 392 | PossibleMatMul = PossibleMatMul.intersect_domain(Domain); |
618 | 392 | |
619 | 392 | // If AccMap spans entire domain (Non-partial write), |
620 | 392 | // compute FirstPos and SecondPos. |
621 | 392 | // If AccMap != PossibleMatMul here (the two maps have been gisted at |
622 | 392 | // this point), it means that the writes are not complete, or in other |
623 | 392 | // words, it is a Partial write and Partial writes must be rejected. |
624 | 392 | if (AccMap.is_equal(PossibleMatMul)) { |
625 | 98 | if (FirstPos != -1 && FirstPos != FirstDims[i]84 ) |
626 | 28 | continue; |
627 | 70 | FirstPos = FirstDims[i]; |
628 | 70 | if (SecondPos != -1 && SecondPos != SecondDims[i]56 ) |
629 | 14 | continue; |
630 | 56 | SecondPos = SecondDims[i]; |
631 | 56 | return true; |
632 | 56 | } |
633 | 392 | } |
634 | 98 | |
635 | 98 | return false42 ; |
636 | 98 | } |
637 | | |
638 | | /// Does the memory access represent a non-scalar operand of the matrix |
639 | | /// multiplication. |
640 | | /// |
641 | | /// Check that the memory access @p MemAccess is the read access to a non-scalar |
642 | | /// operand of the matrix multiplication or its result. |
643 | | /// |
644 | | /// @param MemAccess The memory access to be checked. |
645 | | /// @param MMI Parameters of the matrix multiplication operands. |
646 | | /// @return True in case the memory access represents the read access |
647 | | /// to a non-scalar operand of the matrix multiplication and |
648 | | /// false, otherwise. |
649 | | static bool isMatMulNonScalarReadAccess(MemoryAccess *MemAccess, |
650 | 43 | MatMulInfoTy &MMI) { |
651 | 43 | if (!MemAccess->isLatestArrayKind() || !MemAccess->isRead()) |
652 | 0 | return false; |
653 | 43 | auto AccMap = MemAccess->getLatestAccessRelation(); |
654 | 43 | isl::set StmtDomain = MemAccess->getStatement()->getDomain(); |
655 | 43 | if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.j) && !MMI.ReadFromC14 ) { |
656 | 14 | MMI.ReadFromC = MemAccess; |
657 | 14 | return true; |
658 | 14 | } |
659 | 29 | if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.k) && !MMI.A14 ) { |
660 | 14 | MMI.A = MemAccess; |
661 | 14 | return true; |
662 | 14 | } |
663 | 15 | if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.k, MMI.j) && !MMI.B14 ) { |
664 | 14 | MMI.B = MemAccess; |
665 | 14 | return true; |
666 | 14 | } |
667 | 1 | return false; |
668 | 1 | } |
669 | | |
670 | | /// Check accesses to operands of the matrix multiplication. |
671 | | /// |
672 | | /// Check that accesses of the SCoP statement, which corresponds to |
673 | | /// the partial schedule @p PartialSchedule, are scalar in terms of loops |
674 | | /// containing the matrix multiplication, in case they do not represent |
675 | | /// accesses to the non-scalar operands of the matrix multiplication or |
676 | | /// its result. |
677 | | /// |
678 | | /// @param PartialSchedule The partial schedule of the SCoP statement. |
679 | | /// @param MMI Parameters of the matrix multiplication operands. |
680 | | /// @return True in case the corresponding SCoP statement |
681 | | /// represents matrix multiplication and false, |
682 | | /// otherwise. |
683 | | static bool containsOnlyMatrMultAcc(isl::map PartialSchedule, |
684 | 14 | MatMulInfoTy &MMI) { |
685 | 14 | auto InputDimId = PartialSchedule.get_tuple_id(isl::dim::in); |
686 | 14 | auto *Stmt = static_cast<ScopStmt *>(InputDimId.get_user()); |
687 | 14 | unsigned OutDimNum = PartialSchedule.dim(isl::dim::out); |
688 | 14 | assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest " |
689 | 14 | "and, consequently, the corresponding scheduling " |
690 | 14 | "functions have at least three dimensions."); |
691 | 14 | auto MapI = |
692 | 14 | permuteDimensions(PartialSchedule, isl::dim::out, MMI.i, OutDimNum - 1); |
693 | 14 | auto MapJ = |
694 | 14 | permuteDimensions(PartialSchedule, isl::dim::out, MMI.j, OutDimNum - 1); |
695 | 14 | auto MapK = |
696 | 14 | permuteDimensions(PartialSchedule, isl::dim::out, MMI.k, OutDimNum - 1); |
697 | 14 | |
698 | 14 | auto Accesses = getAccessesInOrder(*Stmt); |
699 | 62 | for (auto *MemA = Accesses.begin(); MemA != Accesses.end() - 1; MemA++48 ) { |
700 | 48 | auto *MemAccessPtr = *MemA; |
701 | 48 | if (MemAccessPtr->isLatestArrayKind() && MemAccessPtr != MMI.WriteToC43 && |
702 | 48 | !isMatMulNonScalarReadAccess(MemAccessPtr, MMI)43 && |
703 | 48 | !(MemAccessPtr->isStrideZero(MapI))1 && |
704 | 48 | MemAccessPtr->isStrideZero(MapJ)0 && MemAccessPtr->isStrideZero(MapK)0 ) |
705 | 0 | return false; |
706 | 48 | } |
707 | 14 | return true; |
708 | 14 | } |
709 | | |
710 | | /// Check for dependencies corresponding to the matrix multiplication. |
711 | | /// |
712 | | /// Check that there is only true dependence of the form |
713 | | /// S(..., k, ...) -> S(..., k + 1, …), where S is the SCoP statement |
714 | | /// represented by @p Schedule and k is @p Pos. Such a dependence corresponds |
715 | | /// to the dependency produced by the matrix multiplication. |
716 | | /// |
717 | | /// @param Schedule The schedule of the SCoP statement. |
718 | | /// @param D The SCoP dependencies. |
719 | | /// @param Pos The parameter to describe an acceptable true dependence. |
720 | | /// In case it has a negative value, try to determine its |
721 | | /// acceptable value. |
722 | | /// @return True in case dependencies correspond to the matrix multiplication |
723 | | /// and false, otherwise. |
724 | | static bool containsOnlyMatMulDep(isl::map Schedule, const Dependences *D, |
725 | 14 | int &Pos) { |
726 | 14 | isl::union_map Dep = D->getDependences(Dependences::TYPE_RAW); |
727 | 14 | isl::union_map Red = D->getDependences(Dependences::TYPE_RED); |
728 | 14 | if (Red) |
729 | 14 | Dep = Dep.unite(Red); |
730 | 14 | auto DomainSpace = Schedule.get_space().domain(); |
731 | 14 | auto Space = DomainSpace.map_from_domain_and_range(DomainSpace); |
732 | 14 | auto Deltas = Dep.extract_map(Space).deltas(); |
733 | 14 | int DeltasDimNum = Deltas.dim(isl::dim::set); |
734 | 56 | for (int i = 0; i < DeltasDimNum; i++42 ) { |
735 | 42 | auto Val = Deltas.plain_get_val_if_fixed(isl::dim::set, i); |
736 | 42 | Pos = Pos < 0 && Val.is_one() ? i14 : Pos28 ; |
737 | 42 | if (Val.is_nan() || !(Val.is_zero() || (14 i == Pos14 && Val.is_one()14 ))) |
738 | 0 | return false; |
739 | 42 | } |
740 | 14 | if (DeltasDimNum == 0 || Pos < 0) |
741 | 0 | return false; |
742 | 14 | return true; |
743 | 14 | } |
744 | | |
745 | | /// Check if the SCoP statement could probably be optimized with analytical |
746 | | /// modeling. |
747 | | /// |
748 | | /// containsMatrMult tries to determine whether the following conditions |
749 | | /// are true: |
750 | | /// 1. The last memory access modeling an array, MA1, represents writing to |
751 | | /// memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or |
752 | | /// S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement |
753 | | /// under consideration. |
754 | | /// 2. There is only one loop-carried true dependency, and it has the |
755 | | /// form S(..., i3, ...) -> S(..., i3 + 1, ...), and there are no |
756 | | /// loop-carried or anti dependencies. |
757 | | /// 3. SCoP contains three access relations, MA2, MA3, and MA4 that represent |
758 | | /// reading from memory and have the form S(..., i3, ...) -> M(i1, i3), |
759 | | /// S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively, |
760 | | /// and all memory accesses of the SCoP that are different from MA1, MA2, |
761 | | /// MA3, and MA4 have stride 0, if the innermost loop is exchanged with any |
762 | | /// of loops i1, i2 and i3. |
763 | | /// |
764 | | /// @param PartialSchedule The PartialSchedule that contains a SCoP statement |
765 | | /// to check. |
766 | | /// @D The SCoP dependencies. |
767 | | /// @MMI Parameters of the matrix multiplication operands. |
768 | | static bool containsMatrMult(isl::map PartialSchedule, const Dependences *D, |
769 | 16 | MatMulInfoTy &MMI) { |
770 | 16 | auto InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in); |
771 | 16 | auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user()); |
772 | 16 | if (Stmt->size() <= 1) |
773 | 1 | return false; |
774 | 15 | |
775 | 15 | auto Accesses = getAccessesInOrder(*Stmt); |
776 | 15 | for (auto *MemA = Accesses.end() - 1; MemA != Accesses.begin(); MemA--0 ) { |
777 | 15 | auto *MemAccessPtr = *MemA; |
778 | 15 | if (!MemAccessPtr->isLatestArrayKind()) |
779 | 0 | continue; |
780 | 15 | if (!MemAccessPtr->isWrite()) |
781 | 0 | return false; |
782 | 15 | auto AccMap = MemAccessPtr->getLatestAccessRelation(); |
783 | 15 | if (!isMatMulOperandAcc(Stmt->getDomain(), AccMap, MMI.i, MMI.j)) |
784 | 1 | return false; |
785 | 14 | MMI.WriteToC = MemAccessPtr; |
786 | 14 | break; |
787 | 14 | } |
788 | 15 | |
789 | 15 | if (14 !containsOnlyMatMulDep(PartialSchedule, D, MMI.k)14 ) |
790 | 0 | return false; |
791 | 14 | |
792 | 14 | if (!MMI.WriteToC || !containsOnlyMatrMultAcc(PartialSchedule, MMI)) |
793 | 0 | return false; |
794 | 14 | |
795 | 14 | if (!MMI.A || !MMI.B || !MMI.ReadFromC) |
796 | 0 | return false; |
797 | 14 | return true; |
798 | 14 | } |
799 | | |
800 | | /// Permute two dimensions of the band node. |
801 | | /// |
802 | | /// Permute FirstDim and SecondDim dimensions of the Node. |
803 | | /// |
804 | | /// @param Node The band node to be modified. |
805 | | /// @param FirstDim The first dimension to be permuted. |
806 | | /// @param SecondDim The second dimension to be permuted. |
807 | | static isl::schedule_node permuteBandNodeDimensions(isl::schedule_node Node, |
808 | | unsigned FirstDim, |
809 | 80 | unsigned SecondDim) { |
810 | 80 | assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band && |
811 | 80 | isl_schedule_node_band_n_member(Node.get()) > |
812 | 80 | std::max(FirstDim, SecondDim)); |
813 | 80 | auto PartialSchedule = |
814 | 80 | isl::manage(isl_schedule_node_band_get_partial_schedule(Node.get())); |
815 | 80 | auto PartialScheduleFirstDim = PartialSchedule.get_union_pw_aff(FirstDim); |
816 | 80 | auto PartialScheduleSecondDim = PartialSchedule.get_union_pw_aff(SecondDim); |
817 | 80 | PartialSchedule = |
818 | 80 | PartialSchedule.set_union_pw_aff(SecondDim, PartialScheduleFirstDim); |
819 | 80 | PartialSchedule = |
820 | 80 | PartialSchedule.set_union_pw_aff(FirstDim, PartialScheduleSecondDim); |
821 | 80 | Node = isl::manage(isl_schedule_node_delete(Node.release())); |
822 | 80 | return Node.insert_partial_schedule(PartialSchedule); |
823 | 80 | } |
824 | | |
825 | | isl::schedule_node ScheduleTreeOptimizer::createMicroKernel( |
826 | 14 | isl::schedule_node Node, MicroKernelParamsTy MicroKernelParams) { |
827 | 14 | Node = applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, |
828 | 14 | 1); |
829 | 14 | Node = Node.parent().parent(); |
830 | 14 | return permuteBandNodeDimensions(Node, 0, 1).child(0).child(0); |
831 | 14 | } |
832 | | |
833 | | isl::schedule_node ScheduleTreeOptimizer::createMacroKernel( |
834 | 14 | isl::schedule_node Node, MacroKernelParamsTy MacroKernelParams) { |
835 | 14 | assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); |
836 | 14 | if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 12 && |
837 | 14 | MacroKernelParams.Kc == 12 ) |
838 | 2 | return Node; |
839 | 12 | int DimOutNum = isl_schedule_node_band_n_member(Node.get()); |
840 | 12 | std::vector<int> TileSizes(DimOutNum, 1); |
841 | 12 | TileSizes[DimOutNum - 3] = MacroKernelParams.Mc; |
842 | 12 | TileSizes[DimOutNum - 2] = MacroKernelParams.Nc; |
843 | 12 | TileSizes[DimOutNum - 1] = MacroKernelParams.Kc; |
844 | 12 | Node = tileNode(Node, "1st level tiling", TileSizes, 1); |
845 | 12 | Node = Node.parent().parent(); |
846 | 12 | Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1); |
847 | 12 | Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1); |
848 | 12 | |
849 | 12 | // Mark the outermost loop as parallelizable. |
850 | 12 | Node = Node.band_member_set_coincident(0, true); |
851 | 12 | |
852 | 12 | return Node.child(0).child(0); |
853 | 12 | } |
854 | | |
855 | | /// Get the size of the widest type of the matrix multiplication operands |
856 | | /// in bytes, including alignment padding. |
857 | | /// |
858 | | /// @param MMI Parameters of the matrix multiplication operands. |
859 | | /// @return The size of the widest type of the matrix multiplication operands |
860 | | /// in bytes, including alignment padding. |
861 | 12 | static uint64_t getMatMulAlignTypeSize(MatMulInfoTy MMI) { |
862 | 12 | auto *S = MMI.A->getStatement()->getParent(); |
863 | 12 | auto &DL = S->getFunction().getParent()->getDataLayout(); |
864 | 12 | auto ElementSizeA = DL.getTypeAllocSize(MMI.A->getElementType()); |
865 | 12 | auto ElementSizeB = DL.getTypeAllocSize(MMI.B->getElementType()); |
866 | 12 | auto ElementSizeC = DL.getTypeAllocSize(MMI.WriteToC->getElementType()); |
867 | 12 | return std::max({ElementSizeA, ElementSizeB, ElementSizeC}); |
868 | 12 | } |
869 | | |
870 | | /// Get the size of the widest type of the matrix multiplication operands |
871 | | /// in bits. |
872 | | /// |
873 | | /// @param MMI Parameters of the matrix multiplication operands. |
874 | | /// @return The size of the widest type of the matrix multiplication operands |
875 | | /// in bits. |
876 | 14 | static uint64_t getMatMulTypeSize(MatMulInfoTy MMI) { |
877 | 14 | auto *S = MMI.A->getStatement()->getParent(); |
878 | 14 | auto &DL = S->getFunction().getParent()->getDataLayout(); |
879 | 14 | auto ElementSizeA = DL.getTypeSizeInBits(MMI.A->getElementType()); |
880 | 14 | auto ElementSizeB = DL.getTypeSizeInBits(MMI.B->getElementType()); |
881 | 14 | auto ElementSizeC = DL.getTypeSizeInBits(MMI.WriteToC->getElementType()); |
882 | 14 | return std::max({ElementSizeA, ElementSizeB, ElementSizeC}); |
883 | 14 | } |
884 | | |
885 | | /// Get parameters of the BLIS micro kernel. |
886 | | /// |
887 | | /// We choose the Mr and Nr parameters of the micro kernel to be large enough |
888 | | /// such that no stalls caused by the combination of latencies and dependencies |
889 | | /// are introduced during the updates of the resulting matrix of the matrix |
890 | | /// multiplication. However, they should also be as small as possible to |
891 | | /// release more registers for entries of multiplied matrices. |
892 | | /// |
893 | | /// @param TTI Target Transform Info. |
894 | | /// @param MMI Parameters of the matrix multiplication operands. |
895 | | /// @return The structure of type MicroKernelParamsTy. |
896 | | /// @see MicroKernelParamsTy |
897 | | static struct MicroKernelParamsTy |
898 | 14 | getMicroKernelParams(const TargetTransformInfo *TTI, MatMulInfoTy MMI) { |
899 | 14 | assert(TTI && "The target transform info should be provided."); |
900 | 14 | |
901 | 14 | // Nvec - Number of double-precision floating-point numbers that can be hold |
902 | 14 | // by a vector register. Use 2 by default. |
903 | 14 | long RegisterBitwidth = VectorRegisterBitwidth; |
904 | 14 | |
905 | 14 | if (RegisterBitwidth == -1) |
906 | 0 | RegisterBitwidth = TTI->getRegisterBitWidth(true); |
907 | 14 | auto ElementSize = getMatMulTypeSize(MMI); |
908 | 14 | assert(ElementSize > 0 && "The element size of the matrix multiplication " |
909 | 14 | "operands should be greater than zero."); |
910 | 14 | auto Nvec = RegisterBitwidth / ElementSize; |
911 | 14 | if (Nvec == 0) |
912 | 0 | Nvec = 2; |
913 | 14 | int Nr = |
914 | 14 | ceil(sqrt(Nvec * LatencyVectorFma * ThroughputVectorFma) / Nvec) * Nvec; |
915 | 14 | int Mr = ceil(Nvec * LatencyVectorFma * ThroughputVectorFma / Nr); |
916 | 14 | return {Mr, Nr}; |
917 | 14 | } |
918 | | |
919 | | namespace { |
920 | | /// Determine parameters of the target cache. |
921 | | /// |
922 | | /// @param TTI Target Transform Info. |
923 | 14 | void getTargetCacheParameters(const llvm::TargetTransformInfo *TTI) { |
924 | 14 | auto L1DCache = llvm::TargetTransformInfo::CacheLevel::L1D; |
925 | 14 | auto L2DCache = llvm::TargetTransformInfo::CacheLevel::L2D; |
926 | 14 | if (FirstCacheLevelSize == -1) { |
927 | 1 | if (TTI->getCacheSize(L1DCache).hasValue()) |
928 | 0 | FirstCacheLevelSize = TTI->getCacheSize(L1DCache).getValue(); |
929 | 1 | else |
930 | 1 | FirstCacheLevelSize = static_cast<int>(FirstCacheLevelDefaultSize); |
931 | 1 | } |
932 | 14 | if (SecondCacheLevelSize == -1) { |
933 | 2 | if (TTI->getCacheSize(L2DCache).hasValue()) |
934 | 1 | SecondCacheLevelSize = TTI->getCacheSize(L2DCache).getValue(); |
935 | 1 | else |
936 | 1 | SecondCacheLevelSize = static_cast<int>(SecondCacheLevelDefaultSize); |
937 | 2 | } |
938 | 14 | if (FirstCacheLevelAssociativity == -1) { |
939 | 1 | if (TTI->getCacheAssociativity(L1DCache).hasValue()) |
940 | 1 | FirstCacheLevelAssociativity = |
941 | 1 | TTI->getCacheAssociativity(L1DCache).getValue(); |
942 | 0 | else |
943 | 0 | FirstCacheLevelAssociativity = |
944 | 0 | static_cast<int>(FirstCacheLevelDefaultAssociativity); |
945 | 1 | } |
946 | 14 | if (SecondCacheLevelAssociativity == -1) { |
947 | 2 | if (TTI->getCacheAssociativity(L2DCache).hasValue()) |
948 | 1 | SecondCacheLevelAssociativity = |
949 | 1 | TTI->getCacheAssociativity(L2DCache).getValue(); |
950 | 1 | else |
951 | 1 | SecondCacheLevelAssociativity = |
952 | 1 | static_cast<int>(SecondCacheLevelDefaultAssociativity); |
953 | 2 | } |
954 | 14 | } |
955 | | } // namespace |
956 | | |
957 | | /// Get parameters of the BLIS macro kernel. |
958 | | /// |
959 | | /// During the computation of matrix multiplication, blocks of partitioned |
960 | | /// matrices are mapped to different layers of the memory hierarchy. |
961 | | /// To optimize data reuse, blocks should be ideally kept in cache between |
962 | | /// iterations. Since parameters of the macro kernel determine sizes of these |
963 | | /// blocks, there are upper and lower bounds on these parameters. |
964 | | /// |
965 | | /// @param TTI Target Transform Info. |
966 | | /// @param MicroKernelParams Parameters of the micro-kernel |
967 | | /// to be taken into account. |
968 | | /// @param MMI Parameters of the matrix multiplication operands. |
969 | | /// @return The structure of type MacroKernelParamsTy. |
970 | | /// @see MacroKernelParamsTy |
971 | | /// @see MicroKernelParamsTy |
972 | | static struct MacroKernelParamsTy |
973 | | getMacroKernelParams(const llvm::TargetTransformInfo *TTI, |
974 | | const MicroKernelParamsTy &MicroKernelParams, |
975 | 14 | MatMulInfoTy MMI) { |
976 | 14 | getTargetCacheParameters(TTI); |
977 | 14 | // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf, |
978 | 14 | // it requires information about the first two levels of a cache to determine |
979 | 14 | // all the parameters of a macro-kernel. It also checks that an associativity |
980 | 14 | // degree of a cache level is greater than two. Otherwise, another algorithm |
981 | 14 | // for determination of the parameters should be used. |
982 | 14 | if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 && |
983 | 14 | FirstCacheLevelSize > 0 && SecondCacheLevelSize > 013 && |
984 | 14 | FirstCacheLevelAssociativity > 213 && SecondCacheLevelAssociativity > 213 )) |
985 | 1 | return {1, 1, 1}; |
986 | 13 | // The quotient should be greater than zero. |
987 | 13 | if (PollyPatternMatchingNcQuotient <= 0) |
988 | 0 | return {1, 1, 1}; |
989 | 13 | int Car = floor( |
990 | 13 | (FirstCacheLevelAssociativity - 1) / |
991 | 13 | (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr)); |
992 | 13 | |
993 | 13 | // Car can be computed to be zero since it is floor to int. |
994 | 13 | // On Mac OS, division by 0 does not raise a signal. This causes negative |
995 | 13 | // tile sizes to be computed. Prevent division by Cac==0 by early returning |
996 | 13 | // if this happens. |
997 | 13 | if (Car == 0) |
998 | 1 | return {1, 1, 1}; |
999 | 12 | |
1000 | 12 | auto ElementSize = getMatMulAlignTypeSize(MMI); |
1001 | 12 | assert(ElementSize > 0 && "The element size of the matrix multiplication " |
1002 | 12 | "operands should be greater than zero."); |
1003 | 12 | int Kc = (Car * FirstCacheLevelSize) / |
1004 | 12 | (MicroKernelParams.Mr * FirstCacheLevelAssociativity * ElementSize); |
1005 | 12 | double Cac = |
1006 | 12 | static_cast<double>(Kc * ElementSize * SecondCacheLevelAssociativity) / |
1007 | 12 | SecondCacheLevelSize; |
1008 | 12 | int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac); |
1009 | 12 | int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr; |
1010 | 12 | |
1011 | 12 | assert(Mc > 0 && Nc > 0 && Kc > 0 && |
1012 | 12 | "Matrix block sizes should be greater than zero"); |
1013 | 12 | return {Mc, Nc, Kc}; |
1014 | 12 | } |
1015 | | |
1016 | | /// Create an access relation that is specific to |
1017 | | /// the matrix multiplication pattern. |
1018 | | /// |
1019 | | /// Create an access relation of the following form: |
1020 | | /// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ] |
1021 | | /// where I is @p FirstDim, J is @p SecondDim. |
1022 | | /// |
1023 | | /// It can be used, for example, to create relations that helps to consequently |
1024 | | /// access elements of operands of a matrix multiplication after creation of |
1025 | | /// the BLIS micro and macro kernels. |
1026 | | /// |
1027 | | /// @see ScheduleTreeOptimizer::createMicroKernel |
1028 | | /// @see ScheduleTreeOptimizer::createMacroKernel |
1029 | | /// |
1030 | | /// Subsequently, the described access relation is applied to the range of |
1031 | | /// @p MapOldIndVar, that is used to map original induction variables to |
1032 | | /// the ones, which are produced by schedule transformations. It helps to |
1033 | | /// define relations using a new space and, at the same time, keep them |
1034 | | /// in the original one. |
1035 | | /// |
1036 | | /// @param MapOldIndVar The relation, which maps original induction variables |
1037 | | /// to the ones, which are produced by schedule |
1038 | | /// transformations. |
1039 | | /// @param FirstDim, SecondDim The input dimensions that are used to define |
1040 | | /// the specified access relation. |
1041 | | /// @return The specified access relation. |
1042 | | isl::map getMatMulAccRel(isl::map MapOldIndVar, unsigned FirstDim, |
1043 | 24 | unsigned SecondDim) { |
1044 | 24 | auto AccessRelSpace = isl::space(MapOldIndVar.get_ctx(), 0, 9, 3); |
1045 | 24 | auto AccessRel = isl::map::universe(AccessRelSpace); |
1046 | 24 | AccessRel = AccessRel.equate(isl::dim::in, FirstDim, isl::dim::out, 0); |
1047 | 24 | AccessRel = AccessRel.equate(isl::dim::in, 5, isl::dim::out, 1); |
1048 | 24 | AccessRel = AccessRel.equate(isl::dim::in, SecondDim, isl::dim::out, 2); |
1049 | 24 | return MapOldIndVar.apply_range(AccessRel); |
1050 | 24 | } |
1051 | | |
1052 | | isl::schedule_node createExtensionNode(isl::schedule_node Node, |
1053 | 24 | isl::map ExtensionMap) { |
1054 | 24 | auto Extension = isl::union_map(ExtensionMap); |
1055 | 24 | auto NewNode = isl::schedule_node::from_extension(Extension); |
1056 | 24 | return Node.graft_before(NewNode); |
1057 | 24 | } |
1058 | | |
1059 | | /// Apply the packing transformation. |
1060 | | /// |
1061 | | /// The packing transformation can be described as a data-layout |
1062 | | /// transformation that requires to introduce a new array, copy data |
1063 | | /// to the array, and change memory access locations to reference the array. |
1064 | | /// It can be used to ensure that elements of the new array are read in-stride |
1065 | | /// access, aligned to cache lines boundaries, and preloaded into certain cache |
1066 | | /// levels. |
1067 | | /// |
1068 | | /// As an example let us consider the packing of the array A that would help |
1069 | | /// to read its elements with in-stride access. An access to the array A |
1070 | | /// is represented by an access relation that has the form |
1071 | | /// S[i, j, k] -> A[i, k]. The scheduling function of the SCoP statement S has |
1072 | | /// the form S[i,j, k] -> [floor((j mod Nc) / Nr), floor((i mod Mc) / Mr), |
1073 | | /// k mod Kc, j mod Nr, i mod Mr]. |
1074 | | /// |
1075 | | /// To ensure that elements of the array A are read in-stride access, we add |
1076 | | /// a new array Packed_A[Mc/Mr][Kc][Mr] to the SCoP, using |
1077 | | /// Scop::createScopArrayInfo, change the access relation |
1078 | | /// S[i, j, k] -> A[i, k] to |
1079 | | /// S[i, j, k] -> Packed_A[floor((i mod Mc) / Mr), k mod Kc, i mod Mr], using |
1080 | | /// MemoryAccess::setNewAccessRelation, and copy the data to the array, using |
1081 | | /// the copy statement created by Scop::addScopStmt. |
1082 | | /// |
1083 | | /// @param Node The schedule node to be optimized. |
1084 | | /// @param MapOldIndVar The relation, which maps original induction variables |
1085 | | /// to the ones, which are produced by schedule |
1086 | | /// transformations. |
1087 | | /// @param MicroParams, MacroParams Parameters of the BLIS kernel |
1088 | | /// to be taken into account. |
1089 | | /// @param MMI Parameters of the matrix multiplication operands. |
1090 | | /// @return The optimized schedule node. |
1091 | | static isl::schedule_node |
1092 | | optimizeDataLayoutMatrMulPattern(isl::schedule_node Node, isl::map MapOldIndVar, |
1093 | | MicroKernelParamsTy MicroParams, |
1094 | | MacroKernelParamsTy MacroParams, |
1095 | 12 | MatMulInfoTy &MMI) { |
1096 | 12 | auto InputDimsId = MapOldIndVar.get_tuple_id(isl::dim::in); |
1097 | 12 | auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user()); |
1098 | 12 | |
1099 | 12 | // Create a copy statement that corresponds to the memory access to the |
1100 | 12 | // matrix B, the second operand of the matrix multiplication. |
1101 | 12 | Node = Node.parent().parent().parent().parent().parent().parent(); |
1102 | 12 | Node = isl::manage(isl_schedule_node_band_split(Node.release(), 2)).child(0); |
1103 | 12 | auto AccRel = getMatMulAccRel(MapOldIndVar, 3, 7); |
1104 | 12 | unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr; |
1105 | 12 | unsigned SecondDimSize = MacroParams.Kc; |
1106 | 12 | unsigned ThirdDimSize = MicroParams.Nr; |
1107 | 12 | auto *SAI = Stmt->getParent()->createScopArrayInfo( |
1108 | 12 | MMI.B->getElementType(), "Packed_B", |
1109 | 12 | {FirstDimSize, SecondDimSize, ThirdDimSize}); |
1110 | 12 | AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId()); |
1111 | 12 | auto OldAcc = MMI.B->getLatestAccessRelation(); |
1112 | 12 | MMI.B->setNewAccessRelation(AccRel); |
1113 | 12 | auto ExtMap = MapOldIndVar.project_out(isl::dim::out, 2, |
1114 | 12 | MapOldIndVar.dim(isl::dim::out) - 2); |
1115 | 12 | ExtMap = ExtMap.reverse(); |
1116 | 12 | ExtMap = ExtMap.fix_si(isl::dim::out, MMI.i, 0); |
1117 | 12 | auto Domain = Stmt->getDomain(); |
1118 | 12 | |
1119 | 12 | // Restrict the domains of the copy statements to only execute when also its |
1120 | 12 | // originating statement is executed. |
1121 | 12 | auto DomainId = Domain.get_tuple_id(); |
1122 | 12 | auto *NewStmt = Stmt->getParent()->addScopStmt( |
1123 | 12 | OldAcc, MMI.B->getLatestAccessRelation(), Domain); |
1124 | 12 | ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId); |
1125 | 12 | ExtMap = ExtMap.intersect_range(Domain); |
1126 | 12 | ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId()); |
1127 | 12 | Node = createExtensionNode(Node, ExtMap); |
1128 | 12 | |
1129 | 12 | // Create a copy statement that corresponds to the memory access |
1130 | 12 | // to the matrix A, the first operand of the matrix multiplication. |
1131 | 12 | Node = Node.child(0); |
1132 | 12 | AccRel = getMatMulAccRel(MapOldIndVar, 4, 6); |
1133 | 12 | FirstDimSize = MacroParams.Mc / MicroParams.Mr; |
1134 | 12 | ThirdDimSize = MicroParams.Mr; |
1135 | 12 | SAI = Stmt->getParent()->createScopArrayInfo( |
1136 | 12 | MMI.A->getElementType(), "Packed_A", |
1137 | 12 | {FirstDimSize, SecondDimSize, ThirdDimSize}); |
1138 | 12 | AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId()); |
1139 | 12 | OldAcc = MMI.A->getLatestAccessRelation(); |
1140 | 12 | MMI.A->setNewAccessRelation(AccRel); |
1141 | 12 | ExtMap = MapOldIndVar.project_out(isl::dim::out, 3, |
1142 | 12 | MapOldIndVar.dim(isl::dim::out) - 3); |
1143 | 12 | ExtMap = ExtMap.reverse(); |
1144 | 12 | ExtMap = ExtMap.fix_si(isl::dim::out, MMI.j, 0); |
1145 | 12 | NewStmt = Stmt->getParent()->addScopStmt( |
1146 | 12 | OldAcc, MMI.A->getLatestAccessRelation(), Domain); |
1147 | 12 | |
1148 | 12 | // Restrict the domains of the copy statements to only execute when also its |
1149 | 12 | // originating statement is executed. |
1150 | 12 | ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId); |
1151 | 12 | ExtMap = ExtMap.intersect_range(Domain); |
1152 | 12 | ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId()); |
1153 | 12 | Node = createExtensionNode(Node, ExtMap); |
1154 | 12 | return Node.child(0).child(0).child(0).child(0).child(0); |
1155 | 12 | } |
1156 | | |
1157 | | /// Get a relation mapping induction variables produced by schedule |
1158 | | /// transformations to the original ones. |
1159 | | /// |
1160 | | /// @param Node The schedule node produced as the result of creation |
1161 | | /// of the BLIS kernels. |
1162 | | /// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel |
1163 | | /// to be taken into account. |
1164 | | /// @return The relation mapping original induction variables to the ones |
1165 | | /// produced by schedule transformation. |
1166 | | /// @see ScheduleTreeOptimizer::createMicroKernel |
1167 | | /// @see ScheduleTreeOptimizer::createMacroKernel |
1168 | | /// @see getMacroKernelParams |
1169 | | isl::map |
1170 | | getInductionVariablesSubstitution(isl::schedule_node Node, |
1171 | | MicroKernelParamsTy MicroKernelParams, |
1172 | 12 | MacroKernelParamsTy MacroKernelParams) { |
1173 | 12 | auto Child = Node.child(0); |
1174 | 12 | auto UnMapOldIndVar = Child.get_prefix_schedule_union_map(); |
1175 | 12 | auto MapOldIndVar = isl::map::from_union_map(UnMapOldIndVar); |
1176 | 12 | if (MapOldIndVar.dim(isl::dim::out) > 9) |
1177 | 0 | return MapOldIndVar.project_out(isl::dim::out, 0, |
1178 | 0 | MapOldIndVar.dim(isl::dim::out) - 9); |
1179 | 12 | return MapOldIndVar; |
1180 | 12 | } |
1181 | | |
1182 | | /// Isolate a set of partial tile prefixes and unroll the isolated part. |
1183 | | /// |
1184 | | /// The set should ensure that it contains only partial tile prefixes that have |
1185 | | /// exactly Mr x Nr iterations of the two innermost loops produced by |
1186 | | /// the optimization of the matrix multiplication. Mr and Nr are parameters of |
1187 | | /// the micro-kernel. |
1188 | | /// |
1189 | | /// In case of parametric bounds, this helps to auto-vectorize the unrolled |
1190 | | /// innermost loops, using the SLP vectorizer. |
1191 | | /// |
1192 | | /// @param Node The schedule node to be modified. |
1193 | | /// @param MicroKernelParams Parameters of the micro-kernel |
1194 | | /// to be taken into account. |
1195 | | /// @return The modified isl_schedule_node. |
1196 | | static isl::schedule_node |
1197 | | isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node, |
1198 | 12 | struct MicroKernelParamsTy MicroKernelParams) { |
1199 | 12 | isl::schedule_node Child = Node.get_child(0); |
1200 | 12 | isl::union_map UnMapOldIndVar = Child.get_prefix_schedule_relation(); |
1201 | 12 | isl::set Prefix = isl::map::from_union_map(UnMapOldIndVar).range(); |
1202 | 12 | unsigned Dims = Prefix.dim(isl::dim::set); |
1203 | 12 | Prefix = Prefix.project_out(isl::dim::set, Dims - 1, 1); |
1204 | 12 | Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr); |
1205 | 12 | Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr); |
1206 | 12 | |
1207 | 12 | isl::union_set IsolateOption = |
1208 | 12 | getIsolateOptions(Prefix.add_dims(isl::dim::set, 3), 3); |
1209 | 12 | isl::ctx Ctx = Node.get_ctx(); |
1210 | 12 | auto Options = IsolateOption.unite(getDimOptions(Ctx, "unroll")); |
1211 | 12 | Options = Options.unite(getUnrollIsolatedSetOptions(Ctx)); |
1212 | 12 | Node = Node.band_set_ast_build_options(Options); |
1213 | 12 | Node = Node.parent().parent().parent(); |
1214 | 12 | IsolateOption = getIsolateOptions(Prefix, 3); |
1215 | 12 | Options = IsolateOption.unite(getDimOptions(Ctx, "separate")); |
1216 | 12 | Node = Node.band_set_ast_build_options(Options); |
1217 | 12 | Node = Node.child(0).child(0).child(0); |
1218 | 12 | return Node; |
1219 | 12 | } |
1220 | | |
1221 | | /// Mark @p BasePtr with "Inter iteration alias-free" mark node. |
1222 | | /// |
1223 | | /// @param Node The child of the mark node to be inserted. |
1224 | | /// @param BasePtr The pointer to be marked. |
1225 | | /// @return The modified isl_schedule_node. |
1226 | | static isl::schedule_node markInterIterationAliasFree(isl::schedule_node Node, |
1227 | 14 | Value *BasePtr) { |
1228 | 14 | if (!BasePtr) |
1229 | 0 | return Node; |
1230 | 14 | |
1231 | 14 | auto Id = |
1232 | 14 | isl::id::alloc(Node.get_ctx(), "Inter iteration alias-free", BasePtr); |
1233 | 14 | return Node.insert_mark(Id).child(0); |
1234 | 14 | } |
1235 | | |
1236 | | /// Insert "Loop Vectorizer Disabled" mark node. |
1237 | | /// |
1238 | | /// @param Node The child of the mark node to be inserted. |
1239 | | /// @return The modified isl_schedule_node. |
1240 | 12 | static isl::schedule_node markLoopVectorizerDisabled(isl::schedule_node Node) { |
1241 | 12 | auto Id = isl::id::alloc(Node.get_ctx(), "Loop Vectorizer Disabled", nullptr); |
1242 | 12 | return Node.insert_mark(Id).child(0); |
1243 | 12 | } |
1244 | | |
1245 | | /// Restore the initial ordering of dimensions of the band node |
1246 | | /// |
1247 | | /// In case the band node represents all the dimensions of the iteration |
1248 | | /// domain, recreate the band node to restore the initial ordering of the |
1249 | | /// dimensions. |
1250 | | /// |
1251 | | /// @param Node The band node to be modified. |
1252 | | /// @return The modified schedule node. |
1253 | | static isl::schedule_node |
1254 | 14 | getBandNodeWithOriginDimOrder(isl::schedule_node Node) { |
1255 | 14 | assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); |
1256 | 14 | if (isl_schedule_node_get_type(Node.child(0).get()) != isl_schedule_node_leaf) |
1257 | 0 | return Node; |
1258 | 14 | auto Domain = Node.get_universe_domain(); |
1259 | 14 | assert(isl_union_set_n_set(Domain.get()) == 1); |
1260 | 14 | if (Node.get_schedule_depth() != 0 || |
1261 | 14 | (isl::set(Domain).dim(isl::dim::set) != |
1262 | 14 | isl_schedule_node_band_n_member(Node.get()))) |
1263 | 0 | return Node; |
1264 | 14 | Node = isl::manage(isl_schedule_node_delete(Node.copy())); |
1265 | 14 | auto PartialSchedulePwAff = Domain.identity_union_pw_multi_aff(); |
1266 | 14 | auto PartialScheduleMultiPwAff = |
1267 | 14 | isl::multi_union_pw_aff(PartialSchedulePwAff); |
1268 | 14 | PartialScheduleMultiPwAff = |
1269 | 14 | PartialScheduleMultiPwAff.reset_tuple_id(isl::dim::set); |
1270 | 14 | return Node.insert_partial_schedule(PartialScheduleMultiPwAff); |
1271 | 14 | } |
1272 | | |
1273 | | isl::schedule_node |
1274 | | ScheduleTreeOptimizer::optimizeMatMulPattern(isl::schedule_node Node, |
1275 | | const TargetTransformInfo *TTI, |
1276 | 14 | MatMulInfoTy &MMI) { |
1277 | 14 | assert(TTI && "The target transform info should be provided."); |
1278 | 14 | Node = markInterIterationAliasFree( |
1279 | 14 | Node, MMI.WriteToC->getLatestScopArrayInfo()->getBasePtr()); |
1280 | 14 | int DimOutNum = isl_schedule_node_band_n_member(Node.get()); |
1281 | 14 | assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest " |
1282 | 14 | "and, consequently, the corresponding scheduling " |
1283 | 14 | "functions have at least three dimensions."); |
1284 | 14 | Node = getBandNodeWithOriginDimOrder(Node); |
1285 | 14 | Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3); |
1286 | 14 | int NewJ = MMI.j == DimOutNum - 3 ? MMI.i0 : MMI.j; |
1287 | 14 | int NewK = MMI.k == DimOutNum - 3 ? MMI.i0 : MMI.k; |
1288 | 14 | Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2); |
1289 | 14 | NewK = NewK == DimOutNum - 2 ? NewJ0 : NewK; |
1290 | 14 | Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1); |
1291 | 14 | auto MicroKernelParams = getMicroKernelParams(TTI, MMI); |
1292 | 14 | auto MacroKernelParams = getMacroKernelParams(TTI, MicroKernelParams, MMI); |
1293 | 14 | Node = createMacroKernel(Node, MacroKernelParams); |
1294 | 14 | Node = createMicroKernel(Node, MicroKernelParams); |
1295 | 14 | if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 112 || |
1296 | 14 | MacroKernelParams.Kc == 112 ) |
1297 | 2 | return Node; |
1298 | 12 | auto MapOldIndVar = getInductionVariablesSubstitution(Node, MicroKernelParams, |
1299 | 12 | MacroKernelParams); |
1300 | 12 | if (!MapOldIndVar) |
1301 | 0 | return Node; |
1302 | 12 | Node = markLoopVectorizerDisabled(Node.parent()).child(0); |
1303 | 12 | Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams); |
1304 | 12 | return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams, |
1305 | 12 | MacroKernelParams, MMI); |
1306 | 12 | } |
1307 | | |
1308 | | bool ScheduleTreeOptimizer::isMatrMultPattern(isl::schedule_node Node, |
1309 | | const Dependences *D, |
1310 | 37 | MatMulInfoTy &MMI) { |
1311 | 37 | auto PartialSchedule = isl::manage( |
1312 | 37 | isl_schedule_node_band_get_partial_schedule_union_map(Node.get())); |
1313 | 37 | Node = Node.child(0); |
1314 | 37 | auto LeafType = isl_schedule_node_get_type(Node.get()); |
1315 | 37 | Node = Node.parent(); |
1316 | 37 | if (LeafType != isl_schedule_node_leaf || |
1317 | 37 | isl_schedule_node_band_n_member(Node.get()) < 336 || |
1318 | 37 | Node.get_schedule_depth() != 016 || |
1319 | 37 | isl_union_map_n_map(PartialSchedule.get()) != 116 ) |
1320 | 21 | return false; |
1321 | 16 | auto NewPartialSchedule = isl::map::from_union_map(PartialSchedule); |
1322 | 16 | if (containsMatrMult(NewPartialSchedule, D, MMI)) |
1323 | 14 | return true; |
1324 | 2 | return false; |
1325 | 2 | } |
1326 | | |
1327 | | __isl_give isl_schedule_node * |
1328 | | ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node, |
1329 | 551 | void *User) { |
1330 | 551 | if (!isTileableBandNode(isl::manage_copy(Node))) |
1331 | 505 | return Node; |
1332 | 46 | |
1333 | 46 | const OptimizerAdditionalInfoTy *OAI = |
1334 | 46 | static_cast<const OptimizerAdditionalInfoTy *>(User); |
1335 | 46 | |
1336 | 46 | MatMulInfoTy MMI; |
1337 | 46 | if (PMBasedOpts && User37 && |
1338 | 46 | isMatrMultPattern(isl::manage_copy(Node), OAI->D, MMI)37 ) { |
1339 | 14 | LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n"); |
1340 | 14 | MatMulOpts++; |
1341 | 14 | return optimizeMatMulPattern(isl::manage(Node), OAI->TTI, MMI).release(); |
1342 | 14 | } |
1343 | 32 | |
1344 | 32 | return standardBandOpts(isl::manage(Node), User).release(); |
1345 | 32 | } |
1346 | | |
1347 | | isl::schedule |
1348 | | ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule, |
1349 | 40 | const OptimizerAdditionalInfoTy *OAI) { |
1350 | 40 | auto Root = Schedule.get_root(); |
1351 | 40 | Root = optimizeScheduleNode(Root, OAI); |
1352 | 40 | return Root.get_schedule(); |
1353 | 40 | } |
1354 | | |
1355 | | isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode( |
1356 | 40 | isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) { |
1357 | 40 | Node = isl::manage(isl_schedule_node_map_descendant_bottom_up( |
1358 | 40 | Node.release(), optimizeBand, |
1359 | 40 | const_cast<void *>(static_cast<const void *>(OAI)))); |
1360 | 40 | return Node; |
1361 | 40 | } |
1362 | | |
1363 | | bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S, |
1364 | 40 | isl::schedule NewSchedule) { |
1365 | 40 | // To understand if the schedule has been optimized we check if the schedule |
1366 | 40 | // has changed at all. |
1367 | 40 | // TODO: We can improve this by tracking if any necessarily beneficial |
1368 | 40 | // transformations have been performed. This can e.g. be tiling, loop |
1369 | 40 | // interchange, or ...) We can track this either at the place where the |
1370 | 40 | // transformation has been performed or, in case of automatic ILP based |
1371 | 40 | // optimizations, by comparing (yet to be defined) performance metrics |
1372 | 40 | // before/after the scheduling optimizer |
1373 | 40 | // (e.g., #stride-one accesses) |
1374 | 40 | auto NewScheduleMap = NewSchedule.get_map(); |
1375 | 40 | auto OldSchedule = S.getSchedule(); |
1376 | 40 | assert(OldSchedule && "Only IslScheduleOptimizer can insert extension nodes " |
1377 | 40 | "that make Scop::getSchedule() return nullptr."); |
1378 | 40 | bool changed = !OldSchedule.is_equal(NewScheduleMap); |
1379 | 40 | return changed; |
1380 | 40 | } |
1381 | | |
1382 | | namespace { |
1383 | | |
1384 | | class IslScheduleOptimizer : public ScopPass { |
1385 | | public: |
1386 | | static char ID; |
1387 | | |
1388 | 42 | explicit IslScheduleOptimizer() : ScopPass(ID) {} |
1389 | | |
1390 | 42 | ~IslScheduleOptimizer() override { isl_schedule_free(LastSchedule); } |
1391 | | |
1392 | | /// Optimize the schedule of the SCoP @p S. |
1393 | | bool runOnScop(Scop &S) override; |
1394 | | |
1395 | | /// Print the new schedule for the SCoP @p S. |
1396 | | void printScop(raw_ostream &OS, Scop &S) const override; |
1397 | | |
1398 | | /// Register all analyses and transformation required. |
1399 | | void getAnalysisUsage(AnalysisUsage &AU) const override; |
1400 | | |
1401 | | /// Release the internal memory. |
1402 | 254 | void releaseMemory() override { |
1403 | 254 | isl_schedule_free(LastSchedule); |
1404 | 254 | LastSchedule = nullptr; |
1405 | 254 | } |
1406 | | |
1407 | | private: |
1408 | | isl_schedule *LastSchedule = nullptr; |
1409 | | }; |
1410 | | } // namespace |
1411 | | |
1412 | | char IslScheduleOptimizer::ID = 0; |
1413 | | |
1414 | | /// Collect statistics for the schedule tree. |
1415 | | /// |
1416 | | /// @param Schedule The schedule tree to analyze. If not a schedule tree it is |
1417 | | /// ignored. |
1418 | | /// @param Version The version of the schedule tree that is analyzed. |
1419 | | /// 0 for the original schedule tree before any transformation. |
1420 | | /// 1 for the schedule tree after isl's rescheduling. |
1421 | | /// 2 for the schedule tree after optimizations are applied |
1422 | | /// (tiling, pattern matching) |
1423 | 120 | static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { |
1424 | 120 | auto Root = Schedule.get_root(); |
1425 | 120 | if (!Root) |
1426 | 0 | return; |
1427 | 120 | |
1428 | 120 | isl_schedule_node_foreach_descendant_top_down( |
1429 | 120 | Root.get(), |
1430 | 1.08k | [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool { |
1431 | 1.08k | isl::schedule_node Node = isl::manage_copy(nodeptr); |
1432 | 1.08k | int Version = *static_cast<int *>(user); |
1433 | 1.08k | |
1434 | 1.08k | switch (isl_schedule_node_get_type(Node.get())) { |
1435 | 1.08k | case isl_schedule_node_band: { |
1436 | 329 | NumBands[Version]++; |
1437 | 329 | if (isl_schedule_node_band_get_permutable(Node.get()) == |
1438 | 329 | isl_bool_true) |
1439 | 170 | NumPermutable[Version]++; |
1440 | 329 | |
1441 | 329 | int CountMembers = isl_schedule_node_band_n_member(Node.get()); |
1442 | 329 | NumBandMembers[Version] += CountMembers; |
1443 | 853 | for (int i = 0; i < CountMembers; i += 1524 ) { |
1444 | 524 | if (Node.band_member_get_coincident(i)) |
1445 | 258 | NumCoincident[Version]++; |
1446 | 524 | } |
1447 | 329 | break; |
1448 | 1.08k | } |
1449 | 1.08k | |
1450 | 1.08k | case isl_schedule_node_filter: |
1451 | 175 | NumFilters[Version]++; |
1452 | 175 | break; |
1453 | 1.08k | |
1454 | 1.08k | case isl_schedule_node_extension: |
1455 | 0 | NumExtension[Version]++; |
1456 | 0 | break; |
1457 | 1.08k | |
1458 | 1.08k | default: |
1459 | 576 | break; |
1460 | 1.08k | } |
1461 | 1.08k | |
1462 | 1.08k | return isl_bool_true; |
1463 | 1.08k | }, |
1464 | 120 | &Version); |
1465 | 120 | } |
1466 | | |
1467 | 41 | bool IslScheduleOptimizer::runOnScop(Scop &S) { |
1468 | 41 | // Skip SCoPs in case they're already optimised by PPCGCodeGeneration |
1469 | 41 | if (S.isToBeSkipped()) |
1470 | 0 | return false; |
1471 | 41 | |
1472 | 41 | // Skip empty SCoPs but still allow code generation as it will delete the |
1473 | 41 | // loops present but not needed. |
1474 | 41 | if (S.getSize() == 0) { |
1475 | 0 | S.markAsOptimized(); |
1476 | 0 | return false; |
1477 | 0 | } |
1478 | 41 | |
1479 | 41 | const Dependences &D = |
1480 | 41 | getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement); |
1481 | 41 | |
1482 | 41 | if (D.getSharedIslCtx() != S.getSharedIslCtx()) { |
1483 | 0 | LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n"); |
1484 | 0 | return false; |
1485 | 0 | } |
1486 | 41 | |
1487 | 41 | if (!D.hasValidDependences()) |
1488 | 1 | return false; |
1489 | 40 | |
1490 | 40 | isl_schedule_free(LastSchedule); |
1491 | 40 | LastSchedule = nullptr; |
1492 | 40 | |
1493 | 40 | // Build input data. |
1494 | 40 | int ValidityKinds = |
1495 | 40 | Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; |
1496 | 40 | int ProximityKinds; |
1497 | 40 | |
1498 | 40 | if (OptimizeDeps == "all") |
1499 | 40 | ProximityKinds = |
1500 | 40 | Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; |
1501 | 0 | else if (OptimizeDeps == "raw") |
1502 | 0 | ProximityKinds = Dependences::TYPE_RAW; |
1503 | 0 | else { |
1504 | 0 | errs() << "Do not know how to optimize for '" << OptimizeDeps << "'" |
1505 | 0 | << " Falling back to optimizing all dependences.\n"; |
1506 | 0 | ProximityKinds = |
1507 | 0 | Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW; |
1508 | 0 | } |
1509 | 40 | |
1510 | 40 | isl::union_set Domain = S.getDomains(); |
1511 | 40 | |
1512 | 40 | if (!Domain) |
1513 | 0 | return false; |
1514 | 40 | |
1515 | 40 | ScopsProcessed++; |
1516 | 40 | walkScheduleTreeForStatistics(S.getScheduleTree(), 0); |
1517 | 40 | |
1518 | 40 | isl::union_map Validity = D.getDependences(ValidityKinds); |
1519 | 40 | isl::union_map Proximity = D.getDependences(ProximityKinds); |
1520 | 40 | |
1521 | 40 | // Simplify the dependences by removing the constraints introduced by the |
1522 | 40 | // domains. This can speed up the scheduling time significantly, as large |
1523 | 40 | // constant coefficients will be removed from the dependences. The |
1524 | 40 | // introduction of some additional dependences reduces the possible |
1525 | 40 | // transformations, but in most cases, such transformation do not seem to be |
1526 | 40 | // interesting anyway. In some cases this option may stop the scheduler to |
1527 | 40 | // find any schedule. |
1528 | 40 | if (SimplifyDeps == "yes") { |
1529 | 40 | Validity = Validity.gist_domain(Domain); |
1530 | 40 | Validity = Validity.gist_range(Domain); |
1531 | 40 | Proximity = Proximity.gist_domain(Domain); |
1532 | 40 | Proximity = Proximity.gist_range(Domain); |
1533 | 40 | } else if (0 SimplifyDeps != "no"0 ) { |
1534 | 0 | errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' " |
1535 | 0 | "or 'no'. Falling back to default: 'yes'\n"; |
1536 | 0 | } |
1537 | 40 | |
1538 | 40 | LLVM_DEBUG(dbgs() << "\n\nCompute schedule from: "); |
1539 | 40 | LLVM_DEBUG(dbgs() << "Domain := " << Domain << ";\n"); |
1540 | 40 | LLVM_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n"); |
1541 | 40 | LLVM_DEBUG(dbgs() << "Validity := " << Validity << ";\n"); |
1542 | 40 | |
1543 | 40 | unsigned IslSerializeSCCs; |
1544 | 40 | |
1545 | 40 | if (FusionStrategy == "max") { |
1546 | 2 | IslSerializeSCCs = 0; |
1547 | 38 | } else if (FusionStrategy == "min") { |
1548 | 38 | IslSerializeSCCs = 1; |
1549 | 38 | } else { |
1550 | 0 | errs() << "warning: Unknown fusion strategy. Falling back to maximal " |
1551 | 0 | "fusion.\n"; |
1552 | 0 | IslSerializeSCCs = 0; |
1553 | 0 | } |
1554 | 40 | |
1555 | 40 | int IslMaximizeBands; |
1556 | 40 | |
1557 | 40 | if (MaximizeBandDepth == "yes") { |
1558 | 40 | IslMaximizeBands = 1; |
1559 | 40 | } else if (0 MaximizeBandDepth == "no"0 ) { |
1560 | 0 | IslMaximizeBands = 0; |
1561 | 0 | } else { |
1562 | 0 | errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'" |
1563 | 0 | " or 'no'. Falling back to default: 'yes'\n"; |
1564 | 0 | IslMaximizeBands = 1; |
1565 | 0 | } |
1566 | 40 | |
1567 | 40 | int IslOuterCoincidence; |
1568 | 40 | |
1569 | 40 | if (OuterCoincidence == "yes") { |
1570 | 1 | IslOuterCoincidence = 1; |
1571 | 39 | } else if (OuterCoincidence == "no") { |
1572 | 39 | IslOuterCoincidence = 0; |
1573 | 39 | } else { |
1574 | 0 | errs() << "warning: Option -polly-opt-outer-coincidence should either be " |
1575 | 0 | "'yes' or 'no'. Falling back to default: 'no'\n"; |
1576 | 0 | IslOuterCoincidence = 0; |
1577 | 0 | } |
1578 | 40 | |
1579 | 40 | isl_ctx *Ctx = S.getIslCtx().get(); |
1580 | 40 | |
1581 | 40 | isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence); |
1582 | 40 | isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs); |
1583 | 40 | isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands); |
1584 | 40 | isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm); |
1585 | 40 | isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient); |
1586 | 40 | isl_options_set_tile_scale_tile_loops(Ctx, 0); |
1587 | 40 | |
1588 | 40 | auto OnErrorStatus = isl_options_get_on_error(Ctx); |
1589 | 40 | isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE); |
1590 | 40 | |
1591 | 40 | auto SC = isl::schedule_constraints::on_domain(Domain); |
1592 | 40 | SC = SC.set_proximity(Proximity); |
1593 | 40 | SC = SC.set_validity(Validity); |
1594 | 40 | SC = SC.set_coincidence(Validity); |
1595 | 40 | auto Schedule = SC.compute_schedule(); |
1596 | 40 | isl_options_set_on_error(Ctx, OnErrorStatus); |
1597 | 40 | |
1598 | 40 | walkScheduleTreeForStatistics(Schedule, 1); |
1599 | 40 | |
1600 | 40 | // In cases the scheduler is not able to optimize the code, we just do not |
1601 | 40 | // touch the schedule. |
1602 | 40 | if (!Schedule) |
1603 | 0 | return false; |
1604 | 40 | |
1605 | 40 | ScopsRescheduled++; |
1606 | 40 | |
1607 | 40 | LLVM_DEBUG({ |
1608 | 40 | auto *P = isl_printer_to_str(Ctx); |
1609 | 40 | P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); |
1610 | 40 | P = isl_printer_print_schedule(P, Schedule.get()); |
1611 | 40 | auto *str = isl_printer_get_str(P); |
1612 | 40 | dbgs() << "NewScheduleTree: \n" << str << "\n"; |
1613 | 40 | free(str); |
1614 | 40 | isl_printer_free(P); |
1615 | 40 | }); |
1616 | 40 | |
1617 | 40 | Function &F = S.getFunction(); |
1618 | 40 | auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
1619 | 40 | const OptimizerAdditionalInfoTy OAI = {TTI, const_cast<Dependences *>(&D)}; |
1620 | 40 | auto NewSchedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); |
1621 | 40 | NewSchedule = hoistExtensionNodes(NewSchedule); |
1622 | 40 | walkScheduleTreeForStatistics(NewSchedule, 2); |
1623 | 40 | |
1624 | 40 | if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) |
1625 | 4 | return false; |
1626 | 36 | |
1627 | 36 | auto ScopStats = S.getStatistics(); |
1628 | 36 | ScopsOptimized++; |
1629 | 36 | NumAffineLoopsOptimized += ScopStats.NumAffineLoops; |
1630 | 36 | NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops; |
1631 | 36 | |
1632 | 36 | S.setScheduleTree(NewSchedule); |
1633 | 36 | S.markAsOptimized(); |
1634 | 36 | |
1635 | 36 | if (OptimizedScops) |
1636 | 1 | errs() << S; |
1637 | 36 | |
1638 | 36 | return false; |
1639 | 36 | } |
1640 | | |
1641 | 30 | void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const { |
1642 | 30 | isl_printer *p; |
1643 | 30 | char *ScheduleStr; |
1644 | 30 | |
1645 | 30 | OS << "Calculated schedule:\n"; |
1646 | 30 | |
1647 | 30 | if (!LastSchedule) { |
1648 | 30 | OS << "n/a\n"; |
1649 | 30 | return; |
1650 | 30 | } |
1651 | 0 | |
1652 | 0 | p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule)); |
1653 | 0 | p = isl_printer_print_schedule(p, LastSchedule); |
1654 | 0 | ScheduleStr = isl_printer_get_str(p); |
1655 | 0 | isl_printer_free(p); |
1656 | 0 |
|
1657 | 0 | OS << ScheduleStr << "\n"; |
1658 | 0 | } |
1659 | | |
1660 | 42 | void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { |
1661 | 42 | ScopPass::getAnalysisUsage(AU); |
1662 | 42 | AU.addRequired<DependenceInfo>(); |
1663 | 42 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
1664 | 42 | |
1665 | 42 | AU.addPreserved<DependenceInfo>(); |
1666 | 42 | } |
1667 | | |
1668 | 0 | Pass *polly::createIslScheduleOptimizerPass() { |
1669 | 0 | return new IslScheduleOptimizer(); |
1670 | 0 | } |
1671 | | |
1672 | 48.2k | INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl", |
1673 | 48.2k | "Polly - Optimize schedule of SCoP", false, false); |
1674 | 48.2k | INITIALIZE_PASS_DEPENDENCY(DependenceInfo); |
1675 | 48.2k | INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); |
1676 | 48.2k | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass); |
1677 | 48.2k | INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl", |
1678 | | "Polly - Optimize schedule of SCoP", false, false) |