/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/include/clang/Tooling/Syntax/Tokens.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // Record tokens that a preprocessor emits and define operations to map between |
9 | | // the tokens written in a file and tokens produced by the preprocessor. |
10 | | // |
11 | | // When running the compiler, there are two token streams we are interested in: |
12 | | // - "spelled" tokens directly correspond to a substring written in some |
13 | | // source file. |
14 | | // - "expanded" tokens represent the result of preprocessing, parses consumes |
15 | | // this token stream to produce the AST. |
16 | | // |
17 | | // Expanded tokens correspond directly to locations found in the AST, allowing |
18 | | // to find subranges of the token stream covered by various AST nodes. Spelled |
19 | | // tokens correspond directly to the source code written by the user. |
20 | | // |
21 | | // To allow composing these two use-cases, we also define operations that map |
22 | | // between expanded and spelled tokens that produced them (macro calls, |
23 | | // directives, etc). |
24 | | // |
25 | | //===----------------------------------------------------------------------===// |
26 | | |
27 | | #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H |
28 | | #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H |
29 | | |
30 | | #include "clang/Basic/FileManager.h" |
31 | | #include "clang/Basic/LangOptions.h" |
32 | | #include "clang/Basic/SourceLocation.h" |
33 | | #include "clang/Basic/SourceManager.h" |
34 | | #include "clang/Basic/TokenKinds.h" |
35 | | #include "clang/Lex/Token.h" |
36 | | #include "llvm/ADT/ArrayRef.h" |
37 | | #include "llvm/ADT/Optional.h" |
38 | | #include "llvm/ADT/StringRef.h" |
39 | | #include "llvm/Support/Compiler.h" |
40 | | #include "llvm/Support/raw_ostream.h" |
41 | | #include <cstdint> |
42 | | #include <tuple> |
43 | | |
44 | | namespace clang { |
45 | | class Preprocessor; |
46 | | |
47 | | namespace syntax { |
48 | | |
49 | | /// A half-open character range inside a particular file, the start offset is |
50 | | /// included and the end offset is excluded from the range. |
51 | | struct FileRange { |
52 | | /// EXPECTS: File.isValid() && Begin <= End. |
53 | | FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset); |
54 | | /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(). |
55 | | FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length); |
56 | | /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files |
57 | | /// are the same. |
58 | | FileRange(const SourceManager &SM, SourceLocation BeginLoc, |
59 | | SourceLocation EndLoc); |
60 | | |
61 | 207k | FileID file() const { return File; } |
62 | | /// Start is a start offset (inclusive) in the corresponding file. |
63 | 97.7k | unsigned beginOffset() const { return Begin; } |
64 | | /// End offset (exclusive) in the corresponding file. |
65 | 136k | unsigned endOffset() const { return End; } |
66 | | |
67 | 14 | unsigned length() const { return End - Begin; } |
68 | | |
69 | | /// Check if \p Offset is inside the range. |
70 | 0 | bool contains(unsigned Offset) const { |
71 | 0 | return Begin <= Offset && Offset < End; |
72 | 0 | } |
73 | | /// Check \p Offset is inside the range or equal to its endpoint. |
74 | 0 | bool touches(unsigned Offset) const { |
75 | 0 | return Begin <= Offset && Offset <= End; |
76 | 0 | } |
77 | | |
78 | | /// Gets the substring that this FileRange refers to. |
79 | | llvm::StringRef text(const SourceManager &SM) const; |
80 | | |
81 | | /// Convert to the clang range. The returned range is always a char range, |
82 | | /// never a token range. |
83 | | CharSourceRange toCharRange(const SourceManager &SM) const; |
84 | | |
85 | 133k | friend bool operator==(const FileRange &L, const FileRange &R) { |
86 | 133k | return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End); |
87 | 133k | } |
88 | 0 | friend bool operator!=(const FileRange &L, const FileRange &R) { |
89 | 0 | return !(L == R); |
90 | 0 | } |
91 | | |
92 | | private: |
93 | | FileID File; |
94 | | unsigned Begin; |
95 | | unsigned End; |
96 | | }; |
97 | | |
98 | | /// For debugging purposes. |
99 | | llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); |
100 | | |
101 | | /// A token coming directly from a file or from a macro invocation. Has just |
102 | | /// enough information to locate the token in the source code. |
103 | | /// Can represent both expanded and spelled tokens. |
104 | | class Token { |
105 | | public: |
106 | | Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind); |
107 | | /// EXPECTS: clang::Token is not an annotation token. |
108 | | explicit Token(const clang::Token &T); |
109 | | |
110 | 98.2k | tok::TokenKind kind() const { return Kind; } |
111 | | /// Location of the first character of a token. |
112 | 863k | SourceLocation location() const { return Location; } |
113 | | /// Location right after the last character of a token. |
114 | 26 | SourceLocation endLocation() const { |
115 | 26 | return Location.getLocWithOffset(Length); |
116 | 26 | } |
117 | 155k | unsigned length() const { return Length; } |
118 | | |
119 | | /// Get the substring covered by the token. Note that will include all |
120 | | /// digraphs, newline continuations, etc. E.g. tokens for 'int' and |
121 | | /// in\ |
122 | | /// t |
123 | | /// both have the same kind tok::kw_int, but results of text() are different. |
124 | | llvm::StringRef text(const SourceManager &SM) const; |
125 | | |
126 | | /// Gets a range of this token. |
127 | | /// EXPECTS: token comes from a file, not from a macro expansion. |
128 | | FileRange range(const SourceManager &SM) const; |
129 | | |
130 | | /// Given two tokens inside the same file, returns a file range that starts at |
131 | | /// \p First and ends at \p Last. |
132 | | /// EXPECTS: First and Last are file tokens from the same file, Last starts |
133 | | /// after First. |
134 | | static FileRange range(const SourceManager &SM, const syntax::Token &First, |
135 | | const syntax::Token &Last); |
136 | | |
137 | | std::string dumpForTests(const SourceManager &SM) const; |
138 | | /// For debugging purposes. |
139 | | std::string str() const; |
140 | | |
141 | | private: |
142 | | SourceLocation Location; |
143 | | unsigned Length; |
144 | | tok::TokenKind Kind; |
145 | | }; |
146 | | /// For debugging purposes. Equivalent to a call to Token::str(). |
147 | | llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); |
148 | | |
149 | | /// A list of tokens obtained by preprocessing a text buffer and operations to |
150 | | /// map between the expanded and spelled tokens, i.e. TokenBuffer has |
151 | | /// information about two token streams: |
152 | | /// 1. Expanded tokens: tokens produced by the preprocessor after all macro |
153 | | /// replacements, |
154 | | /// 2. Spelled tokens: corresponding directly to the source code of a file |
155 | | /// before any macro replacements occurred. |
156 | | /// Here's an example to illustrate a difference between those two: |
157 | | /// #define FOO 10 |
158 | | /// int a = FOO; |
159 | | /// |
160 | | /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. |
161 | | /// Expanded tokens are {'int','a','=','10',';','eof'}. |
162 | | /// |
163 | | /// Note that the expanded token stream has a tok::eof token at the end, the |
164 | | /// spelled tokens never store a 'eof' token. |
165 | | /// |
166 | | /// The full list expanded tokens can be obtained with expandedTokens(). Spelled |
167 | | /// tokens for each of the files can be obtained via spelledTokens(FileID). |
168 | | /// |
169 | | /// To map between the expanded and spelled tokens use findSpelledByExpanded(). |
170 | | /// |
171 | | /// To build a token buffer use the TokenCollector class. You can also compute |
172 | | /// the spelled tokens of a file using the tokenize() helper. |
173 | | /// |
174 | | /// FIXME: allow mappings into macro arguments. |
175 | | class TokenBuffer { |
176 | | public: |
177 | 2.24k | TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} |
178 | | |
179 | 4.33k | TokenBuffer(TokenBuffer &&) = default; |
180 | | TokenBuffer(const TokenBuffer &) = delete; |
181 | | TokenBuffer &operator=(TokenBuffer &&) = default; |
182 | | TokenBuffer &operator=(const TokenBuffer &) = delete; |
183 | | |
184 | | /// All tokens produced by the preprocessor after all macro replacements, |
185 | | /// directives, etc. Source locations found in the clang AST will always |
186 | | /// point to one of these tokens. |
187 | | /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()). |
188 | | /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split |
189 | | /// into two '>' tokens by the parser. However, TokenBuffer currently |
190 | | /// keeps it as a single '>>' token. |
191 | 10.9k | llvm::ArrayRef<syntax::Token> expandedTokens() const { |
192 | 10.9k | return ExpandedTokens; |
193 | 10.9k | } |
194 | | |
195 | | /// Returns the subrange of expandedTokens() corresponding to the closed |
196 | | /// token range R. |
197 | | llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const; |
198 | | |
199 | | /// Returns the subrange of spelled tokens corresponding to AST node spanning |
200 | | /// \p Expanded. This is the text that should be replaced if a refactoring |
201 | | /// were to rewrite the node. If \p Expanded is empty, the returned value is |
202 | | /// llvm::None. |
203 | | /// |
204 | | /// Will fail if the expanded tokens do not correspond to a sequence of |
205 | | /// spelled tokens. E.g. for the following example: |
206 | | /// |
207 | | /// #define FIRST f1 f2 f3 |
208 | | /// #define SECOND s1 s2 s3 |
209 | | /// #define ID2(X, Y) X Y |
210 | | /// |
211 | | /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c |
212 | | /// d ID2(e f g, h) i // expanded tokens are: d e f g h i |
213 | | /// |
214 | | /// the results would be: |
215 | | /// expanded => spelled |
216 | | /// ------------------------ |
217 | | /// a => a |
218 | | /// s1 s2 s3 => SECOND |
219 | | /// a f1 f2 f3 => a FIRST |
220 | | /// a f1 => can't map |
221 | | /// s1 s2 => can't map |
222 | | /// e f => e f |
223 | | /// g h => can't map |
224 | | /// |
225 | | /// EXPECTS: \p Expanded is a subrange of expandedTokens(). |
226 | | /// Complexity is logarithmic. |
227 | | llvm::Optional<llvm::ArrayRef<syntax::Token>> |
228 | | spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; |
229 | | |
230 | | /// Find the subranges of expanded tokens, corresponding to \p Spelled. |
231 | | /// |
232 | | /// Some spelled tokens may not be present in the expanded token stream, so |
233 | | /// this function can return an empty vector, e.g. for tokens of macro |
234 | | /// directives or disabled preprocessor branches. |
235 | | /// |
236 | | /// Some spelled tokens can be duplicated in the expanded token stream |
237 | | /// multiple times and this function will return multiple results in those |
238 | | /// cases. This happens when \p Spelled is inside a macro argument. |
239 | | /// |
240 | | /// FIXME: return correct results on macro arguments. For now, we return an |
241 | | /// empty list. |
242 | | /// |
243 | | /// (!) will return empty vector on tokens from #define body: |
244 | | /// E.g. for the following example: |
245 | | /// |
246 | | /// #define FIRST(A) f1 A = A f2 |
247 | | /// #define SECOND s |
248 | | /// |
249 | | /// a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s |
250 | | /// The results would be |
251 | | /// spelled => expanded |
252 | | /// ------------------------ |
253 | | /// #define FIRST => {} |
254 | | /// a FIRST(arg) => {a f1 arg = arg f2} |
255 | | /// arg => {arg, arg} // arg #1 is before `=` and arg #2 is |
256 | | /// // after `=` in the expanded tokens. |
257 | | llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1> |
258 | | expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; |
259 | | |
260 | | /// An expansion produced by the preprocessor, includes macro expansions and |
261 | | /// preprocessor directives. Preprocessor always maps a non-empty range of |
262 | | /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a |
263 | | /// few examples of expansions: |
264 | | /// #pragma once // Expands to an empty range. |
265 | | /// #define FOO 1 2 3 // Expands an empty range. |
266 | | /// FOO // Expands to "1 2 3". |
267 | | /// FIXME(ibiryukov): implement this, currently #include expansions are empty. |
268 | | /// #include <vector> // Expands to tokens produced by the include. |
269 | | struct Expansion { |
270 | | llvm::ArrayRef<syntax::Token> Spelled; |
271 | | llvm::ArrayRef<syntax::Token> Expanded; |
272 | | }; |
273 | | /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting |
274 | | /// a preprocessor directive) return the subrange of expanded tokens that the |
275 | | /// macro expands to. |
276 | | llvm::Optional<Expansion> |
277 | | expansionStartingAt(const syntax::Token *Spelled) const; |
278 | | /// Returns all expansions (partially) expanded from the specified tokens. |
279 | | /// This is the expansions whose Spelled range intersects \p Spelled. |
280 | | std::vector<Expansion> |
281 | | expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const; |
282 | | |
283 | | /// Lexed tokens of a file before preprocessing. E.g. for the following input |
284 | | /// #define DECL(name) int name = 10 |
285 | | /// DECL(a); |
286 | | /// spelledTokens() returns |
287 | | /// {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10", |
288 | | /// "DECL", "(", "a", ")", ";"} |
289 | | llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const; |
290 | | |
291 | | /// Returns the spelled Token starting at Loc, if there are no such tokens |
292 | | /// returns nullptr. |
293 | | const syntax::Token *spelledTokenAt(SourceLocation Loc) const; |
294 | | |
295 | | /// Get all tokens that expand a macro in \p FID. For the following input |
296 | | /// #define FOO B |
297 | | /// #define FOO2(X) int X |
298 | | /// FOO2(XY) |
299 | | /// int B; |
300 | | /// FOO; |
301 | | /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5 |
302 | | /// respecitvely). |
303 | | std::vector<const syntax::Token *> macroExpansions(FileID FID) const; |
304 | | |
305 | 63 | const SourceManager &sourceManager() const { return *SourceMgr; } |
306 | | |
307 | | std::string dumpForTests() const; |
308 | | |
309 | | private: |
310 | | /// Describes a mapping between a continuous subrange of spelled tokens and |
311 | | /// expanded tokens. Represents macro expansions, preprocessor directives, |
312 | | /// conditionally disabled pp regions, etc. |
313 | | /// #define FOO 1+2 |
314 | | /// #define BAR(a) a + 1 |
315 | | /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. |
316 | | /// BAR(1) // invocation #2, tokens = {'a', '+', '1'}, |
317 | | /// macroTokens = {'BAR', '(', '1', ')'}. |
318 | | struct Mapping { |
319 | | // Positions in the corresponding spelled token stream. The corresponding |
320 | | // range is never empty. |
321 | | unsigned BeginSpelled = 0; |
322 | | unsigned EndSpelled = 0; |
323 | | // Positions in the expanded token stream. The corresponding range can be |
324 | | // empty. |
325 | | unsigned BeginExpanded = 0; |
326 | | unsigned EndExpanded = 0; |
327 | | |
328 | | /// For debugging purposes. |
329 | | std::string str() const; |
330 | | }; |
331 | | /// Spelled tokens of the file with information about the subranges. |
332 | | struct MarkedFile { |
333 | | /// Lexed, but not preprocessed, tokens of the file. These map directly to |
334 | | /// text in the corresponding files and include tokens of all preprocessor |
335 | | /// directives. |
336 | | /// FIXME: spelled tokens don't change across FileID that map to the same |
337 | | /// FileEntry. We could consider deduplicating them to save memory. |
338 | | std::vector<syntax::Token> SpelledTokens; |
339 | | /// A sorted list to convert between the spelled and expanded token streams. |
340 | | std::vector<Mapping> Mappings; |
341 | | /// The first expanded token produced for this FileID. |
342 | | unsigned BeginExpanded = 0; |
343 | | unsigned EndExpanded = 0; |
344 | | }; |
345 | | |
346 | | friend class TokenCollector; |
347 | | |
348 | | /// Maps a single expanded token to its spelled counterpart or a mapping that |
349 | | /// produced it. |
350 | | std::pair<const syntax::Token *, const Mapping *> |
351 | | spelledForExpandedToken(const syntax::Token *Expanded) const; |
352 | | |
353 | | /// Returns a mapping starting before \p Spelled token, or nullptr if no |
354 | | /// such mapping exists. |
355 | | static const Mapping * |
356 | | mappingStartingBeforeSpelled(const MarkedFile &F, |
357 | | const syntax::Token *Spelled); |
358 | | |
359 | | /// Convert a private Mapping to a public Expansion. |
360 | | Expansion makeExpansion(const MarkedFile &, const Mapping &) const; |
361 | | /// Returns the file that the Spelled tokens are taken from. |
362 | | /// Asserts that they are non-empty, from a tracked file, and in-bounds. |
363 | | const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; |
364 | | |
365 | | /// Token stream produced after preprocessing, conceputally this captures the |
366 | | /// same stream as 'clang -E' (excluding the preprocessor directives like |
367 | | /// #file, etc.). |
368 | | std::vector<syntax::Token> ExpandedTokens; |
369 | | llvm::DenseMap<FileID, MarkedFile> Files; |
370 | | // The value is never null, pointer instead of reference to avoid disabling |
371 | | // implicit assignment operator. |
372 | | const SourceManager *SourceMgr; |
373 | | }; |
374 | | |
375 | | /// The spelled tokens that overlap or touch a spelling location Loc. |
376 | | /// This always returns 0-2 tokens. |
377 | | llvm::ArrayRef<syntax::Token> |
378 | | spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens); |
379 | | llvm::ArrayRef<syntax::Token> |
380 | | spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens); |
381 | | |
382 | | /// The identifier token that overlaps or touches a spelling location Loc. |
383 | | /// If there is none, returns nullptr. |
384 | | const syntax::Token * |
385 | | spelledIdentifierTouching(SourceLocation Loc, |
386 | | llvm::ArrayRef<syntax::Token> Tokens); |
387 | | const syntax::Token * |
388 | | spelledIdentifierTouching(SourceLocation Loc, |
389 | | const syntax::TokenBuffer &Tokens); |
390 | | |
391 | | /// Lex the text buffer, corresponding to \p FID, in raw mode and record the |
392 | | /// resulting spelled tokens. Does minimal post-processing on raw identifiers, |
393 | | /// setting the appropriate token kind (instead of the raw_identifier reported |
394 | | /// by lexer in raw mode). This is a very low-level function, most users should |
395 | | /// prefer to use TokenCollector. Lexing in raw mode produces wildly different |
396 | | /// results from what one might expect when running a C++ frontend, e.g. |
397 | | /// preprocessor does not run at all. |
398 | | /// The result will *not* have a 'eof' token at the end. |
399 | | std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, |
400 | | const LangOptions &LO); |
401 | | /// Similar to one above, instead of whole file tokenizes a part of it. Note |
402 | | /// that, the first token might be incomplete if FR.startOffset is not at the |
403 | | /// beginning of a token, and the last token returned will start before the |
404 | | /// FR.endOffset but might end after it. |
405 | | std::vector<syntax::Token> |
406 | | tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO); |
407 | | |
408 | | /// Collects tokens for the main file while running the frontend action. An |
409 | | /// instance of this object should be created on |
410 | | /// FrontendAction::BeginSourceFile() and the results should be consumed after |
411 | | /// FrontendAction::Execute() finishes. |
412 | | class TokenCollector { |
413 | | public: |
414 | | /// Adds the hooks to collect the tokens. Should be called before the |
415 | | /// preprocessing starts, i.e. as a part of BeginSourceFile() or |
416 | | /// CreateASTConsumer(). |
417 | | TokenCollector(Preprocessor &P); |
418 | | |
419 | | /// Finalizes token collection. Should be called after preprocessing is |
420 | | /// finished, i.e. after running Execute(). |
421 | | LLVM_NODISCARD TokenBuffer consume() &&; |
422 | | |
423 | | private: |
424 | | /// Maps from a start to an end spelling location of transformations |
425 | | /// performed by the preprocessor. These include: |
426 | | /// 1. range from '#' to the last token in the line for PP directives, |
427 | | /// 2. macro name and arguments for macro expansions. |
428 | | /// Note that we record only top-level macro expansions, intermediate |
429 | | /// expansions (e.g. inside macro arguments) are ignored. |
430 | | /// |
431 | | /// Used to find correct boundaries of macro calls and directives when |
432 | | /// building mappings from spelled to expanded tokens. |
433 | | /// |
434 | | /// Logically, at each point of the preprocessor execution there is a stack of |
435 | | /// macro expansions being processed and we could use it to recover the |
436 | | /// location information we need. However, the public preprocessor API only |
437 | | /// exposes the points when macro expansions start (when we push a macro onto |
438 | | /// the stack) and not when they end (when we pop a macro from the stack). |
439 | | /// To workaround this limitation, we rely on source location information |
440 | | /// stored in this map. |
441 | | using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>; |
442 | | class Builder; |
443 | | class CollectPPExpansions; |
444 | | |
445 | | std::vector<syntax::Token> Expanded; |
446 | | // FIXME: we only store macro expansions, also add directives(#pragma, etc.) |
447 | | PPExpansions Expansions; |
448 | | Preprocessor &PP; |
449 | | CollectPPExpansions *Collector; |
450 | | }; |
451 | | |
452 | | } // namespace syntax |
453 | | } // namespace clang |
454 | | |
455 | | #endif |