Coverage Report

Created: 2021-08-24 07:12

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/include/clang/Tooling/Syntax/Tokens.h
Line
Count
Source (jump to first uncovered line)
1
//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
// Record tokens that a preprocessor emits and define operations to map between
9
// the tokens written in a file and tokens produced by the preprocessor.
10
//
11
// When running the compiler, there are two token streams we are interested in:
12
//   - "spelled" tokens directly correspond to a substring written in some
13
//     source file.
14
//   - "expanded" tokens represent the result of preprocessing, parses consumes
15
//     this token stream to produce the AST.
16
//
17
// Expanded tokens correspond directly to locations found in the AST, allowing
18
// to find subranges of the token stream covered by various AST nodes. Spelled
19
// tokens correspond directly to the source code written by the user.
20
//
21
// To allow composing these two use-cases, we also define operations that map
22
// between expanded and spelled tokens that produced them (macro calls,
23
// directives, etc).
24
//
25
//===----------------------------------------------------------------------===//
26
27
#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
28
#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
29
30
#include "clang/Basic/FileManager.h"
31
#include "clang/Basic/LangOptions.h"
32
#include "clang/Basic/SourceLocation.h"
33
#include "clang/Basic/SourceManager.h"
34
#include "clang/Basic/TokenKinds.h"
35
#include "clang/Lex/Token.h"
36
#include "llvm/ADT/ArrayRef.h"
37
#include "llvm/ADT/DenseMap.h"
38
#include "llvm/ADT/Optional.h"
39
#include "llvm/ADT/StringRef.h"
40
#include "llvm/Support/Compiler.h"
41
#include "llvm/Support/raw_ostream.h"
42
#include <cstdint>
43
#include <tuple>
44
45
namespace clang {
46
class Preprocessor;
47
48
namespace syntax {
49
50
/// A half-open character range inside a particular file, the start offset is
51
/// included and the end offset is excluded from the range.
52
struct FileRange {
53
  /// EXPECTS: File.isValid() && Begin <= End.
54
  FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
55
  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
56
  FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
57
  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
58
  ///          are the same.
59
  FileRange(const SourceManager &SM, SourceLocation BeginLoc,
60
            SourceLocation EndLoc);
61
62
214k
  FileID file() const { return File; }
63
  /// Start is a start offset (inclusive) in the corresponding file.
64
101k
  unsigned beginOffset() const { return Begin; }
65
  /// End offset (exclusive) in the corresponding file.
66
140k
  unsigned endOffset() const { return End; }
67
68
14
  unsigned length() const { return End - Begin; }
69
70
  /// Check if \p Offset is inside the range.
71
0
  bool contains(unsigned Offset) const {
72
0
    return Begin <= Offset && Offset < End;
73
0
  }
74
  /// Check \p Offset is inside the range or equal to its endpoint.
75
0
  bool touches(unsigned Offset) const {
76
0
    return Begin <= Offset && Offset <= End;
77
0
  }
78
79
  /// Gets the substring that this FileRange refers to.
80
  llvm::StringRef text(const SourceManager &SM) const;
81
82
  /// Convert to the clang range. The returned range is always a char range,
83
  /// never a token range.
84
  CharSourceRange toCharRange(const SourceManager &SM) const;
85
86
137k
  friend bool operator==(const FileRange &L, const FileRange &R) {
87
137k
    return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
88
137k
  }
89
0
  friend bool operator!=(const FileRange &L, const FileRange &R) {
90
0
    return !(L == R);
91
0
  }
92
93
private:
94
  FileID File;
95
  unsigned Begin;
96
  unsigned End;
97
};
98
99
/// For debugging purposes.
100
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
101
102
/// A token coming directly from a file or from a macro invocation. Has just
103
/// enough information to locate the token in the source code.
104
/// Can represent both expanded and spelled tokens.
105
class Token {
106
public:
107
  Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
108
  /// EXPECTS: clang::Token is not an annotation token.
109
  explicit Token(const clang::Token &T);
110
111
99.2k
  tok::TokenKind kind() const { return Kind; }
112
  /// Location of the first character of a token.
113
882k
  SourceLocation location() const { return Location; }
114
  /// Location right after the last character of a token.
115
26
  SourceLocation endLocation() const {
116
26
    return Location.getLocWithOffset(Length);
117
26
  }
118
160k
  unsigned length() const { return Length; }
119
120
  /// Get the substring covered by the token. Note that will include all
121
  /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
122
  ///    in\
123
  ///    t
124
  /// both have the same kind tok::kw_int, but results of text() are different.
125
  llvm::StringRef text(const SourceManager &SM) const;
126
127
  /// Gets a range of this token.
128
  /// EXPECTS: token comes from a file, not from a macro expansion.
129
  FileRange range(const SourceManager &SM) const;
130
131
  /// Given two tokens inside the same file, returns a file range that starts at
132
  /// \p First and ends at \p Last.
133
  /// EXPECTS: First and Last are file tokens from the same file, Last starts
134
  ///          after First.
135
  static FileRange range(const SourceManager &SM, const syntax::Token &First,
136
                         const syntax::Token &Last);
137
138
  std::string dumpForTests(const SourceManager &SM) const;
139
  /// For debugging purposes.
140
  std::string str() const;
141
142
private:
143
  SourceLocation Location;
144
  unsigned Length;
145
  tok::TokenKind Kind;
146
};
147
/// For debugging purposes. Equivalent to a call to Token::str().
148
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);
149
150
/// A list of tokens obtained by preprocessing a text buffer and operations to
151
/// map between the expanded and spelled tokens, i.e. TokenBuffer has
152
/// information about two token streams:
153
///    1. Expanded tokens: tokens produced by the preprocessor after all macro
154
///       replacements,
155
///    2. Spelled tokens: corresponding directly to the source code of a file
156
///       before any macro replacements occurred.
157
/// Here's an example to illustrate a difference between those two:
158
///     #define FOO 10
159
///     int a = FOO;
160
///
161
/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
162
/// Expanded tokens are {'int','a','=','10',';','eof'}.
163
///
164
/// Note that the expanded token stream has a tok::eof token at the end, the
165
/// spelled tokens never store a 'eof' token.
166
///
167
/// The full list expanded tokens can be obtained with expandedTokens(). Spelled
168
/// tokens for each of the files can be obtained via spelledTokens(FileID).
169
///
170
/// To map between the expanded and spelled tokens use findSpelledByExpanded().
171
///
172
/// To build a token buffer use the TokenCollector class. You can also compute
173
/// the spelled tokens of a file using the tokenize() helper.
174
///
175
/// FIXME: allow mappings into macro arguments.
176
class TokenBuffer {
177
public:
178
2.26k
  TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
179
180
4.37k
  TokenBuffer(TokenBuffer &&) = default;
181
  TokenBuffer(const TokenBuffer &) = delete;
182
  TokenBuffer &operator=(TokenBuffer &&) = default;
183
  TokenBuffer &operator=(const TokenBuffer &) = delete;
184
185
  /// All tokens produced by the preprocessor after all macro replacements,
186
  /// directives, etc. Source locations found in the clang AST will always
187
  /// point to one of these tokens.
188
  /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
189
  /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
190
  ///        into two '>' tokens by the parser. However, TokenBuffer currently
191
  ///        keeps it as a single '>>' token.
192
11.0k
  llvm::ArrayRef<syntax::Token> expandedTokens() const {
193
11.0k
    return ExpandedTokens;
194
11.0k
  }
195
196
  /// Builds a cache to make future calls to expandedToken(SourceRange) faster.
197
  /// Creates an index only once. Further calls to it will be no-op.
198
  void indexExpandedTokens();
199
200
  /// Returns the subrange of expandedTokens() corresponding to the closed
201
  /// token range R.
202
  /// Consider calling indexExpandedTokens() before for faster lookups.
203
  llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const;
204
205
  /// Returns the subrange of spelled tokens corresponding to AST node spanning
206
  /// \p Expanded. This is the text that should be replaced if a refactoring
207
  /// were to rewrite the node. If \p Expanded is empty, the returned value is
208
  /// llvm::None.
209
  ///
210
  /// Will fail if the expanded tokens do not correspond to a sequence of
211
  /// spelled tokens. E.g. for the following example:
212
  ///
213
  ///   #define FIRST f1 f2 f3
214
  ///   #define SECOND s1 s2 s3
215
  ///   #define ID2(X, Y) X Y
216
  ///
217
  ///   a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
218
  ///   d ID2(e f g, h) i  // expanded tokens are: d e f g h i
219
  ///
220
  /// the results would be:
221
  ///   expanded   => spelled
222
  ///   ------------------------
223
  ///            a => a
224
  ///     s1 s2 s3 => SECOND
225
  ///   a f1 f2 f3 => a FIRST
226
  ///         a f1 => can't map
227
  ///        s1 s2 => can't map
228
  ///         e f  => e f
229
  ///         g h  => can't map
230
  ///
231
  /// EXPECTS: \p Expanded is a subrange of expandedTokens().
232
  /// Complexity is logarithmic.
233
  llvm::Optional<llvm::ArrayRef<syntax::Token>>
234
  spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
235
236
  /// Find the subranges of expanded tokens, corresponding to \p Spelled.
237
  ///
238
  /// Some spelled tokens may not be present in the expanded token stream, so
239
  /// this function can return an empty vector, e.g. for tokens of macro
240
  /// directives or disabled preprocessor branches.
241
  ///
242
  /// Some spelled tokens can be duplicated in the expanded token stream
243
  /// multiple times and this function will return multiple results in those
244
  /// cases. This happens when \p Spelled is inside a macro argument.
245
  ///
246
  /// FIXME: return correct results on macro arguments. For now, we return an
247
  ///        empty list.
248
  ///
249
  /// (!) will return empty vector on tokens from #define body:
250
  /// E.g. for the following example:
251
  ///
252
  ///   #define FIRST(A) f1 A = A f2
253
  ///   #define SECOND s
254
  ///
255
  ///   a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s
256
  /// The results would be
257
  ///   spelled           => expanded
258
  ///   ------------------------
259
  ///   #define FIRST     => {}
260
  ///   a FIRST(arg)      => {a f1 arg = arg f2}
261
  ///   arg               => {arg, arg} // arg #1 is before `=` and arg #2 is
262
  ///                                   // after `=` in the expanded tokens.
263
  llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1>
264
  expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
265
266
  /// An expansion produced by the preprocessor, includes macro expansions and
267
  /// preprocessor directives. Preprocessor always maps a non-empty range of
268
  /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
269
  /// few examples of expansions:
270
  ///    #pragma once      // Expands to an empty range.
271
  ///    #define FOO 1 2 3 // Expands an empty range.
272
  ///    FOO               // Expands to "1 2 3".
273
  /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
274
  ///    #include <vector> // Expands to tokens produced by the include.
275
  struct Expansion {
276
    llvm::ArrayRef<syntax::Token> Spelled;
277
    llvm::ArrayRef<syntax::Token> Expanded;
278
  };
279
  /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
280
  /// a preprocessor directive) return the subrange of expanded tokens that the
281
  /// macro expands to.
282
  llvm::Optional<Expansion>
283
  expansionStartingAt(const syntax::Token *Spelled) const;
284
  /// Returns all expansions (partially) expanded from the specified tokens.
285
  /// This is the expansions whose Spelled range intersects \p Spelled.
286
  std::vector<Expansion>
287
  expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const;
288
289
  /// Lexed tokens of a file before preprocessing. E.g. for the following input
290
  ///     #define DECL(name) int name = 10
291
  ///     DECL(a);
292
  /// spelledTokens() returns
293
  ///    {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10",
294
  ///     "DECL", "(", "a", ")", ";"}
295
  llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;
296
297
  /// Returns the spelled Token starting at Loc, if there are no such tokens
298
  /// returns nullptr.
299
  const syntax::Token *spelledTokenAt(SourceLocation Loc) const;
300
301
  /// Get all tokens that expand a macro in \p FID. For the following input
302
  ///     #define FOO B
303
  ///     #define FOO2(X) int X
304
  ///     FOO2(XY)
305
  ///     int B;
306
  ///     FOO;
307
  /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
308
  /// respecitvely).
309
  std::vector<const syntax::Token *> macroExpansions(FileID FID) const;
310
311
63
  const SourceManager &sourceManager() const { return *SourceMgr; }
312
313
  std::string dumpForTests() const;
314
315
private:
316
  /// Describes a mapping between a continuous subrange of spelled tokens and
317
  /// expanded tokens. Represents macro expansions, preprocessor directives,
318
  /// conditionally disabled pp regions, etc.
319
  ///   #define FOO 1+2
320
  ///   #define BAR(a) a + 1
321
  ///   FOO    // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
322
  ///   BAR(1) // invocation #2, tokens = {'a', '+', '1'},
323
  ///                            macroTokens = {'BAR', '(', '1', ')'}.
324
  struct Mapping {
325
    // Positions in the corresponding spelled token stream. The corresponding
326
    // range is never empty.
327
    unsigned BeginSpelled = 0;
328
    unsigned EndSpelled = 0;
329
    // Positions in the expanded token stream. The corresponding range can be
330
    // empty.
331
    unsigned BeginExpanded = 0;
332
    unsigned EndExpanded = 0;
333
334
    /// For debugging purposes.
335
    std::string str() const;
336
  };
337
  /// Spelled tokens of the file with information about the subranges.
338
  struct MarkedFile {
339
    /// Lexed, but not preprocessed, tokens of the file. These map directly to
340
    /// text in the corresponding files and include tokens of all preprocessor
341
    /// directives.
342
    /// FIXME: spelled tokens don't change across FileID that map to the same
343
    ///        FileEntry. We could consider deduplicating them to save memory.
344
    std::vector<syntax::Token> SpelledTokens;
345
    /// A sorted list to convert between the spelled and expanded token streams.
346
    std::vector<Mapping> Mappings;
347
    /// The first expanded token produced for this FileID.
348
    unsigned BeginExpanded = 0;
349
    unsigned EndExpanded = 0;
350
  };
351
352
  friend class TokenCollector;
353
354
  /// Maps a single expanded token to its spelled counterpart or a mapping that
355
  /// produced it.
356
  std::pair<const syntax::Token *, const Mapping *>
357
  spelledForExpandedToken(const syntax::Token *Expanded) const;
358
359
  /// Returns a mapping starting before \p Spelled token, or nullptr if no
360
  /// such mapping exists.
361
  static const Mapping *
362
  mappingStartingBeforeSpelled(const MarkedFile &F,
363
                               const syntax::Token *Spelled);
364
365
  /// Convert a private Mapping to a public Expansion.
366
  Expansion makeExpansion(const MarkedFile &, const Mapping &) const;
367
  /// Returns the file that the Spelled tokens are taken from.
368
  /// Asserts that they are non-empty, from a tracked file, and in-bounds.
369
  const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
370
371
  /// Token stream produced after preprocessing, conceputally this captures the
372
  /// same stream as 'clang -E' (excluding the preprocessor directives like
373
  /// #file, etc.).
374
  std::vector<syntax::Token> ExpandedTokens;
375
  // Index of ExpandedTokens for faster lookups by SourceLocation.
376
  llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex;
377
  llvm::DenseMap<FileID, MarkedFile> Files;
378
  // The value is never null, pointer instead of reference to avoid disabling
379
  // implicit assignment operator.
380
  const SourceManager *SourceMgr;
381
};
382
383
/// The spelled tokens that overlap or touch a spelling location Loc.
384
/// This always returns 0-2 tokens.
385
llvm::ArrayRef<syntax::Token>
386
spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens);
387
llvm::ArrayRef<syntax::Token>
388
spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens);
389
390
/// The identifier token that overlaps or touches a spelling location Loc.
391
/// If there is none, returns nullptr.
392
const syntax::Token *
393
spelledIdentifierTouching(SourceLocation Loc,
394
                          llvm::ArrayRef<syntax::Token> Tokens);
395
const syntax::Token *
396
spelledIdentifierTouching(SourceLocation Loc,
397
                          const syntax::TokenBuffer &Tokens);
398
399
/// Lex the text buffer, corresponding to \p FID, in raw mode and record the
400
/// resulting spelled tokens. Does minimal post-processing on raw identifiers,
401
/// setting the appropriate token kind (instead of the raw_identifier reported
402
/// by lexer in raw mode). This is a very low-level function, most users should
403
/// prefer to use TokenCollector. Lexing in raw mode produces wildly different
404
/// results from what one might expect when running a C++ frontend, e.g.
405
/// preprocessor does not run at all.
406
/// The result will *not* have a 'eof' token at the end.
407
std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
408
                                    const LangOptions &LO);
409
/// Similar to one above, instead of whole file tokenizes a part of it. Note
410
/// that, the first token might be incomplete if FR.startOffset is not at the
411
/// beginning of a token, and the last token returned will start before the
412
/// FR.endOffset but might end after it.
413
std::vector<syntax::Token>
414
tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO);
415
416
/// Collects tokens for the main file while running the frontend action. An
417
/// instance of this object should be created on
418
/// FrontendAction::BeginSourceFile() and the results should be consumed after
419
/// FrontendAction::Execute() finishes.
420
class TokenCollector {
421
public:
422
  /// Adds the hooks to collect the tokens. Should be called before the
423
  /// preprocessing starts, i.e. as a part of BeginSourceFile() or
424
  /// CreateASTConsumer().
425
  TokenCollector(Preprocessor &P);
426
427
  /// Finalizes token collection. Should be called after preprocessing is
428
  /// finished, i.e. after running Execute().
429
  LLVM_NODISCARD TokenBuffer consume() &&;
430
431
private:
432
  /// Maps from a start to an end spelling location of transformations
433
  /// performed by the preprocessor. These include:
434
  ///   1. range from '#' to the last token in the line for PP directives,
435
  ///   2. macro name and arguments for macro expansions.
436
  /// Note that we record only top-level macro expansions, intermediate
437
  /// expansions (e.g. inside macro arguments) are ignored.
438
  ///
439
  /// Used to find correct boundaries of macro calls and directives when
440
  /// building mappings from spelled to expanded tokens.
441
  ///
442
  /// Logically, at each point of the preprocessor execution there is a stack of
443
  /// macro expansions being processed and we could use it to recover the
444
  /// location information we need. However, the public preprocessor API only
445
  /// exposes the points when macro expansions start (when we push a macro onto
446
  /// the stack) and not when they end (when we pop a macro from the stack).
447
  /// To workaround this limitation, we rely on source location information
448
  /// stored in this map.
449
  using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>;
450
  class Builder;
451
  class CollectPPExpansions;
452
453
  std::vector<syntax::Token> Expanded;
454
  // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
455
  PPExpansions Expansions;
456
  Preprocessor &PP;
457
  CollectPPExpansions *Collector;
458
};
459
460
} // namespace syntax
461
} // namespace clang
462
463
#endif