/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/include/clang/Lex/Lexer.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file defines the Lexer interface. |
10 | | // |
11 | | //===----------------------------------------------------------------------===// |
12 | | |
13 | | #ifndef LLVM_CLANG_LEX_LEXER_H |
14 | | #define LLVM_CLANG_LEX_LEXER_H |
15 | | |
16 | | #include "clang/Basic/LangOptions.h" |
17 | | #include "clang/Basic/SourceLocation.h" |
18 | | #include "clang/Basic/TokenKinds.h" |
19 | | #include "clang/Lex/DependencyDirectivesScanner.h" |
20 | | #include "clang/Lex/PreprocessorLexer.h" |
21 | | #include "clang/Lex/Token.h" |
22 | | #include "llvm/ADT/Optional.h" |
23 | | #include "llvm/ADT/SmallVector.h" |
24 | | #include "llvm/ADT/StringRef.h" |
25 | | #include <cassert> |
26 | | #include <cstdint> |
27 | | #include <string> |
28 | | |
29 | | namespace llvm { |
30 | | |
31 | | class MemoryBufferRef; |
32 | | |
33 | | } // namespace llvm |
34 | | |
35 | | namespace clang { |
36 | | |
37 | | class DiagnosticBuilder; |
38 | | class Preprocessor; |
39 | | class SourceManager; |
40 | | class LangOptions; |
41 | | |
42 | | /// ConflictMarkerKind - Kinds of conflict marker which the lexer might be |
43 | | /// recovering from. |
44 | | enum ConflictMarkerKind { |
45 | | /// Not within a conflict marker. |
46 | | CMK_None, |
47 | | |
48 | | /// A normal or diff3 conflict marker, initiated by at least 7 "<"s, |
49 | | /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s. |
50 | | CMK_Normal, |
51 | | |
52 | | /// A Perforce-style conflict marker, initiated by 4 ">"s, |
53 | | /// separated by 4 "="s, and terminated by 4 "<"s. |
54 | | CMK_Perforce |
55 | | }; |
56 | | |
57 | | /// Describes the bounds (start, size) of the preamble and a flag required by |
58 | | /// PreprocessorOptions::PrecompiledPreambleBytes. |
59 | | /// The preamble includes the BOM, if any. |
60 | | struct PreambleBounds { |
61 | | /// Size of the preamble in bytes. |
62 | | unsigned Size; |
63 | | |
64 | | /// Whether the preamble ends at the start of a new line. |
65 | | /// |
66 | | /// Used to inform the lexer as to whether it's starting at the beginning of |
67 | | /// a line after skipping the preamble. |
68 | | bool PreambleEndsAtStartOfLine; |
69 | | |
70 | | PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine) |
71 | 1.39k | : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {} |
72 | | }; |
73 | | |
74 | | /// Lexer - This provides a simple interface that turns a text buffer into a |
75 | | /// stream of tokens. This provides no support for file reading or buffering, |
76 | | /// or buffering/seeking of tokens, only forward lexing is supported. It relies |
77 | | /// on the specified Preprocessor object to handle preprocessor directives, etc. |
78 | | class Lexer : public PreprocessorLexer { |
79 | | friend class Preprocessor; |
80 | | |
81 | | void anchor() override; |
82 | | |
83 | | //===--------------------------------------------------------------------===// |
84 | | // Constant configuration values for this lexer. |
85 | | |
86 | | // Start of the buffer. |
87 | | const char *BufferStart; |
88 | | |
89 | | // End of the buffer. |
90 | | const char *BufferEnd; |
91 | | |
92 | | // Location for start of file. |
93 | | SourceLocation FileLoc; |
94 | | |
95 | | // LangOpts enabled by this language. |
96 | | // Storing LangOptions as reference here is important from performance point |
97 | | // of view. Lack of reference means that LangOptions copy constructor would be |
98 | | // called by Lexer(..., const LangOptions &LangOpts,...). Given that local |
99 | | // Lexer objects are created thousands times (in Lexer::getRawToken, |
100 | | // Preprocessor::EnterSourceFile and other places) during single module |
101 | | // processing in frontend it would make std::vector<std::string> copy |
102 | | // constructors surprisingly hot. |
103 | | const LangOptions &LangOpts; |
104 | | |
105 | | // True if '//' line comments are enabled. |
106 | | bool LineComment; |
107 | | |
108 | | // True if lexer for _Pragma handling. |
109 | | bool Is_PragmaLexer; |
110 | | |
111 | | //===--------------------------------------------------------------------===// |
112 | | // Context-specific lexing flags set by the preprocessor. |
113 | | // |
114 | | |
115 | | /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace |
116 | | /// and return them as tokens. This is used for -C and -CC modes, and |
117 | | /// whitespace preservation can be useful for some clients that want to lex |
118 | | /// the file in raw mode and get every character from the file. |
119 | | /// |
120 | | /// When this is set to 2 it returns comments and whitespace. When set to 1 |
121 | | /// it returns comments, when it is set to 0 it returns normal tokens only. |
122 | | unsigned char ExtendedTokenMode; |
123 | | |
124 | | //===--------------------------------------------------------------------===// |
125 | | // Context that changes as the file is lexed. |
126 | | // NOTE: any state that mutates when in raw mode must have save/restore code |
127 | | // in Lexer::isNextPPTokenLParen. |
128 | | |
129 | | // BufferPtr - Current pointer into the buffer. This is the next character |
130 | | // to be lexed. |
131 | | const char *BufferPtr; |
132 | | |
133 | | // IsAtStartOfLine - True if the next lexed token should get the "start of |
134 | | // line" flag set on it. |
135 | | bool IsAtStartOfLine; |
136 | | |
137 | | bool IsAtPhysicalStartOfLine; |
138 | | |
139 | | bool HasLeadingSpace; |
140 | | |
141 | | bool HasLeadingEmptyMacro; |
142 | | |
143 | | /// True if this is the first time we're lexing the input file. |
144 | | bool IsFirstTimeLexingFile; |
145 | | |
146 | | // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n', |
147 | | // it also points to '\n.' |
148 | | const char *NewLinePtr; |
149 | | |
150 | | // CurrentConflictMarkerState - The kind of conflict marker we are handling. |
151 | | ConflictMarkerKind CurrentConflictMarkerState; |
152 | | |
153 | | /// Non-empty if this \p Lexer is \p isDependencyDirectivesLexer(). |
154 | | ArrayRef<dependency_directives_scan::Directive> DepDirectives; |
155 | | |
156 | | /// If this \p Lexer is \p isDependencyDirectivesLexer(), it represents the |
157 | | /// next token to use from the current dependency directive. |
158 | | unsigned NextDepDirectiveTokenIndex = 0; |
159 | | |
160 | | void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd); |
161 | | |
162 | | public: |
163 | | /// Lexer constructor - Create a new lexer object for the specified buffer |
164 | | /// with the specified preprocessor managing the lexing process. This lexer |
165 | | /// assumes that the associated file buffer and Preprocessor objects will |
166 | | /// outlive it, so it doesn't take ownership of either of them. |
167 | | Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, |
168 | | bool IsFirstIncludeOfFile = true); |
169 | | |
170 | | /// Lexer constructor - Create a new raw lexer object. This object is only |
171 | | /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the |
172 | | /// text range will outlive it, so it doesn't take ownership of it. |
173 | | Lexer(SourceLocation FileLoc, const LangOptions &LangOpts, |
174 | | const char *BufStart, const char *BufPtr, const char *BufEnd, |
175 | | bool IsFirstIncludeOfFile = true); |
176 | | |
177 | | /// Lexer constructor - Create a new raw lexer object. This object is only |
178 | | /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the |
179 | | /// text range will outlive it, so it doesn't take ownership of it. |
180 | | Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, |
181 | | const SourceManager &SM, const LangOptions &LangOpts, |
182 | | bool IsFirstIncludeOfFile = true); |
183 | | |
184 | | Lexer(const Lexer &) = delete; |
185 | | Lexer &operator=(const Lexer &) = delete; |
186 | | |
187 | | /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for |
188 | | /// _Pragma expansion. This has a variety of magic semantics that this method |
189 | | /// sets up. It returns a new'd Lexer that must be delete'd when done. |
190 | | static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc, |
191 | | SourceLocation ExpansionLocStart, |
192 | | SourceLocation ExpansionLocEnd, |
193 | | unsigned TokLen, Preprocessor &PP); |
194 | | |
195 | | /// getFileLoc - Return the File Location for the file we are lexing out of. |
196 | | /// The physical location encodes the location where the characters come from, |
197 | | /// the virtual location encodes where we should *claim* the characters came |
198 | | /// from. Currently this is only used by _Pragma handling. |
199 | 1.81M | SourceLocation getFileLoc() const { return FileLoc; } |
200 | | |
201 | | private: |
202 | | /// Lex - Return the next token in the file. If this is the end of file, it |
203 | | /// return the tok::eof token. This implicitly involves the preprocessor. |
204 | | bool Lex(Token &Result); |
205 | | |
206 | | /// Called when the preprocessor is in 'dependency scanning lexing mode'. |
207 | | bool LexDependencyDirectiveToken(Token &Result); |
208 | | |
209 | | /// Called when the preprocessor is in 'dependency scanning lexing mode' and |
210 | | /// is skipping a conditional block. |
211 | | bool LexDependencyDirectiveTokenWhileSkipping(Token &Result); |
212 | | |
213 | | /// True when the preprocessor is in 'dependency scanning lexing mode' and |
214 | | /// created this \p Lexer for lexing a set of dependency directive tokens. |
215 | 1.58G | bool isDependencyDirectivesLexer() const { return !DepDirectives.empty(); } |
216 | | |
217 | | /// Initializes \p Result with data from \p DDTok and advances \p BufferPtr to |
218 | | /// the position just after the token. |
219 | | /// \returns the buffer pointer at the beginning of the token. |
220 | | const char *convertDependencyDirectiveToken( |
221 | | const dependency_directives_scan::Token &DDTok, Token &Result); |
222 | | |
223 | | public: |
224 | | /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma. |
225 | 4.44M | bool isPragmaLexer() const { return Is_PragmaLexer; } |
226 | | |
227 | | private: |
228 | | /// IndirectLex - An indirect call to 'Lex' that can be invoked via |
229 | | /// the PreprocessorLexer interface. |
230 | 13.0k | void IndirectLex(Token &Result) override { Lex(Result); } |
231 | | |
232 | | public: |
233 | | /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no |
234 | | /// associated preprocessor object. Return true if the 'next character to |
235 | | /// read' pointer points at the end of the lexer buffer, false otherwise. |
236 | 70.6M | bool LexFromRawLexer(Token &Result) { |
237 | 70.6M | assert(LexingRawMode && "Not already in raw mode!"); |
238 | 0 | Lex(Result); |
239 | | // Note that lexing to the end of the buffer doesn't implicitly delete the |
240 | | // lexer when in raw mode. |
241 | 70.6M | return BufferPtr == BufferEnd; |
242 | 70.6M | } |
243 | | |
244 | | /// isKeepWhitespaceMode - Return true if the lexer should return tokens for |
245 | | /// every character in the file, including whitespace and comments. This |
246 | | /// should only be used in raw mode, as the preprocessor is not prepared to |
247 | | /// deal with the excess tokens. |
248 | 792M | bool isKeepWhitespaceMode() const { |
249 | 792M | return ExtendedTokenMode > 1; |
250 | 792M | } |
251 | | |
252 | | /// SetKeepWhitespaceMode - This method lets clients enable or disable |
253 | | /// whitespace retention mode. |
254 | 78.7M | void SetKeepWhitespaceMode(bool Val) { |
255 | 78.7M | assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) && |
256 | 78.7M | "Can only retain whitespace in raw mode or -traditional-cpp"); |
257 | 78.7M | ExtendedTokenMode = Val ? 280.8k : 078.6M ; |
258 | 78.7M | } |
259 | | |
260 | | /// inKeepCommentMode - Return true if the lexer should return comments as |
261 | | /// tokens. |
262 | 91.6M | bool inKeepCommentMode() const { |
263 | 91.6M | return ExtendedTokenMode > 0; |
264 | 91.6M | } |
265 | | |
266 | | /// SetCommentRetentionMode - Change the comment retention mode of the lexer |
267 | | /// to the specified mode. This is really only useful when lexing in raw |
268 | | /// mode, because otherwise the lexer needs to manage this. |
269 | 180M | void SetCommentRetentionState(bool Mode) { |
270 | 180M | assert(!isKeepWhitespaceMode() && |
271 | 180M | "Can't play with comment retention state when retaining whitespace"); |
272 | 180M | ExtendedTokenMode = Mode ? 150.8M : 0129M ; |
273 | 180M | } |
274 | | |
275 | | /// Sets the extended token mode back to its initial value, according to the |
276 | | /// language options and preprocessor. This controls whether the lexer |
277 | | /// produces comment and whitespace tokens. |
278 | | /// |
279 | | /// This requires the lexer to have an associated preprocessor. A standalone |
280 | | /// lexer has nothing to reset to. |
281 | | void resetExtendedTokenMode(); |
282 | | |
283 | | /// Gets source code buffer. |
284 | 6.70k | StringRef getBuffer() const { |
285 | 6.70k | return StringRef(BufferStart, BufferEnd - BufferStart); |
286 | 6.70k | } |
287 | | |
288 | | /// ReadToEndOfLine - Read the rest of the current preprocessor line as an |
289 | | /// uninterpreted string. This switches the lexer out of directive mode. |
290 | | void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr); |
291 | | |
292 | | |
293 | | /// Diag - Forwarding function for diagnostics. This translate a source |
294 | | /// position in the current buffer into a SourceLocation object for rendering. |
295 | | DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const; |
296 | | |
297 | | /// getSourceLocation - Return a source location identifier for the specified |
298 | | /// offset in the current file. |
299 | | SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const; |
300 | | |
301 | | /// getSourceLocation - Return a source location for the next character in |
302 | | /// the current file. |
303 | 1.97M | SourceLocation getSourceLocation() override { |
304 | 1.97M | return getSourceLocation(BufferPtr); |
305 | 1.97M | } |
306 | | |
307 | | /// Return the current location in the buffer. |
308 | 16.7M | const char *getBufferLocation() const { return BufferPtr; } |
309 | | |
310 | | /// Returns the current lexing offset. |
311 | 3.93M | unsigned getCurrentBufferOffset() { |
312 | 3.93M | assert(BufferPtr >= BufferStart && "Invalid buffer state"); |
313 | 0 | return BufferPtr - BufferStart; |
314 | 3.93M | } |
315 | | |
316 | | /// Set the lexer's buffer pointer to \p Offset. |
317 | | void seek(unsigned Offset, bool IsAtStartOfLine); |
318 | | |
319 | | /// Stringify - Convert the specified string into a C string by i) escaping |
320 | | /// '\\' and " characters and ii) replacing newline character(s) with "\\n". |
321 | | /// If Charify is true, this escapes the ' character instead of ". |
322 | | static std::string Stringify(StringRef Str, bool Charify = false); |
323 | | |
324 | | /// Stringify - Convert the specified string into a C string by i) escaping |
325 | | /// '\\' and " characters and ii) replacing newline character(s) with "\\n". |
326 | | static void Stringify(SmallVectorImpl<char> &Str); |
327 | | |
328 | | /// getSpelling - This method is used to get the spelling of a token into a |
329 | | /// preallocated buffer, instead of as an std::string. The caller is required |
330 | | /// to allocate enough space for the token, which is guaranteed to be at least |
331 | | /// Tok.getLength() bytes long. The length of the actual result is returned. |
332 | | /// |
333 | | /// Note that this method may do two possible things: it may either fill in |
334 | | /// the buffer specified with characters, or it may *change the input pointer* |
335 | | /// to point to a constant buffer with the data already in it (avoiding a |
336 | | /// copy). The caller is not allowed to modify the returned buffer pointer |
337 | | /// if an internal buffer is returned. |
338 | | static unsigned getSpelling(const Token &Tok, const char *&Buffer, |
339 | | const SourceManager &SourceMgr, |
340 | | const LangOptions &LangOpts, |
341 | | bool *Invalid = nullptr); |
342 | | |
343 | | /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a |
344 | | /// token is the characters used to represent the token in the source file |
345 | | /// after trigraph expansion and escaped-newline folding. In particular, this |
346 | | /// wants to get the true, uncanonicalized, spelling of things like digraphs |
347 | | /// UCNs, etc. |
348 | | static std::string getSpelling(const Token &Tok, |
349 | | const SourceManager &SourceMgr, |
350 | | const LangOptions &LangOpts, |
351 | | bool *Invalid = nullptr); |
352 | | |
353 | | /// getSpelling - This method is used to get the spelling of the |
354 | | /// token at the given source location. If, as is usually true, it |
355 | | /// is not necessary to copy any data, then the returned string may |
356 | | /// not point into the provided buffer. |
357 | | /// |
358 | | /// This method lexes at the expansion depth of the given |
359 | | /// location and does not jump to the expansion or spelling |
360 | | /// location. |
361 | | static StringRef getSpelling(SourceLocation loc, |
362 | | SmallVectorImpl<char> &buffer, |
363 | | const SourceManager &SM, |
364 | | const LangOptions &options, |
365 | | bool *invalid = nullptr); |
366 | | |
367 | | /// MeasureTokenLength - Relex the token at the specified location and return |
368 | | /// its length in bytes in the input file. If the token needs cleaning (e.g. |
369 | | /// includes a trigraph or an escaped newline) then this count includes bytes |
370 | | /// that are part of that. |
371 | | static unsigned MeasureTokenLength(SourceLocation Loc, |
372 | | const SourceManager &SM, |
373 | | const LangOptions &LangOpts); |
374 | | |
375 | | /// Relex the token at the specified location. |
376 | | /// \returns true if there was a failure, false on success. |
377 | | static bool getRawToken(SourceLocation Loc, Token &Result, |
378 | | const SourceManager &SM, |
379 | | const LangOptions &LangOpts, |
380 | | bool IgnoreWhiteSpace = false); |
381 | | |
382 | | /// Given a location any where in a source buffer, find the location |
383 | | /// that corresponds to the beginning of the token in which the original |
384 | | /// source location lands. |
385 | | static SourceLocation GetBeginningOfToken(SourceLocation Loc, |
386 | | const SourceManager &SM, |
387 | | const LangOptions &LangOpts); |
388 | | |
389 | | /// Get the physical length (including trigraphs and escaped newlines) of the |
390 | | /// first \p Characters characters of the token starting at TokStart. |
391 | | static unsigned getTokenPrefixLength(SourceLocation TokStart, |
392 | | unsigned CharNo, |
393 | | const SourceManager &SM, |
394 | | const LangOptions &LangOpts); |
395 | | |
396 | | /// AdvanceToTokenCharacter - If the current SourceLocation specifies a |
397 | | /// location at the start of a token, return a new location that specifies a |
398 | | /// character within the token. This handles trigraphs and escaped newlines. |
399 | | static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart, |
400 | | unsigned Characters, |
401 | | const SourceManager &SM, |
402 | 60.8k | const LangOptions &LangOpts) { |
403 | 60.8k | return TokStart.getLocWithOffset( |
404 | 60.8k | getTokenPrefixLength(TokStart, Characters, SM, LangOpts)); |
405 | 60.8k | } |
406 | | |
407 | | /// Computes the source location just past the end of the |
408 | | /// token at this source location. |
409 | | /// |
410 | | /// This routine can be used to produce a source location that |
411 | | /// points just past the end of the token referenced by \p Loc, and |
412 | | /// is generally used when a diagnostic needs to point just after a |
413 | | /// token where it expected something different that it received. If |
414 | | /// the returned source location would not be meaningful (e.g., if |
415 | | /// it points into a macro), this routine returns an invalid |
416 | | /// source location. |
417 | | /// |
418 | | /// \param Offset an offset from the end of the token, where the source |
419 | | /// location should refer to. The default offset (0) produces a source |
420 | | /// location pointing just past the end of the token; an offset of 1 produces |
421 | | /// a source location pointing to the last character in the token, etc. |
422 | | static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, |
423 | | const SourceManager &SM, |
424 | | const LangOptions &LangOpts); |
425 | | |
426 | | /// Given a token range, produce a corresponding CharSourceRange that |
427 | | /// is not a token range. This allows the source range to be used by |
428 | | /// components that don't have access to the lexer and thus can't find the |
429 | | /// end of the range for themselves. |
430 | | static CharSourceRange getAsCharRange(SourceRange Range, |
431 | | const SourceManager &SM, |
432 | 6.48k | const LangOptions &LangOpts) { |
433 | 6.48k | SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts); |
434 | 6.48k | return End.isInvalid() ? CharSourceRange()1 |
435 | 6.48k | : CharSourceRange::getCharRange( |
436 | 6.48k | Range.getBegin(), End); |
437 | 6.48k | } |
438 | | static CharSourceRange getAsCharRange(CharSourceRange Range, |
439 | | const SourceManager &SM, |
440 | 2.07k | const LangOptions &LangOpts) { |
441 | 2.07k | return Range.isTokenRange() |
442 | 2.07k | ? getAsCharRange(Range.getAsRange(), SM, LangOpts)2.03k |
443 | 2.07k | : Range41 ; |
444 | 2.07k | } |
445 | | |
446 | | /// Returns true if the given MacroID location points at the first |
447 | | /// token of the macro expansion. |
448 | | /// |
449 | | /// \param MacroBegin If non-null and function returns true, it is set to |
450 | | /// begin location of the macro. |
451 | | static bool isAtStartOfMacroExpansion(SourceLocation loc, |
452 | | const SourceManager &SM, |
453 | | const LangOptions &LangOpts, |
454 | | SourceLocation *MacroBegin = nullptr); |
455 | | |
456 | | /// Returns true if the given MacroID location points at the last |
457 | | /// token of the macro expansion. |
458 | | /// |
459 | | /// \param MacroEnd If non-null and function returns true, it is set to |
460 | | /// end location of the macro. |
461 | | static bool isAtEndOfMacroExpansion(SourceLocation loc, |
462 | | const SourceManager &SM, |
463 | | const LangOptions &LangOpts, |
464 | | SourceLocation *MacroEnd = nullptr); |
465 | | |
466 | | /// Accepts a range and returns a character range with file locations. |
467 | | /// |
468 | | /// Returns a null range if a part of the range resides inside a macro |
469 | | /// expansion or the range does not reside on the same FileID. |
470 | | /// |
471 | | /// This function is trying to deal with macros and return a range based on |
472 | | /// file locations. The cases where it can successfully handle macros are: |
473 | | /// |
474 | | /// -begin or end range lies at the start or end of a macro expansion, in |
475 | | /// which case the location will be set to the expansion point, e.g: |
476 | | /// \#define M 1 2 |
477 | | /// a M |
478 | | /// If you have a range [a, 2] (where 2 came from the macro), the function |
479 | | /// will return a range for "a M" |
480 | | /// if you have range [a, 1], the function will fail because the range |
481 | | /// overlaps with only a part of the macro |
482 | | /// |
483 | | /// -The macro is a function macro and the range can be mapped to the macro |
484 | | /// arguments, e.g: |
485 | | /// \#define M 1 2 |
486 | | /// \#define FM(x) x |
487 | | /// FM(a b M) |
488 | | /// if you have range [b, 2], the function will return the file range "b M" |
489 | | /// inside the macro arguments. |
490 | | /// if you have range [a, 2], the function will return the file range |
491 | | /// "FM(a b M)" since the range includes all of the macro expansion. |
492 | | static CharSourceRange makeFileCharRange(CharSourceRange Range, |
493 | | const SourceManager &SM, |
494 | | const LangOptions &LangOpts); |
495 | | |
496 | | /// Returns a string for the source that the range encompasses. |
497 | | static StringRef getSourceText(CharSourceRange Range, |
498 | | const SourceManager &SM, |
499 | | const LangOptions &LangOpts, |
500 | | bool *Invalid = nullptr); |
501 | | |
502 | | /// Retrieve the name of the immediate macro expansion. |
503 | | /// |
504 | | /// This routine starts from a source location, and finds the name of the macro |
505 | | /// responsible for its immediate expansion. It looks through any intervening |
506 | | /// macro argument expansions to compute this. It returns a StringRef which |
507 | | /// refers to the SourceManager-owned buffer of the source where that macro |
508 | | /// name is spelled. Thus, the result shouldn't out-live that SourceManager. |
509 | | static StringRef getImmediateMacroName(SourceLocation Loc, |
510 | | const SourceManager &SM, |
511 | | const LangOptions &LangOpts); |
512 | | |
513 | | /// Retrieve the name of the immediate macro expansion. |
514 | | /// |
515 | | /// This routine starts from a source location, and finds the name of the |
516 | | /// macro responsible for its immediate expansion. It looks through any |
517 | | /// intervening macro argument expansions to compute this. It returns a |
518 | | /// StringRef which refers to the SourceManager-owned buffer of the source |
519 | | /// where that macro name is spelled. Thus, the result shouldn't out-live |
520 | | /// that SourceManager. |
521 | | /// |
522 | | /// This differs from Lexer::getImmediateMacroName in that any macro argument |
523 | | /// location will result in the topmost function macro that accepted it. |
524 | | /// e.g. |
525 | | /// \code |
526 | | /// MAC1( MAC2(foo) ) |
527 | | /// \endcode |
528 | | /// for location of 'foo' token, this function will return "MAC1" while |
529 | | /// Lexer::getImmediateMacroName will return "MAC2". |
530 | | static StringRef getImmediateMacroNameForDiagnostics( |
531 | | SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts); |
532 | | |
533 | | /// Compute the preamble of the given file. |
534 | | /// |
535 | | /// The preamble of a file contains the initial comments, include directives, |
536 | | /// and other preprocessor directives that occur before the code in this |
537 | | /// particular file actually begins. The preamble of the main source file is |
538 | | /// a potential prefix header. |
539 | | /// |
540 | | /// \param Buffer The memory buffer containing the file's contents. |
541 | | /// |
542 | | /// \param MaxLines If non-zero, restrict the length of the preamble |
543 | | /// to fewer than this number of lines. |
544 | | /// |
545 | | /// \returns The offset into the file where the preamble ends and the rest |
546 | | /// of the file begins along with a boolean value indicating whether |
547 | | /// the preamble ends at the beginning of a new line. |
548 | | static PreambleBounds ComputePreamble(StringRef Buffer, |
549 | | const LangOptions &LangOpts, |
550 | | unsigned MaxLines = 0); |
551 | | |
552 | | /// Finds the token that comes right after the given location. |
553 | | /// |
554 | | /// Returns the next token, or none if the location is inside a macro. |
555 | | static Optional<Token> findNextToken(SourceLocation Loc, |
556 | | const SourceManager &SM, |
557 | | const LangOptions &LangOpts); |
558 | | |
559 | | /// Checks that the given token is the first token that occurs after |
560 | | /// the given location (this excludes comments and whitespace). Returns the |
561 | | /// location immediately after the specified token. If the token is not found |
562 | | /// or the location is inside a macro, the returned source location will be |
563 | | /// invalid. |
564 | | static SourceLocation findLocationAfterToken(SourceLocation loc, |
565 | | tok::TokenKind TKind, |
566 | | const SourceManager &SM, |
567 | | const LangOptions &LangOpts, |
568 | | bool SkipTrailingWhitespaceAndNewLine); |
569 | | |
570 | | /// Returns true if the given character could appear in an identifier. |
571 | | static bool isAsciiIdentifierContinueChar(char c, |
572 | | const LangOptions &LangOpts); |
573 | | |
574 | | /// Checks whether new line pointed by Str is preceded by escape |
575 | | /// sequence. |
576 | | static bool isNewLineEscaped(const char *BufferStart, const char *Str); |
577 | | |
578 | | /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever |
579 | | /// emit a warning. |
580 | | static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size, |
581 | 315k | const LangOptions &LangOpts) { |
582 | | // If this is not a trigraph and not a UCN or escaped newline, return |
583 | | // quickly. |
584 | 315k | if (isObviouslySimpleCharacter(Ptr[0])) { |
585 | 304k | Size = 1; |
586 | 304k | return *Ptr; |
587 | 304k | } |
588 | | |
589 | 11.4k | Size = 0; |
590 | 11.4k | return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); |
591 | 315k | } |
592 | | |
593 | | /// Returns the leading whitespace for line that corresponds to the given |
594 | | /// location \p Loc. |
595 | | static StringRef getIndentationForLine(SourceLocation Loc, |
596 | | const SourceManager &SM); |
597 | | |
598 | | /// Check if this is the first time we're lexing the input file. |
599 | 14 | bool isFirstTimeLexingFile() const { return IsFirstTimeLexingFile; } |
600 | | |
601 | | private: |
602 | | //===--------------------------------------------------------------------===// |
603 | | // Internal implementation interfaces. |
604 | | |
605 | | /// LexTokenInternal - Internal interface to lex a preprocessing token. Called |
606 | | /// by Lex. |
607 | | /// |
608 | | bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine); |
609 | | |
610 | | bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr); |
611 | | |
612 | | bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr); |
613 | | |
614 | | /// FormTokenWithChars - When we lex a token, we have identified a span |
615 | | /// starting at BufferPtr, going to TokEnd that forms the token. This method |
616 | | /// takes that range and assigns it to the token as its location and size. In |
617 | | /// addition, since tokens cannot overlap, this also updates BufferPtr to be |
618 | | /// TokEnd. |
619 | | void FormTokenWithChars(Token &Result, const char *TokEnd, |
620 | 1.56G | tok::TokenKind Kind) { |
621 | 1.56G | unsigned TokLen = TokEnd-BufferPtr; |
622 | 1.56G | Result.setLength(TokLen); |
623 | 1.56G | Result.setLocation(getSourceLocation(BufferPtr, TokLen)); |
624 | 1.56G | Result.setKind(Kind); |
625 | 1.56G | BufferPtr = TokEnd; |
626 | 1.56G | } |
627 | | |
628 | | /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a |
629 | | /// tok::l_paren token, 0 if it is something else and 2 if there are no more |
630 | | /// tokens in the buffer controlled by this lexer. |
631 | | unsigned isNextPPTokenLParen(); |
632 | | |
633 | | //===--------------------------------------------------------------------===// |
634 | | // Lexer character reading interfaces. |
635 | | |
636 | | // This lexer is built on two interfaces for reading characters, both of which |
637 | | // automatically provide phase 1/2 translation. getAndAdvanceChar is used |
638 | | // when we know that we will be reading a character from the input buffer and |
639 | | // that this character will be part of the result token. This occurs in (f.e.) |
640 | | // string processing, because we know we need to read until we find the |
641 | | // closing '"' character. |
642 | | // |
643 | | // The second interface is the combination of getCharAndSize with |
644 | | // ConsumeChar. getCharAndSize reads a phase 1/2 translated character, |
645 | | // returning it and its size. If the lexer decides that this character is |
646 | | // part of the current token, it calls ConsumeChar on it. This two stage |
647 | | // approach allows us to emit diagnostics for characters (e.g. warnings about |
648 | | // trigraphs), knowing that they only are emitted if the character is |
649 | | // consumed. |
650 | | |
651 | | /// isObviouslySimpleCharacter - Return true if the specified character is |
652 | | /// obviously the same in translation phase 1 and translation phase 3. This |
653 | | /// can return false for characters that end up being the same, but it will |
654 | | /// never return true for something that needs to be mapped. |
655 | 3.08G | static bool isObviouslySimpleCharacter(char C) { |
656 | 3.08G | return C != '?' && C != '\\'3.08G ; |
657 | 3.08G | } |
658 | | |
659 | | /// getAndAdvanceChar - Read a single 'character' from the specified buffer, |
660 | | /// advance over it, and return it. This is tricky in several cases. Here we |
661 | | /// just handle the trivial case and fall-back to the non-inlined |
662 | | /// getCharAndSizeSlow method to handle the hard case. |
663 | 1.94G | inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) { |
664 | | // If this is not a trigraph and not a UCN or escaped newline, return |
665 | | // quickly. |
666 | 1.94G | if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++1.94G ; |
667 | | |
668 | 6.97M | unsigned Size = 0; |
669 | 6.97M | char C = getCharAndSizeSlow(Ptr, Size, &Tok); |
670 | 6.97M | Ptr += Size; |
671 | 6.97M | return C; |
672 | 1.94G | } |
673 | | |
674 | | /// ConsumeChar - When a character (identified by getCharAndSize) is consumed |
675 | | /// and added to a given token, check to see if there are diagnostics that |
676 | | /// need to be emitted or flags that need to be set on the token. If so, do |
677 | | /// it. |
678 | 202M | const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) { |
679 | | // Normal case, we consumed exactly one token. Just return it. |
680 | 202M | if (Size == 1) |
681 | 202M | return Ptr+Size; |
682 | | |
683 | | // Otherwise, re-lex the character with a current token, allowing |
684 | | // diagnostics to be emitted and flags to be set. |
685 | 88 | Size = 0; |
686 | 88 | getCharAndSizeSlow(Ptr, Size, &Tok); |
687 | 88 | return Ptr+Size; |
688 | 202M | } |
689 | | |
690 | | /// getCharAndSize - Peek a single 'character' from the specified buffer, |
691 | | /// get its size, and return it. This is tricky in several cases. Here we |
692 | | /// just handle the trivial case and fall-back to the non-inlined |
693 | | /// getCharAndSizeSlow method to handle the hard case. |
694 | 1.13G | inline char getCharAndSize(const char *Ptr, unsigned &Size) { |
695 | | // If this is not a trigraph and not a UCN or escaped newline, return |
696 | | // quickly. |
697 | 1.13G | if (isObviouslySimpleCharacter(Ptr[0])) { |
698 | 1.13G | Size = 1; |
699 | 1.13G | return *Ptr; |
700 | 1.13G | } |
701 | | |
702 | 421 | Size = 0; |
703 | 421 | return getCharAndSizeSlow(Ptr, Size); |
704 | 1.13G | } |
705 | | |
706 | | /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize |
707 | | /// method. |
708 | | char getCharAndSizeSlow(const char *Ptr, unsigned &Size, |
709 | | Token *Tok = nullptr); |
710 | | |
711 | | /// getEscapedNewLineSize - Return the size of the specified escaped newline, |
712 | | /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry |
713 | | /// to this function. |
714 | | static unsigned getEscapedNewLineSize(const char *P); |
715 | | |
716 | | /// SkipEscapedNewLines - If P points to an escaped newline (or a series of |
717 | | /// them), skip over them and return the first non-escaped-newline found, |
718 | | /// otherwise return P. |
719 | | static const char *SkipEscapedNewLines(const char *P); |
720 | | |
721 | | /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a |
722 | | /// diagnostic. |
723 | | static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, |
724 | | const LangOptions &LangOpts); |
725 | | |
726 | | //===--------------------------------------------------------------------===// |
727 | | // Other lexer functions. |
728 | | |
729 | | void SetByteOffset(unsigned Offset, bool StartOfLine); |
730 | | |
731 | | void PropagateLineStartLeadingSpaceInfo(Token &Result); |
732 | | |
733 | | const char *LexUDSuffix(Token &Result, const char *CurPtr, |
734 | | bool IsStringLiteral); |
735 | | |
736 | | // Helper functions to lex the remainder of a token of the specific type. |
737 | | |
738 | | // This function handles both ASCII and Unicode identifiers after |
739 | | // the first codepoint of the identifyier has been parsed. |
740 | | bool LexIdentifierContinue(Token &Result, const char *CurPtr); |
741 | | |
742 | | bool LexNumericConstant (Token &Result, const char *CurPtr); |
743 | | bool LexStringLiteral (Token &Result, const char *CurPtr, |
744 | | tok::TokenKind Kind); |
745 | | bool LexRawStringLiteral (Token &Result, const char *CurPtr, |
746 | | tok::TokenKind Kind); |
747 | | bool LexAngledStringLiteral(Token &Result, const char *CurPtr); |
748 | | bool LexCharConstant (Token &Result, const char *CurPtr, |
749 | | tok::TokenKind Kind); |
750 | | bool LexEndOfFile (Token &Result, const char *CurPtr); |
751 | | bool SkipWhitespace (Token &Result, const char *CurPtr, |
752 | | bool &TokAtPhysicalStartOfLine); |
753 | | bool SkipLineComment (Token &Result, const char *CurPtr, |
754 | | bool &TokAtPhysicalStartOfLine); |
755 | | bool SkipBlockComment (Token &Result, const char *CurPtr, |
756 | | bool &TokAtPhysicalStartOfLine); |
757 | | bool SaveLineComment (Token &Result, const char *CurPtr); |
758 | | |
759 | | bool IsStartOfConflictMarker(const char *CurPtr); |
760 | | bool HandleEndOfConflictMarker(const char *CurPtr); |
761 | | |
762 | | bool lexEditorPlaceholder(Token &Result, const char *CurPtr); |
763 | | |
764 | | bool isCodeCompletionPoint(const char *CurPtr) const; |
765 | 66 | void cutOffLexing() { BufferPtr = BufferEnd; } |
766 | | |
767 | | bool isHexaLiteral(const char *Start, const LangOptions &LangOpts); |
768 | | |
769 | | void codeCompleteIncludedFile(const char *PathStart, |
770 | | const char *CompletionPoint, bool IsAngled); |
771 | | |
772 | | llvm::Optional<uint32_t> |
773 | | tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result); |
774 | | llvm::Optional<uint32_t> tryReadNamedUCN(const char *&StartPtr, |
775 | | Token *Result); |
776 | | |
777 | | /// Read a universal character name. |
778 | | /// |
779 | | /// \param StartPtr The position in the source buffer after the initial '\'. |
780 | | /// If the UCN is syntactically well-formed (but not |
781 | | /// necessarily valid), this parameter will be updated to |
782 | | /// point to the character after the UCN. |
783 | | /// \param SlashLoc The position in the source buffer of the '\'. |
784 | | /// \param Result The token being formed. Pass \c nullptr to suppress |
785 | | /// diagnostics and handle token formation in the caller. |
786 | | /// |
787 | | /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is |
788 | | /// invalid. |
789 | | uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result); |
790 | | |
791 | | /// Try to consume a UCN as part of an identifier at the current |
792 | | /// location. |
793 | | /// \param CurPtr Initially points to the range of characters in the source |
794 | | /// buffer containing the '\'. Updated to point past the end of |
795 | | /// the UCN on success. |
796 | | /// \param Size The number of characters occupied by the '\' (including |
797 | | /// trigraphs and escaped newlines). |
798 | | /// \param Result The token being produced. Marked as containing a UCN on |
799 | | /// success. |
800 | | /// \return \c true if a UCN was lexed and it produced an acceptable |
801 | | /// identifier character, \c false otherwise. |
802 | | bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, |
803 | | Token &Result); |
804 | | |
805 | | /// Try to consume an identifier character encoded in UTF-8. |
806 | | /// \param CurPtr Points to the start of the (potential) UTF-8 code unit |
807 | | /// sequence. On success, updated to point past the end of it. |
808 | | /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier |
809 | | /// character was lexed, \c false otherwise. |
810 | | bool tryConsumeIdentifierUTF8Char(const char *&CurPtr); |
811 | | }; |
812 | | |
813 | | } // namespace clang |
814 | | |
815 | | #endif // LLVM_CLANG_LEX_LEXER_H |