/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/include/clang/Lex/Lexer.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file defines the Lexer interface. |
10 | | // |
11 | | //===----------------------------------------------------------------------===// |
12 | | |
13 | | #ifndef LLVM_CLANG_LEX_LEXER_H |
14 | | #define LLVM_CLANG_LEX_LEXER_H |
15 | | |
16 | | #include "clang/Basic/LangOptions.h" |
17 | | #include "clang/Basic/SourceLocation.h" |
18 | | #include "clang/Basic/TokenKinds.h" |
19 | | #include "clang/Lex/PreprocessorLexer.h" |
20 | | #include "clang/Lex/Token.h" |
21 | | #include "llvm/ADT/Optional.h" |
22 | | #include "llvm/ADT/SmallVector.h" |
23 | | #include "llvm/ADT/StringRef.h" |
24 | | #include <cassert> |
25 | | #include <cstdint> |
26 | | #include <string> |
27 | | |
28 | | namespace llvm { |
29 | | |
30 | | class MemoryBufferRef; |
31 | | |
32 | | } // namespace llvm |
33 | | |
34 | | namespace clang { |
35 | | |
36 | | class DiagnosticBuilder; |
37 | | class Preprocessor; |
38 | | class SourceManager; |
39 | | class LangOptions; |
40 | | |
41 | | /// ConflictMarkerKind - Kinds of conflict marker which the lexer might be |
42 | | /// recovering from. |
43 | | enum ConflictMarkerKind { |
44 | | /// Not within a conflict marker. |
45 | | CMK_None, |
46 | | |
47 | | /// A normal or diff3 conflict marker, initiated by at least 7 "<"s, |
48 | | /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s. |
49 | | CMK_Normal, |
50 | | |
51 | | /// A Perforce-style conflict marker, initiated by 4 ">"s, |
52 | | /// separated by 4 "="s, and terminated by 4 "<"s. |
53 | | CMK_Perforce |
54 | | }; |
55 | | |
56 | | /// Describes the bounds (start, size) of the preamble and a flag required by |
57 | | /// PreprocessorOptions::PrecompiledPreambleBytes. |
58 | | /// The preamble includes the BOM, if any. |
59 | | struct PreambleBounds { |
60 | | /// Size of the preamble in bytes. |
61 | | unsigned Size; |
62 | | |
63 | | /// Whether the preamble ends at the start of a new line. |
64 | | /// |
65 | | /// Used to inform the lexer as to whether it's starting at the beginning of |
66 | | /// a line after skipping the preamble. |
67 | | bool PreambleEndsAtStartOfLine; |
68 | | |
69 | | PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine) |
70 | 1.39k | : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {} |
71 | | }; |
72 | | |
73 | | /// Lexer - This provides a simple interface that turns a text buffer into a |
74 | | /// stream of tokens. This provides no support for file reading or buffering, |
75 | | /// or buffering/seeking of tokens, only forward lexing is supported. It relies |
76 | | /// on the specified Preprocessor object to handle preprocessor directives, etc. |
77 | | class Lexer : public PreprocessorLexer { |
78 | | friend class Preprocessor; |
79 | | |
80 | | void anchor() override; |
81 | | |
82 | | //===--------------------------------------------------------------------===// |
83 | | // Constant configuration values for this lexer. |
84 | | |
85 | | // Start of the buffer. |
86 | | const char *BufferStart; |
87 | | |
88 | | // End of the buffer. |
89 | | const char *BufferEnd; |
90 | | |
91 | | // Location for start of file. |
92 | | SourceLocation FileLoc; |
93 | | |
94 | | // LangOpts enabled by this language. |
95 | | // Storing LangOptions as reference here is important from performance point |
96 | | // of view. Lack of reference means that LangOptions copy constructor would be |
97 | | // called by Lexer(..., const LangOptions &LangOpts,...). Given that local |
98 | | // Lexer objects are created thousands times (in Lexer::getRawToken, |
99 | | // Preprocessor::EnterSourceFile and other places) during single module |
100 | | // processing in frontend it would make std::vector<std::string> copy |
101 | | // constructors surprisingly hot. |
102 | | const LangOptions &LangOpts; |
103 | | |
104 | | // True if '//' line comments are enabled. |
105 | | bool LineComment; |
106 | | |
107 | | // True if lexer for _Pragma handling. |
108 | | bool Is_PragmaLexer; |
109 | | |
110 | | //===--------------------------------------------------------------------===// |
111 | | // Context-specific lexing flags set by the preprocessor. |
112 | | // |
113 | | |
114 | | /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace |
115 | | /// and return them as tokens. This is used for -C and -CC modes, and |
116 | | /// whitespace preservation can be useful for some clients that want to lex |
117 | | /// the file in raw mode and get every character from the file. |
118 | | /// |
119 | | /// When this is set to 2 it returns comments and whitespace. When set to 1 |
120 | | /// it returns comments, when it is set to 0 it returns normal tokens only. |
121 | | unsigned char ExtendedTokenMode; |
122 | | |
123 | | //===--------------------------------------------------------------------===// |
124 | | // Context that changes as the file is lexed. |
125 | | // NOTE: any state that mutates when in raw mode must have save/restore code |
126 | | // in Lexer::isNextPPTokenLParen. |
127 | | |
128 | | // BufferPtr - Current pointer into the buffer. This is the next character |
129 | | // to be lexed. |
130 | | const char *BufferPtr; |
131 | | |
132 | | // IsAtStartOfLine - True if the next lexed token should get the "start of |
133 | | // line" flag set on it. |
134 | | bool IsAtStartOfLine; |
135 | | |
136 | | bool IsAtPhysicalStartOfLine; |
137 | | |
138 | | bool HasLeadingSpace; |
139 | | |
140 | | bool HasLeadingEmptyMacro; |
141 | | |
142 | | /// True if this is the first time we're lexing the input file. |
143 | | bool IsFirstTimeLexingFile; |
144 | | |
145 | | // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n', |
146 | | // it also points to '\n.' |
147 | | const char *NewLinePtr; |
148 | | |
149 | | // CurrentConflictMarkerState - The kind of conflict marker we are handling. |
150 | | ConflictMarkerKind CurrentConflictMarkerState; |
151 | | |
152 | | void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd); |
153 | | |
154 | | public: |
155 | | /// Lexer constructor - Create a new lexer object for the specified buffer |
156 | | /// with the specified preprocessor managing the lexing process. This lexer |
157 | | /// assumes that the associated file buffer and Preprocessor objects will |
158 | | /// outlive it, so it doesn't take ownership of either of them. |
159 | | Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, |
160 | | bool IsFirstIncludeOfFile = true); |
161 | | |
162 | | /// Lexer constructor - Create a new raw lexer object. This object is only |
163 | | /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the |
164 | | /// text range will outlive it, so it doesn't take ownership of it. |
165 | | Lexer(SourceLocation FileLoc, const LangOptions &LangOpts, |
166 | | const char *BufStart, const char *BufPtr, const char *BufEnd, |
167 | | bool IsFirstIncludeOfFile = true); |
168 | | |
169 | | /// Lexer constructor - Create a new raw lexer object. This object is only |
170 | | /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the |
171 | | /// text range will outlive it, so it doesn't take ownership of it. |
172 | | Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, |
173 | | const SourceManager &SM, const LangOptions &LangOpts, |
174 | | bool IsFirstIncludeOfFile = true); |
175 | | |
176 | | Lexer(const Lexer &) = delete; |
177 | | Lexer &operator=(const Lexer &) = delete; |
178 | | |
179 | | /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for |
180 | | /// _Pragma expansion. This has a variety of magic semantics that this method |
181 | | /// sets up. It returns a new'd Lexer that must be delete'd when done. |
182 | | static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc, |
183 | | SourceLocation ExpansionLocStart, |
184 | | SourceLocation ExpansionLocEnd, |
185 | | unsigned TokLen, Preprocessor &PP); |
186 | | |
187 | | /// getFileLoc - Return the File Location for the file we are lexing out of. |
188 | | /// The physical location encodes the location where the characters come from, |
189 | | /// the virtual location encodes where we should *claim* the characters came |
190 | | /// from. Currently this is only used by _Pragma handling. |
191 | 1.50M | SourceLocation getFileLoc() const { return FileLoc; } |
192 | | |
193 | | private: |
194 | | /// Lex - Return the next token in the file. If this is the end of file, it |
195 | | /// return the tok::eof token. This implicitly involves the preprocessor. |
196 | | bool Lex(Token &Result); |
197 | | |
198 | | public: |
199 | | /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma. |
200 | 3.09M | bool isPragmaLexer() const { return Is_PragmaLexer; } |
201 | | |
202 | | private: |
203 | | /// IndirectLex - An indirect call to 'Lex' that can be invoked via |
204 | | /// the PreprocessorLexer interface. |
205 | 0 | void IndirectLex(Token &Result) override { Lex(Result); } |
206 | | |
207 | | public: |
208 | | /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no |
209 | | /// associated preprocessor object. Return true if the 'next character to |
210 | | /// read' pointer points at the end of the lexer buffer, false otherwise. |
211 | 61.9M | bool LexFromRawLexer(Token &Result) { |
212 | 61.9M | assert(LexingRawMode && "Not already in raw mode!"); |
213 | 0 | Lex(Result); |
214 | | // Note that lexing to the end of the buffer doesn't implicitly delete the |
215 | | // lexer when in raw mode. |
216 | 61.9M | return BufferPtr == BufferEnd; |
217 | 61.9M | } |
218 | | |
219 | | /// isKeepWhitespaceMode - Return true if the lexer should return tokens for |
220 | | /// every character in the file, including whitespace and comments. This |
221 | | /// should only be used in raw mode, as the preprocessor is not prepared to |
222 | | /// deal with the excess tokens. |
223 | 807M | bool isKeepWhitespaceMode() const { |
224 | 807M | return ExtendedTokenMode > 1; |
225 | 807M | } |
226 | | |
227 | | /// SetKeepWhitespaceMode - This method lets clients enable or disable |
228 | | /// whitespace retention mode. |
229 | 79.9M | void SetKeepWhitespaceMode(bool Val) { |
230 | 79.9M | assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) && |
231 | 79.9M | "Can only retain whitespace in raw mode or -traditional-cpp"); |
232 | 79.9M | ExtendedTokenMode = Val ? 279.1k : 079.8M ; |
233 | 79.9M | } |
234 | | |
235 | | /// inKeepCommentMode - Return true if the lexer should return comments as |
236 | | /// tokens. |
237 | 120M | bool inKeepCommentMode() const { |
238 | 120M | return ExtendedTokenMode > 0; |
239 | 120M | } |
240 | | |
241 | | /// SetCommentRetentionMode - Change the comment retention mode of the lexer |
242 | | /// to the specified mode. This is really only useful when lexing in raw |
243 | | /// mode, because otherwise the lexer needs to manage this. |
244 | 177M | void SetCommentRetentionState(bool Mode) { |
245 | 177M | assert(!isKeepWhitespaceMode() && |
246 | 177M | "Can't play with comment retention state when retaining whitespace"); |
247 | 177M | ExtendedTokenMode = Mode ? 146.6M : 0131M ; |
248 | 177M | } |
249 | | |
250 | | /// Sets the extended token mode back to its initial value, according to the |
251 | | /// language options and preprocessor. This controls whether the lexer |
252 | | /// produces comment and whitespace tokens. |
253 | | /// |
254 | | /// This requires the lexer to have an associated preprocessor. A standalone |
255 | | /// lexer has nothing to reset to. |
256 | | void resetExtendedTokenMode(); |
257 | | |
258 | | /// Gets source code buffer. |
259 | 5.22k | StringRef getBuffer() const { |
260 | 5.22k | return StringRef(BufferStart, BufferEnd - BufferStart); |
261 | 5.22k | } |
262 | | |
263 | | /// ReadToEndOfLine - Read the rest of the current preprocessor line as an |
264 | | /// uninterpreted string. This switches the lexer out of directive mode. |
265 | | void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr); |
266 | | |
267 | | |
268 | | /// Diag - Forwarding function for diagnostics. This translate a source |
269 | | /// position in the current buffer into a SourceLocation object for rendering. |
270 | | DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const; |
271 | | |
272 | | /// getSourceLocation - Return a source location identifier for the specified |
273 | | /// offset in the current file. |
274 | | SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const; |
275 | | |
276 | | /// getSourceLocation - Return a source location for the next character in |
277 | | /// the current file. |
278 | 1.66M | SourceLocation getSourceLocation() override { |
279 | 1.66M | return getSourceLocation(BufferPtr); |
280 | 1.66M | } |
281 | | |
282 | | /// Return the current location in the buffer. |
283 | 48.5k | const char *getBufferLocation() const { return BufferPtr; } |
284 | | |
285 | | /// Returns the current lexing offset. |
286 | 39.2k | unsigned getCurrentBufferOffset() { |
287 | 39.2k | assert(BufferPtr >= BufferStart && "Invalid buffer state"); |
288 | 0 | return BufferPtr - BufferStart; |
289 | 39.2k | } |
290 | | |
291 | | /// Skip over \p NumBytes bytes. |
292 | | /// |
293 | | /// If the skip is successful, the next token will be lexed from the new |
294 | | /// offset. The lexer also assumes that we skipped to the start of the line. |
295 | | /// |
296 | | /// \returns true if the skip failed (new offset would have been past the |
297 | | /// end of the buffer), false otherwise. |
298 | | bool skipOver(unsigned NumBytes); |
299 | | |
300 | | /// Stringify - Convert the specified string into a C string by i) escaping |
301 | | /// '\\' and " characters and ii) replacing newline character(s) with "\\n". |
302 | | /// If Charify is true, this escapes the ' character instead of ". |
303 | | static std::string Stringify(StringRef Str, bool Charify = false); |
304 | | |
305 | | /// Stringify - Convert the specified string into a C string by i) escaping |
306 | | /// '\\' and " characters and ii) replacing newline character(s) with "\\n". |
307 | | static void Stringify(SmallVectorImpl<char> &Str); |
308 | | |
309 | | /// getSpelling - This method is used to get the spelling of a token into a |
310 | | /// preallocated buffer, instead of as an std::string. The caller is required |
311 | | /// to allocate enough space for the token, which is guaranteed to be at least |
312 | | /// Tok.getLength() bytes long. The length of the actual result is returned. |
313 | | /// |
314 | | /// Note that this method may do two possible things: it may either fill in |
315 | | /// the buffer specified with characters, or it may *change the input pointer* |
316 | | /// to point to a constant buffer with the data already in it (avoiding a |
317 | | /// copy). The caller is not allowed to modify the returned buffer pointer |
318 | | /// if an internal buffer is returned. |
319 | | static unsigned getSpelling(const Token &Tok, const char *&Buffer, |
320 | | const SourceManager &SourceMgr, |
321 | | const LangOptions &LangOpts, |
322 | | bool *Invalid = nullptr); |
323 | | |
324 | | /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a |
325 | | /// token is the characters used to represent the token in the source file |
326 | | /// after trigraph expansion and escaped-newline folding. In particular, this |
327 | | /// wants to get the true, uncanonicalized, spelling of things like digraphs |
328 | | /// UCNs, etc. |
329 | | static std::string getSpelling(const Token &Tok, |
330 | | const SourceManager &SourceMgr, |
331 | | const LangOptions &LangOpts, |
332 | | bool *Invalid = nullptr); |
333 | | |
334 | | /// getSpelling - This method is used to get the spelling of the |
335 | | /// token at the given source location. If, as is usually true, it |
336 | | /// is not necessary to copy any data, then the returned string may |
337 | | /// not point into the provided buffer. |
338 | | /// |
339 | | /// This method lexes at the expansion depth of the given |
340 | | /// location and does not jump to the expansion or spelling |
341 | | /// location. |
342 | | static StringRef getSpelling(SourceLocation loc, |
343 | | SmallVectorImpl<char> &buffer, |
344 | | const SourceManager &SM, |
345 | | const LangOptions &options, |
346 | | bool *invalid = nullptr); |
347 | | |
348 | | /// MeasureTokenLength - Relex the token at the specified location and return |
349 | | /// its length in bytes in the input file. If the token needs cleaning (e.g. |
350 | | /// includes a trigraph or an escaped newline) then this count includes bytes |
351 | | /// that are part of that. |
352 | | static unsigned MeasureTokenLength(SourceLocation Loc, |
353 | | const SourceManager &SM, |
354 | | const LangOptions &LangOpts); |
355 | | |
356 | | /// Relex the token at the specified location. |
357 | | /// \returns true if there was a failure, false on success. |
358 | | static bool getRawToken(SourceLocation Loc, Token &Result, |
359 | | const SourceManager &SM, |
360 | | const LangOptions &LangOpts, |
361 | | bool IgnoreWhiteSpace = false); |
362 | | |
363 | | /// Given a location any where in a source buffer, find the location |
364 | | /// that corresponds to the beginning of the token in which the original |
365 | | /// source location lands. |
366 | | static SourceLocation GetBeginningOfToken(SourceLocation Loc, |
367 | | const SourceManager &SM, |
368 | | const LangOptions &LangOpts); |
369 | | |
370 | | /// Get the physical length (including trigraphs and escaped newlines) of the |
371 | | /// first \p Characters characters of the token starting at TokStart. |
372 | | static unsigned getTokenPrefixLength(SourceLocation TokStart, |
373 | | unsigned CharNo, |
374 | | const SourceManager &SM, |
375 | | const LangOptions &LangOpts); |
376 | | |
377 | | /// AdvanceToTokenCharacter - If the current SourceLocation specifies a |
378 | | /// location at the start of a token, return a new location that specifies a |
379 | | /// character within the token. This handles trigraphs and escaped newlines. |
380 | | static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart, |
381 | | unsigned Characters, |
382 | | const SourceManager &SM, |
383 | 54.4k | const LangOptions &LangOpts) { |
384 | 54.4k | return TokStart.getLocWithOffset( |
385 | 54.4k | getTokenPrefixLength(TokStart, Characters, SM, LangOpts)); |
386 | 54.4k | } |
387 | | |
388 | | /// Computes the source location just past the end of the |
389 | | /// token at this source location. |
390 | | /// |
391 | | /// This routine can be used to produce a source location that |
392 | | /// points just past the end of the token referenced by \p Loc, and |
393 | | /// is generally used when a diagnostic needs to point just after a |
394 | | /// token where it expected something different that it received. If |
395 | | /// the returned source location would not be meaningful (e.g., if |
396 | | /// it points into a macro), this routine returns an invalid |
397 | | /// source location. |
398 | | /// |
399 | | /// \param Offset an offset from the end of the token, where the source |
400 | | /// location should refer to. The default offset (0) produces a source |
401 | | /// location pointing just past the end of the token; an offset of 1 produces |
402 | | /// a source location pointing to the last character in the token, etc. |
403 | | static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, |
404 | | const SourceManager &SM, |
405 | | const LangOptions &LangOpts); |
406 | | |
407 | | /// Given a token range, produce a corresponding CharSourceRange that |
408 | | /// is not a token range. This allows the source range to be used by |
409 | | /// components that don't have access to the lexer and thus can't find the |
410 | | /// end of the range for themselves. |
411 | | static CharSourceRange getAsCharRange(SourceRange Range, |
412 | | const SourceManager &SM, |
413 | 6.50k | const LangOptions &LangOpts) { |
414 | 6.50k | SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts); |
415 | 6.50k | return End.isInvalid() ? CharSourceRange()1 |
416 | 6.50k | : CharSourceRange::getCharRange( |
417 | 6.50k | Range.getBegin(), End); |
418 | 6.50k | } |
419 | | static CharSourceRange getAsCharRange(CharSourceRange Range, |
420 | | const SourceManager &SM, |
421 | 2.07k | const LangOptions &LangOpts) { |
422 | 2.07k | return Range.isTokenRange() |
423 | 2.07k | ? getAsCharRange(Range.getAsRange(), SM, LangOpts)2.03k |
424 | 2.07k | : Range41 ; |
425 | 2.07k | } |
426 | | |
427 | | /// Returns true if the given MacroID location points at the first |
428 | | /// token of the macro expansion. |
429 | | /// |
430 | | /// \param MacroBegin If non-null and function returns true, it is set to |
431 | | /// begin location of the macro. |
432 | | static bool isAtStartOfMacroExpansion(SourceLocation loc, |
433 | | const SourceManager &SM, |
434 | | const LangOptions &LangOpts, |
435 | | SourceLocation *MacroBegin = nullptr); |
436 | | |
437 | | /// Returns true if the given MacroID location points at the last |
438 | | /// token of the macro expansion. |
439 | | /// |
440 | | /// \param MacroEnd If non-null and function returns true, it is set to |
441 | | /// end location of the macro. |
442 | | static bool isAtEndOfMacroExpansion(SourceLocation loc, |
443 | | const SourceManager &SM, |
444 | | const LangOptions &LangOpts, |
445 | | SourceLocation *MacroEnd = nullptr); |
446 | | |
447 | | /// Accepts a range and returns a character range with file locations. |
448 | | /// |
449 | | /// Returns a null range if a part of the range resides inside a macro |
450 | | /// expansion or the range does not reside on the same FileID. |
451 | | /// |
452 | | /// This function is trying to deal with macros and return a range based on |
453 | | /// file locations. The cases where it can successfully handle macros are: |
454 | | /// |
455 | | /// -begin or end range lies at the start or end of a macro expansion, in |
456 | | /// which case the location will be set to the expansion point, e.g: |
457 | | /// \#define M 1 2 |
458 | | /// a M |
459 | | /// If you have a range [a, 2] (where 2 came from the macro), the function |
460 | | /// will return a range for "a M" |
461 | | /// if you have range [a, 1], the function will fail because the range |
462 | | /// overlaps with only a part of the macro |
463 | | /// |
464 | | /// -The macro is a function macro and the range can be mapped to the macro |
465 | | /// arguments, e.g: |
466 | | /// \#define M 1 2 |
467 | | /// \#define FM(x) x |
468 | | /// FM(a b M) |
469 | | /// if you have range [b, 2], the function will return the file range "b M" |
470 | | /// inside the macro arguments. |
471 | | /// if you have range [a, 2], the function will return the file range |
472 | | /// "FM(a b M)" since the range includes all of the macro expansion. |
473 | | static CharSourceRange makeFileCharRange(CharSourceRange Range, |
474 | | const SourceManager &SM, |
475 | | const LangOptions &LangOpts); |
476 | | |
477 | | /// Returns a string for the source that the range encompasses. |
478 | | static StringRef getSourceText(CharSourceRange Range, |
479 | | const SourceManager &SM, |
480 | | const LangOptions &LangOpts, |
481 | | bool *Invalid = nullptr); |
482 | | |
483 | | /// Retrieve the name of the immediate macro expansion. |
484 | | /// |
485 | | /// This routine starts from a source location, and finds the name of the macro |
486 | | /// responsible for its immediate expansion. It looks through any intervening |
487 | | /// macro argument expansions to compute this. It returns a StringRef which |
488 | | /// refers to the SourceManager-owned buffer of the source where that macro |
489 | | /// name is spelled. Thus, the result shouldn't out-live that SourceManager. |
490 | | static StringRef getImmediateMacroName(SourceLocation Loc, |
491 | | const SourceManager &SM, |
492 | | const LangOptions &LangOpts); |
493 | | |
494 | | /// Retrieve the name of the immediate macro expansion. |
495 | | /// |
496 | | /// This routine starts from a source location, and finds the name of the |
497 | | /// macro responsible for its immediate expansion. It looks through any |
498 | | /// intervening macro argument expansions to compute this. It returns a |
499 | | /// StringRef which refers to the SourceManager-owned buffer of the source |
500 | | /// where that macro name is spelled. Thus, the result shouldn't out-live |
501 | | /// that SourceManager. |
502 | | /// |
503 | | /// This differs from Lexer::getImmediateMacroName in that any macro argument |
504 | | /// location will result in the topmost function macro that accepted it. |
505 | | /// e.g. |
506 | | /// \code |
507 | | /// MAC1( MAC2(foo) ) |
508 | | /// \endcode |
509 | | /// for location of 'foo' token, this function will return "MAC1" while |
510 | | /// Lexer::getImmediateMacroName will return "MAC2". |
511 | | static StringRef getImmediateMacroNameForDiagnostics( |
512 | | SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts); |
513 | | |
514 | | /// Compute the preamble of the given file. |
515 | | /// |
516 | | /// The preamble of a file contains the initial comments, include directives, |
517 | | /// and other preprocessor directives that occur before the code in this |
518 | | /// particular file actually begins. The preamble of the main source file is |
519 | | /// a potential prefix header. |
520 | | /// |
521 | | /// \param Buffer The memory buffer containing the file's contents. |
522 | | /// |
523 | | /// \param MaxLines If non-zero, restrict the length of the preamble |
524 | | /// to fewer than this number of lines. |
525 | | /// |
526 | | /// \returns The offset into the file where the preamble ends and the rest |
527 | | /// of the file begins along with a boolean value indicating whether |
528 | | /// the preamble ends at the beginning of a new line. |
529 | | static PreambleBounds ComputePreamble(StringRef Buffer, |
530 | | const LangOptions &LangOpts, |
531 | | unsigned MaxLines = 0); |
532 | | |
533 | | /// Finds the token that comes right after the given location. |
534 | | /// |
535 | | /// Returns the next token, or none if the location is inside a macro. |
536 | | static Optional<Token> findNextToken(SourceLocation Loc, |
537 | | const SourceManager &SM, |
538 | | const LangOptions &LangOpts); |
539 | | |
540 | | /// Checks that the given token is the first token that occurs after |
541 | | /// the given location (this excludes comments and whitespace). Returns the |
542 | | /// location immediately after the specified token. If the token is not found |
543 | | /// or the location is inside a macro, the returned source location will be |
544 | | /// invalid. |
545 | | static SourceLocation findLocationAfterToken(SourceLocation loc, |
546 | | tok::TokenKind TKind, |
547 | | const SourceManager &SM, |
548 | | const LangOptions &LangOpts, |
549 | | bool SkipTrailingWhitespaceAndNewLine); |
550 | | |
551 | | /// Returns true if the given character could appear in an identifier. |
552 | | static bool isAsciiIdentifierContinueChar(char c, |
553 | | const LangOptions &LangOpts); |
554 | | |
555 | | /// Checks whether new line pointed by Str is preceded by escape |
556 | | /// sequence. |
557 | | static bool isNewLineEscaped(const char *BufferStart, const char *Str); |
558 | | |
559 | | /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever |
560 | | /// emit a warning. |
561 | | static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size, |
562 | 267k | const LangOptions &LangOpts) { |
563 | | // If this is not a trigraph and not a UCN or escaped newline, return |
564 | | // quickly. |
565 | 267k | if (isObviouslySimpleCharacter(Ptr[0])) { |
566 | 256k | Size = 1; |
567 | 256k | return *Ptr; |
568 | 256k | } |
569 | | |
570 | 10.7k | Size = 0; |
571 | 10.7k | return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); |
572 | 267k | } |
573 | | |
574 | | /// Returns the leading whitespace for line that corresponds to the given |
575 | | /// location \p Loc. |
576 | | static StringRef getIndentationForLine(SourceLocation Loc, |
577 | | const SourceManager &SM); |
578 | | |
579 | | /// Check if this is the first time we're lexing the input file. |
580 | 14 | bool isFirstTimeLexingFile() const { return IsFirstTimeLexingFile; } |
581 | | |
582 | | private: |
583 | | //===--------------------------------------------------------------------===// |
584 | | // Internal implementation interfaces. |
585 | | |
586 | | /// LexTokenInternal - Internal interface to lex a preprocessing token. Called |
587 | | /// by Lex. |
588 | | /// |
589 | | bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine); |
590 | | |
591 | | bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr); |
592 | | |
593 | | bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr); |
594 | | |
595 | | /// FormTokenWithChars - When we lex a token, we have identified a span |
596 | | /// starting at BufferPtr, going to TokEnd that forms the token. This method |
597 | | /// takes that range and assigns it to the token as its location and size. In |
598 | | /// addition, since tokens cannot overlap, this also updates BufferPtr to be |
599 | | /// TokEnd. |
600 | | void FormTokenWithChars(Token &Result, const char *TokEnd, |
601 | 1.58G | tok::TokenKind Kind) { |
602 | 1.58G | unsigned TokLen = TokEnd-BufferPtr; |
603 | 1.58G | Result.setLength(TokLen); |
604 | 1.58G | Result.setLocation(getSourceLocation(BufferPtr, TokLen)); |
605 | 1.58G | Result.setKind(Kind); |
606 | 1.58G | BufferPtr = TokEnd; |
607 | 1.58G | } |
608 | | |
609 | | /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a |
610 | | /// tok::l_paren token, 0 if it is something else and 2 if there are no more |
611 | | /// tokens in the buffer controlled by this lexer. |
612 | | unsigned isNextPPTokenLParen(); |
613 | | |
614 | | //===--------------------------------------------------------------------===// |
615 | | // Lexer character reading interfaces. |
616 | | |
617 | | // This lexer is built on two interfaces for reading characters, both of which |
618 | | // automatically provide phase 1/2 translation. getAndAdvanceChar is used |
619 | | // when we know that we will be reading a character from the input buffer and |
620 | | // that this character will be part of the result token. This occurs in (f.e.) |
621 | | // string processing, because we know we need to read until we find the |
622 | | // closing '"' character. |
623 | | // |
624 | | // The second interface is the combination of getCharAndSize with |
625 | | // ConsumeChar. getCharAndSize reads a phase 1/2 translated character, |
626 | | // returning it and its size. If the lexer decides that this character is |
627 | | // part of the current token, it calls ConsumeChar on it. This two stage |
628 | | // approach allows us to emit diagnostics for characters (e.g. warnings about |
629 | | // trigraphs), knowing that they only are emitted if the character is |
630 | | // consumed. |
631 | | |
632 | | /// isObviouslySimpleCharacter - Return true if the specified character is |
633 | | /// obviously the same in translation phase 1 and translation phase 3. This |
634 | | /// can return false for characters that end up being the same, but it will |
635 | | /// never return true for something that needs to be mapped. |
636 | 3.11G | static bool isObviouslySimpleCharacter(char C) { |
637 | 3.11G | return C != '?' && C != '\\'3.11G ; |
638 | 3.11G | } |
639 | | |
640 | | /// getAndAdvanceChar - Read a single 'character' from the specified buffer, |
641 | | /// advance over it, and return it. This is tricky in several cases. Here we |
642 | | /// just handle the trivial case and fall-back to the non-inlined |
643 | | /// getCharAndSizeSlow method to handle the hard case. |
644 | 1.95G | inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) { |
645 | | // If this is not a trigraph and not a UCN or escaped newline, return |
646 | | // quickly. |
647 | 1.95G | if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++1.94G ; |
648 | | |
649 | 7.18M | unsigned Size = 0; |
650 | 7.18M | char C = getCharAndSizeSlow(Ptr, Size, &Tok); |
651 | 7.18M | Ptr += Size; |
652 | 7.18M | return C; |
653 | 1.95G | } |
654 | | |
655 | | /// ConsumeChar - When a character (identified by getCharAndSize) is consumed |
656 | | /// and added to a given token, check to see if there are diagnostics that |
657 | | /// need to be emitted or flags that need to be set on the token. If so, do |
658 | | /// it. |
659 | 202M | const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) { |
660 | | // Normal case, we consumed exactly one token. Just return it. |
661 | 202M | if (Size == 1) |
662 | 202M | return Ptr+Size; |
663 | | |
664 | | // Otherwise, re-lex the character with a current token, allowing |
665 | | // diagnostics to be emitted and flags to be set. |
666 | 82 | Size = 0; |
667 | 82 | getCharAndSizeSlow(Ptr, Size, &Tok); |
668 | 82 | return Ptr+Size; |
669 | 202M | } |
670 | | |
671 | | /// getCharAndSize - Peek a single 'character' from the specified buffer, |
672 | | /// get its size, and return it. This is tricky in several cases. Here we |
673 | | /// just handle the trivial case and fall-back to the non-inlined |
674 | | /// getCharAndSizeSlow method to handle the hard case. |
675 | 1.15G | inline char getCharAndSize(const char *Ptr, unsigned &Size) { |
676 | | // If this is not a trigraph and not a UCN or escaped newline, return |
677 | | // quickly. |
678 | 1.15G | if (isObviouslySimpleCharacter(Ptr[0])) { |
679 | 1.15G | Size = 1; |
680 | 1.15G | return *Ptr; |
681 | 1.15G | } |
682 | | |
683 | 7.47k | Size = 0; |
684 | 7.47k | return getCharAndSizeSlow(Ptr, Size); |
685 | 1.15G | } |
686 | | |
687 | | /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize |
688 | | /// method. |
689 | | char getCharAndSizeSlow(const char *Ptr, unsigned &Size, |
690 | | Token *Tok = nullptr); |
691 | | |
692 | | /// getEscapedNewLineSize - Return the size of the specified escaped newline, |
693 | | /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry |
694 | | /// to this function. |
695 | | static unsigned getEscapedNewLineSize(const char *P); |
696 | | |
697 | | /// SkipEscapedNewLines - If P points to an escaped newline (or a series of |
698 | | /// them), skip over them and return the first non-escaped-newline found, |
699 | | /// otherwise return P. |
700 | | static const char *SkipEscapedNewLines(const char *P); |
701 | | |
702 | | /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a |
703 | | /// diagnostic. |
704 | | static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, |
705 | | const LangOptions &LangOpts); |
706 | | |
707 | | //===--------------------------------------------------------------------===// |
708 | | // Other lexer functions. |
709 | | |
710 | | void SetByteOffset(unsigned Offset, bool StartOfLine); |
711 | | |
712 | | void PropagateLineStartLeadingSpaceInfo(Token &Result); |
713 | | |
714 | | const char *LexUDSuffix(Token &Result, const char *CurPtr, |
715 | | bool IsStringLiteral); |
716 | | |
717 | | // Helper functions to lex the remainder of a token of the specific type. |
718 | | |
719 | | // This function handles both ASCII and Unicode identifiers after |
720 | | // the first codepoint of the identifyier has been parsed. |
721 | | bool LexIdentifierContinue(Token &Result, const char *CurPtr); |
722 | | |
723 | | bool LexNumericConstant (Token &Result, const char *CurPtr); |
724 | | bool LexStringLiteral (Token &Result, const char *CurPtr, |
725 | | tok::TokenKind Kind); |
726 | | bool LexRawStringLiteral (Token &Result, const char *CurPtr, |
727 | | tok::TokenKind Kind); |
728 | | bool LexAngledStringLiteral(Token &Result, const char *CurPtr); |
729 | | bool LexCharConstant (Token &Result, const char *CurPtr, |
730 | | tok::TokenKind Kind); |
731 | | bool LexEndOfFile (Token &Result, const char *CurPtr); |
732 | | bool SkipWhitespace (Token &Result, const char *CurPtr, |
733 | | bool &TokAtPhysicalStartOfLine); |
734 | | bool SkipLineComment (Token &Result, const char *CurPtr, |
735 | | bool &TokAtPhysicalStartOfLine); |
736 | | bool SkipBlockComment (Token &Result, const char *CurPtr, |
737 | | bool &TokAtPhysicalStartOfLine); |
738 | | bool SaveLineComment (Token &Result, const char *CurPtr); |
739 | | |
740 | | bool IsStartOfConflictMarker(const char *CurPtr); |
741 | | bool HandleEndOfConflictMarker(const char *CurPtr); |
742 | | |
743 | | bool lexEditorPlaceholder(Token &Result, const char *CurPtr); |
744 | | |
745 | | bool isCodeCompletionPoint(const char *CurPtr) const; |
746 | 66 | void cutOffLexing() { BufferPtr = BufferEnd; } |
747 | | |
748 | | bool isHexaLiteral(const char *Start, const LangOptions &LangOpts); |
749 | | |
750 | | void codeCompleteIncludedFile(const char *PathStart, |
751 | | const char *CompletionPoint, bool IsAngled); |
752 | | |
753 | | /// Read a universal character name. |
754 | | /// |
755 | | /// \param StartPtr The position in the source buffer after the initial '\'. |
756 | | /// If the UCN is syntactically well-formed (but not |
757 | | /// necessarily valid), this parameter will be updated to |
758 | | /// point to the character after the UCN. |
759 | | /// \param SlashLoc The position in the source buffer of the '\'. |
760 | | /// \param Result The token being formed. Pass \c nullptr to suppress |
761 | | /// diagnostics and handle token formation in the caller. |
762 | | /// |
763 | | /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is |
764 | | /// invalid. |
765 | | uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result); |
766 | | |
767 | | /// Try to consume a UCN as part of an identifier at the current |
768 | | /// location. |
769 | | /// \param CurPtr Initially points to the range of characters in the source |
770 | | /// buffer containing the '\'. Updated to point past the end of |
771 | | /// the UCN on success. |
772 | | /// \param Size The number of characters occupied by the '\' (including |
773 | | /// trigraphs and escaped newlines). |
774 | | /// \param Result The token being produced. Marked as containing a UCN on |
775 | | /// success. |
776 | | /// \return \c true if a UCN was lexed and it produced an acceptable |
777 | | /// identifier character, \c false otherwise. |
778 | | bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, |
779 | | Token &Result); |
780 | | |
781 | | /// Try to consume an identifier character encoded in UTF-8. |
782 | | /// \param CurPtr Points to the start of the (potential) UTF-8 code unit |
783 | | /// sequence. On success, updated to point past the end of it. |
784 | | /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier |
785 | | /// character was lexed, \c false otherwise. |
786 | | bool tryConsumeIdentifierUTF8Char(const char *&CurPtr); |
787 | | }; |
788 | | |
789 | | } // namespace clang |
790 | | |
791 | | #endif // LLVM_CLANG_LEX_LEXER_H |