Coverage Report

Created: 2022-05-17 06:19

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/include/clang/Lex/Lexer.h
Line
Count
Source (jump to first uncovered line)
1
//===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
//  This file defines the Lexer interface.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#ifndef LLVM_CLANG_LEX_LEXER_H
14
#define LLVM_CLANG_LEX_LEXER_H
15
16
#include "clang/Basic/LangOptions.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/TokenKinds.h"
19
#include "clang/Lex/PreprocessorLexer.h"
20
#include "clang/Lex/Token.h"
21
#include "llvm/ADT/Optional.h"
22
#include "llvm/ADT/SmallVector.h"
23
#include "llvm/ADT/StringRef.h"
24
#include <cassert>
25
#include <cstdint>
26
#include <string>
27
28
namespace llvm {
29
30
class MemoryBufferRef;
31
32
} // namespace llvm
33
34
namespace clang {
35
36
class DiagnosticBuilder;
37
class Preprocessor;
38
class SourceManager;
39
class LangOptions;
40
41
/// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
42
/// recovering from.
43
enum ConflictMarkerKind {
44
  /// Not within a conflict marker.
45
  CMK_None,
46
47
  /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
48
  /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
49
  CMK_Normal,
50
51
  /// A Perforce-style conflict marker, initiated by 4 ">"s,
52
  /// separated by 4 "="s, and terminated by 4 "<"s.
53
  CMK_Perforce
54
};
55
56
/// Describes the bounds (start, size) of the preamble and a flag required by
57
/// PreprocessorOptions::PrecompiledPreambleBytes.
58
/// The preamble includes the BOM, if any.
59
struct PreambleBounds {
60
  /// Size of the preamble in bytes.
61
  unsigned Size;
62
63
  /// Whether the preamble ends at the start of a new line.
64
  ///
65
  /// Used to inform the lexer as to whether it's starting at the beginning of
66
  /// a line after skipping the preamble.
67
  bool PreambleEndsAtStartOfLine;
68
69
  PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
70
1.39k
      : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
71
};
72
73
/// Lexer - This provides a simple interface that turns a text buffer into a
74
/// stream of tokens.  This provides no support for file reading or buffering,
75
/// or buffering/seeking of tokens, only forward lexing is supported.  It relies
76
/// on the specified Preprocessor object to handle preprocessor directives, etc.
77
class Lexer : public PreprocessorLexer {
78
  friend class Preprocessor;
79
80
  void anchor() override;
81
82
  //===--------------------------------------------------------------------===//
83
  // Constant configuration values for this lexer.
84
85
  // Start of the buffer.
86
  const char *BufferStart;
87
88
  // End of the buffer.
89
  const char *BufferEnd;
90
91
  // Location for start of file.
92
  SourceLocation FileLoc;
93
94
  // LangOpts enabled by this language.
95
  // Storing LangOptions as reference here is important from performance point
96
  // of view. Lack of reference means that LangOptions copy constructor would be
97
  // called by Lexer(..., const LangOptions &LangOpts,...). Given that local
98
  // Lexer objects are created thousands times (in Lexer::getRawToken,
99
  // Preprocessor::EnterSourceFile and other places) during single module
100
  // processing in frontend it would make std::vector<std::string> copy
101
  // constructors surprisingly hot.
102
  const LangOptions &LangOpts;
103
104
  // True if '//' line comments are enabled.
105
  bool LineComment;
106
107
  // True if lexer for _Pragma handling.
108
  bool Is_PragmaLexer;
109
110
  //===--------------------------------------------------------------------===//
111
  // Context-specific lexing flags set by the preprocessor.
112
  //
113
114
  /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
115
  /// and return them as tokens.  This is used for -C and -CC modes, and
116
  /// whitespace preservation can be useful for some clients that want to lex
117
  /// the file in raw mode and get every character from the file.
118
  ///
119
  /// When this is set to 2 it returns comments and whitespace.  When set to 1
120
  /// it returns comments, when it is set to 0 it returns normal tokens only.
121
  unsigned char ExtendedTokenMode;
122
123
  //===--------------------------------------------------------------------===//
124
  // Context that changes as the file is lexed.
125
  // NOTE: any state that mutates when in raw mode must have save/restore code
126
  // in Lexer::isNextPPTokenLParen.
127
128
  // BufferPtr - Current pointer into the buffer.  This is the next character
129
  // to be lexed.
130
  const char *BufferPtr;
131
132
  // IsAtStartOfLine - True if the next lexed token should get the "start of
133
  // line" flag set on it.
134
  bool IsAtStartOfLine;
135
136
  bool IsAtPhysicalStartOfLine;
137
138
  bool HasLeadingSpace;
139
140
  bool HasLeadingEmptyMacro;
141
142
  /// True if this is the first time we're lexing the input file.
143
  bool IsFirstTimeLexingFile;
144
145
  // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n',
146
  // it also points to '\n.'
147
  const char *NewLinePtr;
148
149
  // CurrentConflictMarkerState - The kind of conflict marker we are handling.
150
  ConflictMarkerKind CurrentConflictMarkerState;
151
152
  void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
153
154
public:
155
  /// Lexer constructor - Create a new lexer object for the specified buffer
156
  /// with the specified preprocessor managing the lexing process.  This lexer
157
  /// assumes that the associated file buffer and Preprocessor objects will
158
  /// outlive it, so it doesn't take ownership of either of them.
159
  Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP,
160
        bool IsFirstIncludeOfFile = true);
161
162
  /// Lexer constructor - Create a new raw lexer object.  This object is only
163
  /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
164
  /// text range will outlive it, so it doesn't take ownership of it.
165
  Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
166
        const char *BufStart, const char *BufPtr, const char *BufEnd,
167
        bool IsFirstIncludeOfFile = true);
168
169
  /// Lexer constructor - Create a new raw lexer object.  This object is only
170
  /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
171
  /// text range will outlive it, so it doesn't take ownership of it.
172
  Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
173
        const SourceManager &SM, const LangOptions &LangOpts,
174
        bool IsFirstIncludeOfFile = true);
175
176
  Lexer(const Lexer &) = delete;
177
  Lexer &operator=(const Lexer &) = delete;
178
179
  /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
180
  /// _Pragma expansion.  This has a variety of magic semantics that this method
181
  /// sets up.  It returns a new'd Lexer that must be delete'd when done.
182
  static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
183
                                   SourceLocation ExpansionLocStart,
184
                                   SourceLocation ExpansionLocEnd,
185
                                   unsigned TokLen, Preprocessor &PP);
186
187
  /// getFileLoc - Return the File Location for the file we are lexing out of.
188
  /// The physical location encodes the location where the characters come from,
189
  /// the virtual location encodes where we should *claim* the characters came
190
  /// from.  Currently this is only used by _Pragma handling.
191
1.50M
  SourceLocation getFileLoc() const { return FileLoc; }
192
193
private:
194
  /// Lex - Return the next token in the file.  If this is the end of file, it
195
  /// return the tok::eof token.  This implicitly involves the preprocessor.
196
  bool Lex(Token &Result);
197
198
public:
199
  /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
200
3.09M
  bool isPragmaLexer() const { return Is_PragmaLexer; }
201
202
private:
203
  /// IndirectLex - An indirect call to 'Lex' that can be invoked via
204
  ///  the PreprocessorLexer interface.
205
0
  void IndirectLex(Token &Result) override { Lex(Result); }
206
207
public:
208
  /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
209
  /// associated preprocessor object.  Return true if the 'next character to
210
  /// read' pointer points at the end of the lexer buffer, false otherwise.
211
61.9M
  bool LexFromRawLexer(Token &Result) {
212
61.9M
    assert(LexingRawMode && "Not already in raw mode!");
213
0
    Lex(Result);
214
    // Note that lexing to the end of the buffer doesn't implicitly delete the
215
    // lexer when in raw mode.
216
61.9M
    return BufferPtr == BufferEnd;
217
61.9M
  }
218
219
  /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
220
  /// every character in the file, including whitespace and comments.  This
221
  /// should only be used in raw mode, as the preprocessor is not prepared to
222
  /// deal with the excess tokens.
223
807M
  bool isKeepWhitespaceMode() const {
224
807M
    return ExtendedTokenMode > 1;
225
807M
  }
226
227
  /// SetKeepWhitespaceMode - This method lets clients enable or disable
228
  /// whitespace retention mode.
229
79.9M
  void SetKeepWhitespaceMode(bool Val) {
230
79.9M
    assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
231
79.9M
           "Can only retain whitespace in raw mode or -traditional-cpp");
232
79.9M
    ExtendedTokenMode = Val ? 
279.1k
:
079.8M
;
233
79.9M
  }
234
235
  /// inKeepCommentMode - Return true if the lexer should return comments as
236
  /// tokens.
237
120M
  bool inKeepCommentMode() const {
238
120M
    return ExtendedTokenMode > 0;
239
120M
  }
240
241
  /// SetCommentRetentionMode - Change the comment retention mode of the lexer
242
  /// to the specified mode.  This is really only useful when lexing in raw
243
  /// mode, because otherwise the lexer needs to manage this.
244
177M
  void SetCommentRetentionState(bool Mode) {
245
177M
    assert(!isKeepWhitespaceMode() &&
246
177M
           "Can't play with comment retention state when retaining whitespace");
247
177M
    ExtendedTokenMode = Mode ? 
146.6M
:
0131M
;
248
177M
  }
249
250
  /// Sets the extended token mode back to its initial value, according to the
251
  /// language options and preprocessor. This controls whether the lexer
252
  /// produces comment and whitespace tokens.
253
  ///
254
  /// This requires the lexer to have an associated preprocessor. A standalone
255
  /// lexer has nothing to reset to.
256
  void resetExtendedTokenMode();
257
258
  /// Gets source code buffer.
259
5.22k
  StringRef getBuffer() const {
260
5.22k
    return StringRef(BufferStart, BufferEnd - BufferStart);
261
5.22k
  }
262
263
  /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
264
  /// uninterpreted string.  This switches the lexer out of directive mode.
265
  void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
266
267
268
  /// Diag - Forwarding function for diagnostics.  This translate a source
269
  /// position in the current buffer into a SourceLocation object for rendering.
270
  DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
271
272
  /// getSourceLocation - Return a source location identifier for the specified
273
  /// offset in the current file.
274
  SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
275
276
  /// getSourceLocation - Return a source location for the next character in
277
  /// the current file.
278
1.66M
  SourceLocation getSourceLocation() override {
279
1.66M
    return getSourceLocation(BufferPtr);
280
1.66M
  }
281
282
  /// Return the current location in the buffer.
283
48.5k
  const char *getBufferLocation() const { return BufferPtr; }
284
285
  /// Returns the current lexing offset.
286
39.2k
  unsigned getCurrentBufferOffset() {
287
39.2k
    assert(BufferPtr >= BufferStart && "Invalid buffer state");
288
0
    return BufferPtr - BufferStart;
289
39.2k
  }
290
291
  /// Skip over \p NumBytes bytes.
292
  ///
293
  /// If the skip is successful, the next token will be lexed from the new
294
  /// offset. The lexer also assumes that we skipped to the start of the line.
295
  ///
296
  /// \returns true if the skip failed (new offset would have been past the
297
  /// end of the buffer), false otherwise.
298
  bool skipOver(unsigned NumBytes);
299
300
  /// Stringify - Convert the specified string into a C string by i) escaping
301
  /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
302
  /// If Charify is true, this escapes the ' character instead of ".
303
  static std::string Stringify(StringRef Str, bool Charify = false);
304
305
  /// Stringify - Convert the specified string into a C string by i) escaping
306
  /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
307
  static void Stringify(SmallVectorImpl<char> &Str);
308
309
  /// getSpelling - This method is used to get the spelling of a token into a
310
  /// preallocated buffer, instead of as an std::string.  The caller is required
311
  /// to allocate enough space for the token, which is guaranteed to be at least
312
  /// Tok.getLength() bytes long.  The length of the actual result is returned.
313
  ///
314
  /// Note that this method may do two possible things: it may either fill in
315
  /// the buffer specified with characters, or it may *change the input pointer*
316
  /// to point to a constant buffer with the data already in it (avoiding a
317
  /// copy).  The caller is not allowed to modify the returned buffer pointer
318
  /// if an internal buffer is returned.
319
  static unsigned getSpelling(const Token &Tok, const char *&Buffer,
320
                              const SourceManager &SourceMgr,
321
                              const LangOptions &LangOpts,
322
                              bool *Invalid = nullptr);
323
324
  /// getSpelling() - Return the 'spelling' of the Tok token.  The spelling of a
325
  /// token is the characters used to represent the token in the source file
326
  /// after trigraph expansion and escaped-newline folding.  In particular, this
327
  /// wants to get the true, uncanonicalized, spelling of things like digraphs
328
  /// UCNs, etc.
329
  static std::string getSpelling(const Token &Tok,
330
                                 const SourceManager &SourceMgr,
331
                                 const LangOptions &LangOpts,
332
                                 bool *Invalid = nullptr);
333
334
  /// getSpelling - This method is used to get the spelling of the
335
  /// token at the given source location.  If, as is usually true, it
336
  /// is not necessary to copy any data, then the returned string may
337
  /// not point into the provided buffer.
338
  ///
339
  /// This method lexes at the expansion depth of the given
340
  /// location and does not jump to the expansion or spelling
341
  /// location.
342
  static StringRef getSpelling(SourceLocation loc,
343
                               SmallVectorImpl<char> &buffer,
344
                               const SourceManager &SM,
345
                               const LangOptions &options,
346
                               bool *invalid = nullptr);
347
348
  /// MeasureTokenLength - Relex the token at the specified location and return
349
  /// its length in bytes in the input file.  If the token needs cleaning (e.g.
350
  /// includes a trigraph or an escaped newline) then this count includes bytes
351
  /// that are part of that.
352
  static unsigned MeasureTokenLength(SourceLocation Loc,
353
                                     const SourceManager &SM,
354
                                     const LangOptions &LangOpts);
355
356
  /// Relex the token at the specified location.
357
  /// \returns true if there was a failure, false on success.
358
  static bool getRawToken(SourceLocation Loc, Token &Result,
359
                          const SourceManager &SM,
360
                          const LangOptions &LangOpts,
361
                          bool IgnoreWhiteSpace = false);
362
363
  /// Given a location any where in a source buffer, find the location
364
  /// that corresponds to the beginning of the token in which the original
365
  /// source location lands.
366
  static SourceLocation GetBeginningOfToken(SourceLocation Loc,
367
                                            const SourceManager &SM,
368
                                            const LangOptions &LangOpts);
369
370
  /// Get the physical length (including trigraphs and escaped newlines) of the
371
  /// first \p Characters characters of the token starting at TokStart.
372
  static unsigned getTokenPrefixLength(SourceLocation TokStart,
373
                                       unsigned CharNo,
374
                                       const SourceManager &SM,
375
                                       const LangOptions &LangOpts);
376
377
  /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
378
  /// location at the start of a token, return a new location that specifies a
379
  /// character within the token.  This handles trigraphs and escaped newlines.
380
  static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
381
                                                unsigned Characters,
382
                                                const SourceManager &SM,
383
54.4k
                                                const LangOptions &LangOpts) {
384
54.4k
    return TokStart.getLocWithOffset(
385
54.4k
        getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
386
54.4k
  }
387
388
  /// Computes the source location just past the end of the
389
  /// token at this source location.
390
  ///
391
  /// This routine can be used to produce a source location that
392
  /// points just past the end of the token referenced by \p Loc, and
393
  /// is generally used when a diagnostic needs to point just after a
394
  /// token where it expected something different that it received. If
395
  /// the returned source location would not be meaningful (e.g., if
396
  /// it points into a macro), this routine returns an invalid
397
  /// source location.
398
  ///
399
  /// \param Offset an offset from the end of the token, where the source
400
  /// location should refer to. The default offset (0) produces a source
401
  /// location pointing just past the end of the token; an offset of 1 produces
402
  /// a source location pointing to the last character in the token, etc.
403
  static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
404
                                            const SourceManager &SM,
405
                                            const LangOptions &LangOpts);
406
407
  /// Given a token range, produce a corresponding CharSourceRange that
408
  /// is not a token range. This allows the source range to be used by
409
  /// components that don't have access to the lexer and thus can't find the
410
  /// end of the range for themselves.
411
  static CharSourceRange getAsCharRange(SourceRange Range,
412
                                        const SourceManager &SM,
413
6.50k
                                        const LangOptions &LangOpts) {
414
6.50k
    SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
415
6.50k
    return End.isInvalid() ? 
CharSourceRange()1
416
6.50k
                           : CharSourceRange::getCharRange(
417
6.50k
                                 Range.getBegin(), End);
418
6.50k
  }
419
  static CharSourceRange getAsCharRange(CharSourceRange Range,
420
                                        const SourceManager &SM,
421
2.07k
                                        const LangOptions &LangOpts) {
422
2.07k
    return Range.isTokenRange()
423
2.07k
               ? 
getAsCharRange(Range.getAsRange(), SM, LangOpts)2.03k
424
2.07k
               : 
Range41
;
425
2.07k
  }
426
427
  /// Returns true if the given MacroID location points at the first
428
  /// token of the macro expansion.
429
  ///
430
  /// \param MacroBegin If non-null and function returns true, it is set to
431
  /// begin location of the macro.
432
  static bool isAtStartOfMacroExpansion(SourceLocation loc,
433
                                        const SourceManager &SM,
434
                                        const LangOptions &LangOpts,
435
                                        SourceLocation *MacroBegin = nullptr);
436
437
  /// Returns true if the given MacroID location points at the last
438
  /// token of the macro expansion.
439
  ///
440
  /// \param MacroEnd If non-null and function returns true, it is set to
441
  /// end location of the macro.
442
  static bool isAtEndOfMacroExpansion(SourceLocation loc,
443
                                      const SourceManager &SM,
444
                                      const LangOptions &LangOpts,
445
                                      SourceLocation *MacroEnd = nullptr);
446
447
  /// Accepts a range and returns a character range with file locations.
448
  ///
449
  /// Returns a null range if a part of the range resides inside a macro
450
  /// expansion or the range does not reside on the same FileID.
451
  ///
452
  /// This function is trying to deal with macros and return a range based on
453
  /// file locations. The cases where it can successfully handle macros are:
454
  ///
455
  /// -begin or end range lies at the start or end of a macro expansion, in
456
  ///  which case the location will be set to the expansion point, e.g:
457
  ///    \#define M 1 2
458
  ///    a M
459
  /// If you have a range [a, 2] (where 2 came from the macro), the function
460
  /// will return a range for "a M"
461
  /// if you have range [a, 1], the function will fail because the range
462
  /// overlaps with only a part of the macro
463
  ///
464
  /// -The macro is a function macro and the range can be mapped to the macro
465
  ///  arguments, e.g:
466
  ///    \#define M 1 2
467
  ///    \#define FM(x) x
468
  ///    FM(a b M)
469
  /// if you have range [b, 2], the function will return the file range "b M"
470
  /// inside the macro arguments.
471
  /// if you have range [a, 2], the function will return the file range
472
  /// "FM(a b M)" since the range includes all of the macro expansion.
473
  static CharSourceRange makeFileCharRange(CharSourceRange Range,
474
                                           const SourceManager &SM,
475
                                           const LangOptions &LangOpts);
476
477
  /// Returns a string for the source that the range encompasses.
478
  static StringRef getSourceText(CharSourceRange Range,
479
                                 const SourceManager &SM,
480
                                 const LangOptions &LangOpts,
481
                                 bool *Invalid = nullptr);
482
483
  /// Retrieve the name of the immediate macro expansion.
484
  ///
485
  /// This routine starts from a source location, and finds the name of the macro
486
  /// responsible for its immediate expansion. It looks through any intervening
487
  /// macro argument expansions to compute this. It returns a StringRef which
488
  /// refers to the SourceManager-owned buffer of the source where that macro
489
  /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
490
  static StringRef getImmediateMacroName(SourceLocation Loc,
491
                                         const SourceManager &SM,
492
                                         const LangOptions &LangOpts);
493
494
  /// Retrieve the name of the immediate macro expansion.
495
  ///
496
  /// This routine starts from a source location, and finds the name of the
497
  /// macro responsible for its immediate expansion. It looks through any
498
  /// intervening macro argument expansions to compute this. It returns a
499
  /// StringRef which refers to the SourceManager-owned buffer of the source
500
  /// where that macro name is spelled. Thus, the result shouldn't out-live
501
  /// that SourceManager.
502
  ///
503
  /// This differs from Lexer::getImmediateMacroName in that any macro argument
504
  /// location will result in the topmost function macro that accepted it.
505
  /// e.g.
506
  /// \code
507
  ///   MAC1( MAC2(foo) )
508
  /// \endcode
509
  /// for location of 'foo' token, this function will return "MAC1" while
510
  /// Lexer::getImmediateMacroName will return "MAC2".
511
  static StringRef getImmediateMacroNameForDiagnostics(
512
      SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
513
514
  /// Compute the preamble of the given file.
515
  ///
516
  /// The preamble of a file contains the initial comments, include directives,
517
  /// and other preprocessor directives that occur before the code in this
518
  /// particular file actually begins. The preamble of the main source file is
519
  /// a potential prefix header.
520
  ///
521
  /// \param Buffer The memory buffer containing the file's contents.
522
  ///
523
  /// \param MaxLines If non-zero, restrict the length of the preamble
524
  /// to fewer than this number of lines.
525
  ///
526
  /// \returns The offset into the file where the preamble ends and the rest
527
  /// of the file begins along with a boolean value indicating whether
528
  /// the preamble ends at the beginning of a new line.
529
  static PreambleBounds ComputePreamble(StringRef Buffer,
530
                                        const LangOptions &LangOpts,
531
                                        unsigned MaxLines = 0);
532
533
  /// Finds the token that comes right after the given location.
534
  ///
535
  /// Returns the next token, or none if the location is inside a macro.
536
  static Optional<Token> findNextToken(SourceLocation Loc,
537
                                       const SourceManager &SM,
538
                                       const LangOptions &LangOpts);
539
540
  /// Checks that the given token is the first token that occurs after
541
  /// the given location (this excludes comments and whitespace). Returns the
542
  /// location immediately after the specified token. If the token is not found
543
  /// or the location is inside a macro, the returned source location will be
544
  /// invalid.
545
  static SourceLocation findLocationAfterToken(SourceLocation loc,
546
                                         tok::TokenKind TKind,
547
                                         const SourceManager &SM,
548
                                         const LangOptions &LangOpts,
549
                                         bool SkipTrailingWhitespaceAndNewLine);
550
551
  /// Returns true if the given character could appear in an identifier.
552
  static bool isAsciiIdentifierContinueChar(char c,
553
                                            const LangOptions &LangOpts);
554
555
  /// Checks whether new line pointed by Str is preceded by escape
556
  /// sequence.
557
  static bool isNewLineEscaped(const char *BufferStart, const char *Str);
558
559
  /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
560
  /// emit a warning.
561
  static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
562
267k
                                          const LangOptions &LangOpts) {
563
    // If this is not a trigraph and not a UCN or escaped newline, return
564
    // quickly.
565
267k
    if (isObviouslySimpleCharacter(Ptr[0])) {
566
256k
      Size = 1;
567
256k
      return *Ptr;
568
256k
    }
569
570
10.7k
    Size = 0;
571
10.7k
    return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
572
267k
  }
573
574
  /// Returns the leading whitespace for line that corresponds to the given
575
  /// location \p Loc.
576
  static StringRef getIndentationForLine(SourceLocation Loc,
577
                                         const SourceManager &SM);
578
579
  /// Check if this is the first time we're lexing the input file.
580
14
  bool isFirstTimeLexingFile() const { return IsFirstTimeLexingFile; }
581
582
private:
583
  //===--------------------------------------------------------------------===//
584
  // Internal implementation interfaces.
585
586
  /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
587
  /// by Lex.
588
  ///
589
  bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
590
591
  bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
592
593
  bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr);
594
595
  /// FormTokenWithChars - When we lex a token, we have identified a span
596
  /// starting at BufferPtr, going to TokEnd that forms the token.  This method
597
  /// takes that range and assigns it to the token as its location and size.  In
598
  /// addition, since tokens cannot overlap, this also updates BufferPtr to be
599
  /// TokEnd.
600
  void FormTokenWithChars(Token &Result, const char *TokEnd,
601
1.58G
                          tok::TokenKind Kind) {
602
1.58G
    unsigned TokLen = TokEnd-BufferPtr;
603
1.58G
    Result.setLength(TokLen);
604
1.58G
    Result.setLocation(getSourceLocation(BufferPtr, TokLen));
605
1.58G
    Result.setKind(Kind);
606
1.58G
    BufferPtr = TokEnd;
607
1.58G
  }
608
609
  /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
610
  /// tok::l_paren token, 0 if it is something else and 2 if there are no more
611
  /// tokens in the buffer controlled by this lexer.
612
  unsigned isNextPPTokenLParen();
613
614
  //===--------------------------------------------------------------------===//
615
  // Lexer character reading interfaces.
616
617
  // This lexer is built on two interfaces for reading characters, both of which
618
  // automatically provide phase 1/2 translation.  getAndAdvanceChar is used
619
  // when we know that we will be reading a character from the input buffer and
620
  // that this character will be part of the result token. This occurs in (f.e.)
621
  // string processing, because we know we need to read until we find the
622
  // closing '"' character.
623
  //
624
  // The second interface is the combination of getCharAndSize with
625
  // ConsumeChar.  getCharAndSize reads a phase 1/2 translated character,
626
  // returning it and its size.  If the lexer decides that this character is
627
  // part of the current token, it calls ConsumeChar on it.  This two stage
628
  // approach allows us to emit diagnostics for characters (e.g. warnings about
629
  // trigraphs), knowing that they only are emitted if the character is
630
  // consumed.
631
632
  /// isObviouslySimpleCharacter - Return true if the specified character is
633
  /// obviously the same in translation phase 1 and translation phase 3.  This
634
  /// can return false for characters that end up being the same, but it will
635
  /// never return true for something that needs to be mapped.
636
3.11G
  static bool isObviouslySimpleCharacter(char C) {
637
3.11G
    return C != '?' && 
C != '\\'3.11G
;
638
3.11G
  }
639
640
  /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
641
  /// advance over it, and return it.  This is tricky in several cases.  Here we
642
  /// just handle the trivial case and fall-back to the non-inlined
643
  /// getCharAndSizeSlow method to handle the hard case.
644
1.95G
  inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
645
    // If this is not a trigraph and not a UCN or escaped newline, return
646
    // quickly.
647
1.95G
    if (isObviouslySimpleCharacter(Ptr[0])) 
return *Ptr++1.94G
;
648
649
7.18M
    unsigned Size = 0;
650
7.18M
    char C = getCharAndSizeSlow(Ptr, Size, &Tok);
651
7.18M
    Ptr += Size;
652
7.18M
    return C;
653
1.95G
  }
654
655
  /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
656
  /// and added to a given token, check to see if there are diagnostics that
657
  /// need to be emitted or flags that need to be set on the token.  If so, do
658
  /// it.
659
202M
  const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
660
    // Normal case, we consumed exactly one token.  Just return it.
661
202M
    if (Size == 1)
662
202M
      return Ptr+Size;
663
664
    // Otherwise, re-lex the character with a current token, allowing
665
    // diagnostics to be emitted and flags to be set.
666
82
    Size = 0;
667
82
    getCharAndSizeSlow(Ptr, Size, &Tok);
668
82
    return Ptr+Size;
669
202M
  }
670
671
  /// getCharAndSize - Peek a single 'character' from the specified buffer,
672
  /// get its size, and return it.  This is tricky in several cases.  Here we
673
  /// just handle the trivial case and fall-back to the non-inlined
674
  /// getCharAndSizeSlow method to handle the hard case.
675
1.15G
  inline char getCharAndSize(const char *Ptr, unsigned &Size) {
676
    // If this is not a trigraph and not a UCN or escaped newline, return
677
    // quickly.
678
1.15G
    if (isObviouslySimpleCharacter(Ptr[0])) {
679
1.15G
      Size = 1;
680
1.15G
      return *Ptr;
681
1.15G
    }
682
683
7.47k
    Size = 0;
684
7.47k
    return getCharAndSizeSlow(Ptr, Size);
685
1.15G
  }
686
687
  /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
688
  /// method.
689
  char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
690
                          Token *Tok = nullptr);
691
692
  /// getEscapedNewLineSize - Return the size of the specified escaped newline,
693
  /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
694
  /// to this function.
695
  static unsigned getEscapedNewLineSize(const char *P);
696
697
  /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
698
  /// them), skip over them and return the first non-escaped-newline found,
699
  /// otherwise return P.
700
  static const char *SkipEscapedNewLines(const char *P);
701
702
  /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
703
  /// diagnostic.
704
  static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
705
                                       const LangOptions &LangOpts);
706
707
  //===--------------------------------------------------------------------===//
708
  // Other lexer functions.
709
710
  void SetByteOffset(unsigned Offset, bool StartOfLine);
711
712
  void PropagateLineStartLeadingSpaceInfo(Token &Result);
713
714
  const char *LexUDSuffix(Token &Result, const char *CurPtr,
715
                          bool IsStringLiteral);
716
717
  // Helper functions to lex the remainder of a token of the specific type.
718
719
  // This function handles both ASCII and Unicode identifiers after
720
  // the first codepoint of the identifyier has been parsed.
721
  bool LexIdentifierContinue(Token &Result, const char *CurPtr);
722
723
  bool LexNumericConstant    (Token &Result, const char *CurPtr);
724
  bool LexStringLiteral      (Token &Result, const char *CurPtr,
725
                              tok::TokenKind Kind);
726
  bool LexRawStringLiteral   (Token &Result, const char *CurPtr,
727
                              tok::TokenKind Kind);
728
  bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
729
  bool LexCharConstant       (Token &Result, const char *CurPtr,
730
                              tok::TokenKind Kind);
731
  bool LexEndOfFile          (Token &Result, const char *CurPtr);
732
  bool SkipWhitespace        (Token &Result, const char *CurPtr,
733
                              bool &TokAtPhysicalStartOfLine);
734
  bool SkipLineComment       (Token &Result, const char *CurPtr,
735
                              bool &TokAtPhysicalStartOfLine);
736
  bool SkipBlockComment      (Token &Result, const char *CurPtr,
737
                              bool &TokAtPhysicalStartOfLine);
738
  bool SaveLineComment       (Token &Result, const char *CurPtr);
739
740
  bool IsStartOfConflictMarker(const char *CurPtr);
741
  bool HandleEndOfConflictMarker(const char *CurPtr);
742
743
  bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
744
745
  bool isCodeCompletionPoint(const char *CurPtr) const;
746
66
  void cutOffLexing() { BufferPtr = BufferEnd; }
747
748
  bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
749
750
  void codeCompleteIncludedFile(const char *PathStart,
751
                                const char *CompletionPoint, bool IsAngled);
752
753
  /// Read a universal character name.
754
  ///
755
  /// \param StartPtr The position in the source buffer after the initial '\'.
756
  ///                 If the UCN is syntactically well-formed (but not
757
  ///                 necessarily valid), this parameter will be updated to
758
  ///                 point to the character after the UCN.
759
  /// \param SlashLoc The position in the source buffer of the '\'.
760
  /// \param Result   The token being formed. Pass \c nullptr to suppress
761
  ///                 diagnostics and handle token formation in the caller.
762
  ///
763
  /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
764
  ///         invalid.
765
  uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
766
767
  /// Try to consume a UCN as part of an identifier at the current
768
  /// location.
769
  /// \param CurPtr Initially points to the range of characters in the source
770
  ///               buffer containing the '\'. Updated to point past the end of
771
  ///               the UCN on success.
772
  /// \param Size The number of characters occupied by the '\' (including
773
  ///             trigraphs and escaped newlines).
774
  /// \param Result The token being produced. Marked as containing a UCN on
775
  ///               success.
776
  /// \return \c true if a UCN was lexed and it produced an acceptable
777
  ///         identifier character, \c false otherwise.
778
  bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
779
                               Token &Result);
780
781
  /// Try to consume an identifier character encoded in UTF-8.
782
  /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
783
  ///        sequence. On success, updated to point past the end of it.
784
  /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
785
  ///         character was lexed, \c false otherwise.
786
  bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
787
};
788
789
} // namespace clang
790
791
#endif // LLVM_CLANG_LEX_LEXER_H