Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/tools/clang/include/clang/Lex/Lexer.h
Line
Count
Source (jump to first uncovered line)
1
//===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
//  This file defines the Lexer interface.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#ifndef LLVM_CLANG_LEX_LEXER_H
14
#define LLVM_CLANG_LEX_LEXER_H
15
16
#include "clang/Basic/LangOptions.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/TokenKinds.h"
19
#include "clang/Lex/PreprocessorLexer.h"
20
#include "clang/Lex/Token.h"
21
#include "llvm/ADT/Optional.h"
22
#include "llvm/ADT/SmallVector.h"
23
#include "llvm/ADT/StringRef.h"
24
#include <cassert>
25
#include <cstdint>
26
#include <string>
27
28
namespace llvm {
29
30
class MemoryBuffer;
31
32
} // namespace llvm
33
34
namespace clang {
35
36
class DiagnosticBuilder;
37
class Preprocessor;
38
class SourceManager;
39
40
/// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
41
/// recovering from.
42
enum ConflictMarkerKind {
43
  /// Not within a conflict marker.
44
  CMK_None,
45
46
  /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
47
  /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
48
  CMK_Normal,
49
50
  /// A Perforce-style conflict marker, initiated by 4 ">"s,
51
  /// separated by 4 "="s, and terminated by 4 "<"s.
52
  CMK_Perforce
53
};
54
55
/// Describes the bounds (start, size) of the preamble and a flag required by
56
/// PreprocessorOptions::PrecompiledPreambleBytes.
57
/// The preamble includes the BOM, if any.
58
struct PreambleBounds {
59
  /// Size of the preamble in bytes.
60
  unsigned Size;
61
62
  /// Whether the preamble ends at the start of a new line.
63
  ///
64
  /// Used to inform the lexer as to whether it's starting at the beginning of
65
  /// a line after skipping the preamble.
66
  bool PreambleEndsAtStartOfLine;
67
68
  PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
69
1.36k
      : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
70
};
71
72
/// Lexer - This provides a simple interface that turns a text buffer into a
73
/// stream of tokens.  This provides no support for file reading or buffering,
74
/// or buffering/seeking of tokens, only forward lexing is supported.  It relies
75
/// on the specified Preprocessor object to handle preprocessor directives, etc.
76
class Lexer : public PreprocessorLexer {
77
  friend class Preprocessor;
78
79
  void anchor() override;
80
81
  //===--------------------------------------------------------------------===//
82
  // Constant configuration values for this lexer.
83
84
  // Start of the buffer.
85
  const char *BufferStart;
86
87
  // End of the buffer.
88
  const char *BufferEnd;
89
90
  // Location for start of file.
91
  SourceLocation FileLoc;
92
93
  // LangOpts enabled by this language (cache).
94
  LangOptions LangOpts;
95
96
  // True if lexer for _Pragma handling.
97
  bool Is_PragmaLexer;
98
99
  //===--------------------------------------------------------------------===//
100
  // Context-specific lexing flags set by the preprocessor.
101
  //
102
103
  /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
104
  /// and return them as tokens.  This is used for -C and -CC modes, and
105
  /// whitespace preservation can be useful for some clients that want to lex
106
  /// the file in raw mode and get every character from the file.
107
  ///
108
  /// When this is set to 2 it returns comments and whitespace.  When set to 1
109
  /// it returns comments, when it is set to 0 it returns normal tokens only.
110
  unsigned char ExtendedTokenMode;
111
112
  //===--------------------------------------------------------------------===//
113
  // Context that changes as the file is lexed.
114
  // NOTE: any state that mutates when in raw mode must have save/restore code
115
  // in Lexer::isNextPPTokenLParen.
116
117
  // BufferPtr - Current pointer into the buffer.  This is the next character
118
  // to be lexed.
119
  const char *BufferPtr;
120
121
  // IsAtStartOfLine - True if the next lexed token should get the "start of
122
  // line" flag set on it.
123
  bool IsAtStartOfLine;
124
125
  bool IsAtPhysicalStartOfLine;
126
127
  bool HasLeadingSpace;
128
129
  bool HasLeadingEmptyMacro;
130
131
  // CurrentConflictMarkerState - The kind of conflict marker we are handling.
132
  ConflictMarkerKind CurrentConflictMarkerState;
133
134
  void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
135
136
public:
137
  /// Lexer constructor - Create a new lexer object for the specified buffer
138
  /// with the specified preprocessor managing the lexing process.  This lexer
139
  /// assumes that the associated file buffer and Preprocessor objects will
140
  /// outlive it, so it doesn't take ownership of either of them.
141
  Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP);
142
143
  /// Lexer constructor - Create a new raw lexer object.  This object is only
144
  /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
145
  /// text range will outlive it, so it doesn't take ownership of it.
146
  Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
147
        const char *BufStart, const char *BufPtr, const char *BufEnd);
148
149
  /// Lexer constructor - Create a new raw lexer object.  This object is only
150
  /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
151
  /// text range will outlive it, so it doesn't take ownership of it.
152
  Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
153
        const SourceManager &SM, const LangOptions &LangOpts);
154
155
  Lexer(const Lexer &) = delete;
156
  Lexer &operator=(const Lexer &) = delete;
157
158
  /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
159
  /// _Pragma expansion.  This has a variety of magic semantics that this method
160
  /// sets up.  It returns a new'd Lexer that must be delete'd when done.
161
  static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
162
                                   SourceLocation ExpansionLocStart,
163
                                   SourceLocation ExpansionLocEnd,
164
                                   unsigned TokLen, Preprocessor &PP);
165
166
  /// getLangOpts - Return the language features currently enabled.
167
  /// NOTE: this lexer modifies features as a file is parsed!
168
5.83M
  const LangOptions &getLangOpts() const { return LangOpts; }
169
170
  /// getFileLoc - Return the File Location for the file we are lexing out of.
171
  /// The physical location encodes the location where the characters come from,
172
  /// the virtual location encodes where we should *claim* the characters came
173
  /// from.  Currently this is only used by _Pragma handling.
174
1.12M
  SourceLocation getFileLoc() const { return FileLoc; }
175
176
private:
177
  /// Lex - Return the next token in the file.  If this is the end of file, it
178
  /// return the tok::eof token.  This implicitly involves the preprocessor.
179
  bool Lex(Token &Result);
180
181
public:
182
  /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
183
1.75M
  bool isPragmaLexer() const { return Is_PragmaLexer; }
184
185
private:
186
  /// IndirectLex - An indirect call to 'Lex' that can be invoked via
187
  ///  the PreprocessorLexer interface.
188
0
  void IndirectLex(Token &Result) override { Lex(Result); }
189
190
public:
191
  /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
192
  /// associated preprocessor object.  Return true if the 'next character to
193
  /// read' pointer points at the end of the lexer buffer, false otherwise.
194
15.9M
  bool LexFromRawLexer(Token &Result) {
195
15.9M
    assert(LexingRawMode && "Not already in raw mode!");
196
15.9M
    Lex(Result);
197
15.9M
    // Note that lexing to the end of the buffer doesn't implicitly delete the
198
15.9M
    // lexer when in raw mode.
199
15.9M
    return BufferPtr == BufferEnd;
200
15.9M
  }
201
202
  /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
203
  /// every character in the file, including whitespace and comments.  This
204
  /// should only be used in raw mode, as the preprocessor is not prepared to
205
  /// deal with the excess tokens.
206
614M
  bool isKeepWhitespaceMode() const {
207
614M
    return ExtendedTokenMode > 1;
208
614M
  }
209
210
  /// SetKeepWhitespaceMode - This method lets clients enable or disable
211
  /// whitespace retention mode.
212
159M
  void SetKeepWhitespaceMode(bool Val) {
213
159M
    assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
214
159M
           "Can only retain whitespace in raw mode or -traditional-cpp");
215
159M
    ExtendedTokenMode = Val ? 
232.3k
:
0159M
;
216
159M
  }
217
218
  /// inKeepCommentMode - Return true if the lexer should return comments as
219
  /// tokens.
220
27.2M
  bool inKeepCommentMode() const {
221
27.2M
    return ExtendedTokenMode > 0;
222
27.2M
  }
223
224
  /// SetCommentRetentionMode - Change the comment retention mode of the lexer
225
  /// to the specified mode.  This is really only useful when lexing in raw
226
  /// mode, because otherwise the lexer needs to manage this.
227
206M
  void SetCommentRetentionState(bool Mode) {
228
206M
    assert(!isKeepWhitespaceMode() &&
229
206M
           "Can't play with comment retention state when retaining whitespace");
230
206M
    ExtendedTokenMode = Mode ? 
112.5M
:
0194M
;
231
206M
  }
232
233
  /// Sets the extended token mode back to its initial value, according to the
234
  /// language options and preprocessor. This controls whether the lexer
235
  /// produces comment and whitespace tokens.
236
  ///
237
  /// This requires the lexer to have an associated preprocessor. A standalone
238
  /// lexer has nothing to reset to.
239
  void resetExtendedTokenMode();
240
241
  /// Gets source code buffer.
242
4.54k
  StringRef getBuffer() const {
243
4.54k
    return StringRef(BufferStart, BufferEnd - BufferStart);
244
4.54k
  }
245
246
  /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
247
  /// uninterpreted string.  This switches the lexer out of directive mode.
248
  void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
249
250
251
  /// Diag - Forwarding function for diagnostics.  This translate a source
252
  /// position in the current buffer into a SourceLocation object for rendering.
253
  DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
254
255
  /// getSourceLocation - Return a source location identifier for the specified
256
  /// offset in the current file.
257
  SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
258
259
  /// getSourceLocation - Return a source location for the next character in
260
  /// the current file.
261
5.38M
  SourceLocation getSourceLocation() override {
262
5.38M
    return getSourceLocation(BufferPtr);
263
5.38M
  }
264
265
  /// Return the current location in the buffer.
266
47.8k
  const char *getBufferLocation() const { return BufferPtr; }
267
268
  /// Stringify - Convert the specified string into a C string by i) escaping
269
  /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
270
  /// If Charify is true, this escapes the ' character instead of ".
271
  static std::string Stringify(StringRef Str, bool Charify = false);
272
273
  /// Stringify - Convert the specified string into a C string by i) escaping
274
  /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
275
  static void Stringify(SmallVectorImpl<char> &Str);
276
277
  /// getSpelling - This method is used to get the spelling of a token into a
278
  /// preallocated buffer, instead of as an std::string.  The caller is required
279
  /// to allocate enough space for the token, which is guaranteed to be at least
280
  /// Tok.getLength() bytes long.  The length of the actual result is returned.
281
  ///
282
  /// Note that this method may do two possible things: it may either fill in
283
  /// the buffer specified with characters, or it may *change the input pointer*
284
  /// to point to a constant buffer with the data already in it (avoiding a
285
  /// copy).  The caller is not allowed to modify the returned buffer pointer
286
  /// if an internal buffer is returned.
287
  static unsigned getSpelling(const Token &Tok, const char *&Buffer,
288
                              const SourceManager &SourceMgr,
289
                              const LangOptions &LangOpts,
290
                              bool *Invalid = nullptr);
291
292
  /// getSpelling() - Return the 'spelling' of the Tok token.  The spelling of a
293
  /// token is the characters used to represent the token in the source file
294
  /// after trigraph expansion and escaped-newline folding.  In particular, this
295
  /// wants to get the true, uncanonicalized, spelling of things like digraphs
296
  /// UCNs, etc.
297
  static std::string getSpelling(const Token &Tok,
298
                                 const SourceManager &SourceMgr,
299
                                 const LangOptions &LangOpts,
300
                                 bool *Invalid = nullptr);
301
302
  /// getSpelling - This method is used to get the spelling of the
303
  /// token at the given source location.  If, as is usually true, it
304
  /// is not necessary to copy any data, then the returned string may
305
  /// not point into the provided buffer.
306
  ///
307
  /// This method lexes at the expansion depth of the given
308
  /// location and does not jump to the expansion or spelling
309
  /// location.
310
  static StringRef getSpelling(SourceLocation loc,
311
                               SmallVectorImpl<char> &buffer,
312
                               const SourceManager &SM,
313
                               const LangOptions &options,
314
                               bool *invalid = nullptr);
315
316
  /// MeasureTokenLength - Relex the token at the specified location and return
317
  /// its length in bytes in the input file.  If the token needs cleaning (e.g.
318
  /// includes a trigraph or an escaped newline) then this count includes bytes
319
  /// that are part of that.
320
  static unsigned MeasureTokenLength(SourceLocation Loc,
321
                                     const SourceManager &SM,
322
                                     const LangOptions &LangOpts);
323
324
  /// Relex the token at the specified location.
325
  /// \returns true if there was a failure, false on success.
326
  static bool getRawToken(SourceLocation Loc, Token &Result,
327
                          const SourceManager &SM,
328
                          const LangOptions &LangOpts,
329
                          bool IgnoreWhiteSpace = false);
330
331
  /// Given a location any where in a source buffer, find the location
332
  /// that corresponds to the beginning of the token in which the original
333
  /// source location lands.
334
  static SourceLocation GetBeginningOfToken(SourceLocation Loc,
335
                                            const SourceManager &SM,
336
                                            const LangOptions &LangOpts);
337
338
  /// Get the physical length (including trigraphs and escaped newlines) of the
339
  /// first \p Characters characters of the token starting at TokStart.
340
  static unsigned getTokenPrefixLength(SourceLocation TokStart,
341
                                       unsigned CharNo,
342
                                       const SourceManager &SM,
343
                                       const LangOptions &LangOpts);
344
345
  /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
346
  /// location at the start of a token, return a new location that specifies a
347
  /// character within the token.  This handles trigraphs and escaped newlines.
348
  static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
349
                                                unsigned Characters,
350
                                                const SourceManager &SM,
351
188k
                                                const LangOptions &LangOpts) {
352
188k
    return TokStart.getLocWithOffset(
353
188k
        getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
354
188k
  }
355
356
  /// Computes the source location just past the end of the
357
  /// token at this source location.
358
  ///
359
  /// This routine can be used to produce a source location that
360
  /// points just past the end of the token referenced by \p Loc, and
361
  /// is generally used when a diagnostic needs to point just after a
362
  /// token where it expected something different that it received. If
363
  /// the returned source location would not be meaningful (e.g., if
364
  /// it points into a macro), this routine returns an invalid
365
  /// source location.
366
  ///
367
  /// \param Offset an offset from the end of the token, where the source
368
  /// location should refer to. The default offset (0) produces a source
369
  /// location pointing just past the end of the token; an offset of 1 produces
370
  /// a source location pointing to the last character in the token, etc.
371
  static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
372
                                            const SourceManager &SM,
373
                                            const LangOptions &LangOpts);
374
375
  /// Given a token range, produce a corresponding CharSourceRange that
376
  /// is not a token range. This allows the source range to be used by
377
  /// components that don't have access to the lexer and thus can't find the
378
  /// end of the range for themselves.
379
  static CharSourceRange getAsCharRange(SourceRange Range,
380
                                        const SourceManager &SM,
381
6.06k
                                        const LangOptions &LangOpts) {
382
6.06k
    SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
383
6.06k
    return End.isInvalid() ? 
CharSourceRange()0
384
6.06k
                           : CharSourceRange::getCharRange(
385
6.06k
                                 Range.getBegin(), End);
386
6.06k
  }
387
  static CharSourceRange getAsCharRange(CharSourceRange Range,
388
                                        const SourceManager &SM,
389
1.82k
                                        const LangOptions &LangOpts) {
390
1.82k
    return Range.isTokenRange()
391
1.82k
               ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
392
1.82k
               : 
Range0
;
393
1.82k
  }
394
395
  /// Returns true if the given MacroID location points at the first
396
  /// token of the macro expansion.
397
  ///
398
  /// \param MacroBegin If non-null and function returns true, it is set to
399
  /// begin location of the macro.
400
  static bool isAtStartOfMacroExpansion(SourceLocation loc,
401
                                        const SourceManager &SM,
402
                                        const LangOptions &LangOpts,
403
                                        SourceLocation *MacroBegin = nullptr);
404
405
  /// Returns true if the given MacroID location points at the last
406
  /// token of the macro expansion.
407
  ///
408
  /// \param MacroEnd If non-null and function returns true, it is set to
409
  /// end location of the macro.
410
  static bool isAtEndOfMacroExpansion(SourceLocation loc,
411
                                      const SourceManager &SM,
412
                                      const LangOptions &LangOpts,
413
                                      SourceLocation *MacroEnd = nullptr);
414
415
  /// Accepts a range and returns a character range with file locations.
416
  ///
417
  /// Returns a null range if a part of the range resides inside a macro
418
  /// expansion or the range does not reside on the same FileID.
419
  ///
420
  /// This function is trying to deal with macros and return a range based on
421
  /// file locations. The cases where it can successfully handle macros are:
422
  ///
423
  /// -begin or end range lies at the start or end of a macro expansion, in
424
  ///  which case the location will be set to the expansion point, e.g:
425
  ///    \#define M 1 2
426
  ///    a M
427
  /// If you have a range [a, 2] (where 2 came from the macro), the function
428
  /// will return a range for "a M"
429
  /// if you have range [a, 1], the function will fail because the range
430
  /// overlaps with only a part of the macro
431
  ///
432
  /// -The macro is a function macro and the range can be mapped to the macro
433
  ///  arguments, e.g:
434
  ///    \#define M 1 2
435
  ///    \#define FM(x) x
436
  ///    FM(a b M)
437
  /// if you have range [b, 2], the function will return the file range "b M"
438
  /// inside the macro arguments.
439
  /// if you have range [a, 2], the function will return the file range
440
  /// "FM(a b M)" since the range includes all of the macro expansion.
441
  static CharSourceRange makeFileCharRange(CharSourceRange Range,
442
                                           const SourceManager &SM,
443
                                           const LangOptions &LangOpts);
444
445
  /// Returns a string for the source that the range encompasses.
446
  static StringRef getSourceText(CharSourceRange Range,
447
                                 const SourceManager &SM,
448
                                 const LangOptions &LangOpts,
449
                                 bool *Invalid = nullptr);
450
451
  /// Retrieve the name of the immediate macro expansion.
452
  ///
453
  /// This routine starts from a source location, and finds the name of the macro
454
  /// responsible for its immediate expansion. It looks through any intervening
455
  /// macro argument expansions to compute this. It returns a StringRef which
456
  /// refers to the SourceManager-owned buffer of the source where that macro
457
  /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
458
  static StringRef getImmediateMacroName(SourceLocation Loc,
459
                                         const SourceManager &SM,
460
                                         const LangOptions &LangOpts);
461
462
  /// Retrieve the name of the immediate macro expansion.
463
  ///
464
  /// This routine starts from a source location, and finds the name of the
465
  /// macro responsible for its immediate expansion. It looks through any
466
  /// intervening macro argument expansions to compute this. It returns a
467
  /// StringRef which refers to the SourceManager-owned buffer of the source
468
  /// where that macro name is spelled. Thus, the result shouldn't out-live
469
  /// that SourceManager.
470
  ///
471
  /// This differs from Lexer::getImmediateMacroName in that any macro argument
472
  /// location will result in the topmost function macro that accepted it.
473
  /// e.g.
474
  /// \code
475
  ///   MAC1( MAC2(foo) )
476
  /// \endcode
477
  /// for location of 'foo' token, this function will return "MAC1" while
478
  /// Lexer::getImmediateMacroName will return "MAC2".
479
  static StringRef getImmediateMacroNameForDiagnostics(
480
      SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
481
482
  /// Compute the preamble of the given file.
483
  ///
484
  /// The preamble of a file contains the initial comments, include directives,
485
  /// and other preprocessor directives that occur before the code in this
486
  /// particular file actually begins. The preamble of the main source file is
487
  /// a potential prefix header.
488
  ///
489
  /// \param Buffer The memory buffer containing the file's contents.
490
  ///
491
  /// \param MaxLines If non-zero, restrict the length of the preamble
492
  /// to fewer than this number of lines.
493
  ///
494
  /// \returns The offset into the file where the preamble ends and the rest
495
  /// of the file begins along with a boolean value indicating whether
496
  /// the preamble ends at the beginning of a new line.
497
  static PreambleBounds ComputePreamble(StringRef Buffer,
498
                                        const LangOptions &LangOpts,
499
                                        unsigned MaxLines = 0);
500
501
  /// Finds the token that comes right after the given location.
502
  ///
503
  /// Returns the next token, or none if the location is inside a macro.
504
  static Optional<Token> findNextToken(SourceLocation Loc,
505
                                       const SourceManager &SM,
506
                                       const LangOptions &LangOpts);
507
508
  /// Checks that the given token is the first token that occurs after
509
  /// the given location (this excludes comments and whitespace). Returns the
510
  /// location immediately after the specified token. If the token is not found
511
  /// or the location is inside a macro, the returned source location will be
512
  /// invalid.
513
  static SourceLocation findLocationAfterToken(SourceLocation loc,
514
                                         tok::TokenKind TKind,
515
                                         const SourceManager &SM,
516
                                         const LangOptions &LangOpts,
517
                                         bool SkipTrailingWhitespaceAndNewLine);
518
519
  /// Returns true if the given character could appear in an identifier.
520
  static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
521
522
  /// Checks whether new line pointed by Str is preceded by escape
523
  /// sequence.
524
  static bool isNewLineEscaped(const char *BufferStart, const char *Str);
525
526
  /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
527
  /// emit a warning.
528
  static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
529
322k
                                          const LangOptions &LangOpts) {
530
322k
    // If this is not a trigraph and not a UCN or escaped newline, return
531
322k
    // quickly.
532
322k
    if (isObviouslySimpleCharacter(Ptr[0])) {
533
302k
      Size = 1;
534
302k
      return *Ptr;
535
302k
    }
536
19.9k
537
19.9k
    Size = 0;
538
19.9k
    return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
539
19.9k
  }
540
541
  /// Returns the leading whitespace for line that corresponds to the given
542
  /// location \p Loc.
543
  static StringRef getIndentationForLine(SourceLocation Loc,
544
                                         const SourceManager &SM);
545
546
private:
547
  //===--------------------------------------------------------------------===//
548
  // Internal implementation interfaces.
549
550
  /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
551
  /// by Lex.
552
  ///
553
  bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
554
555
  bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
556
557
  /// Given that a token begins with the Unicode character \p C, figure out
558
  /// what kind of token it is and dispatch to the appropriate lexing helper
559
  /// function.
560
  bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
561
562
  /// FormTokenWithChars - When we lex a token, we have identified a span
563
  /// starting at BufferPtr, going to TokEnd that forms the token.  This method
564
  /// takes that range and assigns it to the token as its location and size.  In
565
  /// addition, since tokens cannot overlap, this also updates BufferPtr to be
566
  /// TokEnd.
567
  void FormTokenWithChars(Token &Result, const char *TokEnd,
568
1.42G
                          tok::TokenKind Kind) {
569
1.42G
    unsigned TokLen = TokEnd-BufferPtr;
570
1.42G
    Result.setLength(TokLen);
571
1.42G
    Result.setLocation(getSourceLocation(BufferPtr, TokLen));
572
1.42G
    Result.setKind(Kind);
573
1.42G
    BufferPtr = TokEnd;
574
1.42G
  }
575
576
  /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
577
  /// tok::l_paren token, 0 if it is something else and 2 if there are no more
578
  /// tokens in the buffer controlled by this lexer.
579
  unsigned isNextPPTokenLParen();
580
581
  //===--------------------------------------------------------------------===//
582
  // Lexer character reading interfaces.
583
584
  // This lexer is built on two interfaces for reading characters, both of which
585
  // automatically provide phase 1/2 translation.  getAndAdvanceChar is used
586
  // when we know that we will be reading a character from the input buffer and
587
  // that this character will be part of the result token. This occurs in (f.e.)
588
  // string processing, because we know we need to read until we find the
589
  // closing '"' character.
590
  //
591
  // The second interface is the combination of getCharAndSize with
592
  // ConsumeChar.  getCharAndSize reads a phase 1/2 translated character,
593
  // returning it and its size.  If the lexer decides that this character is
594
  // part of the current token, it calls ConsumeChar on it.  This two stage
595
  // approach allows us to emit diagnostics for characters (e.g. warnings about
596
  // trigraphs), knowing that they only are emitted if the character is
597
  // consumed.
598
599
  /// isObviouslySimpleCharacter - Return true if the specified character is
600
  /// obviously the same in translation phase 1 and translation phase 3.  This
601
  /// can return false for characters that end up being the same, but it will
602
  /// never return true for something that needs to be mapped.
603
2.09G
  static bool isObviouslySimpleCharacter(char C) {
604
2.09G
    return C != '?' && 
C != '\\'2.09G
;
605
2.09G
  }
606
607
  /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
608
  /// advance over it, and return it.  This is tricky in several cases.  Here we
609
  /// just handle the trivial case and fall-back to the non-inlined
610
  /// getCharAndSizeSlow method to handle the hard case.
611
1.66G
  inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
612
1.66G
    // If this is not a trigraph and not a UCN or escaped newline, return
613
1.66G
    // quickly.
614
1.66G
    if (isObviouslySimpleCharacter(Ptr[0])) 
return *Ptr++1.66G
;
615
4.70M
616
4.70M
    unsigned Size = 0;
617
4.70M
    char C = getCharAndSizeSlow(Ptr, Size, &Tok);
618
4.70M
    Ptr += Size;
619
4.70M
    return C;
620
4.70M
  }
621
622
  /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
623
  /// and added to a given token, check to see if there are diagnostics that
624
  /// need to be emitted or flags that need to be set on the token.  If so, do
625
  /// it.
626
135M
  const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
627
135M
    // Normal case, we consumed exactly one token.  Just return it.
628
135M
    if (Size == 1)
629
135M
      return Ptr+Size;
630
86
631
86
    // Otherwise, re-lex the character with a current token, allowing
632
86
    // diagnostics to be emitted and flags to be set.
633
86
    Size = 0;
634
86
    getCharAndSizeSlow(Ptr, Size, &Tok);
635
86
    return Ptr+Size;
636
86
  }
637
638
  /// getCharAndSize - Peek a single 'character' from the specified buffer,
639
  /// get its size, and return it.  This is tricky in several cases.  Here we
640
  /// just handle the trivial case and fall-back to the non-inlined
641
  /// getCharAndSizeSlow method to handle the hard case.
642
425M
  inline char getCharAndSize(const char *Ptr, unsigned &Size) {
643
425M
    // If this is not a trigraph and not a UCN or escaped newline, return
644
425M
    // quickly.
645
425M
    if (isObviouslySimpleCharacter(Ptr[0])) {
646
425M
      Size = 1;
647
425M
      return *Ptr;
648
425M
    }
649
7.69k
650
7.69k
    Size = 0;
651
7.69k
    return getCharAndSizeSlow(Ptr, Size);
652
7.69k
  }
653
654
  /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
655
  /// method.
656
  char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
657
                          Token *Tok = nullptr);
658
659
  /// getEscapedNewLineSize - Return the size of the specified escaped newline,
660
  /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
661
  /// to this function.
662
  static unsigned getEscapedNewLineSize(const char *P);
663
664
  /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
665
  /// them), skip over them and return the first non-escaped-newline found,
666
  /// otherwise return P.
667
  static const char *SkipEscapedNewLines(const char *P);
668
669
  /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
670
  /// diagnostic.
671
  static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
672
                                       const LangOptions &LangOpts);
673
674
  //===--------------------------------------------------------------------===//
675
  // Other lexer functions.
676
677
  void SetByteOffset(unsigned Offset, bool StartOfLine);
678
679
  void PropagateLineStartLeadingSpaceInfo(Token &Result);
680
681
  const char *LexUDSuffix(Token &Result, const char *CurPtr,
682
                          bool IsStringLiteral);
683
684
  // Helper functions to lex the remainder of a token of the specific type.
685
  bool LexIdentifier         (Token &Result, const char *CurPtr);
686
  bool LexNumericConstant    (Token &Result, const char *CurPtr);
687
  bool LexStringLiteral      (Token &Result, const char *CurPtr,
688
                              tok::TokenKind Kind);
689
  bool LexRawStringLiteral   (Token &Result, const char *CurPtr,
690
                              tok::TokenKind Kind);
691
  bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
692
  bool LexCharConstant       (Token &Result, const char *CurPtr,
693
                              tok::TokenKind Kind);
694
  bool LexEndOfFile          (Token &Result, const char *CurPtr);
695
  bool SkipWhitespace        (Token &Result, const char *CurPtr,
696
                              bool &TokAtPhysicalStartOfLine);
697
  bool SkipLineComment       (Token &Result, const char *CurPtr,
698
                              bool &TokAtPhysicalStartOfLine);
699
  bool SkipBlockComment      (Token &Result, const char *CurPtr,
700
                              bool &TokAtPhysicalStartOfLine);
701
  bool SaveLineComment       (Token &Result, const char *CurPtr);
702
703
  bool IsStartOfConflictMarker(const char *CurPtr);
704
  bool HandleEndOfConflictMarker(const char *CurPtr);
705
706
  bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
707
708
  bool isCodeCompletionPoint(const char *CurPtr) const;
709
59
  void cutOffLexing() { BufferPtr = BufferEnd; }
710
711
  bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
712
713
  void codeCompleteIncludedFile(const char *PathStart,
714
                                const char *CompletionPoint, bool IsAngled);
715
716
  /// Read a universal character name.
717
  ///
718
  /// \param StartPtr The position in the source buffer after the initial '\'.
719
  ///                 If the UCN is syntactically well-formed (but not 
720
  ///                 necessarily valid), this parameter will be updated to
721
  ///                 point to the character after the UCN.
722
  /// \param SlashLoc The position in the source buffer of the '\'.
723
  /// \param Result   The token being formed. Pass \c nullptr to suppress
724
  ///                 diagnostics and handle token formation in the caller.
725
  ///
726
  /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
727
  ///         invalid.
728
  uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
729
730
  /// Try to consume a UCN as part of an identifier at the current
731
  /// location.
732
  /// \param CurPtr Initially points to the range of characters in the source
733
  ///               buffer containing the '\'. Updated to point past the end of
734
  ///               the UCN on success.
735
  /// \param Size The number of characters occupied by the '\' (including
736
  ///             trigraphs and escaped newlines).
737
  /// \param Result The token being produced. Marked as containing a UCN on
738
  ///               success.
739
  /// \return \c true if a UCN was lexed and it produced an acceptable
740
  ///         identifier character, \c false otherwise.
741
  bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
742
                               Token &Result);
743
744
  /// Try to consume an identifier character encoded in UTF-8.
745
  /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
746
  ///        sequence. On success, updated to point past the end of it.
747
  /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
748
  ///         character was lexed, \c false otherwise.
749
  bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
750
};
751
752
} // namespace clang
753
754
#endif // LLVM_CLANG_LEX_LEXER_H