Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/tools/clang/lib/Lex/Lexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
//  This file implements the Lexer and Token interfaces.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "clang/Lex/Lexer.h"
14
#include "UnicodeCharSets.h"
15
#include "clang/Basic/CharInfo.h"
16
#include "clang/Basic/IdentifierTable.h"
17
#include "clang/Basic/LangOptions.h"
18
#include "clang/Basic/SourceLocation.h"
19
#include "clang/Basic/SourceManager.h"
20
#include "clang/Basic/TokenKinds.h"
21
#include "clang/Lex/LexDiagnostic.h"
22
#include "clang/Lex/LiteralSupport.h"
23
#include "clang/Lex/MultipleIncludeOpt.h"
24
#include "clang/Lex/Preprocessor.h"
25
#include "clang/Lex/PreprocessorOptions.h"
26
#include "clang/Lex/Token.h"
27
#include "clang/Basic/Diagnostic.h"
28
#include "clang/Basic/LLVM.h"
29
#include "clang/Basic/TokenKinds.h"
30
#include "llvm/ADT/None.h"
31
#include "llvm/ADT/Optional.h"
32
#include "llvm/ADT/StringExtras.h"
33
#include "llvm/ADT/StringSwitch.h"
34
#include "llvm/ADT/StringRef.h"
35
#include "llvm/Support/Compiler.h"
36
#include "llvm/Support/ConvertUTF.h"
37
#include "llvm/Support/MathExtras.h"
38
#include "llvm/Support/MemoryBuffer.h"
39
#include "llvm/Support/NativeFormatting.h"
40
#include "llvm/Support/UnicodeCharRanges.h"
41
#include <algorithm>
42
#include <cassert>
43
#include <cstddef>
44
#include <cstdint>
45
#include <cstring>
46
#include <string>
47
#include <tuple>
48
#include <utility>
49
50
using namespace clang;
51
52
//===----------------------------------------------------------------------===//
53
// Token Class Implementation
54
//===----------------------------------------------------------------------===//
55
56
/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
57
152k
bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
58
152k
  if (isAnnotation())
59
2
    return false;
60
152k
  if (IdentifierInfo *II = getIdentifierInfo())
61
140k
    return II->getObjCKeywordID() == objcKey;
62
11.8k
  return false;
63
11.8k
}
64
65
/// getObjCKeywordID - Return the ObjC keyword kind.
66
470k
tok::ObjCKeywordKind Token::getObjCKeywordID() const {
67
470k
  if (isAnnotation())
68
1
    return tok::objc_not_keyword;
69
470k
  IdentifierInfo *specId = getIdentifierInfo();
70
470k
  return specId ? 
specId->getObjCKeywordID()266k
:
tok::objc_not_keyword203k
;
71
470k
}
72
73
//===----------------------------------------------------------------------===//
74
// Lexer Class Implementation
75
//===----------------------------------------------------------------------===//
76
77
0
void Lexer::anchor() {}
78
79
void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
80
13.6M
                      const char *BufEnd) {
81
13.6M
  BufferStart = BufStart;
82
13.6M
  BufferPtr = BufPtr;
83
13.6M
  BufferEnd = BufEnd;
84
13.6M
85
13.6M
  assert(BufEnd[0] == 0 &&
86
13.6M
         "We assume that the input buffer has a null character at the end"
87
13.6M
         " to simplify lexing!");
88
13.6M
89
13.6M
  // Check whether we have a BOM in the beginning of the buffer. If yes - act
90
13.6M
  // accordingly. Right now we support only UTF-8 with and without BOM, so, just
91
13.6M
  // skip the UTF-8 BOM if it's present.
92
13.6M
  if (BufferStart == BufferPtr) {
93
638k
    // Determine the size of the BOM.
94
638k
    StringRef Buf(BufferStart, BufferEnd - BufferStart);
95
638k
    size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
96
638k
      .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
97
638k
      .Default(0);
98
638k
99
638k
    // Skip the BOM.
100
638k
    BufferPtr += BOMLength;
101
638k
  }
102
13.6M
103
13.6M
  Is_PragmaLexer = false;
104
13.6M
  CurrentConflictMarkerState = CMK_None;
105
13.6M
106
13.6M
  // Start of the file is a start of line.
107
13.6M
  IsAtStartOfLine = true;
108
13.6M
  IsAtPhysicalStartOfLine = true;
109
13.6M
110
13.6M
  HasLeadingSpace = false;
111
13.6M
  HasLeadingEmptyMacro = false;
112
13.6M
113
13.6M
  // We are not after parsing a #.
114
13.6M
  ParsingPreprocessorDirective = false;
115
13.6M
116
13.6M
  // We are not after parsing #include.
117
13.6M
  ParsingFilename = false;
118
13.6M
119
13.6M
  // We are not in raw mode.  Raw mode disables diagnostics and interpretation
120
13.6M
  // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
121
13.6M
  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
122
13.6M
  // or otherwise skipping over tokens.
123
13.6M
  LexingRawMode = false;
124
13.6M
125
13.6M
  // Default to not keeping comments.
126
13.6M
  ExtendedTokenMode = 0;
127
13.6M
}
128
129
/// Lexer constructor - Create a new lexer object for the specified buffer
130
/// with the specified preprocessor managing the lexing process.  This lexer
131
/// assumes that the associated file buffer and Preprocessor objects will
132
/// outlive it, so it doesn't take ownership of either of them.
133
Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
134
    : PreprocessorLexer(&PP, FID),
135
      FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
136
602k
      LangOpts(PP.getLangOpts()) {
137
602k
  InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
138
602k
            InputFile->getBufferEnd());
139
602k
140
602k
  resetExtendedTokenMode();
141
602k
}
142
143
/// Lexer constructor - Create a new raw lexer object.  This object is only
144
/// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
145
/// range will outlive it, so it doesn't take ownership of it.
146
Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
147
             const char *BufStart, const char *BufPtr, const char *BufEnd)
148
13.0M
    : FileLoc(fileloc), LangOpts(langOpts) {
149
13.0M
  InitLexer(BufStart, BufPtr, BufEnd);
150
13.0M
151
13.0M
  // We *are* in raw mode.
152
13.0M
  LexingRawMode = true;
153
13.0M
}
154
155
/// Lexer constructor - Create a new raw lexer object.  This object is only
156
/// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
157
/// range will outlive it, so it doesn't take ownership of it.
158
Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
159
             const SourceManager &SM, const LangOptions &langOpts)
160
    : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
161
31.2k
            FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
Unexecuted instantiation: clang::Lexer::Lexer(clang::FileID, llvm::MemoryBuffer const*, clang::SourceManager const&, clang::LangOptions const&)
clang::Lexer::Lexer(clang::FileID, llvm::MemoryBuffer const*, clang::SourceManager const&, clang::LangOptions const&)
Line
Count
Source
161
31.2k
            FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
162
163
168M
void Lexer::resetExtendedTokenMode() {
164
168M
  assert(PP && "Cannot reset token mode without a preprocessor");
165
168M
  if (LangOpts.TraditionalCPP)
166
1.10k
    SetKeepWhitespaceMode(true);
167
168M
  else
168
168M
    SetCommentRetentionState(PP->getCommentRetentionState());
169
168M
}
170
171
/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
172
/// _Pragma expansion.  This has a variety of magic semantics that this method
173
/// sets up.  It returns a new'd Lexer that must be delete'd when done.
174
///
175
/// On entrance to this routine, TokStartLoc is a macro location which has a
176
/// spelling loc that indicates the bytes to be lexed for the token and an
177
/// expansion location that indicates where all lexed tokens should be
178
/// "expanded from".
179
///
180
/// TODO: It would really be nice to make _Pragma just be a wrapper around a
181
/// normal lexer that remaps tokens as they fly by.  This would require making
182
/// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
183
/// interface that could handle this stuff.  This would pull GetMappedTokenLoc
184
/// out of the critical path of the lexer!
185
///
186
Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
187
                                 SourceLocation ExpansionLocStart,
188
                                 SourceLocation ExpansionLocEnd,
189
36.4k
                                 unsigned TokLen, Preprocessor &PP) {
190
36.4k
  SourceManager &SM = PP.getSourceManager();
191
36.4k
192
36.4k
  // Create the lexer as if we were going to lex the file normally.
193
36.4k
  FileID SpellingFID = SM.getFileID(SpellingLoc);
194
36.4k
  const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
195
36.4k
  Lexer *L = new Lexer(SpellingFID, InputFile, PP);
196
36.4k
197
36.4k
  // Now that the lexer is created, change the start/end locations so that we
198
36.4k
  // just lex the subsection of the file that we want.  This is lexing from a
199
36.4k
  // scratch buffer.
200
36.4k
  const char *StrData = SM.getCharacterData(SpellingLoc);
201
36.4k
202
36.4k
  L->BufferPtr = StrData;
203
36.4k
  L->BufferEnd = StrData+TokLen;
204
36.4k
  assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
205
36.4k
206
36.4k
  // Set the SourceLocation with the remapping information.  This ensures that
207
36.4k
  // GetMappedTokenLoc will remap the tokens as they are lexed.
208
36.4k
  L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
209
36.4k
                                     ExpansionLocStart,
210
36.4k
                                     ExpansionLocEnd, TokLen);
211
36.4k
212
36.4k
  // Ensure that the lexer thinks it is inside a directive, so that end \n will
213
36.4k
  // return an EOD token.
214
36.4k
  L->ParsingPreprocessorDirective = true;
215
36.4k
216
36.4k
  // This lexer really is for _Pragma.
217
36.4k
  L->Is_PragmaLexer = true;
218
36.4k
  return L;
219
36.4k
}
220
221
116k
template <typename T> static void StringifyImpl(T &Str, char Quote) {
222
116k
  typename T::size_type i = 0, e = Str.size();
223
15.5M
  while (i < e) {
224
15.4M
    if (Str[i] == '\\' || 
Str[i] == Quote15.4M
) {
225
7.10k
      Str.insert(Str.begin() + i, '\\');
226
7.10k
      i += 2;
227
7.10k
      ++e;
228
15.4M
    } else if (Str[i] == '\n' || 
Str[i] == '\r'15.4M
) {
229
17
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
230
17
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'13
) &&
231
17
          
Str[i] != Str[i + 1]4
) {
232
0
        Str[i] = '\\';
233
0
        Str[i + 1] = 'n';
234
17
      } else {
235
17
        // Replace '\n' and '\r' to '\\' followed by 'n'.
236
17
        Str[i] = '\\';
237
17
        Str.insert(Str.begin() + i + 1, 'n');
238
17
        ++e;
239
17
      }
240
17
      i += 2;
241
17
    } else
242
15.4M
      ++i;
243
15.4M
  }
244
116k
}
Lexer.cpp:void StringifyImpl<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, char)
Line
Count
Source
221
3.58k
template <typename T> static void StringifyImpl(T &Str, char Quote) {
222
3.58k
  typename T::size_type i = 0, e = Str.size();
223
99.5k
  while (i < e) {
224
95.9k
    if (Str[i] == '\\' || 
Str[i] == Quote95.9k
) {
225
7.09k
      Str.insert(Str.begin() + i, '\\');
226
7.09k
      i += 2;
227
7.09k
      ++e;
228
88.8k
    } else if (Str[i] == '\n' || 
Str[i] == '\r'88.8k
) {
229
9
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
230
9
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'7
) &&
231
9
          
Str[i] != Str[i + 1]2
) {
232
0
        Str[i] = '\\';
233
0
        Str[i + 1] = 'n';
234
9
      } else {
235
9
        // Replace '\n' and '\r' to '\\' followed by 'n'.
236
9
        Str[i] = '\\';
237
9
        Str.insert(Str.begin() + i + 1, 'n');
238
9
        ++e;
239
9
      }
240
9
      i += 2;
241
9
    } else
242
88.8k
      ++i;
243
95.9k
  }
244
3.58k
}
Lexer.cpp:void StringifyImpl<llvm::SmallVectorImpl<char> >(llvm::SmallVectorImpl<char>&, char)
Line
Count
Source
221
112k
template <typename T> static void StringifyImpl(T &Str, char Quote) {
222
112k
  typename T::size_type i = 0, e = Str.size();
223
15.4M
  while (i < e) {
224
15.3M
    if (Str[i] == '\\' || 
Str[i] == Quote15.3M
) {
225
11
      Str.insert(Str.begin() + i, '\\');
226
11
      i += 2;
227
11
      ++e;
228
15.3M
    } else if (Str[i] == '\n' || 
Str[i] == '\r'15.3M
) {
229
8
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
230
8
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'6
) &&
231
8
          
Str[i] != Str[i + 1]2
) {
232
0
        Str[i] = '\\';
233
0
        Str[i + 1] = 'n';
234
8
      } else {
235
8
        // Replace '\n' and '\r' to '\\' followed by 'n'.
236
8
        Str[i] = '\\';
237
8
        Str.insert(Str.begin() + i + 1, 'n');
238
8
        ++e;
239
8
      }
240
8
      i += 2;
241
8
    } else
242
15.3M
      ++i;
243
15.3M
  }
244
112k
}
245
246
3.58k
std::string Lexer::Stringify(StringRef Str, bool Charify) {
247
3.58k
  std::string Result = Str;
248
3.58k
  char Quote = Charify ? 
'\''0
: '"';
249
3.58k
  StringifyImpl(Result, Quote);
250
3.58k
  return Result;
251
3.58k
}
252
253
112k
void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
254
255
//===----------------------------------------------------------------------===//
256
// Token Spelling
257
//===----------------------------------------------------------------------===//
258
259
/// Slow case of getSpelling. Extract the characters comprising the
260
/// spelling of this token from the provided input buffer.
261
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
262
13.7k
                              const LangOptions &LangOpts, char *Spelling) {
263
13.7k
  assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
264
13.7k
265
13.7k
  size_t Length = 0;
266
13.7k
  const char *BufEnd = BufPtr + Tok.getLength();
267
13.7k
268
13.7k
  if (tok::isStringLiteral(Tok.getKind())) {
269
1.33k
    // Munch the encoding-prefix and opening double-quote.
270
1.35k
    while (BufPtr < BufEnd) {
271
1.35k
      unsigned Size;
272
1.35k
      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
273
1.35k
      BufPtr += Size;
274
1.35k
275
1.35k
      if (Spelling[Length - 1] == '"')
276
1.33k
        break;
277
1.35k
    }
278
1.33k
279
1.33k
    // Raw string literals need special handling; trigraph expansion and line
280
1.33k
    // splicing do not occur within their d-char-sequence nor within their
281
1.33k
    // r-char-sequence.
282
1.33k
    if (Length >= 2 &&
283
1.33k
        
Spelling[Length - 2] == 'R'11
&&
Spelling[Length - 1] == '"'11
) {
284
11
      // Search backwards from the end of the token to find the matching closing
285
11
      // quote.
286
11
      const char *RawEnd = BufEnd;
287
20
      do --RawEnd; while (*RawEnd != '"');
288
11
      size_t RawLength = RawEnd - BufPtr + 1;
289
11
290
11
      // Everything between the quotes is included verbatim in the spelling.
291
11
      memcpy(Spelling + Length, BufPtr, RawLength);
292
11
      Length += RawLength;
293
11
      BufPtr += RawLength;
294
11
295
11
      // The rest of the token is lexed normally.
296
11
    }
297
1.33k
  }
298
13.7k
299
273k
  while (BufPtr < BufEnd) {
300
260k
    unsigned Size;
301
260k
    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
302
260k
    BufPtr += Size;
303
260k
  }
304
13.7k
305
13.7k
  assert(Length < Tok.getLength() &&
306
13.7k
         "NeedsCleaning flag set on token that didn't need cleaning!");
307
13.7k
  return Length;
308
13.7k
}
309
310
/// getSpelling() - Return the 'spelling' of this token.  The spelling of a
311
/// token are the characters used to represent the token in the source file
312
/// after trigraph expansion and escaped-newline folding.  In particular, this
313
/// wants to get the true, uncanonicalized, spelling of things like digraphs
314
/// UCNs, etc.
315
StringRef Lexer::getSpelling(SourceLocation loc,
316
                             SmallVectorImpl<char> &buffer,
317
                             const SourceManager &SM,
318
                             const LangOptions &options,
319
479
                             bool *invalid) {
320
479
  // Break down the source location.
321
479
  std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
322
479
323
479
  // Try to the load the file buffer.
324
479
  bool invalidTemp = false;
325
479
  StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
326
479
  if (invalidTemp) {
327
0
    if (invalid) *invalid = true;
328
0
    return {};
329
0
  }
330
479
331
479
  const char *tokenBegin = file.data() + locInfo.second;
332
479
333
479
  // Lex from the start of the given location.
334
479
  Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
335
479
              file.begin(), tokenBegin, file.end());
336
479
  Token token;
337
479
  lexer.LexFromRawLexer(token);
338
479
339
479
  unsigned length = token.getLength();
340
479
341
479
  // Common case:  no need for cleaning.
342
479
  if (!token.needsCleaning())
343
478
    return StringRef(tokenBegin, length);
344
1
345
1
  // Hard case, we need to relex the characters into the string.
346
1
  buffer.resize(length);
347
1
  buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
348
1
  return StringRef(buffer.data(), buffer.size());
349
1
}
350
351
/// getSpelling() - Return the 'spelling' of this token.  The spelling of a
352
/// token are the characters used to represent the token in the source file
353
/// after trigraph expansion and escaped-newline folding.  In particular, this
354
/// wants to get the true, uncanonicalized, spelling of things like digraphs
355
/// UCNs, etc.
356
std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
357
1.27M
                               const LangOptions &LangOpts, bool *Invalid) {
358
1.27M
  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
359
1.27M
360
1.27M
  bool CharDataInvalid = false;
361
1.27M
  const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
362
1.27M
                                                    &CharDataInvalid);
363
1.27M
  if (Invalid)
364
3.57k
    *Invalid = CharDataInvalid;
365
1.27M
  if (CharDataInvalid)
366
0
    return {};
367
1.27M
368
1.27M
  // If this token contains nothing interesting, return it directly.
369
1.27M
  if (!Tok.needsCleaning())
370
1.27M
    return std::string(TokStart, TokStart + Tok.getLength());
371
3
372
3
  std::string Result;
373
3
  Result.resize(Tok.getLength());
374
3
  Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
375
3
  return Result;
376
3
}
377
378
/// getSpelling - This method is used to get the spelling of a token into a
379
/// preallocated buffer, instead of as an std::string.  The caller is required
380
/// to allocate enough space for the token, which is guaranteed to be at least
381
/// Tok.getLength() bytes long.  The actual length of the token is returned.
382
///
383
/// Note that this method may do two possible things: it may either fill in
384
/// the buffer specified with characters, or it may *change the input pointer*
385
/// to point to a constant buffer with the data already in it (avoiding a
386
/// copy).  The caller is not allowed to modify the returned buffer pointer
387
/// if an internal buffer is returned.
388
unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
389
                            const SourceManager &SourceMgr,
390
16.7M
                            const LangOptions &LangOpts, bool *Invalid) {
391
16.7M
  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
392
16.7M
393
16.7M
  const char *TokStart = nullptr;
394
16.7M
  // NOTE: this has to be checked *before* testing for an IdentifierInfo.
395
16.7M
  if (Tok.is(tok::raw_identifier))
396
27.3k
    TokStart = Tok.getRawIdentifier().data();
397
16.7M
  else if (!Tok.hasUCN()) {
398
16.7M
    if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
399
1.96M
      // Just return the string from the identifier table, which is very quick.
400
1.96M
      Buffer = II->getNameStart();
401
1.96M
      return II->getLength();
402
1.96M
    }
403
14.7M
  }
404
14.7M
405
14.7M
  // NOTE: this can be checked even after testing for an IdentifierInfo.
406
14.7M
  if (Tok.isLiteral())
407
13.6M
    TokStart = Tok.getLiteralData();
408
14.7M
409
14.7M
  if (!TokStart) {
410
1.09M
    // Compute the start of the token in the input lexer buffer.
411
1.09M
    bool CharDataInvalid = false;
412
1.09M
    TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
413
1.09M
    if (Invalid)
414
700k
      *Invalid = CharDataInvalid;
415
1.09M
    if (CharDataInvalid) {
416
0
      Buffer = "";
417
0
      return 0;
418
0
    }
419
14.7M
  }
420
14.7M
421
14.7M
  // If this token contains nothing interesting, return it directly.
422
14.7M
  if (!Tok.needsCleaning()) {
423
14.7M
    Buffer = TokStart;
424
14.7M
    return Tok.getLength();
425
14.7M
  }
426
13.7k
427
13.7k
  // Otherwise, hard case, relex the characters into the string.
428
13.7k
  return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
429
13.7k
}
430
431
/// MeasureTokenLength - Relex the token at the specified location and return
432
/// its length in bytes in the input file.  If the token needs cleaning (e.g.
433
/// includes a trigraph or an escaped newline) then this count includes bytes
434
/// that are part of that.
435
unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
436
                                   const SourceManager &SM,
437
12.5M
                                   const LangOptions &LangOpts) {
438
12.5M
  Token TheTok;
439
12.5M
  if (getRawToken(Loc, TheTok, SM, LangOpts))
440
1.46k
    return 0;
441
12.5M
  return TheTok.getLength();
442
12.5M
}
443
444
/// Relex the token at the specified location.
445
/// \returns true if there was a failure, false on success.
446
bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
447
                        const SourceManager &SM,
448
                        const LangOptions &LangOpts,
449
12.5M
                        bool IgnoreWhiteSpace) {
450
12.5M
  // TODO: this could be special cased for common tokens like identifiers, ')',
451
12.5M
  // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
452
12.5M
  // all obviously single-char tokens.  This could use
453
12.5M
  // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
454
12.5M
  // something.
455
12.5M
456
12.5M
  // If this comes from a macro expansion, we really do want the macro name, not
457
12.5M
  // the token this macro expanded to.
458
12.5M
  Loc = SM.getExpansionLoc(Loc);
459
12.5M
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
460
12.5M
  bool Invalid = false;
461
12.5M
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
462
12.5M
  if (Invalid)
463
4
    return true;
464
12.5M
465
12.5M
  const char *StrData = Buffer.data()+LocInfo.second;
466
12.5M
467
12.5M
  if (!IgnoreWhiteSpace && 
isWhitespace(StrData[0])12.5M
)
468
1.46k
    return true;
469
12.5M
470
12.5M
  // Create a lexer starting at the beginning of this token.
471
12.5M
  Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
472
12.5M
                 Buffer.begin(), StrData, Buffer.end());
473
12.5M
  TheLexer.SetCommentRetentionState(true);
474
12.5M
  TheLexer.LexFromRawLexer(Result);
475
12.5M
  return false;
476
12.5M
}
477
478
/// Returns the pointer that points to the beginning of line that contains
479
/// the given offset, or null if the offset if invalid.
480
10.7k
static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
481
10.7k
  const char *BufStart = Buffer.data();
482
10.7k
  if (Offset >= Buffer.size())
483
6
    return nullptr;
484
10.7k
485
10.7k
  const char *LexStart = BufStart + Offset;
486
283k
  for (; LexStart != BufStart; 
--LexStart272k
) {
487
282k
    if (isVerticalWhitespace(LexStart[0]) &&
488
282k
        
!Lexer::isNewLineEscaped(BufStart, LexStart)10.4k
) {
489
10.3k
      // LexStart should point at first character of logical line.
490
10.3k
      ++LexStart;
491
10.3k
      break;
492
10.3k
    }
493
282k
  }
494
10.7k
  return LexStart;
495
10.7k
}
496
497
static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
498
                                              const SourceManager &SM,
499
10.4k
                                              const LangOptions &LangOpts) {
500
10.4k
  assert(Loc.isFileID());
501
10.4k
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
502
10.4k
  if (LocInfo.first.isInvalid())
503
0
    return Loc;
504
10.4k
505
10.4k
  bool Invalid = false;
506
10.4k
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
507
10.4k
  if (Invalid)
508
0
    return Loc;
509
10.4k
510
10.4k
  // Back up from the current location until we hit the beginning of a line
511
10.4k
  // (or the buffer). We'll relex from that point.
512
10.4k
  const char *StrData = Buffer.data() + LocInfo.second;
513
10.4k
  const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
514
10.4k
  if (!LexStart || 
LexStart == StrData10.4k
)
515
254
    return Loc;
516
10.1k
517
10.1k
  // Create a lexer starting at the beginning of this token.
518
10.1k
  SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
519
10.1k
  Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
520
10.1k
                 Buffer.end());
521
10.1k
  TheLexer.SetCommentRetentionState(true);
522
10.1k
523
10.1k
  // Lex tokens until we find the token that contains the source location.
524
10.1k
  Token TheTok;
525
16.1k
  do {
526
16.1k
    TheLexer.LexFromRawLexer(TheTok);
527
16.1k
528
16.1k
    if (TheLexer.getBufferLocation() > StrData) {
529
10.1k
      // Lexing this token has taken the lexer past the source location we're
530
10.1k
      // looking for. If the current token encompasses our source location,
531
10.1k
      // return the beginning of that token.
532
10.1k
      if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
533
9.68k
        return TheTok.getLocation();
534
492
535
492
      // We ended up skipping over the source location entirely, which means
536
492
      // that it points into whitespace. We're done here.
537
492
      break;
538
492
    }
539
16.1k
  } while (
TheTok.getKind() != tok::eof5.99k
);
540
10.1k
541
10.1k
  // We've passed our source location; just return the original source location.
542
10.1k
  
return Loc492
;
543
10.1k
}
544
545
SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
546
                                          const SourceManager &SM,
547
10.4k
                                          const LangOptions &LangOpts) {
548
10.4k
  if (Loc.isFileID())
549
10.4k
    return getBeginningOfFileToken(Loc, SM, LangOpts);
550
20
551
20
  if (!SM.isMacroArgExpansion(Loc))
552
0
    return Loc;
553
20
554
20
  SourceLocation FileLoc = SM.getSpellingLoc(Loc);
555
20
  SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
556
20
  std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
557
20
  std::pair<FileID, unsigned> BeginFileLocInfo =
558
20
      SM.getDecomposedLoc(BeginFileLoc);
559
20
  assert(FileLocInfo.first == BeginFileLocInfo.first &&
560
20
         FileLocInfo.second >= BeginFileLocInfo.second);
561
20
  return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
562
20
}
563
564
namespace {
565
566
enum PreambleDirectiveKind {
567
  PDK_Skipped,
568
  PDK_Unknown
569
};
570
571
} // namespace
572
573
PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
574
                                      const LangOptions &LangOpts,
575
501
                                      unsigned MaxLines) {
576
501
  // Create a lexer starting at the beginning of the file. Note that we use a
577
501
  // "fake" file source location at offset 1 so that the lexer will track our
578
501
  // position within the file.
579
501
  const unsigned StartOffset = 1;
580
501
  SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
581
501
  Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
582
501
                 Buffer.end());
583
501
  TheLexer.SetCommentRetentionState(true);
584
501
585
501
  bool InPreprocessorDirective = false;
586
501
  Token TheTok;
587
501
  SourceLocation ActiveCommentLoc;
588
501
589
501
  unsigned MaxLineOffset = 0;
590
501
  if (MaxLines) {
591
84
    const char *CurPtr = Buffer.begin();
592
84
    unsigned CurLine = 0;
593
12.3k
    while (CurPtr != Buffer.end()) {
594
12.3k
      char ch = *CurPtr++;
595
12.3k
      if (ch == '\n') {
596
642
        ++CurLine;
597
642
        if (CurLine == MaxLines)
598
83
          break;
599
642
      }
600
12.3k
    }
601
84
    if (CurPtr != Buffer.end())
602
78
      MaxLineOffset = CurPtr - Buffer.begin();
603
84
  }
604
501
605
3.89k
  do {
606
3.89k
    TheLexer.LexFromRawLexer(TheTok);
607
3.89k
608
3.89k
    if (InPreprocessorDirective) {
609
2.82k
      // If we've hit the end of the file, we're done.
610
2.82k
      if (TheTok.getKind() == tok::eof) {
611
14
        break;
612
14
      }
613
2.81k
614
2.81k
      // If we haven't hit the end of the preprocessor directive, skip this
615
2.81k
      // token.
616
2.81k
      if (!TheTok.isAtStartOfLine())
617
1.89k
        continue;
618
915
619
915
      // We've passed the end of the preprocessor directive, and will look
620
915
      // at this token again below.
621
915
      InPreprocessorDirective = false;
622
915
    }
623
3.89k
624
3.89k
    // Keep track of the # of lines in the preamble.
625
3.89k
    
if (1.98k
TheTok.isAtStartOfLine()1.98k
) {
626
1.96k
      unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
627
1.96k
628
1.96k
      // If we were asked to limit the number of lines in the preamble,
629
1.96k
      // and we're about to exceed that limit, we're done.
630
1.96k
      if (MaxLineOffset && 
TokOffset >= MaxLineOffset360
)
631
18
        break;
632
1.96k
    }
633
1.96k
634
1.96k
    // Comments are okay; skip over them.
635
1.96k
    if (TheTok.getKind() == tok::comment) {
636
569
      if (ActiveCommentLoc.isInvalid())
637
223
        ActiveCommentLoc = TheTok.getLocation();
638
569
      continue;
639
569
    }
640
1.39k
641
1.39k
    if (TheTok.isAtStartOfLine() && 
TheTok.getKind() == tok::hash1.37k
) {
642
929
      // This is the start of a preprocessor directive.
643
929
      Token HashTok = TheTok;
644
929
      InPreprocessorDirective = true;
645
929
      ActiveCommentLoc = SourceLocation();
646
929
647
929
      // Figure out which directive this is. Since we're lexing raw tokens,
648
929
      // we don't have an identifier table available. Instead, just look at
649
929
      // the raw identifier to recognize and categorize preprocessor directives.
650
929
      TheLexer.LexFromRawLexer(TheTok);
651
929
      if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
652
929
        StringRef Keyword = TheTok.getRawIdentifier();
653
929
        PreambleDirectiveKind PDK
654
929
          = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
655
929
              .Case("include", PDK_Skipped)
656
929
              .Case("__include_macros", PDK_Skipped)
657
929
              .Case("define", PDK_Skipped)
658
929
              .Case("undef", PDK_Skipped)
659
929
              .Case("line", PDK_Skipped)
660
929
              .Case("error", PDK_Skipped)
661
929
              .Case("pragma", PDK_Skipped)
662
929
              .Case("import", PDK_Skipped)
663
929
              .Case("include_next", PDK_Skipped)
664
929
              .Case("warning", PDK_Skipped)
665
929
              .Case("ident", PDK_Skipped)
666
929
              .Case("sccs", PDK_Skipped)
667
929
              .Case("assert", PDK_Skipped)
668
929
              .Case("unassert", PDK_Skipped)
669
929
              .Case("if", PDK_Skipped)
670
929
              .Case("ifdef", PDK_Skipped)
671
929
              .Case("ifndef", PDK_Skipped)
672
929
              .Case("elif", PDK_Skipped)
673
929
              .Case("else", PDK_Skipped)
674
929
              .Case("endif", PDK_Skipped)
675
929
              .Default(PDK_Unknown);
676
929
677
929
        switch (PDK) {
678
929
        case PDK_Skipped:
679
929
          continue;
680
929
681
929
        case PDK_Unknown:
682
0
          // We don't know what this directive is; stop at the '#'.
683
0
          break;
684
0
        }
685
0
      }
686
0
687
0
      // We only end up here if we didn't recognize the preprocessor
688
0
      // directive or it was one that can't occur in the preamble at this
689
0
      // point. Roll back the current token to the location of the '#'.
690
0
      TheTok = HashTok;
691
0
    }
692
1.39k
693
1.39k
    // We hit a token that we don't recognize as being in the
694
1.39k
    // "preprocessing only" part of the file, so we're no longer in
695
1.39k
    // the preamble.
696
1.39k
    
break469
;
697
3.39k
  } while (true);
698
501
699
501
  SourceLocation End;
700
501
  if (ActiveCommentLoc.isValid())
701
80
    End = ActiveCommentLoc; // don't truncate a decl comment.
702
421
  else
703
421
    End = TheTok.getLocation();
704
501
705
501
  return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
706
501
                        TheTok.isAtStartOfLine());
707
501
}
708
709
unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
710
                                     const SourceManager &SM,
711
206k
                                     const LangOptions &LangOpts) {
712
206k
  // Figure out how many physical characters away the specified expansion
713
206k
  // character is.  This needs to take into consideration newlines and
714
206k
  // trigraphs.
715
206k
  bool Invalid = false;
716
206k
  const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
717
206k
718
206k
  // If they request the first char of the token, we're trivially done.
719
206k
  if (Invalid || (CharNo == 0 && 
Lexer::isObviouslySimpleCharacter(*TokPtr)127k
))
720
127k
    return 0;
721
79.1k
722
79.1k
  unsigned PhysOffset = 0;
723
79.1k
724
79.1k
  // The usual case is that tokens don't contain anything interesting.  Skip
725
79.1k
  // over the uninteresting characters.  If a token only consists of simple
726
79.1k
  // chars, this method is extremely fast.
727
568k
  while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
728
567k
    if (CharNo == 0)
729
78.0k
      return PhysOffset;
730
489k
    ++TokPtr;
731
489k
    --CharNo;
732
489k
    ++PhysOffset;
733
489k
  }
734
79.1k
735
79.1k
  // If we have a character that may be a trigraph or escaped newline, use a
736
79.1k
  // lexer to parse it correctly.
737
79.1k
  
for (; 1.09k
CharNo8.73k
;
--CharNo7.64k
) {
738
7.64k
    unsigned Size;
739
7.64k
    Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
740
7.64k
    TokPtr += Size;
741
7.64k
    PhysOffset += Size;
742
7.64k
  }
743
1.09k
744
1.09k
  // Final detail: if we end up on an escaped newline, we want to return the
745
1.09k
  // location of the actual byte of the token.  For example foo\<newline>bar
746
1.09k
  // advanced by 3 should return the location of b, not of \\.  One compounding
747
1.09k
  // detail of this is that the escape may be made by a trigraph.
748
1.09k
  if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
749
676
    PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
750
1.09k
751
1.09k
  return PhysOffset;
752
79.1k
}
753
754
/// Computes the source location just past the end of the
755
/// token at this source location.
756
///
757
/// This routine can be used to produce a source location that
758
/// points just past the end of the token referenced by \p Loc, and
759
/// is generally used when a diagnostic needs to point just after a
760
/// token where it expected something different that it received. If
761
/// the returned source location would not be meaningful (e.g., if
762
/// it points into a macro), this routine returns an invalid
763
/// source location.
764
///
765
/// \param Offset an offset from the end of the token, where the source
766
/// location should refer to. The default offset (0) produces a source
767
/// location pointing just past the end of the token; an offset of 1 produces
768
/// a source location pointing to the last character in the token, etc.
769
SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
770
                                          const SourceManager &SM,
771
2.88M
                                          const LangOptions &LangOpts) {
772
2.88M
  if (Loc.isInvalid())
773
154
    return {};
774
2.88M
775
2.88M
  if (Loc.isMacroID()) {
776
10.4k
    if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
777
10.1k
      return {}; // Points inside the macro expansion.
778
2.87M
  }
779
2.87M
780
2.87M
  unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
781
2.87M
  if (Len > Offset)
782
2.87M
    Len = Len - Offset;
783
405
  else
784
405
    return Loc;
785
2.87M
786
2.87M
  return Loc.getLocWithOffset(Len);
787
2.87M
}
788
789
/// Returns true if the given MacroID location points at the first
790
/// token of the macro expansion.
791
bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
792
                                      const SourceManager &SM,
793
                                      const LangOptions &LangOpts,
794
9.60M
                                      SourceLocation *MacroBegin) {
795
9.60M
  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
796
9.60M
797
9.60M
  SourceLocation expansionLoc;
798
9.60M
  if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
799
1.70M
    return false;
800
7.90M
801
7.90M
  if (expansionLoc.isFileID()) {
802
3.39M
    // No other macro expansions, this is the first.
803
3.39M
    if (MacroBegin)
804
87
      *MacroBegin = expansionLoc;
805
3.39M
    return true;
806
3.39M
  }
807
4.51M
808
4.51M
  return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
809
4.51M
}
810
811
/// Returns true if the given MacroID location points at the last
812
/// token of the macro expansion.
813
bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
814
                                    const SourceManager &SM,
815
                                    const LangOptions &LangOpts,
816
9.28M
                                    SourceLocation *MacroEnd) {
817
9.28M
  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
818
9.28M
819
9.28M
  SourceLocation spellLoc = SM.getSpellingLoc(loc);
820
9.28M
  unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
821
9.28M
  if (tokLen == 0)
822
0
    return false;
823
9.28M
824
9.28M
  SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
825
9.28M
  SourceLocation expansionLoc;
826
9.28M
  if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
827
1.39M
    return false;
828
7.88M
829
7.88M
  if (expansionLoc.isFileID()) {
830
3.70M
    // No other macro expansions.
831
3.70M
    if (MacroEnd)
832
339
      *MacroEnd = expansionLoc;
833
3.70M
    return true;
834
3.70M
  }
835
4.18M
836
4.18M
  return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
837
4.18M
}
838
839
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
840
                                             const SourceManager &SM,
841
2.19M
                                             const LangOptions &LangOpts) {
842
2.19M
  SourceLocation Begin = Range.getBegin();
843
2.19M
  SourceLocation End = Range.getEnd();
844
2.19M
  assert(Begin.isFileID() && End.isFileID());
845
2.19M
  if (Range.isTokenRange()) {
846
2.18M
    End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
847
2.18M
    if (End.isInvalid())
848
0
      return {};
849
2.19M
  }
850
2.19M
851
2.19M
  // Break down the source locations.
852
2.19M
  FileID FID;
853
2.19M
  unsigned BeginOffs;
854
2.19M
  std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
855
2.19M
  if (FID.isInvalid())
856
0
    return {};
857
2.19M
858
2.19M
  unsigned EndOffs;
859
2.19M
  if (!SM.isInFileID(End, FID, &EndOffs) ||
860
2.19M
      BeginOffs > EndOffs)
861
0
    return {};
862
2.19M
863
2.19M
  return CharSourceRange::getCharRange(Begin, End);
864
2.19M
}
865
866
CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
867
                                         const SourceManager &SM,
868
2.19M
                                         const LangOptions &LangOpts) {
869
2.19M
  SourceLocation Begin = Range.getBegin();
870
2.19M
  SourceLocation End = Range.getEnd();
871
2.19M
  if (Begin.isInvalid() || 
End.isInvalid()2.19M
)
872
9
    return {};
873
2.19M
874
2.19M
  if (Begin.isFileID() && 
End.isFileID()2.19M
)
875
2.19M
    return makeRangeFromFileLocs(Range, SM, LangOpts);
876
189
877
189
  if (Begin.isMacroID() && 
End.isFileID()182
) {
878
20
    if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
879
0
      return {};
880
20
    Range.setBegin(Begin);
881
20
    return makeRangeFromFileLocs(Range, SM, LangOpts);
882
20
  }
883
169
884
169
  if (Begin.isFileID() && 
End.isMacroID()7
) {
885
7
    if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
886
7
                                                          &End)) ||
887
7
        (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
888
0
                                                           &End)))
889
0
      return {};
890
7
    Range.setEnd(End);
891
7
    return makeRangeFromFileLocs(Range, SM, LangOpts);
892
7
  }
893
162
894
162
  assert(Begin.isMacroID() && End.isMacroID());
895
162
  SourceLocation MacroBegin, MacroEnd;
896
162
  if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
897
162
      
(54
(54
Range.isTokenRange()54
&& isAtEndOfMacroExpansion(End, SM, LangOpts,
898
52
                                                        &MacroEnd)) ||
899
54
       
(18
Range.isCharRange()18
&& isAtStartOfMacroExpansion(End, SM, LangOpts,
900
36
                                                         &MacroEnd)))) {
901
36
    Range.setBegin(MacroBegin);
902
36
    Range.setEnd(MacroEnd);
903
36
    return makeRangeFromFileLocs(Range, SM, LangOpts);
904
36
  }
905
126
906
126
  bool Invalid = false;
907
126
  const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
908
126
                                                        &Invalid);
909
126
  if (Invalid)
910
0
    return {};
911
126
912
126
  if (BeginEntry.getExpansion().isMacroArgExpansion()) {
913
107
    const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
914
107
                                                        &Invalid);
915
107
    if (Invalid)
916
0
      return {};
917
107
918
107
    if (EndEntry.getExpansion().isMacroArgExpansion() &&
919
107
        BeginEntry.getExpansion().getExpansionLocStart() ==
920
106
            EndEntry.getExpansion().getExpansionLocStart()) {
921
105
      Range.setBegin(SM.getImmediateSpellingLoc(Begin));
922
105
      Range.setEnd(SM.getImmediateSpellingLoc(End));
923
105
      return makeFileCharRange(Range, SM, LangOpts);
924
105
    }
925
21
  }
926
21
927
21
  return {};
928
21
}
929
930
StringRef Lexer::getSourceText(CharSourceRange Range,
931
                               const SourceManager &SM,
932
                               const LangOptions &LangOpts,
933
2.18M
                               bool *Invalid) {
934
2.18M
  Range = makeFileCharRange(Range, SM, LangOpts);
935
2.18M
  if (Range.isInvalid()) {
936
13
    if (Invalid) 
*Invalid = true2
;
937
13
    return {};
938
13
  }
939
2.18M
940
2.18M
  // Break down the source location.
941
2.18M
  std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
942
2.18M
  if (beginInfo.first.isInvalid()) {
943
0
    if (Invalid) *Invalid = true;
944
0
    return {};
945
0
  }
946
2.18M
947
2.18M
  unsigned EndOffs;
948
2.18M
  if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
949
2.18M
      beginInfo.second > EndOffs) {
950
0
    if (Invalid) *Invalid = true;
951
0
    return {};
952
0
  }
953
2.18M
954
2.18M
  // Try to the load the file buffer.
955
2.18M
  bool invalidTemp = false;
956
2.18M
  StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
957
2.18M
  if (invalidTemp) {
958
0
    if (Invalid) *Invalid = true;
959
0
    return {};
960
0
  }
961
2.18M
962
2.18M
  if (Invalid) 
*Invalid = false240
;
963
2.18M
  return file.substr(beginInfo.second, EndOffs - beginInfo.second);
964
2.18M
}
965
966
StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
967
                                       const SourceManager &SM,
968
6.27k
                                       const LangOptions &LangOpts) {
969
6.27k
  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
970
6.27k
971
6.27k
  // Find the location of the immediate macro expansion.
972
20.5k
  while (true) {
973
20.5k
    FileID FID = SM.getFileID(Loc);
974
20.5k
    const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
975
20.5k
    const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
976
20.5k
    Loc = Expansion.getExpansionLocStart();
977
20.5k
    if (!Expansion.isMacroArgExpansion())
978
4.10k
      break;
979
16.4k
980
16.4k
    // For macro arguments we need to check that the argument did not come
981
16.4k
    // from an inner macro, e.g: "MAC1( MAC2(foo) )"
982
16.4k
983
16.4k
    // Loc points to the argument id of the macro definition, move to the
984
16.4k
    // macro expansion.
985
16.4k
    Loc = SM.getImmediateExpansionRange(Loc).getBegin();
986
16.4k
    SourceLocation SpellLoc = Expansion.getSpellingLoc();
987
16.4k
    if (SpellLoc.isFileID())
988
2.16k
      break; // No inner macro.
989
14.3k
990
14.3k
    // If spelling location resides in the same FileID as macro expansion
991
14.3k
    // location, it means there is no inner macro.
992
14.3k
    FileID MacroFID = SM.getFileID(Loc);
993
14.3k
    if (SM.isInFileID(SpellLoc, MacroFID))
994
1
      break;
995
14.3k
996
14.3k
    // Argument came from inner macro.
997
14.3k
    Loc = SpellLoc;
998
14.3k
  }
999
6.27k
1000
6.27k
  // Find the spelling location of the start of the non-argument expansion
1001
6.27k
  // range. This is where the macro name was spelled in order to begin
1002
6.27k
  // expanding this macro.
1003
6.27k
  Loc = SM.getSpellingLoc(Loc);
1004
6.27k
1005
6.27k
  // Dig out the buffer where the macro name was spelled and the extents of the
1006
6.27k
  // name so that we can render it into the expansion note.
1007
6.27k
  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1008
6.27k
  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1009
6.27k
  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1010
6.27k
  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1011
6.27k
}
1012
1013
StringRef Lexer::getImmediateMacroNameForDiagnostics(
1014
540
    SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1015
540
  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1016
540
  // Walk past macro argument expansions.
1017
540
  while (SM.isMacroArgExpansion(Loc))
1018
0
    Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1019
540
1020
540
  // If the macro's spelling has no FileID, then it's actually a token paste
1021
540
  // or stringization (or similar) and not a macro at all.
1022
540
  if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1023
45
    return {};
1024
495
1025
495
  // Find the spelling location of the start of the non-argument expansion
1026
495
  // range. This is where the macro name was spelled in order to begin
1027
495
  // expanding this macro.
1028
495
  Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1029
495
1030
495
  // Dig out the buffer where the macro name was spelled and the extents of the
1031
495
  // name so that we can render it into the expansion note.
1032
495
  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1033
495
  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1034
495
  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1035
495
  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1036
495
}
1037
1038
1.26k
bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
1039
1.26k
  return isIdentifierBody(c, LangOpts.DollarIdents);
1040
1.26k
}
1041
1042
10.4k
bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1043
10.4k
  assert(isVerticalWhitespace(Str[0]));
1044
10.4k
  if (Str - 1 < BufferStart)
1045
2
    return false;
1046
10.4k
1047
10.4k
  if ((Str[0] == '\n' && 
Str[-1] == '\r'10.4k
) ||
1048
10.4k
      
(10.4k
Str[0] == '\r'10.4k
&&
Str[-1] == '\n'18
)) {
1049
18
    if (Str - 2 < BufferStart)
1050
2
      return false;
1051
16
    --Str;
1052
16
  }
1053
10.4k
  --Str;
1054
10.4k
1055
10.4k
  // Rewind to first non-space character:
1056
10.5k
  while (Str > BufferStart && 
isHorizontalWhitespace(*Str)10.5k
)
1057
106
    --Str;
1058
10.4k
1059
10.4k
  return *Str == '\\';
1060
10.4k
}
1061
1062
StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1063
275
                                       const SourceManager &SM) {
1064
275
  if (Loc.isInvalid() || Loc.isMacroID())
1065
0
    return {};
1066
275
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1067
275
  if (LocInfo.first.isInvalid())
1068
0
    return {};
1069
275
  bool Invalid = false;
1070
275
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1071
275
  if (Invalid)
1072
0
    return {};
1073
275
  const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1074
275
  if (!Line)
1075
0
    return {};
1076
275
  StringRef Rest = Buffer.substr(Line - Buffer.data());
1077
275
  size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1078
275
  return NumWhitespaceChars == StringRef::npos
1079
275
             ? 
""0
1080
275
             : Rest.take_front(NumWhitespaceChars);
1081
275
}
1082
1083
//===----------------------------------------------------------------------===//
1084
// Diagnostics forwarding code.
1085
//===----------------------------------------------------------------------===//
1086
1087
/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1088
/// lexer buffer was all expanded at a single point, perform the mapping.
1089
/// This is currently only used for _Pragma implementation, so it is the slow
1090
/// path of the hot getSourceLocation method.  Do not allow it to be inlined.
1091
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1092
    Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1093
static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1094
                                        SourceLocation FileLoc,
1095
183k
                                        unsigned CharNo, unsigned TokLen) {
1096
183k
  assert(FileLoc.isMacroID() && "Must be a macro expansion");
1097
183k
1098
183k
  // Otherwise, we're lexing "mapped tokens".  This is used for things like
1099
183k
  // _Pragma handling.  Combine the expansion location of FileLoc with the
1100
183k
  // spelling location.
1101
183k
  SourceManager &SM = PP.getSourceManager();
1102
183k
1103
183k
  // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1104
183k
  // characters come from spelling(FileLoc)+Offset.
1105
183k
  SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1106
183k
  SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1107
183k
1108
183k
  // Figure out the expansion loc range, which is the range covered by the
1109
183k
  // original _Pragma(...) sequence.
1110
183k
  CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1111
183k
1112
183k
  return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1113
183k
}
1114
1115
/// getSourceLocation - Return a source location identifier for the specified
1116
/// offset in the current file.
1117
SourceLocation Lexer::getSourceLocation(const char *Loc,
1118
1.45G
                                        unsigned TokLen) const {
1119
1.45G
  assert(Loc >= BufferStart && Loc <= BufferEnd &&
1120
1.45G
         "Location out of range for this buffer!");
1121
1.45G
1122
1.45G
  // In the normal case, we're just lexing from a simple file buffer, return
1123
1.45G
  // the file id from FileLoc with the offset specified.
1124
1.45G
  unsigned CharNo = Loc-BufferStart;
1125
1.45G
  if (FileLoc.isFileID())
1126
1.45G
    return FileLoc.getLocWithOffset(CharNo);
1127
183k
1128
183k
  // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1129
183k
  // tokens are lexed from where the _Pragma was defined.
1130
183k
  assert(PP && "This doesn't work on raw lexers");
1131
183k
  return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1132
183k
}
1133
1134
/// Diag - Forwarding function for diagnostics.  This translate a source
1135
/// position in the current buffer into a SourceLocation object for rendering.
1136
12.0k
DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1137
12.0k
  return PP->Diag(getSourceLocation(Loc), DiagID);
1138
12.0k
}
1139
1140
//===----------------------------------------------------------------------===//
1141
// Trigraph and Escaped Newline Handling Code.
1142
//===----------------------------------------------------------------------===//
1143
1144
/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1145
/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1146
796
static char GetTrigraphCharForLetter(char Letter) {
1147
796
  switch (Letter) {
1148
796
  
default: return 0665
;
1149
796
  
case '=': return '#'21
;
1150
796
  
case ')': return ']'31
;
1151
796
  
case '(': return '['30
;
1152
796
  
case '!': return '|'6
;
1153
796
  
case '\'': return '^'1
;
1154
796
  
case '>': return '}'4
;
1155
796
  
case '/': return '\\'32
;
1156
796
  
case '<': return '{'3
;
1157
796
  
case '-': return '~'3
;
1158
796
  }
1159
796
}
1160
1161
/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1162
/// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
1163
/// return the result character.  Finally, emit a warning about trigraph use
1164
/// whether trigraphs are enabled or not.
1165
768
static char DecodeTrigraphChar(const char *CP, Lexer *L) {
1166
768
  char Res = GetTrigraphCharForLetter(*CP);
1167
768
  if (!Res || 
!L103
)
return Res678
;
1168
90
1169
90
  if (!L->getLangOpts().Trigraphs) {
1170
33
    if (!L->isLexingRawMode())
1171
23
      L->Diag(CP-2, diag::trigraph_ignored);
1172
33
    return 0;
1173
33
  }
1174
57
1175
57
  if (!L->isLexingRawMode())
1176
46
    L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1177
57
  return Res;
1178
57
}
1179
1180
/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1181
/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1182
/// trigraph equivalent on entry to this function.
1183
4.29M
unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1184
4.29M
  unsigned Size = 0;
1185
4.29M
  while (isWhitespace(Ptr[Size])) {
1186
4.29M
    ++Size;
1187
4.29M
1188
4.29M
    if (Ptr[Size-1] != '\n' && 
Ptr[Size-1] != '\r'374
)
1189
287
      continue;
1190
4.29M
1191
4.29M
    // If this is a \r\n or \n\r, skip the other half.
1192
4.29M
    if ((Ptr[Size] == '\r' || 
Ptr[Size] == '\n'4.29M
) &&
1193
4.29M
        
Ptr[Size-1] != Ptr[Size]1.39k
)
1194
87
      ++Size;
1195
4.29M
1196
4.29M
    return Size;
1197
4.29M
  }
1198
4.29M
1199
4.29M
  // Not an escaped newline, must be a \t or something else.
1200
4.29M
  
return 0705
;
1201
4.29M
}
1202
1203
/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1204
/// them), skip over them and return the first non-escaped-newline found,
1205
/// otherwise return P.
1206
676
const char *Lexer::SkipEscapedNewLines(const char *P) {
1207
689
  while (true) {
1208
689
    const char *AfterEscape;
1209
689
    if (*P == '\\') {
1210
676
      AfterEscape = P+1;
1211
676
    } else 
if (13
*P == '?'13
) {
1212
0
      // If not a trigraph for escape, bail out.
1213
0
      if (P[1] != '?' || P[2] != '/')
1214
0
        return P;
1215
0
      // FIXME: Take LangOpts into account; the language might not
1216
0
      // support trigraphs.
1217
0
      AfterEscape = P+3;
1218
13
    } else {
1219
13
      return P;
1220
13
    }
1221
676
1222
676
    unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1223
676
    if (NewLineSize == 0) 
return P663
;
1224
13
    P = AfterEscape+NewLineSize;
1225
13
  }
1226
676
}
1227
1228
Optional<Token> Lexer::findNextToken(SourceLocation Loc,
1229
                                     const SourceManager &SM,
1230
592
                                     const LangOptions &LangOpts) {
1231
592
  if (Loc.isMacroID()) {
1232
3
    if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1233
3
      return None;
1234
589
  }
1235
589
  Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1236
589
1237
589
  // Break down the source location.
1238
589
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1239
589
1240
589
  // Try to load the file buffer.
1241
589
  bool InvalidTemp = false;
1242
589
  StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1243
589
  if (InvalidTemp)
1244
0
    return None;
1245
589
1246
589
  const char *TokenBegin = File.data() + LocInfo.second;
1247
589
1248
589
  // Lex from the start of the given location.
1249
589
  Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1250
589
                                      TokenBegin, File.end());
1251
589
  // Find the token.
1252
589
  Token Tok;
1253
589
  lexer.LexFromRawLexer(Tok);
1254
589
  return Tok;
1255
589
}
1256
1257
/// Checks that the given token is the first token that occurs after the
1258
/// given location (this excludes comments and whitespace). Returns the location
1259
/// immediately after the specified token. If the token is not found or the
1260
/// location is inside a macro, the returned source location will be invalid.
1261
SourceLocation Lexer::findLocationAfterToken(
1262
    SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1263
554
    const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1264
554
  Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1265
554
  if (!Tok || 
Tok->isNot(TKind)552
)
1266
65
    return {};
1267
489
  SourceLocation TokenLoc = Tok->getLocation();
1268
489
1269
489
  // Calculate how much whitespace needs to be skipped if any.
1270
489
  unsigned NumWhitespaceChars = 0;
1271
489
  if (SkipTrailingWhitespaceAndNewLine) {
1272
210
    const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1273
210
    unsigned char C = *TokenEnd;
1274
419
    while (isHorizontalWhitespace(C)) {
1275
209
      C = *(++TokenEnd);
1276
209
      NumWhitespaceChars++;
1277
209
    }
1278
210
1279
210
    // Skip \r, \n, \r\n, or \n\r
1280
210
    if (C == '\n' || 
C == '\r'113
) {
1281
98
      char PrevC = C;
1282
98
      C = *(++TokenEnd);
1283
98
      NumWhitespaceChars++;
1284
98
      if ((C == '\n' || 
C == '\r'97
) &&
C != PrevC1
)
1285
1
        NumWhitespaceChars++;
1286
98
    }
1287
210
  }
1288
489
1289
489
  return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1290
489
}
1291
1292
/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1293
/// get its size, and return it.  This is tricky in several cases:
1294
///   1. If currently at the start of a trigraph, we warn about the trigraph,
1295
///      then either return the trigraph (skipping 3 chars) or the '?',
1296
///      depending on whether trigraphs are enabled or not.
1297
///   2. If this is an escaped newline (potentially with whitespace between
1298
///      the backslash and newline), implicitly skip the newline and return
1299
///      the char after it.
1300
///
1301
/// This handles the slow/uncommon case of the getCharAndSize method.  Here we
1302
/// know that we can accumulate into Size, and that we have already incremented
1303
/// Ptr by Size bytes.
1304
///
1305
/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1306
/// be updated to match.
1307
char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1308
8.99M
                               Token *Tok) {
1309
8.99M
  // If we have a slash, look for an escaped newline.
1310
8.99M
  if (Ptr[0] == '\\') {
1311
4.51M
    ++Size;
1312
4.51M
    ++Ptr;
1313
4.51M
Slash:
1314
4.51M
    // Common case, backslash-char where the char is not whitespace.
1315
4.51M
    if (!isWhitespace(Ptr[0])) 
return '\\'234k
;
1316
4.27M
1317
4.27M
    // See if we have optional whitespace characters between the slash and
1318
4.27M
    // newline.
1319
4.27M
    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1320
4.27M
      // Remember that this token needs to be cleaned.
1321
4.27M
      if (Tok) 
Tok->setFlag(Token::NeedsCleaning)4.27M
;
1322
4.27M
1323
4.27M
      // Warn if there was whitespace between the backslash and newline.
1324
4.27M
      if (Ptr[0] != '\n' && 
Ptr[0] != '\r'97
&&
Tok23
&&
!isLexingRawMode()16
)
1325
8
        Diag(Ptr, diag::backslash_newline_space);
1326
4.27M
1327
4.27M
      // Found backslash<whitespace><newline>.  Parse the char after it.
1328
4.27M
      Size += EscapedNewLineSize;
1329
4.27M
      Ptr  += EscapedNewLineSize;
1330
4.27M
1331
4.27M
      // Use slow version to accumulate a correct size field.
1332
4.27M
      return getCharAndSizeSlow(Ptr, Size, Tok);
1333
4.27M
    }
1334
42
1335
42
    // Otherwise, this is not an escaped newline, just return the slash.
1336
42
    return '\\';
1337
42
  }
1338
4.48M
1339
4.48M
  // If this is a trigraph, process it.
1340
4.48M
  if (Ptr[0] == '?' && 
Ptr[1] == '?'208k
) {
1341
768
    // If this is actually a legal trigraph (not something like "??x"), emit
1342
768
    // a trigraph warning.  If so, and if trigraphs are enabled, return it.
1343
768
    if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
1344
70
      // Remember that this token needs to be cleaned.
1345
70
      if (Tok) 
Tok->setFlag(Token::NeedsCleaning)57
;
1346
70
1347
70
      Ptr += 3;
1348
70
      Size += 3;
1349
70
      if (C == '\\') 
goto Slash18
;
1350
52
      return C;
1351
52
    }
1352
768
  }
1353
4.48M
1354
4.48M
  // If this is neither, return a single character.
1355
4.48M
  ++Size;
1356
4.48M
  return *Ptr;
1357
4.48M
}
1358
1359
/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1360
/// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
1361
/// and that we have already incremented Ptr by Size bytes.
1362
///
1363
/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1364
/// be updated to match.
1365
char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1366
36.6k
                                     const LangOptions &LangOpts) {
1367
36.6k
  // If we have a slash, look for an escaped newline.
1368
36.6k
  if (Ptr[0] == '\\') {
1369
23.7k
    ++Size;
1370
23.7k
    ++Ptr;
1371
23.7k
Slash:
1372
23.7k
    // Common case, backslash-char where the char is not whitespace.
1373
23.7k
    if (!isWhitespace(Ptr[0])) 
return '\\'7.05k
;
1374
16.6k
1375
16.6k
    // See if we have optional whitespace characters followed by a newline.
1376
16.6k
    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1377
16.6k
      // Found backslash<whitespace><newline>.  Parse the char after it.
1378
16.6k
      Size += EscapedNewLineSize;
1379
16.6k
      Ptr  += EscapedNewLineSize;
1380
16.6k
1381
16.6k
      // Use slow version to accumulate a correct size field.
1382
16.6k
      return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1383
16.6k
    }
1384
1
1385
1
    // Otherwise, this is not an escaped newline, just return the slash.
1386
1
    return '\\';
1387
1
  }
1388
12.8k
1389
12.8k
  // If this is a trigraph, process it.
1390
12.8k
  if (LangOpts.Trigraphs && 
Ptr[0] == '?'5.37k
&&
Ptr[1] == '?'28
) {
1391
28
    // If this is actually a legal trigraph (not something like "??x"), return
1392
28
    // it.
1393
28
    if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1394
28
      Ptr += 3;
1395
28
      Size += 3;
1396
28
      if (C == '\\') 
goto Slash8
;
1397
20
      return C;
1398
20
    }
1399
28
  }
1400
12.8k
1401
12.8k
  // If this is neither, return a single character.
1402
12.8k
  ++Size;
1403
12.8k
  return *Ptr;
1404
12.8k
}
1405
1406
//===----------------------------------------------------------------------===//
1407
// Helper methods for lexing.
1408
//===----------------------------------------------------------------------===//
1409
1410
/// Routine that indiscriminately sets the offset into the source file.
1411
415
void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1412
415
  BufferPtr = BufferStart + Offset;
1413
415
  if (BufferPtr > BufferEnd)
1414
0
    BufferPtr = BufferEnd;
1415
415
  // FIXME: What exactly does the StartOfLine bit mean?  There are two
1416
415
  // possible meanings for the "start" of the line: the first token on the
1417
415
  // unexpanded line, or the first token on the expanded line.
1418
415
  IsAtStartOfLine = StartOfLine;
1419
415
  IsAtPhysicalStartOfLine = StartOfLine;
1420
415
}
1421
1422
525
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1423
525
  if (LangOpts.AsmPreprocessor) {
1424
4
    return false;
1425
521
  } else if (LangOpts.CPlusPlus11 || 
LangOpts.C11330
) {
1426
387
    static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1427
387
        C11AllowedIDCharRanges);
1428
387
    return C11AllowedIDChars.contains(C);
1429
387
  } else 
if (134
LangOpts.CPlusPlus134
) {
1430
47
    static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1431
47
        CXX03AllowedIDCharRanges);
1432
47
    return CXX03AllowedIDChars.contains(C);
1433
87
  } else {
1434
87
    static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1435
87
        C99AllowedIDCharRanges);
1436
87
    return C99AllowedIDChars.contains(C);
1437
87
  }
1438
525
}
1439
1440
125
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1441
125
  assert(isAllowedIDChar(C, LangOpts));
1442
125
  if (LangOpts.AsmPreprocessor) {
1443
0
    return false;
1444
125
  } else if (LangOpts.CPlusPlus11 || 
LangOpts.C1175
) {
1445
99
    static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1446
99
        C11DisallowedInitialIDCharRanges);
1447
99
    return !C11DisallowedInitialIDChars.contains(C);
1448
99
  } else 
if (26
LangOpts.CPlusPlus26
) {
1449
6
    return true;
1450
20
  } else {
1451
20
    static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1452
20
        C99DisallowedInitialIDCharRanges);
1453
20
    return !C99DisallowedInitialIDChars.contains(C);
1454
20
  }
1455
125
}
1456
1457
static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1458
568
                                            const char *End) {
1459
568
  return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1460
568
                                       L.getSourceLocation(End));
1461
568
}
1462
1463
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1464
312
                                      CharSourceRange Range, bool IsFirst) {
1465
312
  // Check C99 compatibility.
1466
312
  if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1467
12
    enum {
1468
12
      CannotAppearInIdentifier = 0,
1469
12
      CannotStartIdentifier
1470
12
    };
1471
12
1472
12
    static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1473
12
        C99AllowedIDCharRanges);
1474
12
    static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1475
12
        C99DisallowedInitialIDCharRanges);
1476
12
    if (!C99AllowedIDChars.contains(C)) {
1477
5
      Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1478
5
        << Range
1479
5
        << CannotAppearInIdentifier;
1480
7
    } else if (IsFirst && 
C99DisallowedInitialIDChars.contains(C)3
) {
1481
2
      Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1482
2
        << Range
1483
2
        << CannotStartIdentifier;
1484
2
    }
1485
12
  }
1486
312
1487
312
  // Check C++98 compatibility.
1488
312
  if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
1489
12
    static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1490
12
        CXX03AllowedIDCharRanges);
1491
12
    if (!CXX03AllowedIDChars.contains(C)) {
1492
5
      Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
1493
5
        << Range;
1494
5
    }
1495
12
  }
1496
312
}
1497
1498
/// After encountering UTF-8 character C and interpreting it as an identifier
1499
/// character, check whether it's a homoglyph for a common non-identifier
1500
/// source character that is unlikely to be an intentional identifier
1501
/// character and warn if so.
1502
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1503
225
                                       CharSourceRange Range) {
1504
225
  // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1505
225
  struct HomoglyphPair {
1506
225
    uint32_t Character;
1507
225
    char LooksLike;
1508
1.32k
    bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1509
225
  };
1510
225
  static constexpr HomoglyphPair SortedHomoglyphs[] = {
1511
225
    {U'\u00ad', 0},   // SOFT HYPHEN
1512
225
    {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1513
225
    {U'\u037e', ';'}, // GREEK QUESTION MARK
1514
225
    {U'\u200b', 0},   // ZERO WIDTH SPACE
1515
225
    {U'\u200c', 0},   // ZERO WIDTH NON-JOINER
1516
225
    {U'\u200d', 0},   // ZERO WIDTH JOINER
1517
225
    {U'\u2060', 0},   // WORD JOINER
1518
225
    {U'\u2061', 0},   // FUNCTION APPLICATION
1519
225
    {U'\u2062', 0},   // INVISIBLE TIMES
1520
225
    {U'\u2063', 0},   // INVISIBLE SEPARATOR
1521
225
    {U'\u2064', 0},   // INVISIBLE PLUS
1522
225
    {U'\u2212', '-'}, // MINUS SIGN
1523
225
    {U'\u2215', '/'}, // DIVISION SLASH
1524
225
    {U'\u2216', '\\'}, // SET MINUS
1525
225
    {U'\u2217', '*'}, // ASTERISK OPERATOR
1526
225
    {U'\u2223', '|'}, // DIVIDES
1527
225
    {U'\u2227', '^'}, // LOGICAL AND
1528
225
    {U'\u2236', ':'}, // RATIO
1529
225
    {U'\u223c', '~'}, // TILDE OPERATOR
1530
225
    {U'\ua789', ':'}, // MODIFIER LETTER COLON
1531
225
    {U'\ufeff', 0},   // ZERO WIDTH NO-BREAK SPACE
1532
225
    {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1533
225
    {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1534
225
    {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1535
225
    {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1536
225
    {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1537
225
    {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1538
225
    {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1539
225
    {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1540
225
    {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1541
225
    {U'\uff0c', ','}, // FULLWIDTH COMMA
1542
225
    {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1543
225
    {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1544
225
    {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1545
225
    {U'\uff1a', ':'}, // FULLWIDTH COLON
1546
225
    {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1547
225
    {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1548
225
    {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1549
225
    {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1550
225
    {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1551
225
    {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1552
225
    {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1553
225
    {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1554
225
    {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1555
225
    {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1556
225
    {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1557
225
    {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1558
225
    {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1559
225
    {U'\uff5e', '~'}, // FULLWIDTH TILDE
1560
225
    {0, 0}
1561
225
  };
1562
225
  auto Homoglyph =
1563
225
      std::lower_bound(std::begin(SortedHomoglyphs),
1564
225
                       std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1565
225
  if (Homoglyph->Character == C) {
1566
59
    llvm::SmallString<5> CharBuf;
1567
59
    {
1568
59
      llvm::raw_svector_ostream CharOS(CharBuf);
1569
59
      llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1570
59
    }
1571
59
    if (Homoglyph->LooksLike) {
1572
48
      const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1573
48
      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1574
48
          << Range << CharBuf << LooksLikeStr;
1575
48
    } else {
1576
11
      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1577
11
          << Range << CharBuf;
1578
11
    }
1579
59
  }
1580
225
}
1581
1582
bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1583
216
                                    Token &Result) {
1584
216
  const char *UCNPtr = CurPtr + Size;
1585
216
  uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1586
216
  if (CodePoint == 0 || 
!isAllowedIDChar(CodePoint, LangOpts)116
)
1587
129
    return false;
1588
87
1589
87
  if (!isLexingRawMode())
1590
87
    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1591
87
                              makeCharRange(*this, CurPtr, UCNPtr),
1592
87
                              /*IsFirst=*/false);
1593
87
1594
87
  Result.setFlag(Token::HasUCN);
1595
87
  if ((UCNPtr - CurPtr ==  6 && 
CurPtr[1] == 'u'61
) ||
1596
87
      
(26
UCNPtr - CurPtr == 1026
&&
CurPtr[1] == 'U'17
))
1597
72
    CurPtr = UCNPtr;
1598
15
  else
1599
105
    
while (15
CurPtr != UCNPtr)
1600
90
      (void)getAndAdvanceChar(CurPtr, Result);
1601
87
  return true;
1602
87
}
1603
1604
208
bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1605
208
  const char *UnicodePtr = CurPtr;
1606
208
  llvm::UTF32 CodePoint;
1607
208
  llvm::ConversionResult Result =
1608
208
      llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1609
208
                                (const llvm::UTF8 *)BufferEnd,
1610
208
                                &CodePoint,
1611
208
                                llvm::strictConversion);
1612
208
  if (Result != llvm::conversionOK ||
1613
208
      
!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)193
)
1614
45
    return false;
1615
163
1616
163
  if (!isLexingRawMode()) {
1617
143
    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1618
143
                              makeCharRange(*this, CurPtr, UnicodePtr),
1619
143
                              /*IsFirst=*/false);
1620
143
    maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1621
143
                               makeCharRange(*this, CurPtr, UnicodePtr));
1622
143
  }
1623
163
1624
163
  CurPtr = UnicodePtr;
1625
163
  return true;
1626
163
}
1627
1628
714M
bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1629
714M
  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1630
714M
  unsigned Size;
1631
714M
  unsigned char C = *CurPtr++;
1632
13.1G
  while (isIdentifierBody(C))
1633
12.4G
    C = *CurPtr++;
1634
714M
1635
714M
  --CurPtr;   // Back up over the skipped character.
1636
714M
1637
714M
  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
1638
714M
  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1639
714M
  //
1640
714M
  // TODO: Could merge these checks into an InfoTable flag to make the
1641
714M
  // comparison cheaper
1642
714M
  if (isASCII(C) && 
C != '\\'714M
&&
C != '?'714M
&&
1643
714M
      
(714M
C != '$'714M
||
!LangOpts.DollarIdents2.18k
)) {
1644
714M
FinishIdentifier:
1645
714M
    const char *IdStart = BufferPtr;
1646
714M
    FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1647
714M
    Result.setRawIdentifierData(IdStart);
1648
714M
1649
714M
    // If we are in raw mode, return this identifier raw.  There is no need to
1650
714M
    // look up identifier information or attempt to macro expand it.
1651
714M
    if (LexingRawMode)
1652
487M
      return true;
1653
227M
1654
227M
    // Fill in Result.IdentifierInfo and update the token kind,
1655
227M
    // looking up the identifier in the identifier table.
1656
227M
    IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1657
227M
    // Note that we have to call PP->LookUpIdentifierInfo() even for code
1658
227M
    // completion, it writes IdentifierInfo into Result, and callers rely on it.
1659
227M
1660
227M
    // If the completion point is at the end of an identifier, we want to treat
1661
227M
    // the identifier as incomplete even if it resolves to a macro or a keyword.
1662
227M
    // This allows e.g. 'class^' to complete to 'classifier'.
1663
227M
    if (isCodeCompletionPoint(CurPtr)) {
1664
39
      // Return the code-completion token.
1665
39
      Result.setKind(tok::code_completion);
1666
39
      // Skip the code-completion char and all immediate identifier characters.
1667
39
      // This ensures we get consistent behavior when completing at any point in
1668
39
      // an identifier (i.e. at the start, in the middle, at the end). Note that
1669
39
      // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1670
39
      // simpler.
1671
39
      assert(*CurPtr == 0 && "Completion character must be 0");
1672
39
      ++CurPtr;
1673
39
      // Note that code completion token is not added as a separate character
1674
39
      // when the completion point is at the end of the buffer. Therefore, we need
1675
39
      // to check if the buffer has ended.
1676
39
      if (CurPtr < BufferEnd) {
1677
85
        while (isIdentifierBody(*CurPtr))
1678
47
          ++CurPtr;
1679
38
      }
1680
39
      BufferPtr = CurPtr;
1681
39
      return true;
1682
39
    }
1683
227M
1684
227M
    // Finally, now that we know we have an identifier, pass this off to the
1685
227M
    // preprocessor, which may macro expand it or something.
1686
227M
    if (II->isHandleIdentifierCase())
1687
15.5M
      return PP->HandleIdentifier(Result);
1688
211M
1689
211M
    return true;
1690
211M
  }
1691
5.05k
1692
5.05k
  // Otherwise, $,\,? in identifier found.  Enter slower path.
1693
5.05k
1694
5.05k
  C = getCharAndSize(CurPtr, Size);
1695
10.2k
  while (
true10.2k
) {
1696
10.2k
    if (C == '$') {
1697
2.46k
      // If we hit a $ and they are not supported in identifiers, we are done.
1698
2.46k
      if (!LangOpts.DollarIdents) 
goto FinishIdentifier0
;
1699
2.46k
1700
2.46k
      // Otherwise, emit a diagnostic and continue.
1701
2.46k
      if (!isLexingRawMode())
1702
2.42k
        Diag(CurPtr, diag::ext_dollar_in_identifier);
1703
2.46k
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1704
2.46k
      C = getCharAndSize(CurPtr, Size);
1705
2.46k
      continue;
1706
7.77k
    } else if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)186
) {
1707
65
      C = getCharAndSize(CurPtr, Size);
1708
65
      continue;
1709
7.71k
    } else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)177
) {
1710
145
      C = getCharAndSize(CurPtr, Size);
1711
145
      continue;
1712
7.56k
    } else if (!isIdentifierBody(C)) {
1713
5.06k
      goto FinishIdentifier;
1714
5.06k
    }
1715
2.50k
1716
2.50k
    // Otherwise, this character is good, consume it.
1717
2.50k
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1718
2.50k
1719
2.50k
    C = getCharAndSize(CurPtr, Size);
1720
20.8k
    while (isIdentifierBody(C)) {
1721
18.3k
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1722
18.3k
      C = getCharAndSize(CurPtr, Size);
1723
18.3k
    }
1724
2.50k
  }
1725
5.05k
}
1726
1727
/// isHexaLiteral - Return true if Start points to a hex constant.
1728
/// in microsoft mode (where this is supposed to be several different tokens).
1729
39.2k
bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1730
39.2k
  unsigned Size;
1731
39.2k
  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1732
39.2k
  if (C1 != '0')
1733
27.7k
    return false;
1734
11.5k
  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1735
11.5k
  return (C2 == 'x' || 
C2 == 'X'15
);
1736
11.5k
}
1737
1738
/// LexNumericConstant - Lex the remainder of a integer or floating point
1739
/// constant. From[-1] is the first character lexed.  Return the end of the
1740
/// constant.
1741
46.8M
bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1742
46.8M
  unsigned Size;
1743
46.8M
  char C = getCharAndSize(CurPtr, Size);
1744
46.8M
  char PrevCh = 0;
1745
159M
  while (isPreprocessingNumberBody(C)) {
1746
112M
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1747
112M
    PrevCh = C;
1748
112M
    C = getCharAndSize(CurPtr, Size);
1749
112M
  }
1750
46.8M
1751
46.8M
  // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
1752
46.8M
  if ((C == '-' || 
C == '+'46.3M
) &&
(644k
PrevCh == 'E'644k
||
PrevCh == 'e'643k
)) {
1753
544k
    // If we are in Microsoft mode, don't continue if the constant is hex.
1754
544k
    // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1755
544k
    if (!LangOpts.MicrosoftExt || 
!isHexaLiteral(BufferPtr, LangOpts)27.7k
)
1756
544k
      return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1757
46.2M
  }
1758
46.2M
1759
46.2M
  // If we have a hex FP constant, continue.
1760
46.2M
  if ((C == '-' || 
C == '+'46.2M
) &&
(99.9k
PrevCh == 'P'99.9k
||
PrevCh == 'p'99.9k
)) {
1761
19.6k
    // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1762
19.6k
    // not-quite-conforming extension. Only do so if this looks like it's
1763
19.6k
    // actually meant to be a hexfloat, and not if it has a ud-suffix.
1764
19.6k
    bool IsHexFloat = true;
1765
19.6k
    if (!LangOpts.C99) {
1766
11.5k
      if (!isHexaLiteral(BufferPtr, LangOpts))
1767
9
        IsHexFloat = false;
1768
11.5k
      else if (!getLangOpts().CPlusPlus17 &&
1769
11.5k
               
std::find(BufferPtr, CurPtr, '_') != CurPtr11.4k
)
1770
3
        IsHexFloat = false;
1771
11.5k
    }
1772
19.6k
    if (IsHexFloat)
1773
19.6k
      return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1774
46.2M
  }
1775
46.2M
1776
46.2M
  // If we have a digit separator, continue.
1777
46.2M
  if (C == '\'' && 
getLangOpts().CPlusPlus14249
) {
1778
242
    unsigned NextSize;
1779
242
    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
1780
242
    if (isIdentifierBody(Next)) {
1781
232
      if (!isLexingRawMode())
1782
218
        Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
1783
232
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1784
232
      CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1785
232
      return LexNumericConstant(Result, CurPtr);
1786
232
    }
1787
46.2M
  }
1788
46.2M
1789
46.2M
  // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1790
46.2M
  if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)5
)
1791
4
    return LexNumericConstant(Result, CurPtr);
1792
46.2M
  if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)15
)
1793
4
    return LexNumericConstant(Result, CurPtr);
1794
46.2M
1795
46.2M
  // Update the location of token as well as BufferPtr.
1796
46.2M
  const char *TokStart = BufferPtr;
1797
46.2M
  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1798
46.2M
  Result.setLiteralData(TokStart);
1799
46.2M
  return true;
1800
46.2M
}
1801
1802
/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1803
/// in C++11, or warn on a ud-suffix in C++98.
1804
const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1805
3.48M
                               bool IsStringLiteral) {
1806
3.48M
  assert(getLangOpts().CPlusPlus);
1807
3.48M
1808
3.48M
  // Maximally munch an identifier.
1809
3.48M
  unsigned Size;
1810
3.48M
  char C = getCharAndSize(CurPtr, Size);
1811
3.48M
  bool Consumed = false;
1812
3.48M
1813
3.48M
  if (!isIdentifierHead(C)) {
1814
3.48M
    if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)7
)
1815
0
      Consumed = true;
1816
3.48M
    else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)0
)
1817
0
      Consumed = true;
1818
3.48M
    else
1819
3.48M
      return CurPtr;
1820
3.55k
  }
1821
3.55k
1822
3.55k
  if (!getLangOpts().CPlusPlus11) {
1823
17
    if (!isLexingRawMode())
1824
8
      Diag(CurPtr,
1825
8
           C == '_' ? 
diag::warn_cxx11_compat_user_defined_literal3
1826
8
                    : 
diag::warn_cxx11_compat_reserved_user_defined_literal5
)
1827
8
        << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1828
17
    return CurPtr;
1829
17
  }
1830
3.54k
1831
3.54k
  // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1832
3.54k
  // that does not start with an underscore is ill-formed. As a conforming
1833
3.54k
  // extension, we treat all such suffixes as if they had whitespace before
1834
3.54k
  // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1835
3.54k
  // likely to be a ud-suffix than a macro, however, and accept that.
1836
3.54k
  if (!Consumed) {
1837
3.54k
    bool IsUDSuffix = false;
1838
3.54k
    if (C == '_')
1839
220
      IsUDSuffix = true;
1840
3.32k
    else if (IsStringLiteral && 
getLangOpts().CPlusPlus143.30k
) {
1841
1.39k
      // In C++1y, we need to look ahead a few characters to see if this is a
1842
1.39k
      // valid suffix for a string literal or a numeric literal (this could be
1843
1.39k
      // the 'operator""if' defining a numeric literal operator).
1844
1.39k
      const unsigned MaxStandardSuffixLength = 3;
1845
1.39k
      char Buffer[MaxStandardSuffixLength] = { C };
1846
1.39k
      unsigned Consumed = Size;
1847
1.39k
      unsigned Chars = 1;
1848
2.38k
      while (true) {
1849
2.38k
        unsigned NextSize;
1850
2.38k
        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
1851
2.38k
                                         getLangOpts());
1852
2.38k
        if (!isIdentifierBody(Next)) {
1853
1.38k
          // End of suffix. Check whether this is on the whitelist.
1854
1.38k
          const StringRef CompleteSuffix(Buffer, Chars);
1855
1.38k
          IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
1856
1.38k
                                                            CompleteSuffix);
1857
1.38k
          break;
1858
1.38k
        }
1859
996
1860
996
        if (Chars == MaxStandardSuffixLength)
1861
2
          // Too long: can't be a standard suffix.
1862
2
          break;
1863
994
1864
994
        Buffer[Chars++] = Next;
1865
994
        Consumed += NextSize;
1866
994
      }
1867
1.39k
    }
1868
3.54k
1869
3.54k
    if (!IsUDSuffix) {
1870
2.11k
      if (!isLexingRawMode())
1871
9
        Diag(CurPtr, getLangOpts().MSVCCompat
1872
9
                         ? 
diag::ext_ms_reserved_user_defined_literal0
1873
9
                         : diag::ext_reserved_user_defined_literal)
1874
9
          << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1875
2.11k
      return CurPtr;
1876
2.11k
    }
1877
1.42k
1878
1.42k
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1879
1.42k
  }
1880
3.54k
1881
3.54k
  Result.setFlag(Token::HasUDSuffix);
1882
3.00k
  while (true) {
1883
3.00k
    C = getCharAndSize(CurPtr, Size);
1884
3.00k
    if (isIdentifierBody(C)) 
{ CurPtr = ConsumeChar(CurPtr, Size, Result); }1.55k
1885
1.45k
    else if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)18
)
{}18
1886
1.43k
    else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)16
)
{}14
1887
1.42k
    else break;
1888
3.00k
  }
1889
1.42k
1890
1.42k
  return CurPtr;
1891
3.54k
}
1892
1893
/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1894
/// either " or L" or u8" or u" or U".
1895
bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1896
5.69M
                             tok::TokenKind Kind) {
1897
5.69M
  const char *AfterQuote = CurPtr;
1898
5.69M
  // Does this string contain the \0 character?
1899
5.69M
  const char *NulCharacter = nullptr;
1900
5.69M
1901
5.69M
  if (!isLexingRawMode() &&
1902
5.69M
      
(5.08M
Kind == tok::utf8_string_literal5.08M
||
1903
5.08M
       
Kind == tok::utf16_string_literal5.08M
||
1904
5.08M
       
Kind == tok::utf32_string_literal5.08M
))
1905
249
    Diag(BufferPtr, getLangOpts().CPlusPlus
1906
249
           ? 
diag::warn_cxx98_compat_unicode_literal176
1907
249
           : 
diag::warn_c99_compat_unicode_literal73
);
1908
5.69M
1909
5.69M
  char C = getAndAdvanceChar(CurPtr, Result);
1910
56.6M
  while (C != '"') {
1911
50.9M
    // Skip escaped characters.  Escaped newlines will already be processed by
1912
50.9M
    // getAndAdvanceChar.
1913
50.9M
    if (C == '\\')
1914
203k
      C = getAndAdvanceChar(CurPtr, Result);
1915
50.9M
1916
50.9M
    if (C == '\n' || 
C == '\r'50.9M
|| // Newline.
1917
50.9M
        
(50.9M
C == 050.9M
&&
CurPtr-1 == BufferEnd38
)) { // End of file.
1918
64
      if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor7
)
1919
5
        Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
1920
64
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1921
64
      return true;
1922
64
    }
1923
50.9M
1924
50.9M
    if (C == 0) {
1925
17
      if (isCodeCompletionPoint(CurPtr-1)) {
1926
9
        if (ParsingFilename)
1927
3
          codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
1928
6
        else
1929
6
          PP->CodeCompleteNaturalLanguage();
1930
9
        FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
1931
9
        cutOffLexing();
1932
9
        return true;
1933
9
      }
1934
8
1935
8
      NulCharacter = CurPtr-1;
1936
8
    }
1937
50.9M
    C = getAndAdvanceChar(CurPtr, Result);
1938
50.9M
  }
1939
5.69M
1940
5.69M
  // If we are in C++11, lex the optional ud-suffix.
1941
5.69M
  
if (5.69M
getLangOpts().CPlusPlus5.69M
)
1942
3.40M
    CurPtr = LexUDSuffix(Result, CurPtr, true);
1943
5.69M
1944
5.69M
  // If a nul character existed in the string, warn about it.
1945
5.69M
  if (NulCharacter && 
!isLexingRawMode()8
)
1946
2
    Diag(NulCharacter, diag::null_in_char_or_string) << 1;
1947
5.69M
1948
5.69M
  // Update the location of the token as well as the BufferPtr instance var.
1949
5.69M
  const char *TokStart = BufferPtr;
1950
5.69M
  FormTokenWithChars(Result, CurPtr, Kind);
1951
5.69M
  Result.setLiteralData(TokStart);
1952
5.69M
  return true;
1953
5.69M
}
1954
1955
/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
1956
/// having lexed R", LR", u8R", uR", or UR".
1957
bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
1958
468
                                tok::TokenKind Kind) {
1959
468
  // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
1960
468
  //  Between the initial and final double quote characters of the raw string,
1961
468
  //  any transformations performed in phases 1 and 2 (trigraphs,
1962
468
  //  universal-character-names, and line splicing) are reverted.
1963
468
1964
468
  if (!isLexingRawMode())
1965
99
    Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1966
468
1967
468
  unsigned PrefixLen = 0;
1968
468
1969
1.28k
  while (PrefixLen != 16 && 
isRawStringDelimBody(CurPtr[PrefixLen])1.28k
)
1970
814
    ++PrefixLen;
1971
468
1972
468
  // If the last character was not a '(', then we didn't lex a valid delimiter.
1973
468
  if (CurPtr[PrefixLen] != '(') {
1974
1
    if (!isLexingRawMode()) {
1975
1
      const char *PrefixEnd = &CurPtr[PrefixLen];
1976
1
      if (PrefixLen == 16) {
1977
1
        Diag(PrefixEnd, diag::err_raw_delim_too_long);
1978
1
      } else {
1979
0
        Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1980
0
          << StringRef(PrefixEnd, 1);
1981
0
      }
1982
1
    }
1983
1
1984
1
    // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
1985
1
    // it's possible the '"' was intended to be part of the raw string, but
1986
1
    // there's not much we can do about that.
1987
59
    while (true) {
1988
59
      char C = *CurPtr++;
1989
59
1990
59
      if (C == '"')
1991
1
        break;
1992
58
      if (C == 0 && 
CurPtr-1 == BufferEnd0
) {
1993
0
        --CurPtr;
1994
0
        break;
1995
0
      }
1996
58
    }
1997
1
1998
1
    FormTokenWithChars(Result, CurPtr, tok::unknown);
1999
1
    return true;
2000
1
  }
2001
467
2002
467
  // Save prefix and move CurPtr past it
2003
467
  const char *Prefix = CurPtr;
2004
467
  CurPtr += PrefixLen + 1; // skip over prefix and '('
2005
467
2006
8.19k
  while (true) {
2007
8.19k
    char C = *CurPtr++;
2008
8.19k
2009
8.19k
    if (C == ')') {
2010
471
      // Check for prefix match and closing quote.
2011
471
      if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && 
CurPtr[PrefixLen] == '"'463
) {
2012
463
        CurPtr += PrefixLen + 1; // skip over prefix and '"'
2013
463
        break;
2014
463
      }
2015
7.72k
    } else if (C == 0 && 
CurPtr-1 == BufferEnd4
) { // End of file.
2016
4
      if (!isLexingRawMode())
2017
1
        Diag(BufferPtr, diag::err_unterminated_raw_string)
2018
1
          << StringRef(Prefix, PrefixLen);
2019
4
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2020
4
      return true;
2021
4
    }
2022
8.19k
  }
2023
467
2024
467
  // If we are in C++11, lex the optional ud-suffix.
2025
467
  
if (463
getLangOpts().CPlusPlus463
)
2026
463
    CurPtr = LexUDSuffix(Result, CurPtr, true);
2027
463
2028
463
  // Update the location of token as well as BufferPtr.
2029
463
  const char *TokStart = BufferPtr;
2030
463
  FormTokenWithChars(Result, CurPtr, Kind);
2031
463
  Result.setLiteralData(TokStart);
2032
463
  return true;
2033
467
}
2034
2035
/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2036
/// after having lexed the '<' character.  This is used for #include filenames.
2037
841k
bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2038
841k
  // Does this string contain the \0 character?
2039
841k
  const char *NulCharacter = nullptr;
2040
841k
  const char *AfterLessPos = CurPtr;
2041
841k
  char C = getAndAdvanceChar(CurPtr, Result);
2042
13.7M
  while (C != '>') {
2043
12.8M
    // Skip escaped characters.  Escaped newlines will already be processed by
2044
12.8M
    // getAndAdvanceChar.
2045
12.8M
    if (C == '\\')
2046
6
      C = getAndAdvanceChar(CurPtr, Result);
2047
12.8M
2048
12.8M
    if (C == '\n' || 
C == '\r'12.8M
|| // Newline.
2049
12.8M
        
(12.8M
C == 012.8M
&&
(CurPtr - 1 == BufferEnd)10
)) { // End of file.
2050
10
      // If the filename is unterminated, then it must just be a lone <
2051
10
      // character.  Return this as such.
2052
10
      FormTokenWithChars(Result, AfterLessPos, tok::less);
2053
10
      return true;
2054
10
    }
2055
12.8M
2056
12.8M
    if (C == 0) {
2057
7
      if (isCodeCompletionPoint(CurPtr - 1)) {
2058
6
        codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2059
6
        cutOffLexing();
2060
6
        FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2061
6
        return true;
2062
6
      }
2063
1
      NulCharacter = CurPtr-1;
2064
1
    }
2065
12.8M
    C = getAndAdvanceChar(CurPtr, Result);
2066
12.8M
  }
2067
841k
2068
841k
  // If a nul character existed in the string, warn about it.
2069
841k
  
if (841k
NulCharacter841k
&&
!isLexingRawMode()1
)
2070
1
    Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2071
841k
2072
841k
  // Update the location of token as well as BufferPtr.
2073
841k
  const char *TokStart = BufferPtr;
2074
841k
  FormTokenWithChars(Result, CurPtr, tok::header_name);
2075
841k
  Result.setLiteralData(TokStart);
2076
841k
  return true;
2077
841k
}
2078
2079
void Lexer::codeCompleteIncludedFile(const char *PathStart,
2080
                                     const char *CompletionPoint,
2081
9
                                     bool IsAngled) {
2082
9
  // Completion only applies to the filename, after the last slash.
2083
9
  StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2084
9
  auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? 
"/\\"1
:
"/"8
);
2085
9
  StringRef Dir =
2086
9
      (Slash == StringRef::npos) ? 
""5
:
PartialPath.take_front(Slash)4
;
2087
9
  const char *StartOfFilename =
2088
9
      (Slash == StringRef::npos) ? 
PathStart5
:
PathStart + Slash + 14
;
2089
9
  // Code completion filter range is the filename only, up to completion point.
2090
9
  PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2091
9
      StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2092
9
  // We should replace the characters up to the closing quote, if any.
2093
69
  while (CompletionPoint < BufferEnd) {
2094
69
    char Next = *(CompletionPoint + 1);
2095
69
    if (Next == 0 || Next == '\r' || Next == '\n')
2096
0
      break;
2097
69
    ++CompletionPoint;
2098
69
    if (Next == (IsAngled ? 
'>'51
:
'"'18
))
2099
9
      break;
2100
69
  }
2101
9
  PP->setCodeCompletionTokenRange(
2102
9
      FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2103
9
      FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2104
9
  PP->CodeCompleteIncludedFile(Dir, IsAngled);
2105
9
}
2106
2107
/// LexCharConstant - Lex the remainder of a character constant, after having
2108
/// lexed either ' or L' or u8' or u' or U'.
2109
bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2110
118k
                            tok::TokenKind Kind) {
2111
118k
  // Does this character contain the \0 character?
2112
118k
  const char *NulCharacter = nullptr;
2113
118k
2114
118k
  if (!isLexingRawMode()) {
2115
98.7k
    if (Kind == tok::utf16_char_constant || 
Kind == tok::utf32_char_constant98.6k
)
2116
143
      Diag(BufferPtr, getLangOpts().CPlusPlus
2117
143
                          ? 
diag::warn_cxx98_compat_unicode_literal131
2118
143
                          : 
diag::warn_c99_compat_unicode_literal12
);
2119
98.5k
    else if (Kind == tok::utf8_char_constant)
2120
24
      Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2121
98.7k
  }
2122
118k
2123
118k
  char C = getAndAdvanceChar(CurPtr, Result);
2124
118k
  if (C == '\'') {
2125
28
    if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor2
)
2126
0
      Diag(BufferPtr, diag::ext_empty_character);
2127
28
    FormTokenWithChars(Result, CurPtr, tok::unknown);
2128
28
    return true;
2129
28
  }
2130
118k
2131
249k
  
while (118k
C != '\'') {
2132
130k
    // Skip escaped characters.
2133
130k
    if (C == '\\')
2134
26.2k
      C = getAndAdvanceChar(CurPtr, Result);
2135
130k
2136
130k
    if (C == '\n' || 
C == '\r'130k
|| // Newline.
2137
130k
        
(130k
C == 0130k
&&
CurPtr-1 == BufferEnd19
)) { // End of file.
2138
41
      if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor5
)
2139
3
        Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2140
41
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2141
41
      return true;
2142
41
    }
2143
130k
2144
130k
    if (C == 0) {
2145
14
      if (isCodeCompletionPoint(CurPtr-1)) {
2146
6
        PP->CodeCompleteNaturalLanguage();
2147
6
        FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2148
6
        cutOffLexing();
2149
6
        return true;
2150
6
      }
2151
8
2152
8
      NulCharacter = CurPtr-1;
2153
8
    }
2154
130k
    C = getAndAdvanceChar(CurPtr, Result);
2155
130k
  }
2156
118k
2157
118k
  // If we are in C++11, lex the optional ud-suffix.
2158
118k
  
if (118k
getLangOpts().CPlusPlus118k
)
2159
81.0k
    CurPtr = LexUDSuffix(Result, CurPtr, false);
2160
118k
2161
118k
  // If a nul character existed in the character, warn about it.
2162
118k
  if (NulCharacter && 
!isLexingRawMode()8
)
2163
2
    Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2164
118k
2165
118k
  // Update the location of token as well as BufferPtr.
2166
118k
  const char *TokStart = BufferPtr;
2167
118k
  FormTokenWithChars(Result, CurPtr, Kind);
2168
118k
  Result.setLiteralData(TokStart);
2169
118k
  return true;
2170
118k
}
2171
2172
/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2173
/// Update BufferPtr to point to the next non-whitespace character and return.
2174
///
2175
/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2176
bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2177
170M
                           bool &TokAtPhysicalStartOfLine) {
2178
170M
  // Whitespace - Skip it, then return the token after the whitespace.
2179
170M
  bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2180
170M
2181
170M
  unsigned char Char = *CurPtr;
2182
170M
2183
170M
  // Skip consecutive spaces efficiently.
2184
177M
  while (
true177M
) {
2185
177M
    // Skip horizontal whitespace very aggressively.
2186
1.61G
    while (isHorizontalWhitespace(Char))
2187
1.43G
      Char = *++CurPtr;
2188
177M
2189
177M
    // Otherwise if we have something other than whitespace, we're done.
2190
177M
    if (!isVerticalWhitespace(Char))
2191
170M
      break;
2192
7.20M
2193
7.20M
    if (ParsingPreprocessorDirective) {
2194
4.23k
      // End of preprocessor directive line, let LexTokenInternal handle this.
2195
4.23k
      BufferPtr = CurPtr;
2196
4.23k
      return false;
2197
4.23k
    }
2198
7.20M
2199
7.20M
    // OK, but handle newline.
2200
7.20M
    SawNewline = true;
2201
7.20M
    Char = *++CurPtr;
2202
7.20M
  }
2203
170M
2204
170M
  // If the client wants us to return whitespace, return it now.
2205
170M
  
if (170M
isKeepWhitespaceMode()170M
) {
2206
196k
    FormTokenWithChars(Result, CurPtr, tok::unknown);
2207
196k
    if (SawNewline) {
2208
195k
      IsAtStartOfLine = true;
2209
195k
      IsAtPhysicalStartOfLine = true;
2210
195k
    }
2211
196k
    // FIXME: The next token will not have LeadingSpace set.
2212
196k
    return true;
2213
196k
  }
2214
170M
2215
170M
  // If this isn't immediately after a newline, there is leading space.
2216
170M
  char PrevChar = CurPtr[-1];
2217
170M
  bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2218
170M
2219
170M
  Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2220
170M
  if (SawNewline) {
2221
166M
    Result.setFlag(Token::StartOfLine);
2222
166M
    TokAtPhysicalStartOfLine = true;
2223
166M
  }
2224
170M
2225
170M
  BufferPtr = CurPtr;
2226
170M
  return false;
2227
170M
}
2228
2229
/// We have just read the // characters from input.  Skip until we find the
2230
/// newline character that terminates the comment.  Then update BufferPtr and
2231
/// return.
2232
///
2233
/// If we're in KeepCommentMode or any CommentHandler has inserted
2234
/// some tokens, this will store the first token and return true.
2235
bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2236
12.2M
                            bool &TokAtPhysicalStartOfLine) {
2237
12.2M
  // If Line comments aren't explicitly enabled for this language, emit an
2238
12.2M
  // extension warning.
2239
12.2M
  if (!LangOpts.LineComment && 
!isLexingRawMode()2.30k
) {
2240
48
    Diag(BufferPtr, diag::ext_line_comment);
2241
48
2242
48
    // Mark them enabled so we only emit one warning for this translation
2243
48
    // unit.
2244
48
    LangOpts.LineComment = true;
2245
48
  }
2246
12.2M
2247
12.2M
  // Scan over the body of the comment.  The common case, when scanning, is that
2248
12.2M
  // the comment contains normal ascii characters with nothing interesting in
2249
12.2M
  // them.  As such, optimize for this case with the inner loop.
2250
12.2M
  //
2251
12.2M
  // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2252
12.2M
  // character that ends the line comment.
2253
12.2M
  char C;
2254
12.3M
  while (true) {
2255
12.3M
    C = *CurPtr;
2256
12.3M
    // Skip over characters in the fast loop.
2257
637M
    while (C != 0 &&                // Potentially EOF.
2258
637M
           
C != '\n'637M
&&
C != '\r'624M
) // Newline or DOS-style newline.
2259
624M
      C = *++CurPtr;
2260
12.3M
2261
12.3M
    const char *NextLine = CurPtr;
2262
12.3M
    if (C != 0) {
2263
12.3M
      // We found a newline, see if it's escaped.
2264
12.3M
      const char *EscapePtr = CurPtr-1;
2265
12.3M
      bool HasSpace = false;
2266
12.3M
      while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2267
16.2k
        --EscapePtr;
2268
16.2k
        HasSpace = true;
2269
16.2k
      }
2270
12.3M
2271
12.3M
      if (*EscapePtr == '\\')
2272
88.8k
        // Escaped newline.
2273
88.8k
        CurPtr = EscapePtr;
2274
12.2M
      else if (EscapePtr[0] == '/' && 
EscapePtr[-1] == '?'1.56M
&&
2275
12.2M
               
EscapePtr[-2] == '?'6
&&
LangOpts.Trigraphs6
)
2276
3
        // Trigraph-escaped newline.
2277
3
        CurPtr = EscapePtr-2;
2278
12.2M
      else
2279
12.2M
        break; // This is a newline, we're done.
2280
88.8k
2281
88.8k
      // If there was space between the backslash and newline, warn about it.
2282
88.8k
      if (HasSpace && 
!isLexingRawMode()8
)
2283
6
        Diag(EscapePtr, diag::backslash_newline_space);
2284
88.8k
    }
2285
12.3M
2286
12.3M
    // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
2287
12.3M
    // properly decode the character.  Read it in raw mode to avoid emitting
2288
12.3M
    // diagnostics about things like trigraphs.  If we see an escaped newline,
2289
12.3M
    // we'll handle it below.
2290
12.3M
    const char *OldPtr = CurPtr;
2291
90.0k
    bool OldRawMode = isLexingRawMode();
2292
90.0k
    LexingRawMode = true;
2293
90.0k
    C = getAndAdvanceChar(CurPtr, Result);
2294
90.0k
    LexingRawMode = OldRawMode;
2295
90.0k
2296
90.0k
    // If we only read only one character, then no special handling is needed.
2297
90.0k
    // We're done and can skip forward to the newline.
2298
90.0k
    if (C != 0 && 
CurPtr == OldPtr+188.8k
) {
2299
0
      CurPtr = NextLine;
2300
0
      break;
2301
0
    }
2302
90.0k
2303
90.0k
    // If we read multiple characters, and one of those characters was a \r or
2304
90.0k
    // \n, then we had an escaped newline within the comment.  Emit diagnostic
2305
90.0k
    // unless the next line is also a // comment.
2306
90.0k
    if (CurPtr != OldPtr + 1 && 
C != '/'88.8k
&&
2307
90.0k
        
(3.92k
CurPtr == BufferEnd + 13.92k
||
CurPtr[0] != '/'3.92k
)) {
2308
7.84k
      for (; OldPtr != CurPtr; 
++OldPtr3.92k
)
2309
7.84k
        if (OldPtr[0] == '\n' || 
OldPtr[0] == '\r'3.92k
) {
2310
3.91k
          // Okay, we found a // comment that ends in a newline, if the next
2311
3.91k
          // line is also a // comment, but has spaces, don't emit a diagnostic.
2312
3.91k
          if (isWhitespace(C)) {
2313
3.74k
            const char *ForwardPtr = CurPtr;
2314
44.9k
            while (isWhitespace(*ForwardPtr))  // Skip whitespace.
2315
41.2k
              ++ForwardPtr;
2316
3.74k
            if (ForwardPtr[0] == '/' && 
ForwardPtr[1] == '/'3.32k
)
2317
3.32k
              break;
2318
593
          }
2319
593
2320
593
          if (!isLexingRawMode())
2321
543
            Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2322
593
          break;
2323
593
        }
2324
3.91k
    }
2325
90.0k
2326
90.0k
    if (C == '\r' || C == '\n' || 
CurPtr == BufferEnd + 190.0k
) {
2327
1.20k
      --CurPtr;
2328
1.20k
      break;
2329
1.20k
    }
2330
88.8k
2331
88.8k
    if (C == '\0' && 
isCodeCompletionPoint(CurPtr-1)9
) {
2332
8
      PP->CodeCompleteNaturalLanguage();
2333
8
      cutOffLexing();
2334
8
      return false;
2335
8
    }
2336
88.8k
  }
2337
12.2M
2338
12.2M
  // Found but did not consume the newline.  Notify comment handlers about the
2339
12.2M
  // comment unless we're in a #if 0 block.
2340
12.2M
  
if (12.2M
PP12.2M
&&
!isLexingRawMode()12.1M
&&
2341
12.2M
      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2342
9.40M
                                            getSourceLocation(CurPtr)))) {
2343
0
    BufferPtr = CurPtr;
2344
0
    return true; // A token has to be returned.
2345
0
  }
2346
12.2M
2347
12.2M
  // If we are returning comments as tokens, return this comment as a token.
2348
12.2M
  if (inKeepCommentMode())
2349
66.6k
    return SaveLineComment(Result, CurPtr);
2350
12.2M
2351
12.2M
  // If we are inside a preprocessor directive and we see the end of line,
2352
12.2M
  // return immediately, so that the lexer can return this as an EOD token.
2353
12.2M
  if (ParsingPreprocessorDirective || 
CurPtr == BufferEnd11.7M
) {
2354
476k
    BufferPtr = CurPtr;
2355
476k
    return false;
2356
476k
  }
2357
11.7M
2358
11.7M
  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
2359
11.7M
  // \r\n sequence.  This is an efficiency hack (because we know the \n can't
2360
11.7M
  // contribute to another token), it isn't needed for correctness.  Note that
2361
11.7M
  // this is ok even in KeepWhitespaceMode, because we would have returned the
2362
11.7M
  /// comment above in that mode.
2363
11.7M
  ++CurPtr;
2364
11.7M
2365
11.7M
  // The next returned token is at the start of the line.
2366
11.7M
  Result.setFlag(Token::StartOfLine);
2367
11.7M
  TokAtPhysicalStartOfLine = true;
2368
11.7M
  // No leading whitespace seen so far.
2369
11.7M
  Result.clearFlag(Token::LeadingSpace);
2370
11.7M
  BufferPtr = CurPtr;
2371
11.7M
  return false;
2372
11.7M
}
2373
2374
/// If in save-comment mode, package up this Line comment in an appropriate
2375
/// way and return it.
2376
66.6k
bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2377
66.6k
  // If we're not in a preprocessor directive, just return the // comment
2378
66.6k
  // directly.
2379
66.6k
  FormTokenWithChars(Result, CurPtr, tok::comment);
2380
66.6k
2381
66.6k
  if (!ParsingPreprocessorDirective || 
LexingRawMode2
)
2382
66.6k
    return true;
2383
2
2384
2
  // If this Line-style comment is in a macro definition, transmogrify it into
2385
2
  // a C-style block comment.
2386
2
  bool Invalid = false;
2387
2
  std::string Spelling = PP->getSpelling(Result, &Invalid);
2388
2
  if (Invalid)
2389
0
    return true;
2390
2
2391
2
  assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2392
2
  Spelling[1] = '*';   // Change prefix to "/*".
2393
2
  Spelling += "*/";    // add suffix.
2394
2
2395
2
  Result.setKind(tok::comment);
2396
2
  PP->CreateString(Spelling, Result,
2397
2
                   Result.getLocation(), Result.getLocation());
2398
2
  return true;
2399
2
}
2400
2401
/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2402
/// character (either \\n or \\r) is part of an escaped newline sequence.  Issue
2403
/// a diagnostic if so.  We know that the newline is inside of a block comment.
2404
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2405
28.7k
                                                  Lexer *L) {
2406
28.7k
  assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2407
28.7k
2408
28.7k
  // Back up off the newline.
2409
28.7k
  --CurPtr;
2410
28.7k
2411
28.7k
  // If this is a two-character newline sequence, skip the other character.
2412
28.7k
  if (CurPtr[0] == '\n' || 
CurPtr[0] == '\r'2.39k
) {
2413
26.3k
    // \n\n or \r\r -> not escaped newline.
2414
26.3k
    if (CurPtr[0] == CurPtr[1])
2415
26.3k
      return false;
2416
0
    // \n\r or \r\n -> skip the newline.
2417
0
    --CurPtr;
2418
0
  }
2419
28.7k
2420
28.7k
  // If we have horizontal whitespace, skip over it.  We allow whitespace
2421
28.7k
  // between the slash and newline.
2422
28.7k
  bool HasSpace = false;
2423
2.43k
  while (isHorizontalWhitespace(*CurPtr) || 
*CurPtr == 02.39k
) {
2424
39
    --CurPtr;
2425
39
    HasSpace = true;
2426
39
  }
2427
2.39k
2428
2.39k
  // If we have a slash, we know this is an escaped newline.
2429
2.39k
  if (*CurPtr == '\\') {
2430
8
    if (CurPtr[-1] != '*') 
return false0
;
2431
2.38k
  } else {
2432
2.38k
    // It isn't a slash, is it the ?? / trigraph?
2433
2.38k
    if (CurPtr[0] != '/' || 
CurPtr[-1] != '?'8
||
CurPtr[-2] != '?'5
||
2434
2.38k
        
CurPtr[-3] != '*'5
)
2435
2.38k
      return false;
2436
5
2437
5
    // This is the trigraph ending the comment.  Emit a stern warning!
2438
5
    CurPtr -= 2;
2439
5
2440
5
    // If no trigraphs are enabled, warn that we ignored this trigraph and
2441
5
    // ignore this * character.
2442
5
    if (!L->getLangOpts().Trigraphs) {
2443
0
      if (!L->isLexingRawMode())
2444
0
        L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
2445
0
      return false;
2446
0
    }
2447
5
    if (!L->isLexingRawMode())
2448
5
      L->Diag(CurPtr, diag::trigraph_ends_block_comment);
2449
5
  }
2450
2.39k
2451
2.39k
  // Warn about having an escaped newline between the */ characters.
2452
2.39k
  
if (13
!L->isLexingRawMode()13
)
2453
10
    L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
2454
13
2455
13
  // If there was space between the backslash and newline, warn about it.
2456
13
  if (HasSpace && !L->isLexingRawMode())
2457
10
    L->Diag(CurPtr, diag::backslash_newline_space);
2458
13
2459
13
  return true;
2460
2.39k
}
2461
2462
#ifdef __SSE2__
2463
#include <emmintrin.h>
2464
#elif __ALTIVEC__
2465
#include <altivec.h>
2466
#undef bool
2467
#endif
2468
2469
/// We have just read from input the / and * characters that started a comment.
2470
/// Read until we find the * and / characters that terminate the comment.
2471
/// Note that we don't bother decoding trigraphs or escaped newlines in block
2472
/// comments, because they cannot cause the comment to end.  The only thing
2473
/// that can happen is the comment could end with an escaped newline between
2474
/// the terminating * and /.
2475
///
2476
/// If we're in KeepCommentMode or any CommentHandler has inserted
2477
/// some tokens, this will store the first token and return true.
2478
bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2479
5.97M
                             bool &TokAtPhysicalStartOfLine) {
2480
5.97M
  // Scan one character past where we should, looking for a '/' character.  Once
2481
5.97M
  // we find it, check to see if it was preceded by a *.  This common
2482
5.97M
  // optimization helps people who like to put a lot of * characters in their
2483
5.97M
  // comments.
2484
5.97M
2485
5.97M
  // The first character we get with newlines and trigraphs skipped to handle
2486
5.97M
  // the degenerate /*/ case below correctly if the * has an escaped newline
2487
5.97M
  // after it.
2488
5.97M
  unsigned CharSize;
2489
5.97M
  unsigned char C = getCharAndSize(CurPtr, CharSize);
2490
5.97M
  CurPtr += CharSize;
2491
5.97M
  if (C == 0 && 
CurPtr == BufferEnd+12
) {
2492
1
    if (!isLexingRawMode())
2493
0
      Diag(BufferPtr, diag::err_unterminated_block_comment);
2494
1
    --CurPtr;
2495
1
2496
1
    // KeepWhitespaceMode should return this broken comment as a token.  Since
2497
1
    // it isn't a well formed comment, just return it as an 'unknown' token.
2498
1
    if (isKeepWhitespaceMode()) {
2499
0
      FormTokenWithChars(Result, CurPtr, tok::unknown);
2500
0
      return true;
2501
0
    }
2502
1
2503
1
    BufferPtr = CurPtr;
2504
1
    return false;
2505
1
  }
2506
5.97M
2507
5.97M
  // Check to see if the first character after the '/*' is another /.  If so,
2508
5.97M
  // then this slash does not end the block comment, it is part of it.
2509
5.97M
  if (C == '/')
2510
12
    C = *CurPtr++;
2511
5.97M
2512
9.18M
  while (true) {
2513
9.18M
    // Skip over all non-interesting characters until we find end of buffer or a
2514
9.18M
    // (probably ending) '/' character.
2515
9.18M
    if (CurPtr + 24 < BufferEnd &&
2516
9.18M
        // If there is a code-completion point avoid the fast scan because it
2517
9.18M
        // doesn't check for '\0'.
2518
9.18M
        
!(8.90M
PP8.90M
&&
PP->getCodeCompletionFileLoc() == FileLoc8.90M
)) {
2519
8.90M
      // While not aligned to a 16-byte boundary.
2520
70.0M
      while (C != '/' && 
((intptr_t)CurPtr & 0x0F) != 068.3M
)
2521
61.1M
        C = *CurPtr++;
2522
8.90M
2523
8.90M
      if (C == '/') 
goto FoundSlash1.63M
;
2524
7.26M
2525
7.26M
#ifdef __SSE2__
2526
7.26M
      __m128i Slashes = _mm_set1_epi8('/');
2527
61.7M
      while (CurPtr+16 <= BufferEnd) {
2528
61.7M
        int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2529
61.7M
                                    Slashes));
2530
61.7M
        if (cmp != 0) {
2531
7.22M
          // Adjust the pointer to point directly after the first slash. It's
2532
7.22M
          // not necessary to set C here, it will be overwritten at the end of
2533
7.22M
          // the outer loop.
2534
7.22M
          CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2535
7.22M
          goto FoundSlash;
2536
7.22M
        }
2537
54.5M
        CurPtr += 16;
2538
54.5M
      }
2539
#elif __ALTIVEC__
2540
      __vector unsigned char Slashes = {
2541
        '/', '/', '/', '/',  '/', '/', '/', '/',
2542
        '/', '/', '/', '/',  '/', '/', '/', '/'
2543
      };
2544
      while (CurPtr+16 <= BufferEnd &&
2545
             !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes))
2546
        CurPtr += 16;
2547
#else
2548
      // Scan for '/' quickly.  Many block comments are very large.
2549
      while (CurPtr[0] != '/' &&
2550
             CurPtr[1] != '/' &&
2551
             CurPtr[2] != '/' &&
2552
             CurPtr[3] != '/' &&
2553
             CurPtr+4 < BufferEnd) {
2554
        CurPtr += 4;
2555
      }
2556
#endif
2557
2558
7.26M
      // It has to be one of the bytes scanned, increment to it and read one.
2559
7.26M
      C = *CurPtr++;
2560
35.8k
    }
2561
9.18M
2562
9.18M
    // Loop to scan the remainder.
2563
9.18M
    
while (313k
C != '/'4.51M
&&
C != '\0'4.19M
)
2564
4.19M
      C = *CurPtr++;
2565
313k
2566
313k
    if (C == '/') {
2567
9.18M
  FoundSlash:
2568
9.18M
      if (CurPtr[-2] == '*')  // We found the final */.  We're done!
2569
5.97M
        break;
2570
3.20M
2571
3.20M
      if ((CurPtr[-2] == '\n' || 
CurPtr[-2] == '\r'3.18M
)) {
2572
28.7k
        if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2573
13
          // We found the final */, though it had an escaped newline between the
2574
13
          // * and /.  We're done!
2575
13
          break;
2576
13
        }
2577
3.20M
      }
2578
3.20M
      if (CurPtr[0] == '*' && 
CurPtr[1] != '/'401
) {
2579
401
        // If this is a /* inside of the comment, emit a warning.  Don't do this
2580
401
        // if this is a /*/, which will end the comment.  This misses cases with
2581
401
        // embedded escaped newlines, but oh well.
2582
401
        if (!isLexingRawMode())
2583
398
          Diag(CurPtr-1, diag::warn_nested_block_comment);
2584
401
      }
2585
3.20M
    } else 
if (14
C == 014
&&
CurPtr == BufferEnd+114
) {
2586
2
      if (!isLexingRawMode())
2587
2
        Diag(BufferPtr, diag::err_unterminated_block_comment);
2588
2
      // Note: the user probably forgot a */.  We could continue immediately
2589
2
      // after the /*, but this would involve lexing a lot of what really is the
2590
2
      // comment, which surely would confuse the parser.
2591
2
      --CurPtr;
2592
2
2593
2
      // KeepWhitespaceMode should return this broken comment as a token.  Since
2594
2
      // it isn't a well formed comment, just return it as an 'unknown' token.
2595
2
      if (isKeepWhitespaceMode()) {
2596
0
        FormTokenWithChars(Result, CurPtr, tok::unknown);
2597
0
        return true;
2598
0
      }
2599
2
2600
2
      BufferPtr = CurPtr;
2601
2
      return false;
2602
12
    } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2603
9
      PP->CodeCompleteNaturalLanguage();
2604
9
      cutOffLexing();
2605
9
      return false;
2606
9
    }
2607
3.20M
2608
3.20M
    C = *CurPtr++;
2609
3.20M
  }
2610
5.97M
2611
5.97M
  // Notify comment handlers about the comment unless we're in a #if 0 block.
2612
5.97M
  
if (5.97M
PP5.97M
&&
!isLexingRawMode()5.96M
&&
2613
5.97M
      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2614
4.90M
                                            getSourceLocation(CurPtr)))) {
2615
0
    BufferPtr = CurPtr;
2616
0
    return true; // A token has to be returned.
2617
0
  }
2618
5.97M
2619
5.97M
  // If we are returning comments as tokens, return this comment as a token.
2620
5.97M
  if (inKeepCommentMode()) {
2621
2.05k
    FormTokenWithChars(Result, CurPtr, tok::comment);
2622
2.05k
    return true;
2623
2.05k
  }
2624
5.96M
2625
5.96M
  // It is common for the tokens immediately after a /**/ comment to be
2626
5.96M
  // whitespace.  Instead of going through the big switch, handle it
2627
5.96M
  // efficiently now.  This is safe even in KeepWhitespaceMode because we would
2628
5.96M
  // have already returned above with the comment as a token.
2629
5.96M
  if (isHorizontalWhitespace(*CurPtr)) {
2630
51.4k
    SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2631
51.4k
    return false;
2632
51.4k
  }
2633
5.91M
2634
5.91M
  // Otherwise, just return so that the next character will be lexed as a token.
2635
5.91M
  BufferPtr = CurPtr;
2636
5.91M
  Result.setFlag(Token::LeadingSpace);
2637
5.91M
  return false;
2638
5.91M
}
2639
2640
//===----------------------------------------------------------------------===//
2641
// Primary Lexing Entry Points
2642
//===----------------------------------------------------------------------===//
2643
2644
/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2645
/// uninterpreted string.  This switches the lexer out of directive mode.
2646
1.80k
void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2647
1.80k
  assert(ParsingPreprocessorDirective && ParsingFilename == false &&
2648
1.80k
         "Must be in a preprocessing directive!");
2649
1.80k
  Token Tmp;
2650
1.80k
2651
1.80k
  // CurPtr - Cache BufferPtr in an automatic variable.
2652
1.80k
  const char *CurPtr = BufferPtr;
2653
33.4k
  while (true) {
2654
33.4k
    char Char = getAndAdvanceChar(CurPtr, Tmp);
2655
33.4k
    switch (Char) {
2656
33.4k
    default:
2657
31.6k
      if (Result)
2658
4.68k
        Result->push_back(Char);
2659
31.6k
      break;
2660
33.4k
    case 0:  // Null.
2661
18
      // Found end of file?
2662
18
      if (CurPtr-1 != BufferEnd) {
2663
18
        if (isCodeCompletionPoint(CurPtr-1)) {
2664
18
          PP->CodeCompleteNaturalLanguage();
2665
18
          cutOffLexing();
2666
18
          return;
2667
18
        }
2668
0
2669
0
        // Nope, normal character, continue.
2670
0
        if (Result)
2671
0
          Result->push_back(Char);
2672
0
        break;
2673
0
      }
2674
0
      // FALL THROUGH.
2675
0
      LLVM_FALLTHROUGH;
2676
1.78k
    case '\r':
2677
1.78k
    case '\n':
2678
1.78k
      // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2679
1.78k
      assert(CurPtr[-1] == Char && "Trigraphs for newline?");
2680
1.78k
      BufferPtr = CurPtr-1;
2681
1.78k
2682
1.78k
      // Next, lex the character, which should handle the EOD transition.
2683
1.78k
      Lex(Tmp);
2684
1.78k
      if (Tmp.is(tok::code_completion)) {
2685
0
        if (PP)
2686
0
          PP->CodeCompleteNaturalLanguage();
2687
0
        Lex(Tmp);
2688
0
      }
2689
1.78k
      assert(Tmp.is(tok::eod) && "Unexpected token!");
2690
1.78k
2691
1.78k
      // Finally, we're done;
2692
1.78k
      return;
2693
33.4k
    }
2694
33.4k
  }
2695
1.80k
}
2696
2697
/// LexEndOfFile - CurPtr points to the end of this file.  Handle this
2698
/// condition, reporting diagnostics and handling other edge cases as required.
2699
/// This returns true if Result contains a token, false if PP.Lex should be
2700
/// called again.
2701
637k
bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2702
637k
  // If we hit the end of the file while parsing a preprocessor directive,
2703
637k
  // end the preprocessor directive first.  The next token returned will
2704
637k
  // then be the end of file.
2705
637k
  if (ParsingPreprocessorDirective) {
2706
1.13k
    // Done parsing the "line".
2707
1.13k
    ParsingPreprocessorDirective = false;
2708
1.13k
    // Update the location of token as well as BufferPtr.
2709
1.13k
    FormTokenWithChars(Result, CurPtr, tok::eod);
2710
1.13k
2711
1.13k
    // Restore comment saving mode, in case it was disabled for directive.
2712
1.13k
    if (PP)
2713
1.13k
      resetExtendedTokenMode();
2714
1.13k
    return true;  // Have a token.
2715
1.13k
  }
2716
636k
2717
636k
  // If we are in raw mode, return this event as an EOF token.  Let the caller
2718
636k
  // that put us in raw mode handle the event.
2719
636k
  if (isLexingRawMode()) {
2720
34.2k
    Result.startToken();
2721
34.2k
    BufferPtr = BufferEnd;
2722
34.2k
    FormTokenWithChars(Result, BufferEnd, tok::eof);
2723
34.2k
    return true;
2724
34.2k
  }
2725
601k
2726
601k
  if (PP->isRecordingPreamble() && 
PP->isInPrimaryFile()251
) {
2727
89
    PP->setRecordedPreambleConditionalStack(ConditionalStack);
2728
89
    ConditionalStack.clear();
2729
89
  }
2730
601k
2731
601k
  // Issue diagnostics for unterminated #if and missing newline.
2732
601k
2733
601k
  // If we are in a #if directive, emit an error.
2734
601k
  while (!ConditionalStack.empty()) {
2735
8
    if (PP->getCodeCompletionFileLoc() != FileLoc)
2736
8
      PP->Diag(ConditionalStack.back().IfLoc,
2737
8
               diag::err_pp_unterminated_conditional);
2738
8
    ConditionalStack.pop_back();
2739
8
  }
2740
601k
2741
601k
  // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2742
601k
  // a pedwarn.
2743
601k
  if (CurPtr != BufferStart && 
(597k
CurPtr[-1] != '\n'597k
&&
CurPtr[-1] != '\r'7.24k
)) {
2744
7.24k
    DiagnosticsEngine &Diags = PP->getDiagnostics();
2745
7.24k
    SourceLocation EndLoc = getSourceLocation(BufferEnd);
2746
7.24k
    unsigned DiagID;
2747
7.24k
2748
7.24k
    if (LangOpts.CPlusPlus11) {
2749
5.16k
      // C++11 [lex.phases] 2.2 p2
2750
5.16k
      // Prefer the C++98 pedantic compatibility warning over the generic,
2751
5.16k
      // non-extension, user-requested "missing newline at EOF" warning.
2752
5.16k
      if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2753
2
        DiagID = diag::warn_cxx98_compat_no_newline_eof;
2754
5.16k
      } else {
2755
5.16k
        DiagID = diag::warn_no_newline_eof;
2756
5.16k
      }
2757
5.16k
    } else {
2758
2.07k
      DiagID = diag::ext_no_newline_eof;
2759
2.07k
    }
2760
7.24k
2761
7.24k
    Diag(BufferEnd, DiagID)
2762
7.24k
      << FixItHint::CreateInsertion(EndLoc, "\n");
2763
7.24k
  }
2764
601k
2765
601k
  BufferPtr = CurPtr;
2766
601k
2767
601k
  // Finally, let the preprocessor handle this.
2768
601k
  return PP->HandleEndOfFile(Result, isPragmaLexer());
2769
601k
}
2770
2771
/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2772
/// the specified lexer will return a tok::l_paren token, 0 if it is something
2773
/// else and 2 if there are no more tokens in the buffer controlled by the
2774
/// lexer.
2775
1.31M
unsigned Lexer::isNextPPTokenLParen() {
2776
1.31M
  assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
2777
1.31M
2778
1.31M
  // Switch to 'skipping' mode.  This will ensure that we can lex a token
2779
1.31M
  // without emitting diagnostics, disables macro expansion, and will cause EOF
2780
1.31M
  // to return an EOF token instead of popping the include stack.
2781
1.31M
  LexingRawMode = true;
2782
1.31M
2783
1.31M
  // Save state that can be changed while lexing so that we can restore it.
2784
1.31M
  const char *TmpBufferPtr = BufferPtr;
2785
1.31M
  bool inPPDirectiveMode = ParsingPreprocessorDirective;
2786
1.31M
  bool atStartOfLine = IsAtStartOfLine;
2787
1.31M
  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2788
1.31M
  bool leadingSpace = HasLeadingSpace;
2789
1.31M
2790
1.31M
  Token Tok;
2791
1.31M
  Lex(Tok);
2792
1.31M
2793
1.31M
  // Restore state that may have changed.
2794
1.31M
  BufferPtr = TmpBufferPtr;
2795
1.31M
  ParsingPreprocessorDirective = inPPDirectiveMode;
2796
1.31M
  HasLeadingSpace = leadingSpace;
2797
1.31M
  IsAtStartOfLine = atStartOfLine;
2798
1.31M
  IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2799
1.31M
2800
1.31M
  // Restore the lexer back to non-skipping mode.
2801
1.31M
  LexingRawMode = false;
2802
1.31M
2803
1.31M
  if (Tok.is(tok::eof))
2804
3
    return 2;
2805
1.31M
  return Tok.is(tok::l_paren);
2806
1.31M
}
2807
2808
/// Find the end of a version control conflict marker.
2809
static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
2810
10
                                   ConflictMarkerKind CMK) {
2811
10
  const char *Terminator = CMK == CMK_Perforce ? 
"<<<<\n"5
:
">>>>>>>"5
;
2812
10
  size_t TermLen = CMK == CMK_Perforce ? 
55
:
75
;
2813
10
  auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
2814
10
  size_t Pos = RestOfBuffer.find(Terminator);
2815
11
  while (Pos != StringRef::npos) {
2816
8
    // Must occur at start of line.
2817
8
    if (Pos == 0 ||
2818
8
        
(7
RestOfBuffer[Pos - 1] != '\r'7
&&
RestOfBuffer[Pos - 1] != '\n'7
)) {
2819
1
      RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2820
1
      Pos = RestOfBuffer.find(Terminator);
2821
1
      continue;
2822
1
    }
2823
7
    return RestOfBuffer.data()+Pos;
2824
7
  }
2825
10
  
return nullptr3
;
2826
10
}
2827
2828
/// IsStartOfConflictMarker - If the specified pointer is the start of a version
2829
/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2830
/// and recover nicely.  This returns true if it is a conflict marker and false
2831
/// if not.
2832
2.74k
bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2833
2.74k
  // Only a conflict marker if it starts at the beginning of a line.
2834
2.74k
  if (CurPtr != BufferStart &&
2835
2.74k
      CurPtr[-1] != '\n' && 
CurPtr[-1] != '\r'2.71k
)
2836
2.71k
    return false;
2837
28
2838
28
  // Check to see if we have <<<<<<< or >>>>.
2839
28
  if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
2840
28
      
!StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")19
)
2841
14
    return false;
2842
14
2843
14
  // If we have a situation where we don't care about conflict markers, ignore
2844
14
  // it.
2845
14
  if (CurrentConflictMarkerState || isLexingRawMode())
2846
9
    return false;
2847
5
2848
5
  ConflictMarkerKind Kind = *CurPtr == '<' ? 
CMK_Normal3
:
CMK_Perforce2
;
2849
5
2850
5
  // Check to see if there is an ending marker somewhere in the buffer at the
2851
5
  // start of a line to terminate this conflict marker.
2852
5
  if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2853
4
    // We found a match.  We are really in a conflict marker.
2854
4
    // Diagnose this, and ignore to the end of line.
2855
4
    Diag(CurPtr, diag::err_conflict_marker);
2856
4
    CurrentConflictMarkerState = Kind;
2857
4
2858
4
    // Skip ahead to the end of line.  We know this exists because the
2859
4
    // end-of-conflict marker starts with \r or \n.
2860
76
    while (*CurPtr != '\r' && *CurPtr != '\n') {
2861
72
      assert(CurPtr != BufferEnd && "Didn't find end of line");
2862
72
      ++CurPtr;
2863
72
    }
2864
4
    BufferPtr = CurPtr;
2865
4
    return true;
2866
4
  }
2867
1
2868
1
  // No end of conflict marker found.
2869
1
  return false;
2870
1
}
2871
2872
/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
2873
/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
2874
/// is the end of a conflict marker.  Handle it by ignoring up until the end of
2875
/// the line.  This returns true if it is a conflict marker and false if not.
2876
2.79k
bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
2877
2.79k
  // Only a conflict marker if it starts at the beginning of a line.
2878
2.79k
  if (CurPtr != BufferStart &&
2879
2.79k
      CurPtr[-1] != '\n' && 
CurPtr[-1] != '\r'2.75k
)
2880
2.75k
    return false;
2881
45
2882
45
  // If we have a situation where we don't care about conflict markers, ignore
2883
45
  // it.
2884
45
  if (!CurrentConflictMarkerState || 
isLexingRawMode()5
)
2885
40
    return false;
2886
5
2887
5
  // Check to see if we have the marker (4 characters in a row).
2888
20
  
for (unsigned i = 1; 5
i != 4;
++i15
)
2889
15
    if (CurPtr[i] != CurPtr[0])
2890
0
      return false;
2891
5
2892
5
  // If we do have it, search for the end of the conflict marker.  This could
2893
5
  // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
2894
5
  // be the end of conflict marker.
2895
5
  if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
2896
3
                                        CurrentConflictMarkerState)) {
2897
3
    CurPtr = End;
2898
3
2899
3
    // Skip ahead to the end of line.
2900
37
    while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
2901
34
      ++CurPtr;
2902
3
2903
3
    BufferPtr = CurPtr;
2904
3
2905
3
    // No longer in the conflict marker.
2906
3
    CurrentConflictMarkerState = CMK_None;
2907
3
    return true;
2908
3
  }
2909
2
2910
2
  return false;
2911
2
}
2912
2913
static const char *findPlaceholderEnd(const char *CurPtr,
2914
43
                                      const char *BufferEnd) {
2915
43
  if (CurPtr == BufferEnd)
2916
0
    return nullptr;
2917
43
  BufferEnd -= 1; // Scan until the second last character.
2918
404
  for (; CurPtr != BufferEnd; 
++CurPtr361
) {
2919
404
    if (CurPtr[0] == '#' && 
CurPtr[1] == '>'43
)
2920
43
      return CurPtr + 2;
2921
404
  }
2922
43
  
return nullptr0
;
2923
43
}
2924
2925
45
bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
2926
45
  assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
2927
45
  if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || 
LexingRawMode44
)
2928
2
    return false;
2929
43
  const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
2930
43
  if (!End)
2931
0
    return false;
2932
43
  const char *Start = CurPtr - 1;
2933
43
  if (!LangOpts.AllowEditorPlaceholders)
2934
22
    Diag(Start, diag::err_placeholder_in_source);
2935
43
  Result.startToken();
2936
43
  FormTokenWithChars(Result, End, tok::raw_identifier);
2937
43
  Result.setRawIdentifierData(Start);
2938
43
  PP->LookUpIdentifierInfo(Result);
2939
43
  Result.setFlag(Token::IsEditorPlaceholder);
2940
43
  BufferPtr = End;
2941
43
  return true;
2942
43
}
2943
2944
227M
bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
2945
227M
  if (PP && 
PP->isCodeCompletionEnabled()227M
) {
2946
917k
    SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
2947
917k
    return Loc == PP->getCodeCompletionLoc();
2948
917k
  }
2949
226M
2950
226M
  return false;
2951
226M
}
2952
2953
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
2954
827
                           Token *Result) {
2955
827
  unsigned CharSize;
2956
827
  char Kind = getCharAndSize(StartPtr, CharSize);
2957
827
2958
827
  unsigned NumHexDigits;
2959
827
  if (Kind == 'u')
2960
232
    NumHexDigits = 4;
2961
595
  else if (Kind == 'U')
2962
36
    NumHexDigits = 8;
2963
559
  else
2964
559
    return 0;
2965
268
2966
268
  if (!LangOpts.CPlusPlus && 
!LangOpts.C99132
) {
2967
5
    if (Result && 
!isLexingRawMode()3
)
2968
3
      Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
2969
5
    return 0;
2970
5
  }
2971
263
2972
263
  const char *CurPtr = StartPtr + CharSize;
2973
263
  const char *KindLoc = &CurPtr[-1];
2974
263
2975
263
  uint32_t CodePoint = 0;
2976
1.39k
  for (unsigned i = 0; i < NumHexDigits; 
++i1.12k
) {
2977
1.14k
    char C = getCharAndSize(CurPtr, CharSize);
2978
1.14k
2979
1.14k
    unsigned Value = llvm::hexDigitValue(C);
2980
1.14k
    if (Value == -1U) {
2981
21
      if (Result && 
!isLexingRawMode()18
) {
2982
18
        if (i == 0) {
2983
6
          Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
2984
6
            << StringRef(KindLoc, 1);
2985
12
        } else {
2986
12
          Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
2987
12
2988
12
          // If the user wrote \U1234, suggest a fixit to \u.
2989
12
          if (i == 4 && 
NumHexDigits == 83
) {
2990
3
            CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
2991
3
            Diag(KindLoc, diag::note_ucn_four_not_eight)
2992
3
              << FixItHint::CreateReplacement(URange, "u");
2993
3
          }
2994
12
        }
2995
18
      }
2996
21
2997
21
      return 0;
2998
21
    }
2999
1.12k
3000
1.12k
    CodePoint <<= 4;
3001
1.12k
    CodePoint += Value;
3002
1.12k
3003
1.12k
    CurPtr += CharSize;
3004
1.12k
  }
3005
263
3006
263
  
if (242
Result242
) {
3007
123
    Result->setFlag(Token::HasUCN);
3008
123
    if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
3009
0
      StartPtr = CurPtr;
3010
123
    else
3011
802
      
while (123
StartPtr != CurPtr)
3012
679
        (void)getAndAdvanceChar(StartPtr, *Result);
3013
123
  } else {
3014
119
    StartPtr = CurPtr;
3015
119
  }
3016
242
3017
242
  // Don't apply C family restrictions to UCNs in assembly mode
3018
242
  if (LangOpts.AsmPreprocessor)
3019
6
    return CodePoint;
3020
236
3021
236
  // C99 6.4.3p2: A universal character name shall not specify a character whose
3022
236
  //   short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3023
236
  //   0060 (`), nor one in the range D800 through DFFF inclusive.)
3024
236
  // C++11 [lex.charset]p2: If the hexadecimal value for a
3025
236
  //   universal-character-name corresponds to a surrogate code point (in the
3026
236
  //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3027
236
  //   if the hexadecimal value for a universal-character-name outside the
3028
236
  //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3029
236
  //   string literal corresponds to a control character (in either of the
3030
236
  //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3031
236
  //   basic source character set, the program is ill-formed.
3032
236
  if (CodePoint < 0xA0) {
3033
30
    if (CodePoint == 0x24 || 
CodePoint == 0x4024
||
CodePoint == 0x6024
)
3034
6
      return CodePoint;
3035
24
3036
24
    // We don't use isLexingRawMode() here because we need to warn about bad
3037
24
    // UCNs even when skipping preprocessing tokens in a #if block.
3038
24
    if (Result && 
PP23
) {
3039
23
      if (CodePoint < 0x20 || 
CodePoint >= 0x7F15
)
3040
16
        Diag(BufferPtr, diag::err_ucn_control_character);
3041
7
      else {
3042
7
        char C = static_cast<char>(CodePoint);
3043
7
        Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3044
7
      }
3045
23
    }
3046
24
3047
24
    return 0;
3048
206
  } else if (CodePoint >= 0xD800 && 
CodePoint <= 0xDFFF30
) {
3049
4
    // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3050
4
    // We don't use isLexingRawMode() here because we need to diagnose bad
3051
4
    // UCNs even when skipping preprocessing tokens in a #if block.
3052
4
    if (Result && PP) {
3053
4
      if (LangOpts.CPlusPlus && 
!LangOpts.CPlusPlus112
)
3054
1
        Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3055
3
      else
3056
3
        Diag(BufferPtr, diag::err_ucn_escape_invalid);
3057
4
    }
3058
4
    return 0;
3059
4
  }
3060
202
3061
202
  return CodePoint;
3062
202
}
3063
3064
bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3065
197
                                   const char *CurPtr) {
3066
197
  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
3067
197
      UnicodeWhitespaceCharRanges);
3068
197
  if (!isLexingRawMode() && 
!PP->isPreprocessedOutput()172
&&
3069
197
      
UnicodeWhitespaceChars.contains(C)158
) {
3070
6
    Diag(BufferPtr, diag::ext_unicode_whitespace)
3071
6
      << makeCharRange(*this, BufferPtr, CurPtr);
3072
6
3073
6
    Result.setFlag(Token::LeadingSpace);
3074
6
    return true;
3075
6
  }
3076
191
  return false;
3077
191
}
3078
3079
191
bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
3080
191
  if (isAllowedIDChar(C, LangOpts) && 
isAllowedInitiallyIDChar(C, LangOpts)125
) {
3081
119
    if (!isLexingRawMode() && 
!ParsingPreprocessorDirective106
&&
3082
119
        
!PP->isPreprocessedOutput()91
) {
3083
82
      maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
3084
82
                                makeCharRange(*this, BufferPtr, CurPtr),
3085
82
                                /*IsFirst=*/true);
3086
82
      maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
3087
82
                                 makeCharRange(*this, BufferPtr, CurPtr));
3088
82
    }
3089
119
3090
119
    MIOpt.ReadToken();
3091
119
    return LexIdentifier(Result, CurPtr);
3092
119
  }
3093
72
3094
72
  if (!isLexingRawMode() && 
!ParsingPreprocessorDirective60
&&
3095
72
      
!PP->isPreprocessedOutput()45
&&
3096
72
      
!isASCII(*BufferPtr)41
&&
!isAllowedIDChar(C, LangOpts)25
) {
3097
22
    // Non-ASCII characters tend to creep into source code unintentionally.
3098
22
    // Instead of letting the parser complain about the unknown token,
3099
22
    // just drop the character.
3100
22
    // Note that we can /only/ do this when the non-ASCII character is actually
3101
22
    // spelled as Unicode, not written as a UCN. The standard requires that
3102
22
    // we not throw away any possible preprocessor tokens, but there's a
3103
22
    // loophole in the mapping of Unicode characters to basic character set
3104
22
    // characters that allows us to map these particular characters to, say,
3105
22
    // whitespace.
3106
22
    Diag(BufferPtr, diag::err_non_ascii)
3107
22
      << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
3108
22
3109
22
    BufferPtr = CurPtr;
3110
22
    return false;
3111
22
  }
3112
50
3113
50
  // Otherwise, we have an explicit UCN or a character that's unlikely to show
3114
50
  // up by accident.
3115
50
  MIOpt.ReadToken();
3116
50
  FormTokenWithChars(Result, CurPtr, tok::unknown);
3117
50
  return true;
3118
50
}
3119
3120
6.99M
void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3121
6.99M
  IsAtStartOfLine = Result.isAtStartOfLine();
3122
6.99M
  HasLeadingSpace = Result.hasLeadingSpace();
3123
6.99M
  HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3124
6.99M
  // Note that this doesn't affect IsAtPhysicalStartOfLine.
3125
6.99M
}
3126
3127
1.42G
bool Lexer::Lex(Token &Result) {
3128
1.42G
  // Start a new token.
3129
1.42G
  Result.startToken();
3130
1.42G
3131
1.42G
  // Set up misc whitespace flags for LexTokenInternal.
3132
1.42G
  if (IsAtStartOfLine) {
3133
64.2M
    Result.setFlag(Token::StartOfLine);
3134
64.2M
    IsAtStartOfLine = false;
3135
64.2M
  }
3136
1.42G
3137
1.42G
  if (HasLeadingSpace) {
3138
416k
    Result.setFlag(Token::LeadingSpace);
3139
416k
    HasLeadingSpace = false;
3140
416k
  }
3141
1.42G
3142
1.42G
  if (HasLeadingEmptyMacro) {
3143
584k
    Result.setFlag(Token::LeadingEmptyMacro);
3144
584k
    HasLeadingEmptyMacro = false;
3145
584k
  }
3146
1.42G
3147
1.42G
  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3148
1.42G
  IsAtPhysicalStartOfLine = false;
3149
1.42G
  bool isRawLex = isLexingRawMode();
3150
1.42G
  (void) isRawLex;
3151
1.42G
  bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3152
1.42G
  // (After the LexTokenInternal call, the lexer might be destroyed.)
3153
1.42G
  assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3154
1.42G
  return returnedToken;
3155
1.42G
}
3156
3157
/// LexTokenInternal - This implements a simple C family lexer.  It is an
3158
/// extremely performance critical piece of code.  This assumes that the buffer
3159
/// has a null character at the end of the file.  This returns a preprocessing
3160
/// token, not a normal token, as such, it is an internal interface.  It assumes
3161
/// that the Flags of result have been cleared before calling this.
3162
1.42G
bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3163
1.59G
LexNextToken:
3164
1.59G
  // New token, can't need cleaning yet.
3165
1.59G
  Result.clearFlag(Token::NeedsCleaning);
3166
1.59G
  Result.setIdentifierInfo(nullptr);
3167
1.59G
3168
1.59G
  // CurPtr - Cache BufferPtr in an automatic variable.
3169
1.59G
  const char *CurPtr = BufferPtr;
3170
1.59G
3171
1.59G
  // Small amounts of horizontal whitespace is very common between tokens.
3172
1.59G
  if ((*CurPtr == ' ') || 
(*CurPtr == '\t')1.16G
) {
3173
444M
    ++CurPtr;
3174
1.53G
    while ((*CurPtr == ' ') || 
(*CurPtr == '\t')447M
)
3175
1.08G
      ++CurPtr;
3176
444M
3177
444M
    // If we are keeping whitespace and other tokens, just return what we just
3178
444M
    // skipped.  The next lexer invocation will return the token after the
3179
444M
    // whitespace.
3180
444M
    if (isKeepWhitespaceMode()) {
3181
377k
      FormTokenWithChars(Result, CurPtr, tok::unknown);
3182
377k
      // FIXME: The next token will not have LeadingSpace set.
3183
377k
      return true;
3184
377k
    }
3185
443M
3186
443M
    BufferPtr = CurPtr;
3187
443M
    Result.setFlag(Token::LeadingSpace);
3188
443M
  }
3189
1.59G
3190
1.59G
  unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
3191
1.59G
3192
1.59G
  // Read a character, advancing over it.
3193
1.59G
  char Char = getAndAdvanceChar(CurPtr, Result);
3194
1.59G
  tok::TokenKind Kind;
3195
1.59G
3196
1.59G
  switch (Char) {
3197
1.59G
  case 0:  // Null.
3198
638k
    // Found end of file?
3199
638k
    if (CurPtr-1 == BufferEnd)
3200
637k
      return LexEndOfFile(Result, CurPtr-1);
3201
1.01k
3202
1.01k
    // Check if we are performing code completion.
3203
1.01k
    if (isCodeCompletionPoint(CurPtr-1)) {
3204
1.01k
      // Return the code-completion token.
3205
1.01k
      Result.startToken();
3206
1.01k
      FormTokenWithChars(Result, CurPtr, tok::code_completion);
3207
1.01k
      return true;
3208
1.01k
    }
3209
3
3210
3
    if (!isLexingRawMode())
3211
2
      Diag(CurPtr-1, diag::null_in_file);
3212
3
    Result.setFlag(Token::LeadingSpace);
3213
3
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3214
0
      return true; // KeepWhitespaceMode
3215
3
3216
3
    // We know the lexer hasn't changed, so just try again with this lexer.
3217
3
    // (We manually eliminate the tail call to avoid recursion.)
3218
3
    goto LexNextToken;
3219
3
3220
3
  case 26:  // DOS & CP/M EOF: "^Z".
3221
1
    // If we're in Microsoft extensions mode, treat this as end of file.
3222
1
    if (LangOpts.MicrosoftExt) {
3223
1
      if (!isLexingRawMode())
3224
1
        Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3225
1
      return LexEndOfFile(Result, CurPtr-1);
3226
1
    }
3227
0
3228
0
    // If Microsoft extensions are disabled, this is just random garbage.
3229
0
    Kind = tok::unknown;
3230
0
    break;
3231
0
3232
99.7k
  case '\r':
3233
99.7k
    if (CurPtr[0] == '\n')
3234
99.7k
      (void)getAndAdvanceChar(CurPtr, Result);
3235
99.7k
    LLVM_FALLTHROUGH;
3236
214M
  case '\n':
3237
214M
    // If we are inside a preprocessor directive and we see the end of line,
3238
214M
    // we know we are done with the directive, so return an EOD token.
3239
214M
    if (ParsingPreprocessorDirective) {
3240
50.2M
      // Done parsing the "line".
3241
50.2M
      ParsingPreprocessorDirective = false;
3242
50.2M
3243
50.2M
      // Restore comment saving mode, in case it was disabled for directive.
3244
50.2M
      if (PP)
3245
50.2M
        resetExtendedTokenMode();
3246
50.2M
3247
50.2M
      // Since we consumed a newline, we are back at the start of a line.
3248
50.2M
      IsAtStartOfLine = true;
3249
50.2M
      IsAtPhysicalStartOfLine = true;
3250
50.2M
3251
50.2M
      Kind = tok::eod;
3252
50.2M
      break;
3253
50.2M
    }
3254
164M
3255
164M
    // No leading whitespace seen so far.
3256
164M
    Result.clearFlag(Token::LeadingSpace);
3257
164M
3258
164M
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3259
195k
      return true; // KeepWhitespaceMode
3260
164M
3261
164M
    // We only saw whitespace, so just try again with this lexer.
3262
164M
    // (We manually eliminate the tail call to avoid recursion.)
3263
164M
    goto LexNextToken;
3264
164M
  case ' ':
3265
3.77M
  case '\t':
3266
3.77M
  case '\f':
3267
3.77M
  case '\v':
3268
5.82M
  SkipHorizontalWhitespace:
3269
5.82M
    Result.setFlag(Token::LeadingSpace);
3270
5.82M
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3271
816
      return true; // KeepWhitespaceMode
3272
18.0M
3273
18.0M
  SkipIgnoredUnits:
3274
18.0M
    CurPtr = BufferPtr;
3275
18.0M
3276
18.0M
    // If the next token is obviously a // or /* */ comment, skip it efficiently
3277
18.0M
    // too (without going through the big switch stmt).
3278
18.0M
    if (CurPtr[0] == '/' && 
CurPtr[1] == '/'8.98M
&&
!inKeepCommentMode()8.98M
&&
3279
18.0M
        
LangOpts.LineComment8.98M
&&
3280
18.0M
        
(8.98M
LangOpts.CPlusPlus8.98M
||
!LangOpts.TraditionalCPP4.52M
)) {
3281
8.98M
      if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3282
0
        return true; // There is a token to return.
3283
8.98M
      goto SkipIgnoredUnits;
3284
9.04M
    } else if (CurPtr[0] == '/' && 
CurPtr[1] == '*'3.67k
&&
!inKeepCommentMode()1.79k
) {
3285
1.79k
      if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3286
0
        return true; // There is a token to return.
3287
1.79k
      goto SkipIgnoredUnits;
3288
9.04M
    } else if (isHorizontalWhitespace(*CurPtr)) {
3289
2.05M
      goto SkipHorizontalWhitespace;
3290
2.05M
    }
3291
6.99M
    // We only saw whitespace, so just try again with this lexer.
3292
6.99M
    // (We manually eliminate the tail call to avoid recursion.)
3293
6.99M
    goto LexNextToken;
3294
6.99M
3295
6.99M
  // C99 6.4.4.1: Integer Constants.
3296
6.99M
  // C99 6.4.4.2: Floating Constants.
3297
46.2M
  case '0': case '1': case '2': case '3': case '4':
3298
46.2M
  case '5': case '6': case '7': case '8': case '9':
3299
46.2M
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3300
46.2M
    MIOpt.ReadToken();
3301
46.2M
    return LexNumericConstant(Result, CurPtr);
3302
46.2M
3303
46.2M
  case 'u':   // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
3304
10.5M
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3305
10.5M
    MIOpt.ReadToken();
3306
10.5M
3307
10.5M
    if (LangOpts.CPlusPlus11 || 
LangOpts.C113.90M
) {
3308
10.0M
      Char = getCharAndSize(CurPtr, SizeTmp);
3309
10.0M
3310
10.0M
      // UTF-16 string literal
3311
10.0M
      if (Char == '"')
3312
88
        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3313
88
                                tok::utf16_string_literal);
3314
10.0M
3315
10.0M
      // UTF-16 character constant
3316
10.0M
      if (Char == '\'')
3317
95
        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3318
95
                               tok::utf16_char_constant);
3319
10.0M
3320
10.0M
      // UTF-16 raw string literal
3321
10.0M
      if (Char == 'R' && 
LangOpts.CPlusPlus1129
&&
3322
10.0M
          
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"'21
)
3323
19
        return LexRawStringLiteral(Result,
3324
19
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3325
19
                                           SizeTmp2, Result),
3326
19
                               tok::utf16_string_literal);
3327
10.0M
3328
10.0M
      if (Char == '8') {
3329
44.9k
        char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3330
44.9k
3331
44.9k
        // UTF-8 string literal
3332
44.9k
        if (Char2 == '"')
3333
113
          return LexStringLiteral(Result,
3334
113
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3335
113
                                           SizeTmp2, Result),
3336
113
                               tok::utf8_string_literal);
3337
44.8k
        if (Char2 == '\'' && 
LangOpts.CPlusPlus1743
)
3338
31
          return LexCharConstant(
3339
31
              Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3340
31
                                  SizeTmp2, Result),
3341
31
              tok::utf8_char_constant);
3342
44.8k
3343
44.8k
        if (Char2 == 'R' && 
LangOpts.CPlusPlus1133
) {
3344
25
          unsigned SizeTmp3;
3345
25
          char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3346
25
          // UTF-8 raw string literal
3347
25
          if (Char3 == '"') {
3348
23
            return LexRawStringLiteral(Result,
3349
23
                   ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3350
23
                                           SizeTmp2, Result),
3351
23
                               SizeTmp3, Result),
3352
23
                   tok::utf8_string_literal);
3353
23
          }
3354
10.5M
        }
3355
44.8k
      }
3356
10.0M
    }
3357
10.5M
3358
10.5M
    // treat u like the start of an identifier.
3359
10.5M
    return LexIdentifier(Result, CurPtr);
3360
10.5M
3361
10.5M
  case 'U':   // Identifier (Uber) or C11/C++11 UTF-32 string literal
3362
1.26M
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3363
1.26M
    MIOpt.ReadToken();
3364
1.26M
3365
1.26M
    if (LangOpts.CPlusPlus11 || 
LangOpts.C11877k
) {
3366
1.19M
      Char = getCharAndSize(CurPtr, SizeTmp);
3367
1.19M
3368
1.19M
      // UTF-32 string literal
3369
1.19M
      if (Char == '"')
3370
87
        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3371
87
                                tok::utf32_string_literal);
3372
1.19M
3373
1.19M
      // UTF-32 character constant
3374
1.19M
      if (Char == '\'')
3375
86
        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3376
86
                               tok::utf32_char_constant);
3377
1.19M
3378
1.19M
      // UTF-32 raw string literal
3379
1.19M
      if (Char == 'R' && 
LangOpts.CPlusPlus111.90k
&&
3380
1.19M
          
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"'470
)
3381
21
        return LexRawStringLiteral(Result,
3382
21
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3383
21
                                           SizeTmp2, Result),
3384
21
                               tok::utf32_string_literal);
3385
1.26M
    }
3386
1.26M
3387
1.26M
    // treat U like the start of an identifier.
3388
1.26M
    return LexIdentifier(Result, CurPtr);
3389
1.26M
3390
1.26M
  case 'R': // Identifier or C++0x raw string literal
3391
662k
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3392
662k
    MIOpt.ReadToken();
3393
662k
3394
662k
    if (LangOpts.CPlusPlus11) {
3395
402k
      Char = getCharAndSize(CurPtr, SizeTmp);
3396
402k
3397
402k
      if (Char == '"')
3398
388
        return LexRawStringLiteral(Result,
3399
388
                                   ConsumeChar(CurPtr, SizeTmp, Result),
3400
388
                                   tok::string_literal);
3401
662k
    }
3402
662k
3403
662k
    // treat R like the start of an identifier.
3404
662k
    return LexIdentifier(Result, CurPtr);
3405
662k
3406
662k
  case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
3407
594k
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3408
594k
    MIOpt.ReadToken();
3409
594k
    Char = getCharAndSize(CurPtr, SizeTmp);
3410
594k
3411
594k
    // Wide string literal.
3412
594k
    if (Char == '"')
3413
2.66k
      return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3414
2.66k
                              tok::wide_string_literal);
3415
591k
3416
591k
    // Wide raw string literal.
3417
591k
    if (LangOpts.CPlusPlus11 && 
Char == 'R'256k
&&
3418
591k
        
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"'329
)
3419
17
      return LexRawStringLiteral(Result,
3420
17
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3421
17
                                           SizeTmp2, Result),
3422
17
                               tok::wide_string_literal);
3423
591k
3424
591k
    // Wide character constant.
3425
591k
    if (Char == '\'')
3426
1.62k
      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3427
1.62k
                             tok::wide_char_constant);
3428
590k
    // FALL THROUGH, treating L like the start of an identifier.
3429
590k
    LLVM_FALLTHROUGH;
3430
590k
3431
590k
  // C99 6.4.2: Identifiers.
3432
702M
  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3433
702M
  case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
3434
702M
  case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
3435
702M
  case 'V': case 'W': case 'X': case 'Y': case 'Z':
3436
702M
  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3437
702M
  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3438
702M
  case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
3439
702M
  case 'v': case 'w': case 'x': case 'y': case 'z':
3440
702M
  case '_':
3441
702M
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3442
702M
    MIOpt.ReadToken();
3443
702M
    return LexIdentifier(Result, CurPtr);
3444
702M
3445
702M
  case '$':   // $ in identifiers.
3446
317
    if (LangOpts.DollarIdents) {
3447
20
      if (!isLexingRawMode())
3448
6
        Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3449
20
      // Notify MIOpt that we read a non-whitespace/non-comment token.
3450
20
      MIOpt.ReadToken();
3451
20
      return LexIdentifier(Result, CurPtr);
3452
20
    }
3453
297
3454
297
    Kind = tok::unknown;
3455
297
    break;
3456
297
3457
297
  // C99 6.4.4: Character Constants.
3458
117k
  case '\'':
3459
117k
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3460
117k
    MIOpt.ReadToken();
3461
117k
    return LexCharConstant(Result, CurPtr, tok::char_constant);
3462
297
3463
297
  // C99 6.4.5: String Literals.
3464
5.69M
  case '"':
3465
5.69M
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3466
5.69M
    MIOpt.ReadToken();
3467
5.69M
    return LexStringLiteral(Result, CurPtr,
3468
5.69M
                            ParsingFilename ? 
tok::header_name181k
3469
5.69M
                                            : 
tok::string_literal5.51M
);
3470
297
3471
297
  // C99 6.4.6: Punctuators.
3472
203k
  case '?':
3473
203k
    Kind = tok::question;
3474
203k
    break;
3475
2.24M
  case '[':
3476
2.24M
    Kind = tok::l_square;
3477
2.24M
    break;
3478
2.24M
  case ']':
3479
2.24M
    Kind = tok::r_square;
3480
2.24M
    break;
3481
132M
  case '(':
3482
132M
    Kind = tok::l_paren;
3483
132M
    break;
3484
137M
  case ')':
3485
137M
    Kind = tok::r_paren;
3486
137M
    break;
3487
7.51M
  case '{':
3488
7.51M
    Kind = tok::l_brace;
3489
7.51M
    break;
3490
7.52M
  case '}':
3491
7.52M
    Kind = tok::r_brace;
3492
7.52M
    break;
3493
3.50M
  case '.':
3494
3.50M
    Char = getCharAndSize(CurPtr, SizeTmp);
3495
3.50M
    if (Char >= '0' && 
Char <= '9'2.71M
) {
3496
1.22k
      // Notify MIOpt that we read a non-whitespace/non-comment token.
3497
1.22k
      MIOpt.ReadToken();
3498
1.22k
3499
1.22k
      return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3500
3.50M
    } else if (LangOpts.CPlusPlus && 
Char == '*'2.30M
) {
3501
10.8k
      Kind = tok::periodstar;
3502
10.8k
      CurPtr += SizeTmp;
3503
3.49M
    } else if (Char == '.' &&
3504
3.49M
               
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.'755k
) {
3505
755k
      Kind = tok::ellipsis;
3506
755k
      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3507
755k
                           SizeTmp2, Result);
3508
2.73M
    } else {
3509
2.73M
      Kind = tok::period;
3510
2.73M
    }
3511
3.50M
    
break3.50M
;
3512
3.74M
  case '&':
3513
3.74M
    Char = getCharAndSize(CurPtr, SizeTmp);
3514
3.74M
    if (Char == '&') {
3515
1.50M
      Kind = tok::ampamp;
3516
1.50M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3517
2.23M
    } else if (Char == '=') {
3518
24.8k
      Kind = tok::ampequal;
3519
24.8k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3520
2.20M
    } else {
3521
2.20M
      Kind = tok::amp;
3522
2.20M
    }
3523
3.74M
    break;
3524
6.66M
  case '*':
3525
6.66M
    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3526
9.21k
      Kind = tok::starequal;
3527
9.21k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3528
6.65M
    } else {
3529
6.65M
      Kind = tok::star;
3530
6.65M
    }
3531
6.66M
    break;
3532
3.50M
  case '+':
3533
1.35M
    Char = getCharAndSize(CurPtr, SizeTmp);
3534
1.35M
    if (Char == '+') {
3535
475k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3536
475k
      Kind = tok::plusplus;
3537
882k
    } else if (Char == '=') {
3538
145k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3539
145k
      Kind = tok::plusequal;
3540
737k
    } else {
3541
737k
      Kind = tok::plus;
3542
737k
    }
3543
1.35M
    break;
3544
3.50M
  case '-':
3545
2.82M
    Char = getCharAndSize(CurPtr, SizeTmp);
3546
2.82M
    if (Char == '-') {      // --
3547
96.0k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3548
96.0k
      Kind = tok::minusminus;
3549
2.72M
    } else if (Char == '>' && 
LangOpts.CPlusPlus1.17M
&&
3550
2.72M
               
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*'900k
) { // C++ ->*
3551
1.51k
      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3552
1.51k
                           SizeTmp2, Result);
3553
1.51k
      Kind = tok::arrowstar;
3554
2.72M
    } else if (Char == '>') {   // ->
3555
1.17M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3556
1.17M
      Kind = tok::arrow;
3557
1.55M
    } else if (Char == '=') {   // -=
3558
47.5k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3559
47.5k
      Kind = tok::minusequal;
3560
1.50M
    } else {
3561
1.50M
      Kind = tok::minus;
3562
1.50M
    }
3563
2.82M
    break;
3564
3.50M
  case '~':
3565
186k
    Kind = tok::tilde;
3566
186k
    break;
3567
3.50M
  case '!':
3568
1.50M
    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3569
288k
      Kind = tok::exclaimequal;
3570
288k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3571
1.21M
    } else {
3572
1.21M
      Kind = tok::exclaim;
3573
1.21M
    }
3574
1.50M
    break;
3575
9.52M
  case '/':
3576
9.52M
    // 6.4.9: Comments
3577
9.52M
    Char = getCharAndSize(CurPtr, SizeTmp);
3578
9.52M
    if (Char == '/') {         // Line comment.
3579
3.28M
      // Even if Line comments are disabled (e.g. in C89 mode), we generally
3580
3.28M
      // want to lex this as a comment.  There is one problem with this though,
3581
3.28M
      // that in one particular corner case, this can change the behavior of the
3582
3.28M
      // resultant program.  For example, In  "foo //**/ bar", C89 would lex
3583
3.28M
      // this as "foo / bar" and languages with Line comments would lex it as
3584
3.28M
      // "foo".  Check to see if the character after the second slash is a '*'.
3585
3.28M
      // If so, we will lex that as a "/" instead of the start of a comment.
3586
3.28M
      // However, we never do this if we are just preprocessing.
3587
3.28M
      bool TreatAsComment = LangOpts.LineComment &&
3588
3.28M
                            
(3.27M
LangOpts.CPlusPlus3.27M
||
!LangOpts.TraditionalCPP732k
);
3589
3.28M
      if (!TreatAsComment)
3590
2.39k
        if (!(PP && 
PP->isPreprocessedOutput()2.14k
))
3591
2.31k
          TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3592
3.28M
3593
3.28M
      if (TreatAsComment) {
3594
3.28M
        if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3595
3.28M
                            TokAtPhysicalStartOfLine))
3596
66.6k
          return true; // There is a token to return.
3597
3.21M
3598
3.21M
        // It is common for the tokens immediately after a // comment to be
3599
3.21M
        // whitespace (indentation for the next line).  Instead of going through
3600
3.21M
        // the big switch, handle it efficiently now.
3601
3.21M
        goto SkipIgnoredUnits;
3602
3.21M
      }
3603
3.28M
    }
3604
6.24M
3605
6.24M
    if (Char == '*') {  // /**/ comment.
3606
5.96M
      if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3607
5.96M
                           TokAtPhysicalStartOfLine))
3608
2.05k
        return true; // There is a token to return.
3609
5.96M
3610
5.96M
      // We only saw whitespace, so just try again with this lexer.
3611
5.96M
      // (We manually eliminate the tail call to avoid recursion.)
3612
5.96M
      goto LexNextToken;
3613
5.96M
    }
3614
272k
3615
272k
    if (Char == '=') {
3616
4.95k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3617
4.95k
      Kind = tok::slashequal;
3618
267k
    } else {
3619
267k
      Kind = tok::slash;
3620
267k
    }
3621
272k
    break;
3622
272k
  case '%':
3623
58.7k
    Char = getCharAndSize(CurPtr, SizeTmp);
3624
58.7k
    if (Char == '=') {
3625
1.28k
      Kind = tok::percentequal;
3626
1.28k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3627
57.4k
    } else if (LangOpts.Digraphs && 
Char == '>'57.1k
) {
3628
10
      Kind = tok::r_brace;                             // '%>' -> '}'
3629
10
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3630
57.4k
    } else if (LangOpts.Digraphs && 
Char == ':'57.1k
) {
3631
15
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3632
15
      Char = getCharAndSize(CurPtr, SizeTmp);
3633
15
      if (Char == '%' && 
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':'0
) {
3634
0
        Kind = tok::hashhash;                          // '%:%:' -> '##'
3635
0
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3636
0
                             SizeTmp2, Result);
3637
15
      } else if (Char == '@' && 
LangOpts.MicrosoftExt0
) {// %:@ -> #@ -> Charize
3638
0
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3639
0
        if (!isLexingRawMode())
3640
0
          Diag(BufferPtr, diag::ext_charize_microsoft);
3641
0
        Kind = tok::hashat;
3642
15
      } else {                                         // '%:' -> '#'
3643
15
        // We parsed a # character.  If this occurs at the start of the line,
3644
15
        // it's actually the start of a preprocessing directive.  Callback to
3645
15
        // the preprocessor to handle it.
3646
15
        // TODO: -fpreprocessed mode??
3647
15
        if (TokAtPhysicalStartOfLine && !LexingRawMode && 
!Is_PragmaLexer12
)
3648
12
          goto HandleDirective;
3649
3
3650
3
        Kind = tok::hash;
3651
3
      }
3652
57.3k
    } else {
3653
57.3k
      Kind = tok::percent;
3654
57.3k
    }
3655
58.7k
    
break58.6k
;
3656
10.0M
  case '<':
3657
10.0M
    Char = getCharAndSize(CurPtr, SizeTmp);
3658
10.0M
    if (ParsingFilename) {
3659
841k
      return LexAngledStringLiteral(Result, CurPtr);
3660
9.18M
    } else if (Char == '<') {
3661
292k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3662
292k
      if (After == '=') {
3663
4.57k
        Kind = tok::lesslessequal;
3664
4.57k
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3665
4.57k
                             SizeTmp2, Result);
3666
287k
      } else if (After == '<' && 
IsStartOfConflictMarker(CurPtr-1)155
) {
3667
2
        // If this is actually a '<<<<<<<' version control conflict marker,
3668
2
        // recognize it as such and recover nicely.
3669
2
        goto LexNextToken;
3670
287k
      } else if (After == '<' && 
HandleEndOfConflictMarker(CurPtr-1)153
) {
3671
0
        // If this is '<<<<' and we're in a Perforce-style conflict marker,
3672
0
        // ignore it.
3673
0
        goto LexNextToken;
3674
287k
      } else if (LangOpts.CUDA && 
After == '<'57
) {
3675
57
        Kind = tok::lesslessless;
3676
57
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3677
57
                             SizeTmp2, Result);
3678
287k
      } else {
3679
287k
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3680
287k
        Kind = tok::lessless;
3681
287k
      }
3682
8.88M
    } else if (Char == '=') {
3683
118k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3684
118k
      if (After == '>') {
3685
408
        if (getLangOpts().CPlusPlus2a) {
3686
392
          if (!isLexingRawMode())
3687
369
            Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
3688
392
          CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3689
392
                               SizeTmp2, Result);
3690
392
          Kind = tok::spaceship;
3691
392
          break;
3692
392
        }
3693
16
        // Suggest adding a space between the '<=' and the '>' to avoid a
3694
16
        // change in semantics if this turns up in C++ <=17 mode.
3695
16
        if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
3696
6
          Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship)
3697
6
            << FixItHint::CreateInsertion(
3698
6
                   getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
3699
6
        }
3700
16
      }
3701
118k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3702
118k
      Kind = tok::lessequal;
3703
8.76M
    } else if (LangOpts.Digraphs && 
Char == ':'8.75M
) { // '<:' -> '['
3704
81
      if (LangOpts.CPlusPlus11 &&
3705
81
          
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':'50
) {
3706
35
        // C++0x [lex.pptoken]p3:
3707
35
        //  Otherwise, if the next three characters are <:: and the subsequent
3708
35
        //  character is neither : nor >, the < is treated as a preprocessor
3709
35
        //  token by itself and not as the first character of the alternative
3710
35
        //  token <:.
3711
35
        unsigned SizeTmp3;
3712
35
        char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3713
35
        if (After != ':' && 
After != '>'34
) {
3714
33
          Kind = tok::less;
3715
33
          if (!isLexingRawMode())
3716
30
            Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3717
33
          break;
3718
33
        }
3719
48
      }
3720
48
3721
48
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3722
48
      Kind = tok::l_square;
3723
8.76M
    } else if (LangOpts.Digraphs && 
Char == '%'8.75M
) { // '<%' -> '{'
3724
9
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3725
9
      Kind = tok::l_brace;
3726
8.76M
    } else if (Char == '#' && /*Not a trigraph*/ 
SizeTmp == 147
&&
3727
8.76M
               
lexEditorPlaceholder(Result, CurPtr)45
) {
3728
43
      return true;
3729
8.76M
    } else {
3730
8.76M
      Kind = tok::less;
3731
8.76M
    }
3732
10.0M
    
break9.17M
;
3733
10.0M
  case '>':
3734
7.39M
    Char = getCharAndSize(CurPtr, SizeTmp);
3735
7.39M
    if (Char == '=') {
3736
776k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3737
776k
      Kind = tok::greaterequal;
3738
6.61M
    } else if (Char == '>') {
3739
235k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3740
235k
      if (After == '=') {
3741
3.56k
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3742
3.56k
                             SizeTmp2, Result);
3743
3.56k
        Kind = tok::greatergreaterequal;
3744
231k
      } else if (After == '>' && 
IsStartOfConflictMarker(CurPtr-1)2.58k
) {
3745
2
        // If this is actually a '>>>>' conflict marker, recognize it as such
3746
2
        // and recover nicely.
3747
2
        goto LexNextToken;
3748
231k
      } else if (After == '>' && 
HandleEndOfConflictMarker(CurPtr-1)2.58k
) {
3749
0
        // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3750
0
        goto LexNextToken;
3751
231k
      } else if (LangOpts.CUDA && 
After == '>'73
) {
3752
71
        Kind = tok::greatergreatergreater;
3753
71
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3754
71
                             SizeTmp2, Result);
3755
231k
      } else {
3756
231k
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3757
231k
        Kind = tok::greatergreater;
3758
231k
      }
3759
6.37M
    } else {
3760
6.37M
      Kind = tok::greater;
3761
6.37M
    }
3762
7.39M
    
break7.39M
;
3763
7.39M
  case '^':
3764
64.3k
    Char = getCharAndSize(CurPtr, SizeTmp);
3765
64.3k
    if (Char == '=') {
3766
9.70k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3767
9.70k
      Kind = tok::caretequal;
3768
54.6k
    } else if (LangOpts.OpenCL && 
Char == '^'341
) {
3769
1
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3770
1
      Kind = tok::caretcaret;
3771
54.6k
    } else {
3772
54.6k
      Kind = tok::caret;
3773
54.6k
    }
3774
64.3k
    break;
3775
7.39M
  case '|':
3776
1.18M
    Char = getCharAndSize(CurPtr, SizeTmp);
3777
1.18M
    if (Char == '=') {
3778
44.2k
      Kind = tok::pipeequal;
3779
44.2k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3780
1.14M
    } else if (Char == '|') {
3781
943k
      // If this is '|||||||' and we're in a conflict marker, ignore it.
3782
943k
      if (CurPtr[1] == '|' && 
HandleEndOfConflictMarker(CurPtr-1)19
)
3783
1
        goto LexNextToken;
3784
943k
      Kind = tok::pipepipe;
3785
943k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3786
943k
    } else {
3787
198k
      Kind = tok::pipe;
3788
198k
    }
3789
1.18M
    
break1.18M
;
3790
5.51M
  case ':':
3791
5.51M
    Char = getCharAndSize(CurPtr, SizeTmp);
3792
5.51M
    if (LangOpts.Digraphs && 
Char == '>'5.47M
) {
3793
21
      Kind = tok::r_square; // ':>' -> ']'
3794
21
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3795
5.51M
    } else if ((LangOpts.CPlusPlus ||
3796
5.51M
                
LangOpts.DoubleSquareBracketAttributes363k
) &&
3797
5.51M
               
Char == ':'5.15M
) {
3798
3.89M
      Kind = tok::coloncolon;
3799
3.89M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3800
3.89M
    } else {
3801
1.62M
      Kind = tok::colon;
3802
1.62M
    }
3803
5.51M
    break;
3804
27.2M
  case ';':
3805
27.2M
    Kind = tok::semi;
3806
27.2M
    break;
3807
28.5M
  case '=':
3808
28.5M
    Char = getCharAndSize(CurPtr, SizeTmp);
3809
28.5M
    if (Char == '=') {
3810
699k
      // If this is '====' and we're in a conflict marker, ignore it.
3811
699k
      if (CurPtr[1] == '=' && 
HandleEndOfConflictMarker(CurPtr-1)38
)
3812
2
        goto LexNextToken;
3813
699k
3814
699k
      Kind = tok::equalequal;
3815
699k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3816
27.8M
    } else {
3817
27.8M
      Kind = tok::equal;
3818
27.8M
    }
3819
28.5M
    
break28.5M
;
3820
52.8M
  case ',':
3821
52.8M
    Kind = tok::comma;
3822
52.8M
    break;
3823
160M
  case '#':
3824
160M
    Char = getCharAndSize(CurPtr, SizeTmp);
3825
160M
    if (Char == '#') {
3826
236k
      Kind = tok::hashhash;
3827
236k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3828
159M
    } else if (Char == '@' && 
LangOpts.MicrosoftExt3
) { // #@ -> Charize
3829
3
      Kind = tok::hashat;
3830
3
      if (!isLexingRawMode())
3831
3
        Diag(BufferPtr, diag::ext_charize_microsoft);
3832
3
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3833
159M
    } else {
3834
159M
      // We parsed a # character.  If this occurs at the start of the line,
3835
159M
      // it's actually the start of a preprocessing directive.  Callback to
3836
159M
      // the preprocessor to handle it.
3837
159M
      // TODO: -fpreprocessed mode??
3838
159M
      if (TokAtPhysicalStartOfLine && 
!LexingRawMode159M
&&
!Is_PragmaLexer37.4M
)
3839
37.4M
        goto HandleDirective;
3840
122M
3841
122M
      Kind = tok::hash;
3842
122M
    }
3843
160M
    
break122M
;
3844
160M
3845
160M
  case '@':
3846
91.1k
    // Objective C support.
3847
91.1k
    if (CurPtr[-1] == '@' && LangOpts.ObjC)
3848
88.6k
      Kind = tok::at;
3849
2.52k
    else
3850
2.52k
      Kind = tok::unknown;
3851
91.1k
    break;
3852
160M
3853
160M
  // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
3854
160M
  case '\\':
3855
729
    if (!LangOpts.AsmPreprocessor) {
3856
611
      if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3857
96
        if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3858
0
          if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3859
0
            return true; // KeepWhitespaceMode
3860
0
3861
0
          // We only saw whitespace, so just try again with this lexer.
3862
0
          // (We manually eliminate the tail call to avoid recursion.)
3863
0
          goto LexNextToken;
3864
0
        }
3865
96
3866
96
        return LexUnicode(Result, CodePoint, CurPtr);
3867
96
      }
3868
611
    }
3869
633
3870
633
    Kind = tok::unknown;
3871
633
    break;
3872
633
3873
633
  default: {
3874
332
    if (isASCII(Char)) {
3875
165
      Kind = tok::unknown;
3876
165
      break;
3877
165
    }
3878
167
3879
167
    llvm::UTF32 CodePoint;
3880
167
3881
167
    // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
3882
167
    // an escaped newline.
3883
167
    --CurPtr;
3884
167
    llvm::ConversionResult Status =
3885
167
        llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
3886
167
                                  (const llvm::UTF8 *)BufferEnd,
3887
167
                                  &CodePoint,
3888
167
                                  llvm::strictConversion);
3889
167
    if (Status == llvm::conversionOK) {
3890
101
      if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3891
6
        if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3892
0
          return true; // KeepWhitespaceMode
3893
6
3894
6
        // We only saw whitespace, so just try again with this lexer.
3895
6
        // (We manually eliminate the tail call to avoid recursion.)
3896
6
        goto LexNextToken;
3897
6
      }
3898
95
      return LexUnicode(Result, CodePoint, CurPtr);
3899
95
    }
3900
66
3901
66
    if (isLexingRawMode() || 
ParsingPreprocessorDirective4
||
3902
66
        
PP->isPreprocessedOutput()2
) {
3903
65
      ++CurPtr;
3904
65
      Kind = tok::unknown;
3905
65
      break;
3906
65
    }
3907
1
3908
1
    // Non-ASCII characters tend to creep into source code unintentionally.
3909
1
    // Instead of letting the parser complain about the unknown token,
3910
1
    // just diagnose the invalid UTF-8, then drop the character.
3911
1
    Diag(CurPtr, diag::err_invalid_utf8);
3912
1
3913
1
    BufferPtr = CurPtr+1;
3914
1
    // We're pretending the character didn't exist, so just try again with
3915
1
    // this lexer.
3916
1
    // (We manually eliminate the tail call to avoid recursion.)
3917
1
    goto LexNextToken;
3918
1
  }
3919
615M
  }
3920
615M
3921
615M
  // Notify MIOpt that we read a non-whitespace/non-comment token.
3922
615M
  MIOpt.ReadToken();
3923
615M
3924
615M
  // Update the location of token as well as BufferPtr.
3925
615M
  FormTokenWithChars(Result, CurPtr, Kind);
3926
615M
  return true;
3927
37.4M
3928
37.4M
HandleDirective:
3929
37.4M
  // We parsed a # character and it's the start of a preprocessing directive.
3930
37.4M
3931
37.4M
  FormTokenWithChars(Result, CurPtr, tok::hash);
3932
37.4M
  PP->HandleDirective(Result);
3933
37.4M
3934
37.4M
  if (PP->hadModuleLoaderFatalFailure()) {
3935
1
    // With a fatal failure in the module loader, we abort parsing.
3936
1
    assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
3937
1
    return true;
3938
1
  }
3939
37.4M
3940
37.4M
  // We parsed the directive; lex a token with the new state.
3941
37.4M
  return false;
3942
37.4M
}