Coverage Report

Created: 2021-01-19 06:58

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/AST/CommentLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- CommentLexer.cpp -------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "clang/AST/CommentLexer.h"
10
#include "clang/AST/CommentCommandTraits.h"
11
#include "clang/AST/CommentDiagnostic.h"
12
#include "clang/Basic/CharInfo.h"
13
#include "llvm/ADT/StringExtras.h"
14
#include "llvm/ADT/StringSwitch.h"
15
#include "llvm/Support/ConvertUTF.h"
16
#include "llvm/Support/ErrorHandling.h"
17
18
namespace clang {
19
namespace comments {
20
21
0
void Token::dump(const Lexer &L, const SourceManager &SM) const {
22
0
  llvm::errs() << "comments::Token Kind=" << Kind << " ";
23
0
  Loc.print(llvm::errs(), SM);
24
0
  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25
0
}
26
27
341
static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28
341
  return isLetter(C);
29
341
}
30
31
165
static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32
165
  return isDigit(C);
33
165
}
34
35
101
static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36
101
  return isHexDigit(C);
37
101
}
38
39
static inline StringRef convertCodePointToUTF8(
40
                                      llvm::BumpPtrAllocator &Allocator,
41
36
                                      unsigned CodePoint) {
42
36
  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43
36
  char *ResolvedPtr = Resolved;
44
36
  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45
36
    return StringRef(Resolved, ResolvedPtr - Resolved);
46
0
  else
47
0
    return StringRef();
48
36
}
49
50
namespace {
51
52
#include "clang/AST/CommentHTMLTags.inc"
53
#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54
55
} // end anonymous namespace
56
57
52
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58
  // Fast path, first check a few most widely used named character references.
59
52
  return llvm::StringSwitch<StringRef>(Name)
60
52
      .Case("amp", "&")
61
52
      .Case("lt", "<")
62
52
      .Case("gt", ">")
63
52
      .Case("quot", "\"")
64
52
      .Case("apos", "\'")
65
      // Slow path.
66
52
      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67
52
}
68
69
17
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70
17
  unsigned CodePoint = 0;
71
67
  for (unsigned i = 0, e = Name.size(); i != e; 
++i50
) {
72
50
    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73
50
    CodePoint *= 10;
74
50
    CodePoint += Name[i] - '0';
75
50
  }
76
17
  return convertCodePointToUTF8(Allocator, CodePoint);
77
17
}
78
79
19
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80
19
  unsigned CodePoint = 0;
81
57
  for (unsigned i = 0, e = Name.size(); i != e; 
++i38
) {
82
38
    CodePoint *= 16;
83
38
    const char C = Name[i];
84
38
    assert(isHTMLHexCharacterReferenceCharacter(C));
85
38
    CodePoint += llvm::hexDigitValue(C);
86
38
  }
87
19
  return convertCodePointToUTF8(Allocator, CodePoint);
88
19
}
89
90
1.73k
void Lexer::skipLineStartingDecorations() {
91
  // This function should be called only for C comments
92
1.73k
  assert(CommentState == LCS_InsideCComment);
93
94
1.73k
  if (BufferPtr == CommentEnd)
95
381
    return;
96
97
1.35k
  switch (*BufferPtr) {
98
993
  case ' ':
99
995
  case '\t':
100
995
  case '\f':
101
995
  case '\v': {
102
995
    const char *NewBufferPtr = BufferPtr;
103
995
    NewBufferPtr++;
104
995
    if (NewBufferPtr == CommentEnd)
105
122
      return;
106
107
873
    char C = *NewBufferPtr;
108
1.21k
    while (isHorizontalWhitespace(C)) {
109
364
      NewBufferPtr++;
110
364
      if (NewBufferPtr == CommentEnd)
111
18
        return;
112
346
      C = *NewBufferPtr;
113
346
    }
114
855
    if (C == '*')
115
716
      BufferPtr = NewBufferPtr + 1;
116
855
    break;
117
873
  }
118
8
  case '*':
119
8
    BufferPtr++;
120
8
    break;
121
1.35k
  }
122
1.35k
}
123
124
namespace {
125
/// Returns pointer to the first newline character in the string.
126
286
const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
127
4.93k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr4.64k
) {
128
4.77k
    if (isVerticalWhitespace(*BufferPtr))
129
126
      return BufferPtr;
130
4.77k
  }
131
160
  return BufferEnd;
132
286
}
133
134
1.76k
const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
135
1.76k
  if (BufferPtr == BufferEnd)
136
29
    return BufferPtr;
137
138
1.73k
  if (*BufferPtr == '\n')
139
1.71k
    BufferPtr++;
140
15
  else {
141
15
    assert(*BufferPtr == '\r');
142
15
    BufferPtr++;
143
15
    if (BufferPtr != BufferEnd && 
*BufferPtr == '\n'13
)
144
8
      BufferPtr++;
145
15
  }
146
1.73k
  return BufferPtr;
147
1.73k
}
148
149
const char *skipNamedCharacterReference(const char *BufferPtr,
150
54
                                        const char *BufferEnd) {
151
243
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr189
) {
152
242
    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
153
53
      return BufferPtr;
154
242
  }
155
1
  return BufferEnd;
156
54
}
157
158
const char *skipDecimalCharacterReference(const char *BufferPtr,
159
19
                                          const char *BufferEnd) {
160
73
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr54
) {
161
72
    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
162
18
      return BufferPtr;
163
72
  }
164
1
  return BufferEnd;
165
19
}
166
167
const char *skipHexCharacterReference(const char *BufferPtr,
168
23
                                      const char *BufferEnd) {
169
65
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr42
) {
170
63
    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
171
21
      return BufferPtr;
172
63
  }
173
2
  return BufferEnd;
174
23
}
175
176
794
bool isHTMLIdentifierStartingCharacter(char C) {
177
794
  return isLetter(C);
178
794
}
179
180
1.41k
bool isHTMLIdentifierCharacter(char C) {
181
1.41k
  return isAlphanumeric(C);
182
1.41k
}
183
184
388
const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
185
1.07k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr683
) {
186
1.06k
    if (!isHTMLIdentifierCharacter(*BufferPtr))
187
377
      return BufferPtr;
188
1.06k
  }
189
11
  return BufferEnd;
190
388
}
191
192
/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
193
/// string allowed.
194
///
195
/// Returns pointer to closing quote.
196
const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
197
48
{
198
48
  const char Quote = *BufferPtr;
199
48
  assert(Quote == '\"' || Quote == '\'');
200
201
48
  BufferPtr++;
202
434
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr386
) {
203
428
    const char C = *BufferPtr;
204
428
    if (C == Quote && 
BufferPtr[-1] != '\\'48
)
205
42
      return BufferPtr;
206
428
  }
207
6
  return BufferEnd;
208
48
}
209
210
635
const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
211
802
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr167
) {
212
754
    if (!isWhitespace(*BufferPtr))
213
587
      return BufferPtr;
214
754
  }
215
48
  return BufferEnd;
216
635
}
217
218
39
bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
219
39
  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
220
39
}
221
222
3.06k
bool isCommandNameStartCharacter(char C) {
223
3.06k
  return isLetter(C);
224
3.06k
}
225
226
18.9k
bool isCommandNameCharacter(char C) {
227
18.9k
  return isAlphanumeric(C);
228
18.9k
}
229
230
3.05k
const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
231
19.1k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr16.0k
) {
232
18.9k
    if (!isCommandNameCharacter(*BufferPtr))
233
2.83k
      return BufferPtr;
234
18.9k
  }
235
222
  return BufferEnd;
236
3.05k
}
237
238
/// Return the one past end pointer for BCPL comments.
239
/// Handles newlines escaped with backslash or trigraph for backslahs.
240
3.19k
const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
241
3.19k
  const char *CurPtr = BufferPtr;
242
3.20k
  while (CurPtr != BufferEnd) {
243
54.0k
    while (!isVerticalWhitespace(*CurPtr)) {
244
52.7k
      CurPtr++;
245
52.7k
      if (CurPtr == BufferEnd)
246
1.91k
        return BufferEnd;
247
52.7k
    }
248
    // We found a newline, check if it is escaped.
249
1.26k
    const char *EscapePtr = CurPtr - 1;
250
1.27k
    while(isHorizontalWhitespace(*EscapePtr))
251
12
      EscapePtr--;
252
253
1.26k
    if (*EscapePtr == '\\' ||
254
1.25k
        (EscapePtr - 2 >= BufferPtr && 
EscapePtr[0] == '/'1.09k
&&
255
9
         
EscapePtr[-1] == '?'3
&&
EscapePtr[-2] == '?'3
)) {
256
      // We found an escaped newline.
257
9
      CurPtr = skipNewline(CurPtr, BufferEnd);
258
9
    } else
259
1.25k
      return CurPtr; // Not an escaped newline.
260
1.26k
  }
261
25
  return BufferEnd;
262
3.19k
}
263
264
/// Return the one past end pointer for C comments.
265
/// Very dumb, does not handle escaped newlines or trigraphs.
266
704
const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
267
30.2k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr29.5k
) {
268
30.2k
    if (*BufferPtr == '*') {
269
1.44k
      assert(BufferPtr + 1 != BufferEnd);
270
1.44k
      if (*(BufferPtr + 1) == '/')
271
704
        return BufferPtr;
272
1.44k
    }
273
30.2k
  }
274
704
  
llvm_unreachable0
("buffer end hit before '*/' was seen");
275
704
}
276
277
} // end anonymous namespace
278
279
void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
280
20.6k
                               tok::TokenKind Kind) {
281
20.6k
  const unsigned TokLen = TokEnd - BufferPtr;
282
20.6k
  Result.setLocation(getSourceLocation(BufferPtr));
283
20.6k
  Result.setKind(Kind);
284
20.6k
  Result.setLength(TokLen);
285
20.6k
#ifndef NDEBUG
286
20.6k
  Result.TextPtr = "<UNSET>";
287
20.6k
  Result.IntVal = 7;
288
20.6k
#endif
289
20.6k
  BufferPtr = TokEnd;
290
20.6k
}
291
292
12.3k
void Lexer::lexCommentText(Token &T) {
293
12.3k
  assert(CommentState == LCS_InsideBCPLComment ||
294
12.3k
         CommentState == LCS_InsideCComment);
295
296
  // Handles lexing non-command text, i.e. text and newline.
297
8.03k
  auto HandleNonCommandToken = [&]() -> void {
298
8.03k
    assert(State == LS_Normal);
299
300
8.03k
    const char *TokenPtr = BufferPtr;
301
8.03k
    assert(TokenPtr < CommentEnd);
302
8.03k
    switch (*TokenPtr) {
303
1.68k
      case '\n':
304
1.69k
      case '\r':
305
1.69k
          TokenPtr = skipNewline(TokenPtr, CommentEnd);
306
1.69k
          formTokenWithChars(T, TokenPtr, tok::newline);
307
308
1.69k
          if (CommentState == LCS_InsideCComment)
309
1.68k
            skipLineStartingDecorations();
310
1.69k
          return;
311
312
6.34k
      default: {
313
6.31k
          StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : 
"\n\r"28
;
314
6.34k
          size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
315
6.34k
                           .find_first_of(TokStartSymbols);
316
6.34k
          if (End != StringRef::npos)
317
3.62k
            TokenPtr += End;
318
2.71k
          else
319
2.71k
            TokenPtr = CommentEnd;
320
6.34k
          formTextToken(T, TokenPtr);
321
6.34k
          return;
322
1.68k
      }
323
8.03k
    }
324
8.03k
  };
325
326
12.3k
  if (!ParseCommands)
327
36
    return HandleNonCommandToken();
328
329
12.2k
  switch (State) {
330
11.5k
  case LS_Normal:
331
11.5k
    break;
332
47
  case LS_VerbatimBlockFirstLine:
333
47
    lexVerbatimBlockFirstLine(T);
334
47
    return;
335
77
  case LS_VerbatimBlockBody:
336
77
    lexVerbatimBlockBody(T);
337
77
    return;
338
147
  case LS_VerbatimLineText:
339
147
    lexVerbatimLineText(T);
340
147
    return;
341
356
  case LS_HTMLStartTag:
342
356
    lexHTMLStartTag(T);
343
356
    return;
344
94
  case LS_HTMLEndTag:
345
94
    lexHTMLEndTag(T);
346
94
    return;
347
11.5k
  }
348
349
11.5k
  assert(State == LS_Normal);
350
11.5k
  const char *TokenPtr = BufferPtr;
351
11.5k
  assert(TokenPtr < CommentEnd);
352
11.5k
  switch(*TokenPtr) {
353
2.58k
    case '\\':
354
3.14k
    case '@': {
355
      // Commands that start with a backslash and commands that start with
356
      // 'at' have equivalent semantics.  But we keep information about the
357
      // exact syntax in AST for comments.
358
3.14k
      tok::TokenKind CommandKind =
359
2.58k
          (*TokenPtr == '@') ? 
tok::at_command563
: tok::backslash_command;
360
3.14k
      TokenPtr++;
361
3.14k
      if (TokenPtr == CommentEnd) {
362
12
        formTextToken(T, TokenPtr);
363
12
        return;
364
12
      }
365
3.13k
      char C = *TokenPtr;
366
3.13k
      switch (C) {
367
3.06k
      default:
368
3.06k
        break;
369
370
25
      
case '\\': 6
case '@': 12
case '&': 18
case '$':
371
50
      
case '#': 31
case '<': 37
case '>': 43
case '%':
372
68
      
case '\"': 56
case '.': 62
case ':':
373
        // This is one of \\ \@ \& \$ etc escape sequences.
374
68
        TokenPtr++;
375
68
        if (C == ':' && 
TokenPtr != CommentEnd6
&&
*TokenPtr == ':'6
) {
376
          // This is the \:: escape sequence.
377
6
          TokenPtr++;
378
6
        }
379
68
        StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
380
68
        formTokenWithChars(T, TokenPtr, tok::text);
381
68
        T.setText(UnescapedText);
382
68
        return;
383
3.06k
      }
384
385
      // Don't make zero-length commands.
386
3.06k
      if (!isCommandNameStartCharacter(*TokenPtr)) {
387
8
        formTextToken(T, TokenPtr);
388
8
        return;
389
8
      }
390
391
3.05k
      TokenPtr = skipCommandName(TokenPtr, CommentEnd);
392
3.05k
      unsigned Length = TokenPtr - (BufferPtr + 1);
393
394
      // Hardcoded support for lexing LaTeX formula commands
395
      // \f$ \f[ \f] \f{ \f} as a single command.
396
3.05k
      if (Length == 1 && 
TokenPtr[-1] == 'f'158
&&
TokenPtr != CommentEnd17
) {
397
17
        C = *TokenPtr;
398
17
        if (C == '$' || 
C == '['4
||
C == ']'3
||
C == '{'3
||
C == '}'2
) {
399
15
          TokenPtr++;
400
15
          Length++;
401
15
        }
402
17
      }
403
404
3.05k
      StringRef CommandName(BufferPtr + 1, Length);
405
406
3.05k
      const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
407
3.05k
      if (!Info) {
408
312
        if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
409
11
          StringRef CorrectedName = Info->Name;
410
11
          SourceLocation Loc = getSourceLocation(BufferPtr);
411
11
          SourceLocation EndLoc = getSourceLocation(TokenPtr);
412
11
          SourceRange FullRange = SourceRange(Loc, EndLoc);
413
11
          SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
414
11
          Diag(Loc, diag::warn_correct_comment_command_name)
415
11
            << FullRange << CommandName << CorrectedName
416
11
            << FixItHint::CreateReplacement(CommandRange, CorrectedName);
417
301
        } else {
418
301
          formTokenWithChars(T, TokenPtr, tok::unknown_command);
419
301
          T.setUnknownCommandName(CommandName);
420
301
          Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
421
301
              << SourceRange(T.getLocation(), T.getEndLocation());
422
301
          return;
423
301
        }
424
2.75k
      }
425
2.75k
      if (Info->IsVerbatimBlockCommand) {
426
59
        setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
427
59
        return;
428
59
      }
429
2.69k
      if (Info->IsVerbatimLineCommand) {
430
151
        setupAndLexVerbatimLine(T, TokenPtr, Info);
431
151
        return;
432
151
      }
433
2.54k
      formTokenWithChars(T, TokenPtr, CommandKind);
434
2.54k
      T.setCommandID(Info->getID());
435
2.54k
      return;
436
2.54k
    }
437
438
100
    case '&':
439
100
      lexHTMLCharacterReference(T);
440
100
      return;
441
442
325
    case '<': {
443
325
      TokenPtr++;
444
325
      if (TokenPtr == CommentEnd) {
445
1
        formTextToken(T, TokenPtr);
446
1
        return;
447
1
      }
448
324
      const char C = *TokenPtr;
449
324
      if (isHTMLIdentifierStartingCharacter(C))
450
203
        setupAndLexHTMLStartTag(T);
451
121
      else if (C == '/')
452
99
        setupAndLexHTMLEndTag(T);
453
22
      else
454
22
        formTextToken(T, TokenPtr);
455
324
      return;
456
324
    }
457
458
7.99k
    default:
459
7.99k
      return HandleNonCommandToken();
460
11.5k
  }
461
11.5k
}
462
463
void Lexer::setupAndLexVerbatimBlock(Token &T,
464
                                     const char *TextBegin,
465
59
                                     char Marker, const CommandInfo *Info) {
466
59
  assert(Info->IsVerbatimBlockCommand);
467
468
59
  VerbatimBlockEndCommandName.clear();
469
53
  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : 
"@"6
);
470
59
  VerbatimBlockEndCommandName.append(Info->EndCommandName);
471
472
59
  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
473
59
  T.setVerbatimBlockID(Info->getID());
474
475
  // If there is a newline following the verbatim opening command, skip the
476
  // newline so that we don't create an tok::verbatim_block_line with empty
477
  // text content.
478
59
  if (BufferPtr != CommentEnd &&
479
41
      isVerticalWhitespace(*BufferPtr)) {
480
10
    BufferPtr = skipNewline(BufferPtr, CommentEnd);
481
10
    State = LS_VerbatimBlockBody;
482
10
    return;
483
10
  }
484
485
49
  State = LS_VerbatimBlockFirstLine;
486
49
}
487
488
120
void Lexer::lexVerbatimBlockFirstLine(Token &T) {
489
139
again:
490
139
  assert(BufferPtr < CommentEnd);
491
492
  // FIXME: It would be better to scan the text once, finding either the block
493
  // end command or newline.
494
  //
495
  // Extract current line.
496
139
  const char *Newline = findNewline(BufferPtr, CommentEnd);
497
139
  StringRef Line(BufferPtr, Newline - BufferPtr);
498
499
  // Look for end command in current line.
500
139
  size_t Pos = Line.find(VerbatimBlockEndCommandName);
501
139
  const char *TextEnd;
502
139
  const char *NextLine;
503
139
  if (Pos == StringRef::npos) {
504
    // Current line is completely verbatim.
505
54
    TextEnd = Newline;
506
54
    NextLine = skipNewline(Newline, CommentEnd);
507
85
  } else if (Pos == 0) {
508
    // Current line contains just an end command.
509
46
    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
510
46
    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
511
46
    formTokenWithChars(T, End, tok::verbatim_block_end);
512
46
    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
513
46
    State = LS_Normal;
514
46
    return;
515
39
  } else {
516
    // There is some text, followed by end command.  Extract text first.
517
39
    TextEnd = BufferPtr + Pos;
518
39
    NextLine = TextEnd;
519
    // If there is only whitespace before end command, skip whitespace.
520
39
    if (isWhitespace(BufferPtr, TextEnd)) {
521
19
      BufferPtr = TextEnd;
522
19
      goto again;
523
19
    }
524
74
  }
525
526
74
  StringRef Text(BufferPtr, TextEnd - BufferPtr);
527
74
  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
528
74
  T.setVerbatimBlockText(Text);
529
530
74
  State = LS_VerbatimBlockBody;
531
74
}
532
533
77
void Lexer::lexVerbatimBlockBody(Token &T) {
534
77
  assert(State == LS_VerbatimBlockBody);
535
536
77
  if (CommentState == LCS_InsideCComment)
537
50
    skipLineStartingDecorations();
538
539
77
  if (BufferPtr == CommentEnd) {
540
4
    formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
541
4
    T.setVerbatimBlockText("");
542
4
    return;
543
4
  }
544
545
73
  lexVerbatimBlockFirstLine(T);
546
73
}
547
548
void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
549
151
                                    const CommandInfo *Info) {
550
151
  assert(Info->IsVerbatimLineCommand);
551
151
  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
552
151
  T.setVerbatimLineID(Info->getID());
553
554
151
  State = LS_VerbatimLineText;
555
151
}
556
557
147
void Lexer::lexVerbatimLineText(Token &T) {
558
147
  assert(State == LS_VerbatimLineText);
559
560
  // Extract current line.
561
147
  const char *Newline = findNewline(BufferPtr, CommentEnd);
562
147
  StringRef Text(BufferPtr, Newline - BufferPtr);
563
147
  formTokenWithChars(T, Newline, tok::verbatim_line_text);
564
147
  T.setVerbatimLineText(Text);
565
566
147
  State = LS_Normal;
567
147
}
568
569
100
void Lexer::lexHTMLCharacterReference(Token &T) {
570
100
  const char *TokenPtr = BufferPtr;
571
100
  assert(*TokenPtr == '&');
572
100
  TokenPtr++;
573
100
  if (TokenPtr == CommentEnd) {
574
1
    formTextToken(T, TokenPtr);
575
1
    return;
576
1
  }
577
99
  const char *NamePtr;
578
99
  bool isNamed = false;
579
99
  bool isDecimal = false;
580
99
  char C = *TokenPtr;
581
99
  if (isHTMLNamedCharacterReferenceCharacter(C)) {
582
54
    NamePtr = TokenPtr;
583
54
    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
584
54
    isNamed = true;
585
45
  } else if (C == '#') {
586
44
    TokenPtr++;
587
44
    if (TokenPtr == CommentEnd) {
588
1
      formTextToken(T, TokenPtr);
589
1
      return;
590
1
    }
591
43
    C = *TokenPtr;
592
43
    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
593
19
      NamePtr = TokenPtr;
594
19
      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
595
19
      isDecimal = true;
596
24
    } else if (C == 'x' || 
C == 'X'7
) {
597
23
      TokenPtr++;
598
23
      NamePtr = TokenPtr;
599
23
      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
600
1
    } else {
601
1
      formTextToken(T, TokenPtr);
602
1
      return;
603
1
    }
604
1
  } else {
605
1
    formTextToken(T, TokenPtr);
606
1
    return;
607
1
  }
608
96
  if (NamePtr == TokenPtr || 
TokenPtr == CommentEnd94
||
609
91
      *TokenPtr != ';') {
610
8
    formTextToken(T, TokenPtr);
611
8
    return;
612
8
  }
613
88
  StringRef Name(NamePtr, TokenPtr - NamePtr);
614
88
  TokenPtr++; // Skip semicolon.
615
88
  StringRef Resolved;
616
88
  if (isNamed)
617
52
    Resolved = resolveHTMLNamedCharacterReference(Name);
618
36
  else if (isDecimal)
619
17
    Resolved = resolveHTMLDecimalCharacterReference(Name);
620
19
  else
621
19
    Resolved = resolveHTMLHexCharacterReference(Name);
622
623
88
  if (Resolved.empty()) {
624
0
    formTextToken(T, TokenPtr);
625
0
    return;
626
0
  }
627
88
  formTokenWithChars(T, TokenPtr, tok::text);
628
88
  T.setText(Resolved);
629
88
}
630
631
203
void Lexer::setupAndLexHTMLStartTag(Token &T) {
632
203
  assert(BufferPtr[0] == '<' &&
633
203
         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
634
203
  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
635
203
  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
636
203
  if (!isHTMLTagName(Name)) {
637
8
    formTextToken(T, TagNameEnd);
638
8
    return;
639
8
  }
640
641
195
  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
642
195
  T.setHTMLTagStartName(Name);
643
644
195
  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
645
646
195
  const char C = *BufferPtr;
647
195
  if (BufferPtr != CommentEnd &&
648
192
      (C == '>' || 
C == '/'101
||
isHTMLIdentifierStartingCharacter(C)84
))
649
182
    State = LS_HTMLStartTag;
650
195
}
651
652
356
void Lexer::lexHTMLStartTag(Token &T) {
653
356
  assert(State == LS_HTMLStartTag);
654
655
356
  const char *TokenPtr = BufferPtr;
656
356
  char C = *TokenPtr;
657
356
  if (isHTMLIdentifierCharacter(C)) {
658
86
    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
659
86
    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
660
86
    formTokenWithChars(T, TokenPtr, tok::html_ident);
661
86
    T.setHTMLIdent(Ident);
662
270
  } else {
663
270
    switch (C) {
664
72
    case '=':
665
72
      TokenPtr++;
666
72
      formTokenWithChars(T, TokenPtr, tok::html_equals);
667
72
      break;
668
43
    case '\"':
669
48
    case '\'': {
670
48
      const char *OpenQuote = TokenPtr;
671
48
      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
672
48
      const char *ClosingQuote = TokenPtr;
673
48
      if (TokenPtr != CommentEnd) // Skip closing quote.
674
42
        TokenPtr++;
675
48
      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
676
48
      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
677
48
                                      ClosingQuote - (OpenQuote + 1)));
678
48
      break;
679
43
    }
680
133
    case '>':
681
133
      TokenPtr++;
682
133
      formTokenWithChars(T, TokenPtr, tok::html_greater);
683
133
      State = LS_Normal;
684
133
      return;
685
17
    case '/':
686
17
      TokenPtr++;
687
17
      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
688
15
        TokenPtr++;
689
15
        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
690
15
      } else
691
2
        formTextToken(T, TokenPtr);
692
693
17
      State = LS_Normal;
694
17
      return;
695
206
    }
696
206
  }
697
698
  // Now look ahead and return to normal state if we don't see any HTML tokens
699
  // ahead.
700
206
  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
701
206
  if (BufferPtr == CommentEnd) {
702
23
    State = LS_Normal;
703
23
    return;
704
23
  }
705
706
183
  C = *BufferPtr;
707
183
  if (!isHTMLIdentifierStartingCharacter(C) &&
708
171
      C != '=' && 
C != '\"'99
&&
C != '\''56
&&
C != '>'51
) {
709
9
    State = LS_Normal;
710
9
    return;
711
9
  }
712
183
}
713
714
99
void Lexer::setupAndLexHTMLEndTag(Token &T) {
715
99
  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
716
717
99
  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
718
99
  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
719
99
  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
720
99
  if (!isHTMLTagName(Name)) {
721
3
    formTextToken(T, TagNameEnd);
722
3
    return;
723
3
  }
724
725
96
  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
726
727
96
  formTokenWithChars(T, End, tok::html_end_tag);
728
96
  T.setHTMLTagEndName(Name);
729
730
96
  if (BufferPtr != CommentEnd && 
*BufferPtr == '>'94
)
731
94
    State = LS_HTMLEndTag;
732
96
}
733
734
94
void Lexer::lexHTMLEndTag(Token &T) {
735
94
  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
736
737
94
  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
738
94
  State = LS_Normal;
739
94
}
740
741
Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
742
             const CommandTraits &Traits, SourceLocation FileLoc,
743
             const char *BufferStart, const char *BufferEnd, bool ParseCommands)
744
    : Allocator(Allocator), Diags(Diags), Traits(Traits),
745
      BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
746
      FileLoc(FileLoc), ParseCommands(ParseCommands),
747
2.69k
      CommentState(LCS_BeforeComment), State(LS_Normal) {}
748
749
20.6k
void Lexer::lex(Token &T) {
750
27.6k
again:
751
27.6k
  switch (CommentState) {
752
7.69k
  case LCS_BeforeComment:
753
7.69k
    if (BufferPtr == BufferEnd) {
754
3.79k
      formTokenWithChars(T, BufferPtr, tok::eof);
755
3.79k
      return;
756
3.79k
    }
757
758
3.89k
    assert(*BufferPtr == '/');
759
3.89k
    BufferPtr++; // Skip first slash.
760
3.89k
    switch(*BufferPtr) {
761
3.19k
    case '/': { // BCPL comment.
762
3.19k
      BufferPtr++; // Skip second slash.
763
764
3.19k
      if (BufferPtr != BufferEnd) {
765
        // Skip Doxygen magic marker, if it is present.
766
        // It might be missing because of a typo //< or /*<, or because we
767
        // merged this non-Doxygen comment into a bunch of Doxygen comments
768
        // around it: /** ... */ /* ... */ /** ... */
769
3.18k
        const char C = *BufferPtr;
770
3.18k
        if (C == '/' || 
C == '!'302
)
771
2.92k
          BufferPtr++;
772
3.18k
      }
773
774
      // Skip less-than symbol that marks trailing comments.
775
      // Skip it even if the comment is not a Doxygen one, because //< and /*<
776
      // are frequent typos.
777
3.19k
      if (BufferPtr != BufferEnd && 
*BufferPtr == '<'3.17k
)
778
136
        BufferPtr++;
779
780
3.19k
      CommentState = LCS_InsideBCPLComment;
781
3.19k
      if (State != LS_VerbatimBlockBody && 
State != LS_VerbatimBlockFirstLine3.16k
)
782
3.15k
        State = LS_Normal;
783
3.19k
      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
784
3.19k
      goto again;
785
0
    }
786
704
    case '*': { // C comment.
787
704
      BufferPtr++; // Skip star.
788
789
      // Skip Doxygen magic marker.
790
704
      const char C = *BufferPtr;
791
704
      if ((C == '*' && 
*(BufferPtr + 1) != '/'557
) ||
C == '!'148
)
792
679
        BufferPtr++;
793
794
      // Skip less-than symbol that marks trailing comments.
795
704
      if (BufferPtr != BufferEnd && *BufferPtr == '<')
796
40
        BufferPtr++;
797
798
704
      CommentState = LCS_InsideCComment;
799
704
      State = LS_Normal;
800
704
      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
801
704
      goto again;
802
0
    }
803
0
    default:
804
0
      llvm_unreachable("second character of comment should be '/' or '*'");
805
0
    }
806
807
3.82k
  case LCS_BetweenComments: {
808
    // Consecutive comments are extracted only if there is only whitespace
809
    // between them.  So we can search for the start of the next comment.
810
3.82k
    const char *EndWhitespace = BufferPtr;
811
5.33k
    while(EndWhitespace != BufferEnd && 
*EndWhitespace != '/'2.72k
)
812
1.51k
      EndWhitespace++;
813
814
    // Turn any whitespace between comments (and there is only whitespace
815
    // between them -- guaranteed by comment extraction) into a newline.  We
816
    // have two newlines between C comments in total (first one was synthesized
817
    // after a comment).
818
3.82k
    formTokenWithChars(T, EndWhitespace, tok::newline);
819
820
3.82k
    CommentState = LCS_BeforeComment;
821
3.82k
    break;
822
0
  }
823
824
11.0k
  case LCS_InsideBCPLComment:
825
16.1k
  case LCS_InsideCComment:
826
16.1k
    if (BufferPtr != CommentEnd) {
827
12.3k
      lexCommentText(T);
828
12.3k
      break;
829
3.82k
    } else {
830
      // Skip C comment closing sequence.
831
3.82k
      if (CommentState == LCS_InsideCComment) {
832
701
        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
833
701
        BufferPtr += 2;
834
701
        assert(BufferPtr <= BufferEnd);
835
836
        // Synthenize newline just after the C comment, regardless if there is
837
        // actually a newline.
838
701
        formTokenWithChars(T, BufferPtr, tok::newline);
839
840
701
        CommentState = LCS_BetweenComments;
841
701
        break;
842
3.12k
      } else {
843
        // Don't synthesized a newline after BCPL comment.
844
3.12k
        CommentState = LCS_BetweenComments;
845
3.12k
        goto again;
846
3.12k
      }
847
3.82k
    }
848
27.6k
  }
849
27.6k
}
850
851
StringRef Lexer::getSpelling(const Token &Tok,
852
28
                             const SourceManager &SourceMgr) const {
853
28
  SourceLocation Loc = Tok.getLocation();
854
28
  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
855
856
28
  bool InvalidTemp = false;
857
28
  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
858
28
  if (InvalidTemp)
859
0
    return StringRef();
860
861
28
  const char *Begin = File.data() + LocInfo.second;
862
28
  return StringRef(Begin, Tok.getLength());
863
28
}
864
865
} // end namespace comments
866
} // end namespace clang