Coverage Report

Created: 2022-07-16 07:03

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/AST/CommentLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- CommentLexer.cpp -------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "clang/AST/CommentLexer.h"
10
#include "clang/AST/CommentCommandTraits.h"
11
#include "clang/AST/CommentDiagnostic.h"
12
#include "clang/Basic/CharInfo.h"
13
#include "llvm/ADT/StringExtras.h"
14
#include "llvm/ADT/StringSwitch.h"
15
#include "llvm/Support/ConvertUTF.h"
16
#include "llvm/Support/ErrorHandling.h"
17
18
namespace clang {
19
namespace comments {
20
21
0
void Token::dump(const Lexer &L, const SourceManager &SM) const {
22
0
  llvm::errs() << "comments::Token Kind=" << Kind << " ";
23
0
  Loc.print(llvm::errs(), SM);
24
0
  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25
0
}
26
27
341
static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28
341
  return isLetter(C);
29
341
}
30
31
165
static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32
165
  return isDigit(C);
33
165
}
34
35
101
static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36
101
  return isHexDigit(C);
37
101
}
38
39
static inline StringRef convertCodePointToUTF8(
40
                                      llvm::BumpPtrAllocator &Allocator,
41
36
                                      unsigned CodePoint) {
42
36
  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43
36
  char *ResolvedPtr = Resolved;
44
36
  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45
36
    return StringRef(Resolved, ResolvedPtr - Resolved);
46
0
  else
47
0
    return StringRef();
48
36
}
49
50
namespace {
51
52
#include "clang/AST/CommentHTMLTags.inc"
53
#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54
55
} // end anonymous namespace
56
57
52
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58
  // Fast path, first check a few most widely used named character references.
59
52
  return llvm::StringSwitch<StringRef>(Name)
60
52
      .Case("amp", "&")
61
52
      .Case("lt", "<")
62
52
      .Case("gt", ">")
63
52
      .Case("quot", "\"")
64
52
      .Case("apos", "\'")
65
      // Slow path.
66
52
      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67
52
}
68
69
17
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70
17
  unsigned CodePoint = 0;
71
67
  for (unsigned i = 0, e = Name.size(); i != e; 
++i50
) {
72
50
    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73
0
    CodePoint *= 10;
74
50
    CodePoint += Name[i] - '0';
75
50
  }
76
17
  return convertCodePointToUTF8(Allocator, CodePoint);
77
17
}
78
79
19
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80
19
  unsigned CodePoint = 0;
81
57
  for (unsigned i = 0, e = Name.size(); i != e; 
++i38
) {
82
38
    CodePoint *= 16;
83
38
    const char C = Name[i];
84
38
    assert(isHTMLHexCharacterReferenceCharacter(C));
85
0
    CodePoint += llvm::hexDigitValue(C);
86
38
  }
87
19
  return convertCodePointToUTF8(Allocator, CodePoint);
88
19
}
89
90
2.12k
void Lexer::skipLineStartingDecorations() {
91
  // This function should be called only for C comments
92
2.12k
  assert(CommentState == LCS_InsideCComment);
93
94
2.12k
  if (BufferPtr == CommentEnd)
95
381
    return;
96
97
1.74k
  const char *NewBufferPtr = BufferPtr;
98
3.37k
  while (isHorizontalWhitespace(*NewBufferPtr))
99
1.80k
    if (++NewBufferPtr == CommentEnd)
100
183
      return;
101
1.56k
  if (*NewBufferPtr == '*')
102
1.07k
    BufferPtr = NewBufferPtr + 1;
103
1.56k
}
104
105
namespace {
106
/// Returns pointer to the first newline character in the string.
107
1.25k
const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
108
31.4k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr30.1k
) {
109
30.2k
    if (isVerticalWhitespace(*BufferPtr))
110
132
      return BufferPtr;
111
30.2k
  }
112
1.12k
  return BufferEnd;
113
1.25k
}
114
115
2.98k
const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
116
2.98k
  if (BufferPtr == BufferEnd)
117
852
    return BufferPtr;
118
119
2.13k
  if (*BufferPtr == '\n')
120
2.11k
    BufferPtr++;
121
15
  else {
122
15
    assert(*BufferPtr == '\r');
123
0
    BufferPtr++;
124
15
    if (BufferPtr != BufferEnd && 
*BufferPtr == '\n'13
)
125
8
      BufferPtr++;
126
15
  }
127
0
  return BufferPtr;
128
2.98k
}
129
130
const char *skipNamedCharacterReference(const char *BufferPtr,
131
54
                                        const char *BufferEnd) {
132
243
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr189
) {
133
242
    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
134
53
      return BufferPtr;
135
242
  }
136
1
  return BufferEnd;
137
54
}
138
139
const char *skipDecimalCharacterReference(const char *BufferPtr,
140
19
                                          const char *BufferEnd) {
141
73
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr54
) {
142
72
    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
143
18
      return BufferPtr;
144
72
  }
145
1
  return BufferEnd;
146
19
}
147
148
const char *skipHexCharacterReference(const char *BufferPtr,
149
23
                                      const char *BufferEnd) {
150
65
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr42
) {
151
63
    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
152
21
      return BufferPtr;
153
63
  }
154
2
  return BufferEnd;
155
23
}
156
157
7.97k
bool isHTMLIdentifierStartingCharacter(char C) {
158
7.97k
  return isLetter(C);
159
7.97k
}
160
161
19.4k
bool isHTMLIdentifierCharacter(char C) {
162
19.4k
  return isAlphanumeric(C);
163
19.4k
}
164
165
4.66k
const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
166
19.1k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr14.4k
) {
167
19.1k
    if (!isHTMLIdentifierCharacter(*BufferPtr))
168
4.64k
      return BufferPtr;
169
19.1k
  }
170
11
  return BufferEnd;
171
4.66k
}
172
173
/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
174
/// string allowed.
175
///
176
/// Returns pointer to closing quote.
177
const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
178
48
{
179
48
  const char Quote = *BufferPtr;
180
48
  assert(Quote == '\"' || Quote == '\'');
181
182
0
  BufferPtr++;
183
434
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr386
) {
184
428
    const char C = *BufferPtr;
185
428
    if (C == Quote && 
BufferPtr[-1] != '\\'48
)
186
42
      return BufferPtr;
187
428
  }
188
6
  return BufferEnd;
189
48
}
190
191
2.11k
const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
192
2.36k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr248
) {
193
2.26k
    if (!isWhitespace(*BufferPtr))
194
2.01k
      return BufferPtr;
195
2.26k
  }
196
98
  return BufferEnd;
197
2.11k
}
198
199
106
bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
200
106
  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201
106
}
202
203
10.3k
bool isCommandNameStartCharacter(char C) {
204
10.3k
  return isLetter(C);
205
10.3k
}
206
207
67.2k
bool isCommandNameCharacter(char C) {
208
67.2k
  return isAlphanumeric(C);
209
67.2k
}
210
211
10.3k
const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
212
68.0k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr57.7k
) {
213
67.2k
    if (!isCommandNameCharacter(*BufferPtr))
214
9.52k
      return BufferPtr;
215
67.2k
  }
216
775
  return BufferEnd;
217
10.3k
}
218
219
/// Return the one past end pointer for BCPL comments.
220
/// Handles newlines escaped with backslash or trigraph for backslahs.
221
26.9k
const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
222
26.9k
  const char *CurPtr = BufferPtr;
223
26.9k
  while (CurPtr != BufferEnd) {
224
962k
    while (!isVerticalWhitespace(*CurPtr)) {
225
939k
      CurPtr++;
226
939k
      if (CurPtr == BufferEnd)
227
3.48k
        return BufferEnd;
228
939k
    }
229
    // We found a newline, check if it is escaped.
230
23.4k
    const char *EscapePtr = CurPtr - 1;
231
23.4k
    while(isHorizontalWhitespace(*EscapePtr))
232
12
      EscapePtr--;
233
234
23.4k
    if (*EscapePtr == '\\' ||
235
23.4k
        
(23.4k
EscapePtr - 2 >= BufferPtr23.4k
&&
EscapePtr[0] == '/'18.5k
&&
236
23.4k
         
EscapePtr[-1] == '?'3
&&
EscapePtr[-2] == '?'3
)) {
237
      // We found an escaped newline.
238
9
      CurPtr = skipNewline(CurPtr, BufferEnd);
239
9
    } else
240
23.4k
      return CurPtr; // Not an escaped newline.
241
23.4k
  }
242
42
  return BufferEnd;
243
26.9k
}
244
245
/// Return the one past end pointer for C comments.
246
/// Very dumb, does not handle escaped newlines or trigraphs.
247
747
const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
248
42.6k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr41.8k
) {
249
42.6k
    if (*BufferPtr == '*') {
250
1.83k
      assert(BufferPtr + 1 != BufferEnd);
251
1.83k
      if (*(BufferPtr + 1) == '/')
252
747
        return BufferPtr;
253
1.83k
    }
254
42.6k
  }
255
0
  llvm_unreachable("buffer end hit before '*/' was seen");
256
0
}
257
258
} // end anonymous namespace
259
260
void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
261
89.8k
                               tok::TokenKind Kind) {
262
89.8k
  const unsigned TokLen = TokEnd - BufferPtr;
263
89.8k
  Result.setLocation(getSourceLocation(BufferPtr));
264
89.8k
  Result.setKind(Kind);
265
89.8k
  Result.setLength(TokLen);
266
89.8k
#ifndef NDEBUG
267
89.8k
  Result.TextPtr = "<UNSET>";
268
89.8k
  Result.IntVal = 7;
269
89.8k
#endif
270
89.8k
  BufferPtr = TokEnd;
271
89.8k
}
272
273
35.6k
const char *Lexer::skipTextToken() {
274
35.6k
  const char *TokenPtr = BufferPtr;
275
35.6k
  assert(TokenPtr < CommentEnd);
276
35.6k
  StringRef TokStartSymbols = ParseCommands ? 
"\n\r\\@\"&<"35.6k
:
"\n\r"42
;
277
278
35.7k
again:
279
35.7k
  size_t End =
280
35.7k
      StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
281
35.7k
  if (End == StringRef::npos)
282
20.2k
    return CommentEnd;
283
284
  // Doxygen doesn't recognize any commands in a one-line double quotation.
285
  // If we don't find an ending quotation mark, we pretend it never began.
286
15.5k
  if (*(TokenPtr + End) == '\"') {
287
65
    TokenPtr += End + 1;
288
65
    End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
289
65
    if (End != StringRef::npos && 
*(TokenPtr + End) == '\"'51
)
290
51
      TokenPtr += End + 1;
291
65
    goto again;
292
65
  }
293
15.4k
  return TokenPtr + End;
294
15.5k
}
295
296
54.5k
void Lexer::lexCommentText(Token &T) {
297
54.5k
  assert(CommentState == LCS_InsideBCPLComment ||
298
54.5k
         CommentState == LCS_InsideCComment);
299
300
  // Handles lexing non-command text, i.e. text and newline.
301
37.7k
  auto HandleNonCommandToken = [&]() -> void {
302
37.7k
    assert(State == LS_Normal);
303
304
0
    const char *TokenPtr = BufferPtr;
305
37.7k
    assert(TokenPtr < CommentEnd);
306
0
    switch (*TokenPtr) {
307
2.08k
      case '\n':
308
2.08k
      case '\r':
309
2.08k
          TokenPtr = skipNewline(TokenPtr, CommentEnd);
310
2.08k
          formTokenWithChars(T, TokenPtr, tok::newline);
311
312
2.08k
          if (CommentState == LCS_InsideCComment)
313
2.07k
            skipLineStartingDecorations();
314
2.08k
          return;
315
316
35.6k
      default:
317
35.6k
        return formTextToken(T, skipTextToken());
318
37.7k
    }
319
37.7k
  };
320
321
54.5k
  if (!ParseCommands)
322
60
    return HandleNonCommandToken();
323
324
54.4k
  switch (State) {
325
52.8k
  case LS_Normal:
326
52.8k
    break;
327
114
  case LS_VerbatimBlockFirstLine:
328
114
    lexVerbatimBlockFirstLine(T);
329
114
    return;
330
917
  case LS_VerbatimBlockBody:
331
917
    lexVerbatimBlockBody(T);
332
917
    return;
333
157
  case LS_VerbatimLineText:
334
157
    lexVerbatimLineText(T);
335
157
    return;
336
380
  case LS_HTMLStartTag:
337
380
    lexHTMLStartTag(T);
338
380
    return;
339
118
  case LS_HTMLEndTag:
340
118
    lexHTMLEndTag(T);
341
118
    return;
342
54.4k
  }
343
344
52.8k
  assert(State == LS_Normal);
345
0
  const char *TokenPtr = BufferPtr;
346
52.8k
  assert(TokenPtr < CommentEnd);
347
0
  switch(*TokenPtr) {
348
9.80k
    case '\\':
349
10.3k
    case '@': {
350
      // Commands that start with a backslash and commands that start with
351
      // 'at' have equivalent semantics.  But we keep information about the
352
      // exact syntax in AST for comments.
353
10.3k
      tok::TokenKind CommandKind =
354
10.3k
          (*TokenPtr == '@') ? 
tok::at_command596
:
tok::backslash_command9.80k
;
355
10.3k
      TokenPtr++;
356
10.3k
      if (TokenPtr == CommentEnd) {
357
12
        formTextToken(T, TokenPtr);
358
12
        return;
359
12
      }
360
10.3k
      char C = *TokenPtr;
361
10.3k
      switch (C) {
362
10.3k
      default:
363
10.3k
        break;
364
365
10.3k
      
case '\\': 6
case '@': 12
case '&': 18
case '$':
366
50
      
case '#': 31
case '<': 37
case '>': 43
case '%':
367
68
      
case '\"': 56
case '.': 62
case ':':
368
        // This is one of \\ \@ \& \$ etc escape sequences.
369
68
        TokenPtr++;
370
68
        if (C == ':' && 
TokenPtr != CommentEnd6
&&
*TokenPtr == ':'6
) {
371
          // This is the \:: escape sequence.
372
6
          TokenPtr++;
373
6
        }
374
68
        StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
375
68
        formTokenWithChars(T, TokenPtr, tok::text);
376
68
        T.setText(UnescapedText);
377
68
        return;
378
10.3k
      }
379
380
      // Don't make zero-length commands.
381
10.3k
      if (!isCommandNameStartCharacter(*TokenPtr)) {
382
12
        formTextToken(T, TokenPtr);
383
12
        return;
384
12
      }
385
386
10.3k
      TokenPtr = skipCommandName(TokenPtr, CommentEnd);
387
10.3k
      unsigned Length = TokenPtr - (BufferPtr + 1);
388
389
      // Hardcoded support for lexing LaTeX formula commands
390
      // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
391
10.3k
      if (Length == 1 && 
TokenPtr[-1] == 'f'1.39k
&&
TokenPtr != CommentEnd34
) {
392
34
        C = *TokenPtr;
393
34
        if (C == '$' || 
C == '('17
||
C == ')'12
||
C == '['12
||
C == ']'7
||
394
34
            
C == '{'7
||
C == '}'2
) {
395
32
          TokenPtr++;
396
32
          Length++;
397
32
        }
398
34
      }
399
400
10.3k
      StringRef CommandName(BufferPtr + 1, Length);
401
402
10.3k
      const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
403
10.3k
      if (!Info) {
404
314
        if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
405
23
          StringRef CorrectedName = Info->Name;
406
23
          SourceLocation Loc = getSourceLocation(BufferPtr);
407
23
          SourceLocation EndLoc = getSourceLocation(TokenPtr);
408
23
          SourceRange FullRange = SourceRange(Loc, EndLoc);
409
23
          SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
410
23
          Diag(Loc, diag::warn_correct_comment_command_name)
411
23
            << FullRange << CommandName << CorrectedName
412
23
            << FixItHint::CreateReplacement(CommandRange, CorrectedName);
413
291
        } else {
414
291
          formTokenWithChars(T, TokenPtr, tok::unknown_command);
415
291
          T.setUnknownCommandName(CommandName);
416
291
          Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
417
291
              << SourceRange(T.getLocation(), T.getEndLocation());
418
291
          return;
419
291
        }
420
314
      }
421
10.0k
      if (Info->IsVerbatimBlockCommand) {
422
126
        setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
423
126
        return;
424
126
      }
425
9.88k
      if (Info->IsVerbatimLineCommand) {
426
161
        setupAndLexVerbatimLine(T, TokenPtr, Info);
427
161
        return;
428
161
      }
429
9.72k
      formTokenWithChars(T, TokenPtr, CommandKind);
430
9.72k
      T.setCommandID(Info->getID());
431
9.72k
      return;
432
9.88k
    }
433
434
100
    case '&':
435
100
      lexHTMLCharacterReference(T);
436
100
      return;
437
438
4.59k
    case '<': {
439
4.59k
      TokenPtr++;
440
4.59k
      if (TokenPtr == CommentEnd) {
441
1
        formTextToken(T, TokenPtr);
442
1
        return;
443
1
      }
444
4.59k
      const char C = *TokenPtr;
445
4.59k
      if (isHTMLIdentifierStartingCharacter(C))
446
3.11k
        setupAndLexHTMLStartTag(T);
447
1.48k
      else if (C == '/')
448
1.46k
        setupAndLexHTMLEndTag(T);
449
22
      else
450
22
        formTextToken(T, TokenPtr);
451
4.59k
      return;
452
4.59k
    }
453
454
37.7k
    default:
455
37.7k
      return HandleNonCommandToken();
456
52.8k
  }
457
52.8k
}
458
459
void Lexer::setupAndLexVerbatimBlock(Token &T,
460
                                     const char *TextBegin,
461
126
                                     char Marker, const CommandInfo *Info) {
462
126
  assert(Info->IsVerbatimBlockCommand);
463
464
0
  VerbatimBlockEndCommandName.clear();
465
126
  VerbatimBlockEndCommandName.append(Marker == '\\' ? 
"\\"120
:
"@"6
);
466
126
  VerbatimBlockEndCommandName.append(Info->EndCommandName);
467
468
126
  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
469
126
  T.setVerbatimBlockID(Info->getID());
470
471
  // If there is a newline following the verbatim opening command, skip the
472
  // newline so that we don't create an tok::verbatim_block_line with empty
473
  // text content.
474
126
  if (BufferPtr != CommentEnd &&
475
126
      
isVerticalWhitespace(*BufferPtr)102
) {
476
10
    BufferPtr = skipNewline(BufferPtr, CommentEnd);
477
10
    State = LS_VerbatimBlockBody;
478
10
    return;
479
10
  }
480
481
116
  State = LS_VerbatimBlockFirstLine;
482
116
}
483
484
1.02k
void Lexer::lexVerbatimBlockFirstLine(Token &T) {
485
1.09k
again:
486
1.09k
  assert(BufferPtr < CommentEnd);
487
488
  // FIXME: It would be better to scan the text once, finding either the block
489
  // end command or newline.
490
  //
491
  // Extract current line.
492
0
  const char *Newline = findNewline(BufferPtr, CommentEnd);
493
1.09k
  StringRef Line(BufferPtr, Newline - BufferPtr);
494
495
  // Look for end command in current line.
496
1.09k
  size_t Pos = Line.find(VerbatimBlockEndCommandName);
497
1.09k
  const char *TextEnd;
498
1.09k
  const char *NextLine;
499
1.09k
  if (Pos == StringRef::npos) {
500
    // Current line is completely verbatim.
501
877
    TextEnd = Newline;
502
877
    NextLine = skipNewline(Newline, CommentEnd);
503
877
  } else 
if (219
Pos == 0219
) {
504
    // Current line contains just an end command.
505
113
    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
506
113
    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
507
113
    formTokenWithChars(T, End, tok::verbatim_block_end);
508
113
    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
509
113
    State = LS_Normal;
510
113
    return;
511
113
  } else {
512
    // There is some text, followed by end command.  Extract text first.
513
106
    TextEnd = BufferPtr + Pos;
514
106
    NextLine = TextEnd;
515
    // If there is only whitespace before end command, skip whitespace.
516
106
    if (isWhitespace(BufferPtr, TextEnd)) {
517
69
      BufferPtr = TextEnd;
518
69
      goto again;
519
69
    }
520
106
  }
521
522
914
  StringRef Text(BufferPtr, TextEnd - BufferPtr);
523
914
  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
524
914
  T.setVerbatimBlockText(Text);
525
526
914
  State = LS_VerbatimBlockBody;
527
914
}
528
529
917
void Lexer::lexVerbatimBlockBody(Token &T) {
530
917
  assert(State == LS_VerbatimBlockBody);
531
532
917
  if (CommentState == LCS_InsideCComment)
533
50
    skipLineStartingDecorations();
534
535
917
  if (BufferPtr == CommentEnd) {
536
4
    formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
537
4
    T.setVerbatimBlockText("");
538
4
    return;
539
4
  }
540
541
913
  lexVerbatimBlockFirstLine(T);
542
913
}
543
544
void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
545
161
                                    const CommandInfo *Info) {
546
161
  assert(Info->IsVerbatimLineCommand);
547
0
  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
548
161
  T.setVerbatimLineID(Info->getID());
549
550
161
  State = LS_VerbatimLineText;
551
161
}
552
553
157
void Lexer::lexVerbatimLineText(Token &T) {
554
157
  assert(State == LS_VerbatimLineText);
555
556
  // Extract current line.
557
0
  const char *Newline = findNewline(BufferPtr, CommentEnd);
558
157
  StringRef Text(BufferPtr, Newline - BufferPtr);
559
157
  formTokenWithChars(T, Newline, tok::verbatim_line_text);
560
157
  T.setVerbatimLineText(Text);
561
562
157
  State = LS_Normal;
563
157
}
564
565
100
void Lexer::lexHTMLCharacterReference(Token &T) {
566
100
  const char *TokenPtr = BufferPtr;
567
100
  assert(*TokenPtr == '&');
568
0
  TokenPtr++;
569
100
  if (TokenPtr == CommentEnd) {
570
1
    formTextToken(T, TokenPtr);
571
1
    return;
572
1
  }
573
99
  const char *NamePtr;
574
99
  bool isNamed = false;
575
99
  bool isDecimal = false;
576
99
  char C = *TokenPtr;
577
99
  if (isHTMLNamedCharacterReferenceCharacter(C)) {
578
54
    NamePtr = TokenPtr;
579
54
    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
580
54
    isNamed = true;
581
54
  } else 
if (45
C == '#'45
) {
582
44
    TokenPtr++;
583
44
    if (TokenPtr == CommentEnd) {
584
1
      formTextToken(T, TokenPtr);
585
1
      return;
586
1
    }
587
43
    C = *TokenPtr;
588
43
    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
589
19
      NamePtr = TokenPtr;
590
19
      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
591
19
      isDecimal = true;
592
24
    } else if (C == 'x' || 
C == 'X'7
) {
593
23
      TokenPtr++;
594
23
      NamePtr = TokenPtr;
595
23
      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
596
23
    } else {
597
1
      formTextToken(T, TokenPtr);
598
1
      return;
599
1
    }
600
43
  } else {
601
1
    formTextToken(T, TokenPtr);
602
1
    return;
603
1
  }
604
96
  if (NamePtr == TokenPtr || 
TokenPtr == CommentEnd94
||
605
96
      
*TokenPtr != ';'91
) {
606
8
    formTextToken(T, TokenPtr);
607
8
    return;
608
8
  }
609
88
  StringRef Name(NamePtr, TokenPtr - NamePtr);
610
88
  TokenPtr++; // Skip semicolon.
611
88
  StringRef Resolved;
612
88
  if (isNamed)
613
52
    Resolved = resolveHTMLNamedCharacterReference(Name);
614
36
  else if (isDecimal)
615
17
    Resolved = resolveHTMLDecimalCharacterReference(Name);
616
19
  else
617
19
    Resolved = resolveHTMLHexCharacterReference(Name);
618
619
88
  if (Resolved.empty()) {
620
0
    formTextToken(T, TokenPtr);
621
0
    return;
622
0
  }
623
88
  formTokenWithChars(T, TokenPtr, tok::text);
624
88
  T.setText(Resolved);
625
88
}
626
627
3.11k
void Lexer::setupAndLexHTMLStartTag(Token &T) {
628
3.11k
  assert(BufferPtr[0] == '<' &&
629
3.11k
         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
630
0
  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
631
3.11k
  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
632
3.11k
  if (!isHTMLTagName(Name)) {
633
2.89k
    formTextToken(T, TagNameEnd);
634
2.89k
    return;
635
2.89k
  }
636
637
219
  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
638
219
  T.setHTMLTagStartName(Name);
639
640
219
  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
641
642
219
  const char C = *BufferPtr;
643
219
  if (BufferPtr != CommentEnd &&
644
219
      
(216
C == '>'216
||
C == '/'101
||
isHTMLIdentifierStartingCharacter(C)84
))
645
206
    State = LS_HTMLStartTag;
646
219
}
647
648
380
void Lexer::lexHTMLStartTag(Token &T) {
649
380
  assert(State == LS_HTMLStartTag);
650
651
0
  const char *TokenPtr = BufferPtr;
652
380
  char C = *TokenPtr;
653
380
  if (isHTMLIdentifierCharacter(C)) {
654
86
    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
655
86
    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
656
86
    formTokenWithChars(T, TokenPtr, tok::html_ident);
657
86
    T.setHTMLIdent(Ident);
658
294
  } else {
659
294
    switch (C) {
660
72
    case '=':
661
72
      TokenPtr++;
662
72
      formTokenWithChars(T, TokenPtr, tok::html_equals);
663
72
      break;
664
43
    case '\"':
665
48
    case '\'': {
666
48
      const char *OpenQuote = TokenPtr;
667
48
      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
668
48
      const char *ClosingQuote = TokenPtr;
669
48
      if (TokenPtr != CommentEnd) // Skip closing quote.
670
42
        TokenPtr++;
671
48
      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
672
48
      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
673
48
                                      ClosingQuote - (OpenQuote + 1)));
674
48
      break;
675
43
    }
676
157
    case '>':
677
157
      TokenPtr++;
678
157
      formTokenWithChars(T, TokenPtr, tok::html_greater);
679
157
      State = LS_Normal;
680
157
      return;
681
17
    case '/':
682
17
      TokenPtr++;
683
17
      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
684
15
        TokenPtr++;
685
15
        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
686
15
      } else
687
2
        formTextToken(T, TokenPtr);
688
689
17
      State = LS_Normal;
690
17
      return;
691
294
    }
692
294
  }
693
694
  // Now look ahead and return to normal state if we don't see any HTML tokens
695
  // ahead.
696
206
  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
697
206
  if (BufferPtr == CommentEnd) {
698
23
    State = LS_Normal;
699
23
    return;
700
23
  }
701
702
183
  C = *BufferPtr;
703
183
  if (!isHTMLIdentifierStartingCharacter(C) &&
704
183
      
C != '='171
&&
C != '\"'99
&&
C != '\''56
&&
C != '>'51
) {
705
9
    State = LS_Normal;
706
9
    return;
707
9
  }
708
183
}
709
710
1.46k
void Lexer::setupAndLexHTMLEndTag(Token &T) {
711
1.46k
  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
712
713
0
  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
714
1.46k
  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
715
1.46k
  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
716
1.46k
  if (!isHTMLTagName(Name)) {
717
1.34k
    formTextToken(T, TagNameEnd);
718
1.34k
    return;
719
1.34k
  }
720
721
120
  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
722
723
120
  formTokenWithChars(T, End, tok::html_end_tag);
724
120
  T.setHTMLTagEndName(Name);
725
726
120
  if (BufferPtr != CommentEnd && 
*BufferPtr == '>'118
)
727
118
    State = LS_HTMLEndTag;
728
120
}
729
730
118
void Lexer::lexHTMLEndTag(Token &T) {
731
118
  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
732
733
0
  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
734
118
  State = LS_Normal;
735
118
}
736
737
Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
738
             const CommandTraits &Traits, SourceLocation FileLoc,
739
             const char *BufferStart, const char *BufferEnd, bool ParseCommands)
740
    : Allocator(Allocator), Diags(Diags), Traits(Traits),
741
      BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
742
      FileLoc(FileLoc), ParseCommands(ParseCommands),
743
4.32k
      CommentState(LCS_BeforeComment), State(LS_Normal) {}
744
745
89.8k
void Lexer::lex(Token &T) {
746
144k
again:
747
144k
  switch (CommentState) {
748
34.6k
  case LCS_BeforeComment:
749
34.6k
    if (BufferPtr == BufferEnd) {
750
6.94k
      formTokenWithChars(T, BufferPtr, tok::eof);
751
6.94k
      return;
752
6.94k
    }
753
754
27.7k
    assert(*BufferPtr == '/');
755
0
    BufferPtr++; // Skip first slash.
756
27.7k
    switch(*BufferPtr) {
757
26.9k
    case '/': { // BCPL comment.
758
26.9k
      BufferPtr++; // Skip second slash.
759
760
26.9k
      if (BufferPtr != BufferEnd) {
761
        // Skip Doxygen magic marker, if it is present.
762
        // It might be missing because of a typo //< or /*<, or because we
763
        // merged this non-Doxygen comment into a bunch of Doxygen comments
764
        // around it: /** ... */ /* ... */ /** ... */
765
26.9k
        const char C = *BufferPtr;
766
26.9k
        if (C == '/' || 
C == '!'303
)
767
26.7k
          BufferPtr++;
768
26.9k
      }
769
770
      // Skip less-than symbol that marks trailing comments.
771
      // Skip it even if the comment is not a Doxygen one, because //< and /*<
772
      // are frequent typos.
773
26.9k
      if (BufferPtr != BufferEnd && 
*BufferPtr == '<'26.9k
)
774
138
        BufferPtr++;
775
776
26.9k
      CommentState = LCS_InsideBCPLComment;
777
26.9k
      if (State != LS_VerbatimBlockBody && 
State != LS_VerbatimBlockFirstLine26.1k
)
778
26.1k
        State = LS_Normal;
779
26.9k
      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780
26.9k
      goto again;
781
0
    }
782
747
    case '*': { // C comment.
783
747
      BufferPtr++; // Skip star.
784
785
      // Skip Doxygen magic marker.
786
747
      const char C = *BufferPtr;
787
747
      if ((C == '*' && 
*(BufferPtr + 1) != '/'596
) ||
C == '!'152
)
788
722
        BufferPtr++;
789
790
      // Skip less-than symbol that marks trailing comments.
791
747
      if (BufferPtr != BufferEnd && *BufferPtr == '<')
792
40
        BufferPtr++;
793
794
747
      CommentState = LCS_InsideCComment;
795
747
      State = LS_Normal;
796
747
      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
797
747
      goto again;
798
0
    }
799
0
    default:
800
0
      llvm_unreachable("second character of comment should be '/' or '*'");
801
27.7k
    }
802
803
27.6k
  case LCS_BetweenComments: {
804
    // Consecutive comments are extracted only if there is only whitespace
805
    // between them.  So we can search for the start of the next comment.
806
27.6k
    const char *EndWhitespace = BufferPtr;
807
51.3k
    while(EndWhitespace != BufferEnd && 
*EndWhitespace != '/'47.1k
)
808
23.7k
      EndWhitespace++;
809
810
    // Turn any whitespace between comments (and there is only whitespace
811
    // between them -- guaranteed by comment extraction) into a newline.  We
812
    // have two newlines between C comments in total (first one was synthesized
813
    // after a comment).
814
27.6k
    formTokenWithChars(T, EndWhitespace, tok::newline);
815
816
27.6k
    CommentState = LCS_BeforeComment;
817
27.6k
    break;
818
27.7k
  }
819
820
75.8k
  case LCS_InsideBCPLComment:
821
82.2k
  case LCS_InsideCComment:
822
82.2k
    if (BufferPtr != CommentEnd) {
823
54.5k
      lexCommentText(T);
824
54.5k
      break;
825
54.5k
    } else {
826
      // Skip C comment closing sequence.
827
27.6k
      if (CommentState == LCS_InsideCComment) {
828
744
        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
829
0
        BufferPtr += 2;
830
744
        assert(BufferPtr <= BufferEnd);
831
832
        // Synthenize newline just after the C comment, regardless if there is
833
        // actually a newline.
834
0
        formTokenWithChars(T, BufferPtr, tok::newline);
835
836
744
        CommentState = LCS_BetweenComments;
837
744
        break;
838
26.9k
      } else {
839
        // Don't synthesized a newline after BCPL comment.
840
26.9k
        CommentState = LCS_BetweenComments;
841
26.9k
        goto again;
842
26.9k
      }
843
27.6k
    }
844
144k
  }
845
144k
}
846
847
StringRef Lexer::getSpelling(const Token &Tok,
848
42
                             const SourceManager &SourceMgr) const {
849
42
  SourceLocation Loc = Tok.getLocation();
850
42
  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
851
852
42
  bool InvalidTemp = false;
853
42
  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
854
42
  if (InvalidTemp)
855
0
    return StringRef();
856
857
42
  const char *Begin = File.data() + LocInfo.second;
858
42
  return StringRef(Begin, Tok.getLength());
859
42
}
860
861
} // end namespace comments
862
} // end namespace clang