/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/AST/CommentLexer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- CommentLexer.cpp -------------------------------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | |
9 | | #include "clang/AST/CommentLexer.h" |
10 | | #include "clang/AST/CommentCommandTraits.h" |
11 | | #include "clang/AST/CommentDiagnostic.h" |
12 | | #include "clang/Basic/CharInfo.h" |
13 | | #include "llvm/ADT/StringExtras.h" |
14 | | #include "llvm/ADT/StringSwitch.h" |
15 | | #include "llvm/Support/ConvertUTF.h" |
16 | | #include "llvm/Support/ErrorHandling.h" |
17 | | |
18 | | namespace clang { |
19 | | namespace comments { |
20 | | |
21 | 0 | void Token::dump(const Lexer &L, const SourceManager &SM) const { |
22 | 0 | llvm::errs() << "comments::Token Kind=" << Kind << " "; |
23 | 0 | Loc.print(llvm::errs(), SM); |
24 | 0 | llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; |
25 | 0 | } |
26 | | |
27 | 341 | static inline bool isHTMLNamedCharacterReferenceCharacter(char C) { |
28 | 341 | return isLetter(C); |
29 | 341 | } |
30 | | |
31 | 165 | static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { |
32 | 165 | return isDigit(C); |
33 | 165 | } |
34 | | |
35 | 101 | static inline bool isHTMLHexCharacterReferenceCharacter(char C) { |
36 | 101 | return isHexDigit(C); |
37 | 101 | } |
38 | | |
39 | | static inline StringRef convertCodePointToUTF8( |
40 | | llvm::BumpPtrAllocator &Allocator, |
41 | 36 | unsigned CodePoint) { |
42 | 36 | char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); |
43 | 36 | char *ResolvedPtr = Resolved; |
44 | 36 | if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) |
45 | 36 | return StringRef(Resolved, ResolvedPtr - Resolved); |
46 | 0 | else |
47 | 0 | return StringRef(); |
48 | 36 | } |
49 | | |
50 | | namespace { |
51 | | |
52 | | #include "clang/AST/CommentHTMLTags.inc" |
53 | | #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" |
54 | | |
55 | | } // end anonymous namespace |
56 | | |
57 | 52 | StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { |
58 | | // Fast path, first check a few most widely used named character references. |
59 | 52 | return llvm::StringSwitch<StringRef>(Name) |
60 | 52 | .Case("amp", "&") |
61 | 52 | .Case("lt", "<") |
62 | 52 | .Case("gt", ">") |
63 | 52 | .Case("quot", "\"") |
64 | 52 | .Case("apos", "\'") |
65 | | // Slow path. |
66 | 52 | .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); |
67 | 52 | } |
68 | | |
69 | 17 | StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { |
70 | 17 | unsigned CodePoint = 0; |
71 | 67 | for (unsigned i = 0, e = Name.size(); i != e; ++i50 ) { |
72 | 50 | assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); |
73 | 50 | CodePoint *= 10; |
74 | 50 | CodePoint += Name[i] - '0'; |
75 | 50 | } |
76 | 17 | return convertCodePointToUTF8(Allocator, CodePoint); |
77 | 17 | } |
78 | | |
79 | 19 | StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { |
80 | 19 | unsigned CodePoint = 0; |
81 | 57 | for (unsigned i = 0, e = Name.size(); i != e; ++i38 ) { |
82 | 38 | CodePoint *= 16; |
83 | 38 | const char C = Name[i]; |
84 | 38 | assert(isHTMLHexCharacterReferenceCharacter(C)); |
85 | 38 | CodePoint += llvm::hexDigitValue(C); |
86 | 38 | } |
87 | 19 | return convertCodePointToUTF8(Allocator, CodePoint); |
88 | 19 | } |
89 | | |
90 | 2.12k | void Lexer::skipLineStartingDecorations() { |
91 | | // This function should be called only for C comments |
92 | 2.12k | assert(CommentState == LCS_InsideCComment); |
93 | | |
94 | 2.12k | if (BufferPtr == CommentEnd) |
95 | 381 | return; |
96 | | |
97 | 1.74k | const char *NewBufferPtr = BufferPtr; |
98 | 3.37k | while (isHorizontalWhitespace(*NewBufferPtr)) |
99 | 1.80k | if (++NewBufferPtr == CommentEnd) |
100 | 183 | return; |
101 | 1.56k | if (*NewBufferPtr == '*') |
102 | 1.07k | BufferPtr = NewBufferPtr + 1; |
103 | 1.56k | } |
104 | | |
105 | | namespace { |
106 | | /// Returns pointer to the first newline character in the string. |
107 | 2.41k | const char *findNewline(const char *BufferPtr, const char *BufferEnd) { |
108 | 66.5k | for ( ; BufferPtr != BufferEnd; ++BufferPtr64.1k ) { |
109 | 64.2k | if (isVerticalWhitespace(*BufferPtr)) |
110 | 126 | return BufferPtr; |
111 | 64.2k | } |
112 | 2.29k | return BufferEnd; |
113 | 2.41k | } |
114 | | |
115 | 3.80k | const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { |
116 | 3.80k | if (BufferPtr == BufferEnd) |
117 | 1.67k | return BufferPtr; |
118 | | |
119 | 2.13k | if (*BufferPtr == '\n') |
120 | 2.11k | BufferPtr++; |
121 | 15 | else { |
122 | 15 | assert(*BufferPtr == '\r'); |
123 | 15 | BufferPtr++; |
124 | 15 | if (BufferPtr != BufferEnd && *BufferPtr == '\n'13 ) |
125 | 8 | BufferPtr++; |
126 | 15 | } |
127 | 2.13k | return BufferPtr; |
128 | 2.13k | } |
129 | | |
130 | | const char *skipNamedCharacterReference(const char *BufferPtr, |
131 | 54 | const char *BufferEnd) { |
132 | 243 | for ( ; BufferPtr != BufferEnd; ++BufferPtr189 ) { |
133 | 242 | if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) |
134 | 53 | return BufferPtr; |
135 | 242 | } |
136 | 1 | return BufferEnd; |
137 | 54 | } |
138 | | |
139 | | const char *skipDecimalCharacterReference(const char *BufferPtr, |
140 | 19 | const char *BufferEnd) { |
141 | 73 | for ( ; BufferPtr != BufferEnd; ++BufferPtr54 ) { |
142 | 72 | if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) |
143 | 18 | return BufferPtr; |
144 | 72 | } |
145 | 1 | return BufferEnd; |
146 | 19 | } |
147 | | |
148 | | const char *skipHexCharacterReference(const char *BufferPtr, |
149 | 23 | const char *BufferEnd) { |
150 | 65 | for ( ; BufferPtr != BufferEnd; ++BufferPtr42 ) { |
151 | 63 | if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) |
152 | 21 | return BufferPtr; |
153 | 63 | } |
154 | 2 | return BufferEnd; |
155 | 23 | } |
156 | | |
157 | 8.68k | bool isHTMLIdentifierStartingCharacter(char C) { |
158 | 8.68k | return isLetter(C); |
159 | 8.68k | } |
160 | | |
161 | 21.9k | bool isHTMLIdentifierCharacter(char C) { |
162 | 21.9k | return isAlphanumeric(C); |
163 | 21.9k | } |
164 | | |
165 | 4.99k | const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { |
166 | 21.4k | for ( ; BufferPtr != BufferEnd; ++BufferPtr16.4k ) { |
167 | 21.4k | if (!isHTMLIdentifierCharacter(*BufferPtr)) |
168 | 4.98k | return BufferPtr; |
169 | 21.4k | } |
170 | 11 | return BufferEnd; |
171 | 4.99k | } |
172 | | |
173 | | /// Skip HTML string quoted in single or double quotes. Escaping quotes inside |
174 | | /// string allowed. |
175 | | /// |
176 | | /// Returns pointer to closing quote. |
177 | | const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) |
178 | 76 | { |
179 | 76 | const char Quote = *BufferPtr; |
180 | 76 | assert(Quote == '\"' || Quote == '\''); |
181 | | |
182 | 76 | BufferPtr++; |
183 | 494 | for ( ; BufferPtr != BufferEnd; ++BufferPtr418 ) { |
184 | 488 | const char C = *BufferPtr; |
185 | 488 | if (C == Quote && BufferPtr[-1] != '\\'76 ) |
186 | 70 | return BufferPtr; |
187 | 488 | } |
188 | 6 | return BufferEnd; |
189 | 76 | } |
190 | | |
191 | 2.43k | const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { |
192 | 2.89k | for ( ; BufferPtr != BufferEnd; ++BufferPtr462 ) { |
193 | 2.62k | if (!isWhitespace(*BufferPtr)) |
194 | 2.16k | return BufferPtr; |
195 | 2.62k | } |
196 | 272 | return BufferEnd; |
197 | 2.43k | } |
198 | | |
199 | 280 | bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { |
200 | 280 | return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; |
201 | 280 | } |
202 | | |
203 | 11.9k | bool isCommandNameStartCharacter(char C) { |
204 | 11.9k | return isLetter(C); |
205 | 11.9k | } |
206 | | |
207 | 76.0k | bool isCommandNameCharacter(char C) { |
208 | 76.0k | return isAlphanumeric(C); |
209 | 76.0k | } |
210 | | |
211 | 11.9k | const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { |
212 | 76.9k | for ( ; BufferPtr != BufferEnd; ++BufferPtr65.0k ) { |
213 | 76.0k | if (!isCommandNameCharacter(*BufferPtr)) |
214 | 11.0k | return BufferPtr; |
215 | 76.0k | } |
216 | 901 | return BufferEnd; |
217 | 11.9k | } |
218 | | |
219 | | /// Return the one past end pointer for BCPL comments. |
220 | | /// Handles newlines escaped with backslash or trigraph for backslahs. |
221 | 31.6k | const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { |
222 | 31.6k | const char *CurPtr = BufferPtr; |
223 | 31.6k | while (CurPtr != BufferEnd) { |
224 | 1.12M | while (!isVerticalWhitespace(*CurPtr)) { |
225 | 1.09M | CurPtr++; |
226 | 1.09M | if (CurPtr == BufferEnd) |
227 | 3.86k | return BufferEnd; |
228 | 1.09M | } |
229 | | // We found a newline, check if it is escaped. |
230 | 27.7k | const char *EscapePtr = CurPtr - 1; |
231 | 27.7k | while(isHorizontalWhitespace(*EscapePtr)) |
232 | 12 | EscapePtr--; |
233 | | |
234 | 27.7k | if (*EscapePtr == '\\' || |
235 | 27.7k | (27.7k EscapePtr - 2 >= BufferPtr27.7k && EscapePtr[0] == '/'21.9k && |
236 | 27.7k | EscapePtr[-1] == '?'3 && EscapePtr[-2] == '?'3 )) { |
237 | | // We found an escaped newline. |
238 | 9 | CurPtr = skipNewline(CurPtr, BufferEnd); |
239 | 9 | } else |
240 | 27.7k | return CurPtr; // Not an escaped newline. |
241 | 27.7k | } |
242 | 42 | return BufferEnd; |
243 | 31.6k | } |
244 | | |
245 | | /// Return the one past end pointer for C comments. |
246 | | /// Very dumb, does not handle escaped newlines or trigraphs. |
247 | 771 | const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { |
248 | 43.6k | for ( ; BufferPtr != BufferEnd; ++BufferPtr42.8k ) { |
249 | 43.6k | if (*BufferPtr == '*') { |
250 | 1.86k | assert(BufferPtr + 1 != BufferEnd); |
251 | 1.86k | if (*(BufferPtr + 1) == '/') |
252 | 771 | return BufferPtr; |
253 | 1.86k | } |
254 | 43.6k | } |
255 | 0 | llvm_unreachable("buffer end hit before '*/' was seen"); |
256 | 0 | } |
257 | | |
258 | | } // end anonymous namespace |
259 | | |
260 | | void Lexer::formTokenWithChars(Token &Result, const char *TokEnd, |
261 | 102k | tok::TokenKind Kind) { |
262 | 102k | const unsigned TokLen = TokEnd - BufferPtr; |
263 | 102k | Result.setLocation(getSourceLocation(BufferPtr)); |
264 | 102k | Result.setKind(Kind); |
265 | 102k | Result.setLength(TokLen); |
266 | 102k | #ifndef NDEBUG |
267 | 102k | Result.TextPtr = "<UNSET>"; |
268 | 102k | Result.IntVal = 7; |
269 | 102k | #endif |
270 | 102k | BufferPtr = TokEnd; |
271 | 102k | } |
272 | | |
273 | 40.3k | const char *Lexer::skipTextToken() { |
274 | 40.3k | const char *TokenPtr = BufferPtr; |
275 | 40.3k | assert(TokenPtr < CommentEnd); |
276 | 40.3k | StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<"40.1k : "\n\r"155 ; |
277 | | |
278 | 40.3k | again: |
279 | 40.3k | size_t End = |
280 | 40.3k | StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols); |
281 | 40.3k | if (End == StringRef::npos) |
282 | 22.8k | return CommentEnd; |
283 | | |
284 | | // Doxygen doesn't recognize any commands in a one-line double quotation. |
285 | | // If we don't find an ending quotation mark, we pretend it never began. |
286 | 17.4k | if (*(TokenPtr + End) == '\"') { |
287 | 67 | TokenPtr += End + 1; |
288 | 67 | End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\""); |
289 | 67 | if (End != StringRef::npos && *(TokenPtr + End) == '\"'53 ) |
290 | 53 | TokenPtr += End + 1; |
291 | 67 | goto again; |
292 | 67 | } |
293 | 17.4k | return TokenPtr + End; |
294 | 17.4k | } |
295 | | |
296 | 62.2k | void Lexer::lexCommentText(Token &T) { |
297 | 62.2k | assert(CommentState == LCS_InsideBCPLComment || |
298 | 62.2k | CommentState == LCS_InsideCComment); |
299 | | |
300 | | // Handles lexing non-command text, i.e. text and newline. |
301 | 62.2k | auto HandleNonCommandToken = [&]() -> void { |
302 | 42.4k | assert(State == LS_Normal); |
303 | | |
304 | 42.4k | const char *TokenPtr = BufferPtr; |
305 | 42.4k | assert(TokenPtr < CommentEnd); |
306 | 42.4k | switch (*TokenPtr) { |
307 | 2.08k | case '\n': |
308 | 2.08k | case '\r': |
309 | 2.08k | TokenPtr = skipNewline(TokenPtr, CommentEnd); |
310 | 2.08k | formTokenWithChars(T, TokenPtr, tok::newline); |
311 | | |
312 | 2.08k | if (CommentState == LCS_InsideCComment) |
313 | 2.07k | skipLineStartingDecorations(); |
314 | 2.08k | return; |
315 | | |
316 | 40.3k | default: |
317 | 40.3k | return formTextToken(T, skipTextToken()); |
318 | 42.4k | } |
319 | 42.4k | }; |
320 | | |
321 | 62.2k | if (!ParseCommands) |
322 | 173 | return HandleNonCommandToken(); |
323 | | |
324 | 62.1k | switch (State) { |
325 | 59.3k | case LS_Normal: |
326 | 59.3k | break; |
327 | 288 | case LS_VerbatimBlockFirstLine: |
328 | 288 | lexVerbatimBlockFirstLine(T); |
329 | 288 | return; |
330 | 1.73k | case LS_VerbatimBlockBody: |
331 | 1.73k | lexVerbatimBlockBody(T); |
332 | 1.73k | return; |
333 | 151 | case LS_VerbatimLineText: |
334 | 151 | lexVerbatimLineText(T); |
335 | 151 | return; |
336 | 508 | case LS_HTMLStartTag: |
337 | 508 | lexHTMLStartTag(T); |
338 | 508 | return; |
339 | 118 | case LS_HTMLEndTag: |
340 | 118 | lexHTMLEndTag(T); |
341 | 118 | return; |
342 | 62.1k | } |
343 | | |
344 | 59.3k | assert(State == LS_Normal); |
345 | 59.3k | const char *TokenPtr = BufferPtr; |
346 | 59.3k | assert(TokenPtr < CommentEnd); |
347 | 59.3k | switch(*TokenPtr) { |
348 | 11.4k | case '\\': |
349 | 12.0k | case '@': { |
350 | | // Commands that start with a backslash and commands that start with |
351 | | // 'at' have equivalent semantics. But we keep information about the |
352 | | // exact syntax in AST for comments. |
353 | 12.0k | tok::TokenKind CommandKind = |
354 | 12.0k | (*TokenPtr == '@') ? tok::at_command596 : tok::backslash_command11.4k ; |
355 | 12.0k | TokenPtr++; |
356 | 12.0k | if (TokenPtr == CommentEnd) { |
357 | 12 | formTextToken(T, TokenPtr); |
358 | 12 | return; |
359 | 12 | } |
360 | 12.0k | char C = *TokenPtr; |
361 | 12.0k | switch (C) { |
362 | 11.9k | default: |
363 | 11.9k | break; |
364 | | |
365 | 11.9k | case '\\': 6 case '@': 12 case '&': 18 case '$': |
366 | 50 | case '#': 31 case '<': 37 case '>': 43 case '%': |
367 | 68 | case '\"': 56 case '.': 62 case ':': |
368 | | // This is one of \\ \@ \& \$ etc escape sequences. |
369 | 68 | TokenPtr++; |
370 | 68 | if (C == ':' && TokenPtr != CommentEnd6 && *TokenPtr == ':'6 ) { |
371 | | // This is the \:: escape sequence. |
372 | 6 | TokenPtr++; |
373 | 6 | } |
374 | 68 | StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); |
375 | 68 | formTokenWithChars(T, TokenPtr, tok::text); |
376 | 68 | T.setText(UnescapedText); |
377 | 68 | return; |
378 | 12.0k | } |
379 | | |
380 | | // Don't make zero-length commands. |
381 | 11.9k | if (!isCommandNameStartCharacter(*TokenPtr)) { |
382 | 12 | formTextToken(T, TokenPtr); |
383 | 12 | return; |
384 | 12 | } |
385 | | |
386 | 11.9k | TokenPtr = skipCommandName(TokenPtr, CommentEnd); |
387 | 11.9k | unsigned Length = TokenPtr - (BufferPtr + 1); |
388 | | |
389 | | // Hardcoded support for lexing LaTeX formula commands |
390 | | // \f$ \f( \f) \f[ \f] \f{ \f} as a single command. |
391 | 11.9k | if (Length == 1 && TokenPtr[-1] == 'f'2.01k && TokenPtr != CommentEnd34 ) { |
392 | 34 | C = *TokenPtr; |
393 | 34 | if (C == '$' || C == '('17 || C == ')'12 || C == '['12 || C == ']'7 || |
394 | 34 | C == '{'7 || C == '}'2 ) { |
395 | 32 | TokenPtr++; |
396 | 32 | Length++; |
397 | 32 | } |
398 | 34 | } |
399 | | |
400 | 11.9k | StringRef CommandName(BufferPtr + 1, Length); |
401 | | |
402 | 11.9k | const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); |
403 | 11.9k | if (!Info) { |
404 | 314 | if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { |
405 | 11 | StringRef CorrectedName = Info->Name; |
406 | 11 | SourceLocation Loc = getSourceLocation(BufferPtr); |
407 | 11 | SourceLocation EndLoc = getSourceLocation(TokenPtr); |
408 | 11 | SourceRange FullRange = SourceRange(Loc, EndLoc); |
409 | 11 | SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); |
410 | 11 | Diag(Loc, diag::warn_correct_comment_command_name) |
411 | 11 | << FullRange << CommandName << CorrectedName |
412 | 11 | << FixItHint::CreateReplacement(CommandRange, CorrectedName); |
413 | 303 | } else { |
414 | 303 | formTokenWithChars(T, TokenPtr, tok::unknown_command); |
415 | 303 | T.setUnknownCommandName(CommandName); |
416 | 303 | Diag(T.getLocation(), diag::warn_unknown_comment_command_name) |
417 | 303 | << SourceRange(T.getLocation(), T.getEndLocation()); |
418 | 303 | return; |
419 | 303 | } |
420 | 314 | } |
421 | 11.6k | if (Info->IsVerbatimBlockCommand) { |
422 | 300 | setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); |
423 | 300 | return; |
424 | 300 | } |
425 | 11.3k | if (Info->IsVerbatimLineCommand) { |
426 | 155 | setupAndLexVerbatimLine(T, TokenPtr, Info); |
427 | 155 | return; |
428 | 155 | } |
429 | 11.2k | formTokenWithChars(T, TokenPtr, CommandKind); |
430 | 11.2k | T.setCommandID(Info->getID()); |
431 | 11.2k | return; |
432 | 11.3k | } |
433 | | |
434 | 100 | case '&': |
435 | 100 | lexHTMLCharacterReference(T); |
436 | 100 | return; |
437 | | |
438 | 4.90k | case '<': { |
439 | 4.90k | TokenPtr++; |
440 | 4.90k | if (TokenPtr == CommentEnd) { |
441 | 1 | formTextToken(T, TokenPtr); |
442 | 1 | return; |
443 | 1 | } |
444 | 4.90k | const char C = *TokenPtr; |
445 | 4.90k | if (isHTMLIdentifierStartingCharacter(C)) |
446 | 3.39k | setupAndLexHTMLStartTag(T); |
447 | 1.50k | else if (C == '/') |
448 | 1.48k | setupAndLexHTMLEndTag(T); |
449 | 22 | else |
450 | 22 | formTextToken(T, TokenPtr); |
451 | 4.90k | return; |
452 | 4.90k | } |
453 | | |
454 | 42.2k | default: |
455 | 42.2k | return HandleNonCommandToken(); |
456 | 59.3k | } |
457 | 59.3k | } |
458 | | |
459 | | void Lexer::setupAndLexVerbatimBlock(Token &T, |
460 | | const char *TextBegin, |
461 | 300 | char Marker, const CommandInfo *Info) { |
462 | 300 | assert(Info->IsVerbatimBlockCommand); |
463 | | |
464 | 300 | VerbatimBlockEndCommandName.clear(); |
465 | 300 | VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\"294 : "@"6 ); |
466 | 300 | VerbatimBlockEndCommandName.append(Info->EndCommandName); |
467 | | |
468 | 300 | formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); |
469 | 300 | T.setVerbatimBlockID(Info->getID()); |
470 | | |
471 | | // If there is a newline following the verbatim opening command, skip the |
472 | | // newline so that we don't create an tok::verbatim_block_line with empty |
473 | | // text content. |
474 | 300 | if (BufferPtr != CommentEnd && |
475 | 300 | isVerticalWhitespace(*BufferPtr)198 ) { |
476 | 10 | BufferPtr = skipNewline(BufferPtr, CommentEnd); |
477 | 10 | State = LS_VerbatimBlockBody; |
478 | 10 | return; |
479 | 10 | } |
480 | | |
481 | 290 | State = LS_VerbatimBlockFirstLine; |
482 | 290 | } |
483 | | |
484 | 2.02k | void Lexer::lexVerbatimBlockFirstLine(Token &T) { |
485 | 2.26k | again: |
486 | 2.26k | assert(BufferPtr < CommentEnd); |
487 | | |
488 | | // FIXME: It would be better to scan the text once, finding either the block |
489 | | // end command or newline. |
490 | | // |
491 | | // Extract current line. |
492 | 2.26k | const char *Newline = findNewline(BufferPtr, CommentEnd); |
493 | 2.26k | StringRef Line(BufferPtr, Newline - BufferPtr); |
494 | | |
495 | | // Look for end command in current line. |
496 | 2.26k | size_t Pos = Line.find(VerbatimBlockEndCommandName); |
497 | 2.26k | const char *TextEnd; |
498 | 2.26k | const char *NextLine; |
499 | 2.26k | if (Pos == StringRef::npos) { |
500 | | // Current line is completely verbatim. |
501 | 1.69k | TextEnd = Newline; |
502 | 1.69k | NextLine = skipNewline(Newline, CommentEnd); |
503 | 1.69k | } else if (567 Pos == 0567 ) { |
504 | | // Current line contains just an end command. |
505 | 287 | const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); |
506 | 287 | StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); |
507 | 287 | formTokenWithChars(T, End, tok::verbatim_block_end); |
508 | 287 | T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); |
509 | 287 | State = LS_Normal; |
510 | 287 | return; |
511 | 287 | } else { |
512 | | // There is some text, followed by end command. Extract text first. |
513 | 280 | TextEnd = BufferPtr + Pos; |
514 | 280 | NextLine = TextEnd; |
515 | | // If there is only whitespace before end command, skip whitespace. |
516 | 280 | if (isWhitespace(BufferPtr, TextEnd)) { |
517 | 243 | BufferPtr = TextEnd; |
518 | 243 | goto again; |
519 | 243 | } |
520 | 280 | } |
521 | | |
522 | 1.73k | StringRef Text(BufferPtr, TextEnd - BufferPtr); |
523 | 1.73k | formTokenWithChars(T, NextLine, tok::verbatim_block_line); |
524 | 1.73k | T.setVerbatimBlockText(Text); |
525 | | |
526 | 1.73k | State = LS_VerbatimBlockBody; |
527 | 1.73k | } |
528 | | |
529 | 1.73k | void Lexer::lexVerbatimBlockBody(Token &T) { |
530 | 1.73k | assert(State == LS_VerbatimBlockBody); |
531 | | |
532 | 1.73k | if (CommentState == LCS_InsideCComment) |
533 | 50 | skipLineStartingDecorations(); |
534 | | |
535 | 1.73k | if (BufferPtr == CommentEnd) { |
536 | 4 | formTokenWithChars(T, BufferPtr, tok::verbatim_block_line); |
537 | 4 | T.setVerbatimBlockText(""); |
538 | 4 | return; |
539 | 4 | } |
540 | | |
541 | 1.73k | lexVerbatimBlockFirstLine(T); |
542 | 1.73k | } |
543 | | |
544 | | void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, |
545 | 155 | const CommandInfo *Info) { |
546 | 155 | assert(Info->IsVerbatimLineCommand); |
547 | 155 | formTokenWithChars(T, TextBegin, tok::verbatim_line_name); |
548 | 155 | T.setVerbatimLineID(Info->getID()); |
549 | | |
550 | 155 | State = LS_VerbatimLineText; |
551 | 155 | } |
552 | | |
553 | 151 | void Lexer::lexVerbatimLineText(Token &T) { |
554 | 151 | assert(State == LS_VerbatimLineText); |
555 | | |
556 | | // Extract current line. |
557 | 151 | const char *Newline = findNewline(BufferPtr, CommentEnd); |
558 | 151 | StringRef Text(BufferPtr, Newline - BufferPtr); |
559 | 151 | formTokenWithChars(T, Newline, tok::verbatim_line_text); |
560 | 151 | T.setVerbatimLineText(Text); |
561 | | |
562 | 151 | State = LS_Normal; |
563 | 151 | } |
564 | | |
565 | 100 | void Lexer::lexHTMLCharacterReference(Token &T) { |
566 | 100 | const char *TokenPtr = BufferPtr; |
567 | 100 | assert(*TokenPtr == '&'); |
568 | 100 | TokenPtr++; |
569 | 100 | if (TokenPtr == CommentEnd) { |
570 | 1 | formTextToken(T, TokenPtr); |
571 | 1 | return; |
572 | 1 | } |
573 | 99 | const char *NamePtr; |
574 | 99 | bool isNamed = false; |
575 | 99 | bool isDecimal = false; |
576 | 99 | char C = *TokenPtr; |
577 | 99 | if (isHTMLNamedCharacterReferenceCharacter(C)) { |
578 | 54 | NamePtr = TokenPtr; |
579 | 54 | TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); |
580 | 54 | isNamed = true; |
581 | 54 | } else if (45 C == '#'45 ) { |
582 | 44 | TokenPtr++; |
583 | 44 | if (TokenPtr == CommentEnd) { |
584 | 1 | formTextToken(T, TokenPtr); |
585 | 1 | return; |
586 | 1 | } |
587 | 43 | C = *TokenPtr; |
588 | 43 | if (isHTMLDecimalCharacterReferenceCharacter(C)) { |
589 | 19 | NamePtr = TokenPtr; |
590 | 19 | TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); |
591 | 19 | isDecimal = true; |
592 | 24 | } else if (C == 'x' || C == 'X'7 ) { |
593 | 23 | TokenPtr++; |
594 | 23 | NamePtr = TokenPtr; |
595 | 23 | TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); |
596 | 23 | } else { |
597 | 1 | formTextToken(T, TokenPtr); |
598 | 1 | return; |
599 | 1 | } |
600 | 43 | } else { |
601 | 1 | formTextToken(T, TokenPtr); |
602 | 1 | return; |
603 | 1 | } |
604 | 96 | if (NamePtr == TokenPtr || TokenPtr == CommentEnd94 || |
605 | 96 | *TokenPtr != ';'91 ) { |
606 | 8 | formTextToken(T, TokenPtr); |
607 | 8 | return; |
608 | 8 | } |
609 | 88 | StringRef Name(NamePtr, TokenPtr - NamePtr); |
610 | 88 | TokenPtr++; // Skip semicolon. |
611 | 88 | StringRef Resolved; |
612 | 88 | if (isNamed) |
613 | 52 | Resolved = resolveHTMLNamedCharacterReference(Name); |
614 | 36 | else if (isDecimal) |
615 | 17 | Resolved = resolveHTMLDecimalCharacterReference(Name); |
616 | 19 | else |
617 | 19 | Resolved = resolveHTMLHexCharacterReference(Name); |
618 | | |
619 | 88 | if (Resolved.empty()) { |
620 | 0 | formTextToken(T, TokenPtr); |
621 | 0 | return; |
622 | 0 | } |
623 | 88 | formTokenWithChars(T, TokenPtr, tok::text); |
624 | 88 | T.setText(Resolved); |
625 | 88 | } |
626 | | |
627 | 3.39k | void Lexer::setupAndLexHTMLStartTag(Token &T) { |
628 | 3.39k | assert(BufferPtr[0] == '<' && |
629 | 3.39k | isHTMLIdentifierStartingCharacter(BufferPtr[1])); |
630 | 3.39k | const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); |
631 | 3.39k | StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); |
632 | 3.39k | if (!isHTMLTagName(Name)) { |
633 | 3.13k | formTextToken(T, TagNameEnd); |
634 | 3.13k | return; |
635 | 3.13k | } |
636 | | |
637 | 263 | formTokenWithChars(T, TagNameEnd, tok::html_start_tag); |
638 | 263 | T.setHTMLTagStartName(Name); |
639 | | |
640 | 263 | BufferPtr = skipWhitespace(BufferPtr, CommentEnd); |
641 | | |
642 | 263 | const char C = *BufferPtr; |
643 | 263 | if (BufferPtr != CommentEnd && |
644 | 263 | (260 C == '>'260 || C == '/'145 || isHTMLIdentifierStartingCharacter(C)112 )) |
645 | 250 | State = LS_HTMLStartTag; |
646 | 263 | } |
647 | | |
648 | 508 | void Lexer::lexHTMLStartTag(Token &T) { |
649 | 508 | assert(State == LS_HTMLStartTag); |
650 | | |
651 | 508 | const char *TokenPtr = BufferPtr; |
652 | 508 | char C = *TokenPtr; |
653 | 508 | if (isHTMLIdentifierCharacter(C)) { |
654 | 114 | TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); |
655 | 114 | StringRef Ident(BufferPtr, TokenPtr - BufferPtr); |
656 | 114 | formTokenWithChars(T, TokenPtr, tok::html_ident); |
657 | 114 | T.setHTMLIdent(Ident); |
658 | 394 | } else { |
659 | 394 | switch (C) { |
660 | 100 | case '=': |
661 | 100 | TokenPtr++; |
662 | 100 | formTokenWithChars(T, TokenPtr, tok::html_equals); |
663 | 100 | break; |
664 | 71 | case '\"': |
665 | 76 | case '\'': { |
666 | 76 | const char *OpenQuote = TokenPtr; |
667 | 76 | TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); |
668 | 76 | const char *ClosingQuote = TokenPtr; |
669 | 76 | if (TokenPtr != CommentEnd) // Skip closing quote. |
670 | 70 | TokenPtr++; |
671 | 76 | formTokenWithChars(T, TokenPtr, tok::html_quoted_string); |
672 | 76 | T.setHTMLQuotedString(StringRef(OpenQuote + 1, |
673 | 76 | ClosingQuote - (OpenQuote + 1))); |
674 | 76 | break; |
675 | 71 | } |
676 | 169 | case '>': |
677 | 169 | TokenPtr++; |
678 | 169 | formTokenWithChars(T, TokenPtr, tok::html_greater); |
679 | 169 | State = LS_Normal; |
680 | 169 | return; |
681 | 49 | case '/': |
682 | 49 | TokenPtr++; |
683 | 49 | if (TokenPtr != CommentEnd && *TokenPtr == '>') { |
684 | 47 | TokenPtr++; |
685 | 47 | formTokenWithChars(T, TokenPtr, tok::html_slash_greater); |
686 | 47 | } else |
687 | 2 | formTextToken(T, TokenPtr); |
688 | | |
689 | 49 | State = LS_Normal; |
690 | 49 | return; |
691 | 394 | } |
692 | 394 | } |
693 | | |
694 | | // Now look ahead and return to normal state if we don't see any HTML tokens |
695 | | // ahead. |
696 | 290 | BufferPtr = skipWhitespace(BufferPtr, CommentEnd); |
697 | 290 | if (BufferPtr == CommentEnd) { |
698 | 23 | State = LS_Normal; |
699 | 23 | return; |
700 | 23 | } |
701 | | |
702 | 267 | C = *BufferPtr; |
703 | 267 | if (!isHTMLIdentifierStartingCharacter(C) && |
704 | 267 | C != '='255 && C != '\"'155 && C != '\''84 && C != '>'79 && C != '/'25 ) { |
705 | 9 | State = LS_Normal; |
706 | 9 | return; |
707 | 9 | } |
708 | 267 | } |
709 | | |
710 | 1.48k | void Lexer::setupAndLexHTMLEndTag(Token &T) { |
711 | 1.48k | assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); |
712 | | |
713 | 1.48k | const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); |
714 | 1.48k | const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); |
715 | 1.48k | StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); |
716 | 1.48k | if (!isHTMLTagName(Name)) { |
717 | 1.36k | formTextToken(T, TagNameEnd); |
718 | 1.36k | return; |
719 | 1.36k | } |
720 | | |
721 | 120 | const char *End = skipWhitespace(TagNameEnd, CommentEnd); |
722 | | |
723 | 120 | formTokenWithChars(T, End, tok::html_end_tag); |
724 | 120 | T.setHTMLTagEndName(Name); |
725 | | |
726 | 120 | if (BufferPtr != CommentEnd && *BufferPtr == '>'118 ) |
727 | 118 | State = LS_HTMLEndTag; |
728 | 120 | } |
729 | | |
730 | 118 | void Lexer::lexHTMLEndTag(Token &T) { |
731 | 118 | assert(BufferPtr != CommentEnd && *BufferPtr == '>'); |
732 | | |
733 | 118 | formTokenWithChars(T, BufferPtr + 1, tok::html_greater); |
734 | 118 | State = LS_Normal; |
735 | 118 | } |
736 | | |
737 | | Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, |
738 | | const CommandTraits &Traits, SourceLocation FileLoc, |
739 | | const char *BufferStart, const char *BufferEnd, bool ParseCommands) |
740 | | : Allocator(Allocator), Diags(Diags), Traits(Traits), |
741 | | BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart), |
742 | | FileLoc(FileLoc), ParseCommands(ParseCommands), |
743 | 4.73k | CommentState(LCS_BeforeComment), State(LS_Normal) {} |
744 | | |
745 | 102k | void Lexer::lex(Token &T) { |
746 | 166k | again: |
747 | 166k | switch (CommentState) { |
748 | 39.9k | case LCS_BeforeComment: |
749 | 39.9k | if (BufferPtr == BufferEnd) { |
750 | 7.54k | formTokenWithChars(T, BufferPtr, tok::eof); |
751 | 7.54k | return; |
752 | 7.54k | } |
753 | | |
754 | 32.3k | assert(*BufferPtr == '/'); |
755 | 32.3k | BufferPtr++; // Skip first slash. |
756 | 32.3k | switch(*BufferPtr) { |
757 | 31.6k | case '/': { // BCPL comment. |
758 | 31.6k | BufferPtr++; // Skip second slash. |
759 | | |
760 | 31.6k | if (BufferPtr != BufferEnd) { |
761 | | // Skip Doxygen magic marker, if it is present. |
762 | | // It might be missing because of a typo //< or /*<, or because we |
763 | | // merged this non-Doxygen comment into a bunch of Doxygen comments |
764 | | // around it: /** ... */ /* ... */ /** ... */ |
765 | 31.6k | const char C = *BufferPtr; |
766 | 31.6k | if (C == '/' || C == '!'303 ) |
767 | 31.3k | BufferPtr++; |
768 | 31.6k | } |
769 | | |
770 | | // Skip less-than symbol that marks trailing comments. |
771 | | // Skip it even if the comment is not a Doxygen one, because //< and /*< |
772 | | // are frequent typos. |
773 | 31.6k | if (BufferPtr != BufferEnd && *BufferPtr == '<'31.5k ) |
774 | 138 | BufferPtr++; |
775 | | |
776 | 31.6k | CommentState = LCS_InsideBCPLComment; |
777 | 31.6k | if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine29.9k ) |
778 | 29.8k | State = LS_Normal; |
779 | 31.6k | CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); |
780 | 31.6k | goto again; |
781 | 0 | } |
782 | 771 | case '*': { // C comment. |
783 | 771 | BufferPtr++; // Skip star. |
784 | | |
785 | | // Skip Doxygen magic marker. |
786 | 771 | const char C = *BufferPtr; |
787 | 771 | if ((C == '*' && *(BufferPtr + 1) != '/'620 ) || C == '!'152 ) |
788 | 746 | BufferPtr++; |
789 | | |
790 | | // Skip less-than symbol that marks trailing comments. |
791 | 771 | if (BufferPtr != BufferEnd && *BufferPtr == '<') |
792 | 40 | BufferPtr++; |
793 | | |
794 | 771 | CommentState = LCS_InsideCComment; |
795 | 771 | State = LS_Normal; |
796 | 771 | CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); |
797 | 771 | goto again; |
798 | 0 | } |
799 | 0 | default: |
800 | 0 | llvm_unreachable("second character of comment should be '/' or '*'"); |
801 | 32.3k | } |
802 | | |
803 | 32.3k | case LCS_BetweenComments: { |
804 | | // Consecutive comments are extracted only if there is only whitespace |
805 | | // between them. So we can search for the start of the next comment. |
806 | 32.3k | const char *EndWhitespace = BufferPtr; |
807 | 60.2k | while(EndWhitespace != BufferEnd && *EndWhitespace != '/'55.6k ) |
808 | 27.9k | EndWhitespace++; |
809 | | |
810 | | // Turn any whitespace between comments (and there is only whitespace |
811 | | // between them -- guaranteed by comment extraction) into a newline. We |
812 | | // have two newlines between C comments in total (first one was synthesized |
813 | | // after a comment). |
814 | 32.3k | formTokenWithChars(T, EndWhitespace, tok::newline); |
815 | | |
816 | 32.3k | CommentState = LCS_BeforeComment; |
817 | 32.3k | break; |
818 | 32.3k | } |
819 | | |
820 | 88.1k | case LCS_InsideBCPLComment: |
821 | 94.5k | case LCS_InsideCComment: |
822 | 94.5k | if (BufferPtr != CommentEnd) { |
823 | 62.2k | lexCommentText(T); |
824 | 62.2k | break; |
825 | 62.2k | } else { |
826 | | // Skip C comment closing sequence. |
827 | 32.3k | if (CommentState == LCS_InsideCComment) { |
828 | 768 | assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); |
829 | 768 | BufferPtr += 2; |
830 | 768 | assert(BufferPtr <= BufferEnd); |
831 | | |
832 | | // Synthenize newline just after the C comment, regardless if there is |
833 | | // actually a newline. |
834 | 768 | formTokenWithChars(T, BufferPtr, tok::newline); |
835 | | |
836 | 768 | CommentState = LCS_BetweenComments; |
837 | 768 | break; |
838 | 31.5k | } else { |
839 | | // Don't synthesized a newline after BCPL comment. |
840 | 31.5k | CommentState = LCS_BetweenComments; |
841 | 31.5k | goto again; |
842 | 31.5k | } |
843 | 32.3k | } |
844 | 166k | } |
845 | 166k | } |
846 | | |
847 | | StringRef Lexer::getSpelling(const Token &Tok, |
848 | 155 | const SourceManager &SourceMgr) const { |
849 | 155 | SourceLocation Loc = Tok.getLocation(); |
850 | 155 | std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); |
851 | | |
852 | 155 | bool InvalidTemp = false; |
853 | 155 | StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); |
854 | 155 | if (InvalidTemp) |
855 | 0 | return StringRef(); |
856 | | |
857 | 155 | const char *Begin = File.data() + LocInfo.second; |
858 | 155 | return StringRef(Begin, Tok.getLength()); |
859 | 155 | } |
860 | | |
861 | | } // end namespace comments |
862 | | } // end namespace clang |