/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | /// |
9 | | /// \file |
10 | | /// This file implements FormatTokenLexer, which tokenizes a source file |
11 | | /// into a FormatToken stream suitable for ClangFormat. |
12 | | /// |
13 | | //===----------------------------------------------------------------------===// |
14 | | |
15 | | #include "FormatTokenLexer.h" |
16 | | #include "FormatToken.h" |
17 | | #include "clang/Basic/SourceLocation.h" |
18 | | #include "clang/Basic/SourceManager.h" |
19 | | #include "clang/Format/Format.h" |
20 | | #include "llvm/Support/Regex.h" |
21 | | |
22 | | namespace clang { |
23 | | namespace format { |
24 | | |
25 | | FormatTokenLexer::FormatTokenLexer( |
26 | | const SourceManager &SourceMgr, FileID ID, unsigned Column, |
27 | | const FormatStyle &Style, encoding::Encoding Encoding, |
28 | | llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, |
29 | | IdentifierTable &IdentTable) |
30 | | : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}), |
31 | | Column(Column), TrailingWhitespace(0), |
32 | | LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID), |
33 | | Style(Style), IdentTable(IdentTable), Keywords(IdentTable), |
34 | | Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0), |
35 | | FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), |
36 | 65.5k | MacroBlockEndRegex(Style.MacroBlockEnd) { |
37 | 65.5k | Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts)); |
38 | 65.5k | Lex->SetKeepWhitespaceMode(true); |
39 | | |
40 | 196k | for (const std::string &ForEachMacro : Style.ForEachMacros) { |
41 | 196k | auto Identifier = &IdentTable.get(ForEachMacro); |
42 | 196k | Macros.insert({Identifier, TT_ForEachMacro}); |
43 | 196k | } |
44 | 68.5k | for (const std::string &IfMacro : Style.IfMacros) { |
45 | 68.5k | auto Identifier = &IdentTable.get(IfMacro); |
46 | 68.5k | Macros.insert({Identifier, TT_IfMacro}); |
47 | 68.5k | } |
48 | 66.7k | for (const std::string &AttributeMacro : Style.AttributeMacros) { |
49 | 66.7k | auto Identifier = &IdentTable.get(AttributeMacro); |
50 | 66.7k | Macros.insert({Identifier, TT_AttributeMacro}); |
51 | 66.7k | } |
52 | 131k | for (const std::string &StatementMacro : Style.StatementMacros) { |
53 | 131k | auto Identifier = &IdentTable.get(StatementMacro); |
54 | 131k | Macros.insert({Identifier, TT_StatementMacro}); |
55 | 131k | } |
56 | 65.5k | for (const std::string &TypenameMacro : Style.TypenameMacros) { |
57 | 765 | auto Identifier = &IdentTable.get(TypenameMacro); |
58 | 765 | Macros.insert({Identifier, TT_TypenameMacro}); |
59 | 765 | } |
60 | 65.5k | for (const std::string &NamespaceMacro : Style.NamespaceMacros) { |
61 | 191 | auto Identifier = &IdentTable.get(NamespaceMacro); |
62 | 191 | Macros.insert({Identifier, TT_NamespaceMacro}); |
63 | 191 | } |
64 | 65.5k | for (const std::string &WhitespaceSensitiveMacro : |
65 | 327k | Style.WhitespaceSensitiveMacros) { |
66 | 327k | auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro); |
67 | 327k | Macros.insert({Identifier, TT_UntouchableMacroFunc}); |
68 | 327k | } |
69 | 65.5k | for (const std::string &StatementAttributeLikeMacro : |
70 | 65.5k | Style.StatementAttributeLikeMacros) { |
71 | 65.5k | auto Identifier = &IdentTable.get(StatementAttributeLikeMacro); |
72 | 65.5k | Macros.insert({Identifier, TT_StatementAttributeLikeMacro}); |
73 | 65.5k | } |
74 | 65.5k | } |
75 | | |
76 | 65.5k | ArrayRef<FormatToken *> FormatTokenLexer::lex() { |
77 | 65.5k | assert(Tokens.empty()); |
78 | 0 | assert(FirstInLineIndex == 0); |
79 | 1.10M | do { |
80 | 1.10M | Tokens.push_back(getNextToken()); |
81 | 1.10M | if (Style.isJavaScript()) { |
82 | 37.9k | tryParseJSRegexLiteral(); |
83 | 37.9k | handleTemplateStrings(); |
84 | 37.9k | } |
85 | 1.10M | if (Style.Language == FormatStyle::LK_TextProto) |
86 | 6.73k | tryParsePythonComment(); |
87 | 1.10M | tryMergePreviousTokens(); |
88 | 1.10M | if (Style.isCSharp()) { |
89 | | // This needs to come after tokens have been merged so that C# |
90 | | // string literals are correctly identified. |
91 | 9.11k | handleCSharpVerbatimAndInterpolatedStrings(); |
92 | 9.11k | } |
93 | 1.10M | if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline1.01M ) |
94 | 88.2k | FirstInLineIndex = Tokens.size() - 1; |
95 | 1.10M | } while (Tokens.back()->isNot(tok::eof)); |
96 | 65.5k | return Tokens; |
97 | 65.5k | } |
98 | | |
99 | 1.10M | void FormatTokenLexer::tryMergePreviousTokens() { |
100 | 1.10M | if (tryMerge_TMacro()) |
101 | 18 | return; |
102 | 1.10M | if (tryMergeConflictMarkers()) |
103 | 45 | return; |
104 | 1.10M | if (tryMergeLessLess()) |
105 | 1.29k | return; |
106 | 1.10M | if (tryMergeForEach()) |
107 | 9 | return; |
108 | 1.10M | if (Style.isCpp() && tryTransformTryUsageForC()1.03M ) |
109 | 36 | return; |
110 | | |
111 | 1.10M | if (Style.isJavaScript() || Style.isCSharp()1.06M ) { |
112 | 47.0k | static const tok::TokenKind NullishCoalescingOperator[] = {tok::question, |
113 | 47.0k | tok::question}; |
114 | 47.0k | static const tok::TokenKind NullPropagatingOperator[] = {tok::question, |
115 | 47.0k | tok::period}; |
116 | 47.0k | static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater}; |
117 | | |
118 | 47.0k | if (tryMergeTokens(FatArrow, TT_FatArrow)) |
119 | 244 | return; |
120 | 46.8k | if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) { |
121 | | // Treat like the "||" operator (as opposed to the ternary ?). |
122 | 38 | Tokens.back()->Tok.setKind(tok::pipepipe); |
123 | 38 | return; |
124 | 38 | } |
125 | 46.7k | if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) { |
126 | | // Treat like a regular "." access. |
127 | 22 | Tokens.back()->Tok.setKind(tok::period); |
128 | 22 | return; |
129 | 22 | } |
130 | 46.7k | if (tryMergeNullishCoalescingEqual()) |
131 | 14 | return; |
132 | 46.7k | } |
133 | | |
134 | 1.09M | if (Style.isCSharp()) { |
135 | 8.98k | static const tok::TokenKind CSharpNullConditionalLSquare[] = { |
136 | 8.98k | tok::question, tok::l_square}; |
137 | | |
138 | 8.98k | if (tryMergeCSharpKeywordVariables()) |
139 | 8 | return; |
140 | 8.98k | if (tryMergeCSharpStringLiteral()) |
141 | 43 | return; |
142 | 8.93k | if (tryTransformCSharpForEach()) |
143 | 8 | return; |
144 | 8.92k | if (tryMergeTokens(CSharpNullConditionalLSquare, |
145 | 8.92k | TT_CSharpNullConditionalLSquare)) { |
146 | | // Treat like a regular "[" operator. |
147 | 8 | Tokens.back()->Tok.setKind(tok::l_square); |
148 | 8 | return; |
149 | 8 | } |
150 | 8.92k | } |
151 | | |
152 | 1.09M | if (tryMergeNSStringLiteral()) |
153 | 483 | return; |
154 | | |
155 | 1.09M | if (Style.isJavaScript()) { |
156 | 37.7k | static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal}; |
157 | 37.7k | static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal, |
158 | 37.7k | tok::equal}; |
159 | 37.7k | static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater, |
160 | 37.7k | tok::greaterequal}; |
161 | 37.7k | static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star}; |
162 | 37.7k | static const tok::TokenKind JSExponentiationEqual[] = {tok::star, |
163 | 37.7k | tok::starequal}; |
164 | 37.7k | static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal}; |
165 | 37.7k | static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal}; |
166 | | |
167 | | // FIXME: Investigate what token type gives the correct operator priority. |
168 | 37.7k | if (tryMergeTokens(JSIdentity, TT_BinaryOperator)) |
169 | 12 | return; |
170 | 37.7k | if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator)) |
171 | 12 | return; |
172 | 37.7k | if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator)) |
173 | 10 | return; |
174 | 37.7k | if (tryMergeTokens(JSExponentiation, TT_JsExponentiation)) |
175 | 4 | return; |
176 | 37.7k | if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) { |
177 | 4 | Tokens.back()->Tok.setKind(tok::starequal); |
178 | 4 | return; |
179 | 4 | } |
180 | 37.7k | if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) || |
181 | 37.7k | tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)37.7k ) { |
182 | | // Treat like the "=" assignment operator. |
183 | 8 | Tokens.back()->Tok.setKind(tok::equal); |
184 | 8 | return; |
185 | 8 | } |
186 | 37.7k | if (tryMergeJSPrivateIdentifier()) |
187 | 32 | return; |
188 | 37.7k | } |
189 | | |
190 | 1.09M | if (Style.Language == FormatStyle::LK_Java) { |
191 | 4.54k | static const tok::TokenKind JavaRightLogicalShiftAssign[] = { |
192 | 4.54k | tok::greater, tok::greater, tok::greaterequal}; |
193 | 4.54k | if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator)) |
194 | 2 | return; |
195 | 4.54k | } |
196 | 1.09M | } |
197 | | |
198 | 1.09M | bool FormatTokenLexer::tryMergeNSStringLiteral() { |
199 | 1.09M | if (Tokens.size() < 2) |
200 | 65.5k | return false; |
201 | 1.03M | auto &At = *(Tokens.end() - 2); |
202 | 1.03M | auto &String = *(Tokens.end() - 1); |
203 | 1.03M | if (!At->is(tok::at) || !String->is(tok::string_literal)2.83k ) |
204 | 1.03M | return false; |
205 | 483 | At->Tok.setKind(tok::string_literal); |
206 | 483 | At->TokenText = StringRef(At->TokenText.begin(), |
207 | 483 | String->TokenText.end() - At->TokenText.begin()); |
208 | 483 | At->ColumnWidth += String->ColumnWidth; |
209 | 483 | At->setType(TT_ObjCStringLiteral); |
210 | 483 | Tokens.erase(Tokens.end() - 1); |
211 | 483 | return true; |
212 | 1.03M | } |
213 | | |
214 | 37.7k | bool FormatTokenLexer::tryMergeJSPrivateIdentifier() { |
215 | | // Merges #idenfier into a single identifier with the text #identifier |
216 | | // but the token tok::identifier. |
217 | 37.7k | if (Tokens.size() < 2) |
218 | 3.02k | return false; |
219 | 34.6k | auto &Hash = *(Tokens.end() - 2); |
220 | 34.6k | auto &Identifier = *(Tokens.end() - 1); |
221 | 34.6k | if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier)36 ) |
222 | 34.6k | return false; |
223 | 32 | Hash->Tok.setKind(tok::identifier); |
224 | 32 | Hash->TokenText = |
225 | 32 | StringRef(Hash->TokenText.begin(), |
226 | 32 | Identifier->TokenText.end() - Hash->TokenText.begin()); |
227 | 32 | Hash->ColumnWidth += Identifier->ColumnWidth; |
228 | 32 | Hash->setType(TT_JsPrivateIdentifier); |
229 | 32 | Tokens.erase(Tokens.end() - 1); |
230 | 32 | return true; |
231 | 34.6k | } |
232 | | |
233 | | // Search for verbatim or interpolated string literals @"ABC" or |
234 | | // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to |
235 | | // prevent splitting of @, $ and ". |
236 | | // Merging of multiline verbatim strings with embedded '"' is handled in |
237 | | // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing. |
238 | 8.98k | bool FormatTokenLexer::tryMergeCSharpStringLiteral() { |
239 | 8.98k | if (Tokens.size() < 2) |
240 | 527 | return false; |
241 | | |
242 | | // Interpolated strings could contain { } with " characters inside. |
243 | | // $"{x ?? "null"}" |
244 | | // should not be split into $"{x ?? ", null, "}" but should treated as a |
245 | | // single string-literal. |
246 | | // |
247 | | // We opt not to try and format expressions inside {} within a C# |
248 | | // interpolated string. Formatting expressions within an interpolated string |
249 | | // would require similar work as that done for JavaScript template strings |
250 | | // in `handleTemplateStrings()`. |
251 | 8.45k | auto &CSharpInterpolatedString = *(Tokens.end() - 2); |
252 | 8.45k | if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral && |
253 | 8.45k | (43 CSharpInterpolatedString->TokenText.startswith(R"($")")43 || |
254 | 43 | CSharpInterpolatedString->TokenText.startswith(R"($@")")12 )) { |
255 | 37 | int UnmatchedOpeningBraceCount = 0; |
256 | | |
257 | 37 | auto TokenTextSize = CSharpInterpolatedString->TokenText.size(); |
258 | 989 | for (size_t Index = 0; Index < TokenTextSize; ++Index952 ) { |
259 | 952 | char C = CSharpInterpolatedString->TokenText[Index]; |
260 | 952 | if (C == '{') { |
261 | | // "{{" inside an interpolated string is an escaped '{' so skip it. |
262 | 49 | if (Index + 1 < TokenTextSize && |
263 | 49 | CSharpInterpolatedString->TokenText[Index + 1] == '{') { |
264 | 6 | ++Index; |
265 | 6 | continue; |
266 | 6 | } |
267 | 43 | ++UnmatchedOpeningBraceCount; |
268 | 903 | } else if (C == '}') { |
269 | | // "}}" inside an interpolated string is an escaped '}' so skip it. |
270 | 43 | if (Index + 1 < TokenTextSize && |
271 | 43 | CSharpInterpolatedString->TokenText[Index + 1] == '}') { |
272 | 6 | ++Index; |
273 | 6 | continue; |
274 | 6 | } |
275 | 37 | --UnmatchedOpeningBraceCount; |
276 | 37 | } |
277 | 952 | } |
278 | | |
279 | 37 | if (UnmatchedOpeningBraceCount > 0) { |
280 | 6 | auto &NextToken = *(Tokens.end() - 1); |
281 | 6 | CSharpInterpolatedString->TokenText = |
282 | 6 | StringRef(CSharpInterpolatedString->TokenText.begin(), |
283 | 6 | NextToken->TokenText.end() - |
284 | 6 | CSharpInterpolatedString->TokenText.begin()); |
285 | 6 | CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth; |
286 | 6 | Tokens.erase(Tokens.end() - 1); |
287 | 6 | return true; |
288 | 6 | } |
289 | 37 | } |
290 | | |
291 | | // Look for @"aaaaaa" or $"aaaaaa". |
292 | 8.44k | auto &String = *(Tokens.end() - 1); |
293 | 8.44k | if (!String->is(tok::string_literal)) |
294 | 8.32k | return false; |
295 | | |
296 | 119 | auto &At = *(Tokens.end() - 2); |
297 | 119 | if (!(At->is(tok::at) || At->TokenText == "$"107 )) |
298 | 82 | return false; |
299 | | |
300 | 37 | if (Tokens.size() > 2 && At->is(tok::at)35 ) { |
301 | 12 | auto &Dollar = *(Tokens.end() - 3); |
302 | 12 | if (Dollar->TokenText == "$") { |
303 | | // This looks like $@"aaaaa" so we need to combine all 3 tokens. |
304 | 6 | Dollar->Tok.setKind(tok::string_literal); |
305 | 6 | Dollar->TokenText = |
306 | 6 | StringRef(Dollar->TokenText.begin(), |
307 | 6 | String->TokenText.end() - Dollar->TokenText.begin()); |
308 | 6 | Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth); |
309 | 6 | Dollar->setType(TT_CSharpStringLiteral); |
310 | 6 | Tokens.erase(Tokens.end() - 2); |
311 | 6 | Tokens.erase(Tokens.end() - 1); |
312 | 6 | return true; |
313 | 6 | } |
314 | 12 | } |
315 | | |
316 | | // Convert back into just a string_literal. |
317 | 31 | At->Tok.setKind(tok::string_literal); |
318 | 31 | At->TokenText = StringRef(At->TokenText.begin(), |
319 | 31 | String->TokenText.end() - At->TokenText.begin()); |
320 | 31 | At->ColumnWidth += String->ColumnWidth; |
321 | 31 | At->setType(TT_CSharpStringLiteral); |
322 | 31 | Tokens.erase(Tokens.end() - 1); |
323 | 31 | return true; |
324 | 37 | } |
325 | | |
326 | | // Valid C# attribute targets: |
327 | | // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets |
328 | | const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = { |
329 | | "assembly", "module", "field", "event", "method", |
330 | | "param", "property", "return", "type", |
331 | | }; |
332 | | |
333 | 46.7k | bool FormatTokenLexer::tryMergeNullishCoalescingEqual() { |
334 | 46.7k | if (Tokens.size() < 2) |
335 | 3.55k | return false; |
336 | 43.2k | auto &NullishCoalescing = *(Tokens.end() - 2); |
337 | 43.2k | auto &Equal = *(Tokens.end() - 1); |
338 | 43.2k | if (NullishCoalescing->getType() != TT_NullCoalescingOperator || |
339 | 43.2k | !Equal->is(tok::equal)38 ) { |
340 | 43.2k | return false; |
341 | 43.2k | } |
342 | 14 | NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens. |
343 | 14 | NullishCoalescing->TokenText = |
344 | 14 | StringRef(NullishCoalescing->TokenText.begin(), |
345 | 14 | Equal->TokenText.end() - NullishCoalescing->TokenText.begin()); |
346 | 14 | NullishCoalescing->ColumnWidth += Equal->ColumnWidth; |
347 | 14 | NullishCoalescing->setType(TT_NullCoalescingEqual); |
348 | 14 | Tokens.erase(Tokens.end() - 1); |
349 | 14 | return true; |
350 | 43.2k | } |
351 | | |
352 | 8.98k | bool FormatTokenLexer::tryMergeCSharpKeywordVariables() { |
353 | 8.98k | if (Tokens.size() < 2) |
354 | 527 | return false; |
355 | 8.46k | auto &At = *(Tokens.end() - 2); |
356 | 8.46k | auto &Keyword = *(Tokens.end() - 1); |
357 | 8.46k | if (!At->is(tok::at)) |
358 | 8.43k | return false; |
359 | 28 | if (!Keywords.isCSharpKeyword(*Keyword)) |
360 | 20 | return false; |
361 | | |
362 | 8 | At->Tok.setKind(tok::identifier); |
363 | 8 | At->TokenText = StringRef(At->TokenText.begin(), |
364 | 8 | Keyword->TokenText.end() - At->TokenText.begin()); |
365 | 8 | At->ColumnWidth += Keyword->ColumnWidth; |
366 | 8 | At->setType(Keyword->getType()); |
367 | 8 | Tokens.erase(Tokens.end() - 1); |
368 | 8 | return true; |
369 | 28 | } |
370 | | |
371 | | // In C# transform identifier foreach into kw_foreach |
372 | 8.93k | bool FormatTokenLexer::tryTransformCSharpForEach() { |
373 | 8.93k | if (Tokens.size() < 1) |
374 | 0 | return false; |
375 | 8.93k | auto &Identifier = *(Tokens.end() - 1); |
376 | 8.93k | if (!Identifier->is(tok::identifier)) |
377 | 6.42k | return false; |
378 | 2.51k | if (Identifier->TokenText != "foreach") |
379 | 2.50k | return false; |
380 | | |
381 | 8 | Identifier->setType(TT_ForEachMacro); |
382 | 8 | Identifier->Tok.setKind(tok::kw_for); |
383 | 8 | return true; |
384 | 2.51k | } |
385 | | |
386 | 1.10M | bool FormatTokenLexer::tryMergeForEach() { |
387 | 1.10M | if (Tokens.size() < 2) |
388 | 65.5k | return false; |
389 | 1.03M | auto &For = *(Tokens.end() - 2); |
390 | 1.03M | auto &Each = *(Tokens.end() - 1); |
391 | 1.03M | if (!For->is(tok::kw_for)) |
392 | 1.03M | return false; |
393 | 1.91k | if (!Each->is(tok::identifier)) |
394 | 1.89k | return false; |
395 | 20 | if (Each->TokenText != "each") |
396 | 11 | return false; |
397 | | |
398 | 9 | For->setType(TT_ForEachMacro); |
399 | 9 | For->Tok.setKind(tok::kw_for); |
400 | | |
401 | 9 | For->TokenText = StringRef(For->TokenText.begin(), |
402 | 9 | Each->TokenText.end() - For->TokenText.begin()); |
403 | 9 | For->ColumnWidth += Each->ColumnWidth; |
404 | 9 | Tokens.erase(Tokens.end() - 1); |
405 | 9 | return true; |
406 | 20 | } |
407 | | |
408 | 1.03M | bool FormatTokenLexer::tryTransformTryUsageForC() { |
409 | 1.03M | if (Tokens.size() < 2) |
410 | 60.7k | return false; |
411 | 975k | auto &Try = *(Tokens.end() - 2); |
412 | 975k | if (!Try->is(tok::kw_try)) |
413 | 974k | return false; |
414 | 337 | auto &Next = *(Tokens.end() - 1); |
415 | 337 | if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment)) |
416 | 295 | return false; |
417 | | |
418 | 42 | if (Tokens.size() > 2) { |
419 | 33 | auto &At = *(Tokens.end() - 3); |
420 | 33 | if (At->is(tok::at)) |
421 | 6 | return false; |
422 | 33 | } |
423 | | |
424 | 36 | Try->Tok.setKind(tok::identifier); |
425 | 36 | return true; |
426 | 42 | } |
427 | | |
428 | 1.10M | bool FormatTokenLexer::tryMergeLessLess() { |
429 | | // Merge X,less,less,Y into X,lessless,Y unless X or Y is less. |
430 | 1.10M | if (Tokens.size() < 3) |
431 | 130k | return false; |
432 | | |
433 | 970k | auto First = Tokens.end() - 3; |
434 | 970k | if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less)20.3k ) |
435 | 969k | return false; |
436 | | |
437 | | // Only merge if there currently is no whitespace between the two "<". |
438 | 1.55k | if (First[1]->hasWhitespaceBefore()) |
439 | 36 | return false; |
440 | | |
441 | 1.51k | auto X = Tokens.size() > 3 ? First[-1]1.51k : nullptr3 ; |
442 | 1.51k | auto Y = First[2]; |
443 | 1.51k | if ((X && X->is(tok::less)1.51k ) || Y->is(tok::less)1.39k ) |
444 | 225 | return false; |
445 | | |
446 | | // Do not remove a whitespace between the two "<" e.g. "operator< <>". |
447 | 1.29k | if (X && X->is(tok::kw_operator) && Y->is(tok::greater)30 ) |
448 | 0 | return false; |
449 | | |
450 | 1.29k | First[0]->Tok.setKind(tok::lessless); |
451 | 1.29k | First[0]->TokenText = "<<"; |
452 | 1.29k | First[0]->ColumnWidth += 1; |
453 | 1.29k | Tokens.erase(Tokens.end() - 2); |
454 | 1.29k | return true; |
455 | 1.29k | } |
456 | | |
457 | | bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, |
458 | 418k | TokenType NewType) { |
459 | 418k | if (Tokens.size() < Kinds.size()) |
460 | 35.9k | return false; |
461 | | |
462 | 382k | SmallVectorImpl<FormatToken *>::const_iterator First = |
463 | 382k | Tokens.end() - Kinds.size(); |
464 | 382k | if (!First[0]->is(Kinds[0])) |
465 | 379k | return false; |
466 | 3.26k | unsigned AddLength = 0; |
467 | 3.69k | for (unsigned i = 1; i < Kinds.size(); ++i426 ) { |
468 | 3.32k | if (!First[i]->is(Kinds[i]) || First[i]->hasWhitespaceBefore()438 ) |
469 | 2.90k | return false; |
470 | 426 | AddLength += First[i]->TokenText.size(); |
471 | 426 | } |
472 | 364 | Tokens.resize(Tokens.size() - Kinds.size() + 1); |
473 | 364 | First[0]->TokenText = StringRef(First[0]->TokenText.data(), |
474 | 364 | First[0]->TokenText.size() + AddLength); |
475 | 364 | First[0]->ColumnWidth += AddLength; |
476 | 364 | First[0]->setType(NewType); |
477 | 364 | return true; |
478 | 3.26k | } |
479 | | |
480 | | // Returns \c true if \p Tok can only be followed by an operand in JavaScript. |
481 | 336 | bool FormatTokenLexer::precedesOperand(FormatToken *Tok) { |
482 | | // NB: This is not entirely correct, as an r_paren can introduce an operand |
483 | | // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough |
484 | | // corner case to not matter in practice, though. |
485 | 336 | return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace, |
486 | 336 | tok::r_brace, tok::l_square, tok::semi, tok::exclaim, |
487 | 336 | tok::colon, tok::question, tok::tilde) || |
488 | 336 | Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw, |
489 | 292 | tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void, |
490 | 292 | tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) || |
491 | 336 | Tok->isBinaryOperator()288 ; |
492 | 336 | } |
493 | | |
494 | 340 | bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) { |
495 | 340 | if (!Prev) |
496 | 4 | return true; |
497 | | |
498 | | // Regex literals can only follow after prefix unary operators, not after |
499 | | // postfix unary operators. If the '++' is followed by a non-operand |
500 | | // introducing token, the slash here is the operand and not the start of a |
501 | | // regex. |
502 | | // `!` is an unary prefix operator, but also a post-fix operator that casts |
503 | | // away nullability, so the same check applies. |
504 | 336 | if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim)) |
505 | 20 | return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]); |
506 | | |
507 | | // The previous token must introduce an operand location where regex |
508 | | // literals can occur. |
509 | 316 | if (!precedesOperand(Prev)) |
510 | 24 | return false; |
511 | | |
512 | 292 | return true; |
513 | 316 | } |
514 | | |
515 | | // Tries to parse a JavaScript Regex literal starting at the current token, |
516 | | // if that begins with a slash and is in a location where JavaScript allows |
517 | | // regex literals. Changes the current token to a regex literal and updates |
518 | | // its text if successful. |
519 | 37.9k | void FormatTokenLexer::tryParseJSRegexLiteral() { |
520 | 37.9k | FormatToken *RegexToken = Tokens.back(); |
521 | 37.9k | if (!RegexToken->isOneOf(tok::slash, tok::slashequal)) |
522 | 37.6k | return; |
523 | | |
524 | 340 | FormatToken *Prev = nullptr; |
525 | 344 | for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) { |
526 | | // NB: Because previous pointers are not initialized yet, this cannot use |
527 | | // Token.getPreviousNonComment. |
528 | 344 | if (FT->isNot(tok::comment)) { |
529 | 336 | Prev = FT; |
530 | 336 | break; |
531 | 336 | } |
532 | 344 | } |
533 | | |
534 | 340 | if (!canPrecedeRegexLiteral(Prev)) |
535 | 36 | return; |
536 | | |
537 | | // 'Manually' lex ahead in the current file buffer. |
538 | 304 | const char *Offset = Lex->getBufferLocation(); |
539 | 304 | const char *RegexBegin = Offset - RegexToken->TokenText.size(); |
540 | 304 | StringRef Buffer = Lex->getBuffer(); |
541 | 304 | bool InCharacterClass = false; |
542 | 304 | bool HaveClosingSlash = false; |
543 | 1.91k | for (; !HaveClosingSlash && Offset != Buffer.end()1.61k ; ++Offset1.61k ) { |
544 | | // Regular expressions are terminated with a '/', which can only be |
545 | | // escaped using '\' or a character class between '[' and ']'. |
546 | | // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5. |
547 | 1.61k | switch (*Offset) { |
548 | 116 | case '\\': |
549 | | // Skip the escaped character. |
550 | 116 | ++Offset; |
551 | 116 | break; |
552 | 40 | case '[': |
553 | 40 | InCharacterClass = true; |
554 | 40 | break; |
555 | 40 | case ']': |
556 | 40 | InCharacterClass = false; |
557 | 40 | break; |
558 | 320 | case '/': |
559 | 320 | if (!InCharacterClass) |
560 | 304 | HaveClosingSlash = true; |
561 | 320 | break; |
562 | 1.61k | } |
563 | 1.61k | } |
564 | | |
565 | 304 | RegexToken->setType(TT_RegexLiteral); |
566 | | // Treat regex literals like other string_literals. |
567 | 304 | RegexToken->Tok.setKind(tok::string_literal); |
568 | 304 | RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin); |
569 | 304 | RegexToken->ColumnWidth = RegexToken->TokenText.size(); |
570 | | |
571 | 304 | resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); |
572 | 304 | } |
573 | | |
574 | 9.11k | void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() { |
575 | 9.11k | FormatToken *CSharpStringLiteral = Tokens.back(); |
576 | | |
577 | 9.11k | if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral) |
578 | 9.06k | return; |
579 | | |
580 | | // Deal with multiline strings. |
581 | 43 | if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") || |
582 | 43 | CSharpStringLiteral->TokenText.startswith(R"($@")")37 )) { |
583 | 31 | return; |
584 | 31 | } |
585 | | |
586 | 12 | const char *StrBegin = |
587 | 12 | Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size(); |
588 | 12 | const char *Offset = StrBegin; |
589 | 12 | if (CSharpStringLiteral->TokenText.startswith(R"(@")")) |
590 | 6 | Offset += 2; |
591 | 6 | else // CSharpStringLiteral->TokenText.startswith(R"($@")") |
592 | 6 | Offset += 3; |
593 | | |
594 | | // Look for a terminating '"' in the current file buffer. |
595 | | // Make no effort to format code within an interpolated or verbatim string. |
596 | 288 | for (; Offset != Lex->getBuffer().end(); ++Offset276 ) { |
597 | 288 | if (Offset[0] == '"') { |
598 | | // "" within a verbatim string is an escaped double quote: skip it. |
599 | 22 | if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"') |
600 | 10 | ++Offset; |
601 | 12 | else |
602 | 12 | break; |
603 | 22 | } |
604 | 288 | } |
605 | | |
606 | | // Make no attempt to format code properly if a verbatim string is |
607 | | // unterminated. |
608 | 12 | if (Offset == Lex->getBuffer().end()) |
609 | 0 | return; |
610 | | |
611 | 12 | StringRef LiteralText(StrBegin, Offset - StrBegin + 1); |
612 | 12 | CSharpStringLiteral->TokenText = LiteralText; |
613 | | |
614 | | // Adjust width for potentially multiline string literals. |
615 | 12 | size_t FirstBreak = LiteralText.find('\n'); |
616 | 12 | StringRef FirstLineText = FirstBreak == StringRef::npos |
617 | 12 | ? LiteralText10 |
618 | 12 | : LiteralText.substr(0, FirstBreak)2 ; |
619 | 12 | CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs( |
620 | 12 | FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth, |
621 | 12 | Encoding); |
622 | 12 | size_t LastBreak = LiteralText.rfind('\n'); |
623 | 12 | if (LastBreak != StringRef::npos) { |
624 | 2 | CSharpStringLiteral->IsMultiline = true; |
625 | 2 | unsigned StartColumn = 0; |
626 | 2 | CSharpStringLiteral->LastLineColumnWidth = |
627 | 2 | encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1), |
628 | 2 | StartColumn, Style.TabWidth, Encoding); |
629 | 2 | } |
630 | | |
631 | 12 | SourceLocation loc = Offset < Lex->getBuffer().end() |
632 | 12 | ? Lex->getSourceLocation(Offset + 1) |
633 | 12 | : SourceMgr.getLocForEndOfFile(ID)0 ; |
634 | 12 | resetLexer(SourceMgr.getFileOffset(loc)); |
635 | 12 | } |
636 | | |
637 | 37.9k | void FormatTokenLexer::handleTemplateStrings() { |
638 | 37.9k | FormatToken *BacktickToken = Tokens.back(); |
639 | | |
640 | 37.9k | if (BacktickToken->is(tok::l_brace)) { |
641 | 2.07k | StateStack.push(LexerState::NORMAL); |
642 | 2.07k | return; |
643 | 2.07k | } |
644 | 35.8k | if (BacktickToken->is(tok::r_brace)) { |
645 | 2.17k | if (StateStack.size() == 1) |
646 | 4 | return; |
647 | 2.16k | StateStack.pop(); |
648 | 2.16k | if (StateStack.top() != LexerState::TEMPLATE_STRING) |
649 | 2.06k | return; |
650 | | // If back in TEMPLATE_STRING, fallthrough and continue parsing the |
651 | 33.7k | } else if (BacktickToken->is(tok::unknown) && |
652 | 33.7k | BacktickToken->TokenText == "`"148 ) { |
653 | 148 | StateStack.push(LexerState::TEMPLATE_STRING); |
654 | 33.5k | } else { |
655 | 33.5k | return; // Not actually a template |
656 | 33.5k | } |
657 | | |
658 | | // 'Manually' lex ahead in the current file buffer. |
659 | 248 | const char *Offset = Lex->getBufferLocation(); |
660 | 248 | const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`" |
661 | 1.90k | for (; Offset != Lex->getBuffer().end(); ++Offset1.65k ) { |
662 | 1.90k | if (Offset[0] == '`') { |
663 | 148 | StateStack.pop(); |
664 | 148 | break; |
665 | 148 | } |
666 | 1.75k | if (Offset[0] == '\\') { |
667 | 8 | ++Offset; // Skip the escaped character. |
668 | 1.74k | } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' && |
669 | 1.74k | Offset[1] == '{'100 ) { |
670 | | // '${' introduces an expression interpolation in the template string. |
671 | 100 | StateStack.push(LexerState::NORMAL); |
672 | 100 | ++Offset; |
673 | 100 | break; |
674 | 100 | } |
675 | 1.75k | } |
676 | | |
677 | 248 | StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1); |
678 | 248 | BacktickToken->setType(TT_TemplateString); |
679 | 248 | BacktickToken->Tok.setKind(tok::string_literal); |
680 | 248 | BacktickToken->TokenText = LiteralText; |
681 | | |
682 | | // Adjust width for potentially multiline string literals. |
683 | 248 | size_t FirstBreak = LiteralText.find('\n'); |
684 | 248 | StringRef FirstLineText = FirstBreak == StringRef::npos |
685 | 248 | ? LiteralText212 |
686 | 248 | : LiteralText.substr(0, FirstBreak)36 ; |
687 | 248 | BacktickToken->ColumnWidth = encoding::columnWidthWithTabs( |
688 | 248 | FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding); |
689 | 248 | size_t LastBreak = LiteralText.rfind('\n'); |
690 | 248 | if (LastBreak != StringRef::npos) { |
691 | 36 | BacktickToken->IsMultiline = true; |
692 | 36 | unsigned StartColumn = 0; // The template tail spans the entire line. |
693 | 36 | BacktickToken->LastLineColumnWidth = |
694 | 36 | encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1), |
695 | 36 | StartColumn, Style.TabWidth, Encoding); |
696 | 36 | } |
697 | | |
698 | 248 | SourceLocation loc = Offset < Lex->getBuffer().end() |
699 | 248 | ? Lex->getSourceLocation(Offset + 1) |
700 | 248 | : SourceMgr.getLocForEndOfFile(ID)0 ; |
701 | 248 | resetLexer(SourceMgr.getFileOffset(loc)); |
702 | 248 | } |
703 | | |
704 | 6.73k | void FormatTokenLexer::tryParsePythonComment() { |
705 | 6.73k | FormatToken *HashToken = Tokens.back(); |
706 | 6.73k | if (!HashToken->isOneOf(tok::hash, tok::hashhash)) |
707 | 6.66k | return; |
708 | | // Turn the remainder of this line into a comment. |
709 | 68 | const char *CommentBegin = |
710 | 68 | Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#" |
711 | 68 | size_t From = CommentBegin - Lex->getBuffer().begin(); |
712 | 68 | size_t To = Lex->getBuffer().find_first_of('\n', From); |
713 | 68 | if (To == StringRef::npos) |
714 | 8 | To = Lex->getBuffer().size(); |
715 | 68 | size_t Len = To - From; |
716 | 68 | HashToken->setType(TT_LineComment); |
717 | 68 | HashToken->Tok.setKind(tok::comment); |
718 | 68 | HashToken->TokenText = Lex->getBuffer().substr(From, Len); |
719 | 68 | SourceLocation Loc = To < Lex->getBuffer().size() |
720 | 68 | ? Lex->getSourceLocation(CommentBegin + Len)60 |
721 | 68 | : SourceMgr.getLocForEndOfFile(ID)8 ; |
722 | 68 | resetLexer(SourceMgr.getFileOffset(Loc)); |
723 | 68 | } |
724 | | |
725 | 1.10M | bool FormatTokenLexer::tryMerge_TMacro() { |
726 | 1.10M | if (Tokens.size() < 4) |
727 | 195k | return false; |
728 | 906k | FormatToken *Last = Tokens.back(); |
729 | 906k | if (!Last->is(tok::r_paren)) |
730 | 824k | return false; |
731 | | |
732 | 81.5k | FormatToken *String = Tokens[Tokens.size() - 2]; |
733 | 81.5k | if (!String->is(tok::string_literal) || String->IsMultiline795 ) |
734 | 80.7k | return false; |
735 | | |
736 | 762 | if (!Tokens[Tokens.size() - 3]->is(tok::l_paren)) |
737 | 359 | return false; |
738 | | |
739 | 403 | FormatToken *Macro = Tokens[Tokens.size() - 4]; |
740 | 403 | if (Macro->TokenText != "_T") |
741 | 385 | return false; |
742 | | |
743 | 18 | const char *Start = Macro->TokenText.data(); |
744 | 18 | const char *End = Last->TokenText.data() + Last->TokenText.size(); |
745 | 18 | String->TokenText = StringRef(Start, End - Start); |
746 | 18 | String->IsFirst = Macro->IsFirst; |
747 | 18 | String->LastNewlineOffset = Macro->LastNewlineOffset; |
748 | 18 | String->WhitespaceRange = Macro->WhitespaceRange; |
749 | 18 | String->OriginalColumn = Macro->OriginalColumn; |
750 | 18 | String->ColumnWidth = encoding::columnWidthWithTabs( |
751 | 18 | String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding); |
752 | 18 | String->NewlinesBefore = Macro->NewlinesBefore; |
753 | 18 | String->HasUnescapedNewline = Macro->HasUnescapedNewline; |
754 | | |
755 | 18 | Tokens.pop_back(); |
756 | 18 | Tokens.pop_back(); |
757 | 18 | Tokens.pop_back(); |
758 | 18 | Tokens.back() = String; |
759 | 18 | if (FirstInLineIndex >= Tokens.size()) |
760 | 3 | FirstInLineIndex = Tokens.size() - 1; |
761 | 18 | return true; |
762 | 403 | } |
763 | | |
764 | 1.10M | bool FormatTokenLexer::tryMergeConflictMarkers() { |
765 | 1.10M | if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof)1.01M ) |
766 | 953k | return false; |
767 | | |
768 | | // Conflict lines look like: |
769 | | // <marker> <text from the vcs> |
770 | | // For example: |
771 | | // >>>>>>> /file/in/file/system at revision 1234 |
772 | | // |
773 | | // We merge all tokens in a line that starts with a conflict marker |
774 | | // into a single token with a special token type that the unwrapped line |
775 | | // parser will use to correctly rebuild the underlying code. |
776 | | |
777 | 147k | FileID ID; |
778 | | // Get the position of the first token in the line. |
779 | 147k | unsigned FirstInLineOffset; |
780 | 147k | std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc( |
781 | 147k | Tokens[FirstInLineIndex]->getStartOfNonWhitespace()); |
782 | 147k | StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer(); |
783 | | // Calculate the offset of the start of the current line. |
784 | 147k | auto LineOffset = Buffer.rfind('\n', FirstInLineOffset); |
785 | 147k | if (LineOffset == StringRef::npos) |
786 | 64.3k | LineOffset = 0; |
787 | 83.4k | else |
788 | 83.4k | ++LineOffset; |
789 | | |
790 | 147k | auto FirstSpace = Buffer.find_first_of(" \n", LineOffset); |
791 | 147k | StringRef LineStart; |
792 | 147k | if (FirstSpace == StringRef::npos) |
793 | 8.72k | LineStart = Buffer.substr(LineOffset); |
794 | 139k | else |
795 | 139k | LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset); |
796 | | |
797 | 147k | TokenType Type = TT_Unknown; |
798 | 147k | if (LineStart == "<<<<<<<" || LineStart == ">>>>"147k ) { |
799 | 9 | Type = TT_ConflictStart; |
800 | 147k | } else if (LineStart == "|||||||" || LineStart == "======="147k || |
801 | 147k | LineStart == "===="147k ) { |
802 | 27 | Type = TT_ConflictAlternative; |
803 | 147k | } else if (LineStart == ">>>>>>>" || LineStart == "<<<<"147k ) { |
804 | 9 | Type = TT_ConflictEnd; |
805 | 9 | } |
806 | | |
807 | 147k | if (Type != TT_Unknown) { |
808 | 45 | FormatToken *Next = Tokens.back(); |
809 | | |
810 | 45 | Tokens.resize(FirstInLineIndex + 1); |
811 | | // We do not need to build a complete token here, as we will skip it |
812 | | // during parsing anyway (as we must not touch whitespace around conflict |
813 | | // markers). |
814 | 45 | Tokens.back()->setType(Type); |
815 | 45 | Tokens.back()->Tok.setKind(tok::kw___unknown_anytype); |
816 | | |
817 | 45 | Tokens.push_back(Next); |
818 | 45 | return true; |
819 | 45 | } |
820 | | |
821 | 147k | return false; |
822 | 147k | } |
823 | | |
824 | 2.66k | FormatToken *FormatTokenLexer::getStashedToken() { |
825 | | // Create a synthesized second '>' or '<' token. |
826 | 2.66k | Token Tok = FormatTok->Tok; |
827 | 2.66k | StringRef TokenText = FormatTok->TokenText; |
828 | | |
829 | 2.66k | unsigned OriginalColumn = FormatTok->OriginalColumn; |
830 | 2.66k | FormatTok = new (Allocator.Allocate()) FormatToken; |
831 | 2.66k | FormatTok->Tok = Tok; |
832 | 2.66k | SourceLocation TokLocation = |
833 | 2.66k | FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1); |
834 | 2.66k | FormatTok->Tok.setLocation(TokLocation); |
835 | 2.66k | FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation); |
836 | 2.66k | FormatTok->TokenText = TokenText; |
837 | 2.66k | FormatTok->ColumnWidth = 1; |
838 | 2.66k | FormatTok->OriginalColumn = OriginalColumn + 1; |
839 | | |
840 | 2.66k | return FormatTok; |
841 | 2.66k | } |
842 | | |
843 | | /// Truncate the current token to the new length and make the lexer continue |
844 | | /// from the end of the truncated token. Used for other languages that have |
845 | | /// different token boundaries, like JavaScript in which a comment ends at a |
846 | | /// line break regardless of whether the line break follows a backslash. Also |
847 | | /// used to set the lexer to the end of whitespace if the lexer regards |
848 | | /// whitespace and an unrecognized symbol as one token. |
849 | 96 | void FormatTokenLexer::truncateToken(size_t NewLen) { |
850 | 96 | assert(NewLen <= FormatTok->TokenText.size()); |
851 | 0 | resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation( |
852 | 96 | Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen))); |
853 | 96 | FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen); |
854 | 96 | FormatTok->ColumnWidth = encoding::columnWidthWithTabs( |
855 | 96 | FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth, |
856 | 96 | Encoding); |
857 | 96 | FormatTok->Tok.setLength(NewLen); |
858 | 96 | } |
859 | | |
860 | | /// Count the length of leading whitespace in a token. |
861 | 1.52M | static size_t countLeadingWhitespace(StringRef Text) { |
862 | | // Basically counting the length matched by this regex. |
863 | | // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+" |
864 | | // Directly using the regex turned out to be slow. With the regex |
865 | | // version formatting all files in this directory took about 1.25 |
866 | | // seconds. This version took about 0.5 seconds. |
867 | 1.52M | const unsigned char *const Begin = Text.bytes_begin(); |
868 | 1.52M | const unsigned char *const End = Text.bytes_end(); |
869 | 1.52M | const unsigned char *Cur = Begin; |
870 | 2.35M | while (Cur < End) { |
871 | 1.86M | if (isspace(Cur[0])) { |
872 | 834k | ++Cur; |
873 | 1.03M | } else if (Cur[0] == '\\' && (931 Cur[1] == '\n'931 || Cur[1] == '\r'87 )) { |
874 | | // A '\' followed by a newline always escapes the newline, regardless |
875 | | // of whether there is another '\' before it. |
876 | | // The source has a null byte at the end. So the end of the entire input |
877 | | // isn't reached yet. Also the lexer doesn't break apart an escaped |
878 | | // newline. |
879 | 868 | assert(End - Cur >= 2); |
880 | 0 | Cur += 2; |
881 | 1.03M | } else if (Cur[0] == '?' && Cur[1] == '?'2.48k && Cur[2] == '/'38 && |
882 | 1.03M | (0 Cur[3] == '\n'0 || Cur[3] == '\r'0 )) { |
883 | | // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the |
884 | | // characters are quoted individually in this comment because if we write |
885 | | // them together some compilers warn that we have a trigraph in the code. |
886 | 0 | assert(End - Cur >= 4); |
887 | 0 | Cur += 4; |
888 | 1.03M | } else { |
889 | 1.03M | break; |
890 | 1.03M | } |
891 | 1.86M | } |
892 | 1.52M | return Cur - Begin; |
893 | 1.52M | } |
894 | | |
895 | 1.10M | FormatToken *FormatTokenLexer::getNextToken() { |
896 | 1.10M | if (StateStack.top() == LexerState::TOKEN_STASHED) { |
897 | 2.66k | StateStack.pop(); |
898 | 2.66k | return getStashedToken(); |
899 | 2.66k | } |
900 | | |
901 | 1.09M | FormatTok = new (Allocator.Allocate()) FormatToken; |
902 | 1.09M | readRawToken(*FormatTok); |
903 | 1.09M | SourceLocation WhitespaceStart = |
904 | 1.09M | FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace); |
905 | 1.09M | FormatTok->IsFirst = IsFirstToken; |
906 | 1.09M | IsFirstToken = false; |
907 | | |
908 | | // Consume and record whitespace until we find a significant token. |
909 | | // Some tok::unknown tokens are not just whitespace, e.g. whitespace |
910 | | // followed by a symbol such as backtick. Those symbols may be |
911 | | // significant in other languages. |
912 | 1.09M | unsigned WhitespaceLength = TrailingWhitespace; |
913 | 1.58M | while (FormatTok->isNot(tok::eof)) { |
914 | 1.52M | auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText); |
915 | 1.52M | if (LeadingWhitespace == 0) |
916 | 1.03M | break; |
917 | 490k | if (LeadingWhitespace < FormatTok->TokenText.size()) |
918 | 84 | truncateToken(LeadingWhitespace); |
919 | 490k | StringRef Text = FormatTok->TokenText; |
920 | 490k | bool InEscape = false; |
921 | 1.32M | for (int i = 0, e = Text.size(); i != e; ++i835k ) { |
922 | 835k | switch (Text[i]) { |
923 | 387 | case '\r': |
924 | | // If this is a CRLF sequence, break here and the LF will be handled on |
925 | | // the next loop iteration. Otherwise, this is a single Mac CR, treat it |
926 | | // the same as a single LF. |
927 | 387 | if (i + 1 < e && Text[i + 1] == '\n'378 ) |
928 | 378 | break; |
929 | 387 | LLVM_FALLTHROUGH9 ;9 |
930 | 91.5k | case '\n': |
931 | 91.5k | ++FormatTok->NewlinesBefore; |
932 | 91.5k | if (!InEscape) |
933 | 90.7k | FormatTok->HasUnescapedNewline = true; |
934 | 868 | else |
935 | 868 | InEscape = false; |
936 | 91.5k | FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; |
937 | 91.5k | Column = 0; |
938 | 91.5k | break; |
939 | 12 | case '\f': |
940 | 24 | case '\v': |
941 | 24 | Column = 0; |
942 | 24 | break; |
943 | 740k | case ' ': |
944 | 740k | ++Column; |
945 | 740k | break; |
946 | 2.47k | case '\t': |
947 | 2.47k | Column += |
948 | 2.47k | Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth2.42k : 045 ); |
949 | 2.47k | break; |
950 | 868 | case '\\': |
951 | 868 | case '?': |
952 | 868 | case '/': |
953 | | // The text was entirely whitespace when this loop was entered. Thus |
954 | | // this has to be an escape sequence. |
955 | 868 | assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" || |
956 | 868 | Text.substr(i, 4) == "\?\?/\r" || |
957 | 868 | Text.substr(i, 4) == "\?\?/\n" || |
958 | 868 | (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" || |
959 | 868 | Text.substr(i - 1, 4) == "\?\?/\n")) || |
960 | 868 | (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" || |
961 | 868 | Text.substr(i - 2, 4) == "\?\?/\n"))); |
962 | 0 | InEscape = true; |
963 | 868 | break; |
964 | 0 | default: |
965 | | // This shouldn't happen. |
966 | 0 | assert(false); |
967 | 0 | break; |
968 | 835k | } |
969 | 835k | } |
970 | 490k | WhitespaceLength += Text.size(); |
971 | 490k | readRawToken(*FormatTok); |
972 | 490k | } |
973 | | |
974 | 1.09M | if (FormatTok->is(tok::unknown)) |
975 | 307 | FormatTok->setType(TT_ImplicitStringLiteral); |
976 | | |
977 | | // JavaScript and Java do not allow to escape the end of the line with a |
978 | | // backslash. Backslashes are syntax errors in plain source, but can occur in |
979 | | // comments. When a single line comment ends with a \, it'll cause the next |
980 | | // line of code to be lexed as a comment, breaking formatting. The code below |
981 | | // finds comments that contain a backslash followed by a line break, truncates |
982 | | // the comment token at the backslash, and resets the lexer to restart behind |
983 | | // the backslash. |
984 | 1.09M | if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java1.06M ) && |
985 | 1.09M | FormatTok->is(tok::comment)42.4k && FormatTok->TokenText.startswith("//")537 ) { |
986 | 360 | size_t BackslashPos = FormatTok->TokenText.find('\\'); |
987 | 364 | while (BackslashPos != StringRef::npos) { |
988 | 16 | if (BackslashPos + 1 < FormatTok->TokenText.size() && |
989 | 16 | FormatTok->TokenText[BackslashPos + 1] == '\n') { |
990 | 12 | truncateToken(BackslashPos + 1); |
991 | 12 | break; |
992 | 12 | } |
993 | 4 | BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1); |
994 | 4 | } |
995 | 360 | } |
996 | | |
997 | 1.09M | if (Style.isVerilog()) { |
998 | | // Verilog uses the backtick instead of the hash for preprocessor stuff. |
999 | | // And it uses the hash for delays and parameter lists. In order to continue |
1000 | | // using `tok::hash` in other places, the backtick gets marked as the hash |
1001 | | // here. And in order to tell the backtick and hash apart for |
1002 | | // Verilog-specific stuff, the hash becomes an identifier. |
1003 | 1.02k | if (FormatTok->isOneOf(tok::hash, tok::hashhash)) { |
1004 | 31 | FormatTok->Tok.setKind(tok::raw_identifier); |
1005 | 991 | } else if (FormatTok->is(tok::raw_identifier)) { |
1006 | 473 | if (FormatTok->TokenText == "`") { |
1007 | 53 | FormatTok->Tok.setIdentifierInfo(nullptr); |
1008 | 53 | FormatTok->Tok.setKind(tok::hash); |
1009 | 420 | } else if (FormatTok->TokenText == "``") { |
1010 | 0 | FormatTok->Tok.setIdentifierInfo(nullptr); |
1011 | 0 | FormatTok->Tok.setKind(tok::hashhash); |
1012 | 0 | } |
1013 | 473 | } |
1014 | 1.02k | } |
1015 | | |
1016 | 1.09M | FormatTok->WhitespaceRange = SourceRange( |
1017 | 1.09M | WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); |
1018 | | |
1019 | 1.09M | FormatTok->OriginalColumn = Column; |
1020 | | |
1021 | 1.09M | TrailingWhitespace = 0; |
1022 | 1.09M | if (FormatTok->is(tok::comment)) { |
1023 | | // FIXME: Add the trimmed whitespace to Column. |
1024 | 15.9k | StringRef UntrimmedText = FormatTok->TokenText; |
1025 | 15.9k | FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f"); |
1026 | 15.9k | TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size(); |
1027 | 1.08M | } else if (FormatTok->is(tok::raw_identifier)) { |
1028 | 414k | IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText); |
1029 | 414k | FormatTok->Tok.setIdentifierInfo(&Info); |
1030 | 414k | FormatTok->Tok.setKind(Info.getTokenID()); |
1031 | 414k | if (Style.Language == FormatStyle::LK_Java && |
1032 | 414k | FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete, |
1033 | 1.86k | tok::kw_operator)) { |
1034 | 8 | FormatTok->Tok.setKind(tok::identifier); |
1035 | 8 | FormatTok->Tok.setIdentifierInfo(nullptr); |
1036 | 414k | } else if (Style.isJavaScript() && |
1037 | 414k | FormatTok->isOneOf(tok::kw_struct, tok::kw_union, |
1038 | 13.1k | tok::kw_operator)) { |
1039 | 28 | FormatTok->Tok.setKind(tok::identifier); |
1040 | 28 | FormatTok->Tok.setIdentifierInfo(nullptr); |
1041 | 28 | } |
1042 | 668k | } else if (FormatTok->is(tok::greatergreater)) { |
1043 | 1.25k | FormatTok->Tok.setKind(tok::greater); |
1044 | 1.25k | FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); |
1045 | 1.25k | ++Column; |
1046 | 1.25k | StateStack.push(LexerState::TOKEN_STASHED); |
1047 | 667k | } else if (FormatTok->is(tok::lessless)) { |
1048 | 1.41k | FormatTok->Tok.setKind(tok::less); |
1049 | 1.41k | FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); |
1050 | 1.41k | ++Column; |
1051 | 1.41k | StateStack.push(LexerState::TOKEN_STASHED); |
1052 | 1.41k | } |
1053 | | |
1054 | | // Now FormatTok is the next non-whitespace token. |
1055 | | |
1056 | 1.09M | StringRef Text = FormatTok->TokenText; |
1057 | 1.09M | size_t FirstNewlinePos = Text.find('\n'); |
1058 | 1.09M | if (FirstNewlinePos == StringRef::npos) { |
1059 | | // FIXME: ColumnWidth actually depends on the start column, we need to |
1060 | | // take this into account when the token is moved. |
1061 | 1.09M | FormatTok->ColumnWidth = |
1062 | 1.09M | encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding); |
1063 | 1.09M | Column += FormatTok->ColumnWidth; |
1064 | 1.09M | } else { |
1065 | 928 | FormatTok->IsMultiline = true; |
1066 | | // FIXME: ColumnWidth actually depends on the start column, we need to |
1067 | | // take this into account when the token is moved. |
1068 | 928 | FormatTok->ColumnWidth = encoding::columnWidthWithTabs( |
1069 | 928 | Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding); |
1070 | | |
1071 | | // The last line of the token always starts in column 0. |
1072 | | // Thus, the length can be precomputed even in the presence of tabs. |
1073 | 928 | FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs( |
1074 | 928 | Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding); |
1075 | 928 | Column = FormatTok->LastLineColumnWidth; |
1076 | 928 | } |
1077 | | |
1078 | 1.09M | if (Style.isCpp()) { |
1079 | 1.03M | auto it = Macros.find(FormatTok->Tok.getIdentifierInfo()); |
1080 | 1.03M | if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo()973k && |
1081 | 1.03M | Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() == |
1082 | 391k | tok::pp_define) && |
1083 | 1.03M | it != Macros.end()1.03M ) { |
1084 | 2.66k | FormatTok->setType(it->second); |
1085 | 2.66k | if (it->second == TT_IfMacro) { |
1086 | | // The lexer token currently has type tok::kw_unknown. However, for this |
1087 | | // substitution to be treated correctly in the TokenAnnotator, faking |
1088 | | // the tok value seems to be needed. Not sure if there's a more elegant |
1089 | | // way. |
1090 | 1.35k | FormatTok->Tok.setKind(tok::kw_if); |
1091 | 1.35k | } |
1092 | 1.03M | } else if (FormatTok->is(tok::identifier)) { |
1093 | 246k | if (MacroBlockBeginRegex.match(Text)) |
1094 | 36 | FormatTok->setType(TT_MacroBlockBegin); |
1095 | 246k | else if (MacroBlockEndRegex.match(Text)) |
1096 | 36 | FormatTok->setType(TT_MacroBlockEnd); |
1097 | 246k | } |
1098 | 1.03M | } |
1099 | | |
1100 | 1.09M | return FormatTok; |
1101 | 1.09M | } |
1102 | | |
1103 | 1.48k | bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) { |
1104 | | // In Verilog the quote is not a character literal. |
1105 | | // |
1106 | | // Make the backtick and double backtick identifiers to match against them |
1107 | | // more easily. |
1108 | | // |
1109 | | // In Verilog an escaped identifier starts with backslash and ends with |
1110 | | // whitespace. Unless that whitespace is an escaped newline. A backslash can |
1111 | | // also begin an escaped newline outside of an escaped identifier. We check |
1112 | | // for that outside of the Regex since we can't use negative lookhead |
1113 | | // assertions. Simply changing the '*' to '+' breaks stuff as the escaped |
1114 | | // identifier may have a length of 0 according to Section A.9.3. |
1115 | | // FIXME: If there is an escaped newline in the middle of an escaped |
1116 | | // identifier, allow for pasting the two lines together, But escaped |
1117 | | // identifiers usually occur only in generated code anyway. |
1118 | 1.48k | static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re" |
1119 | 1.48k | "(\r?\n|\r)|[^[:space:]])*)"); |
1120 | | |
1121 | 1.48k | SmallVector<StringRef, 4> Matches; |
1122 | 1.48k | const char *Start = Lex->getBufferLocation(); |
1123 | 1.48k | if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start), |
1124 | 1.48k | &Matches)) { |
1125 | 1.42k | return false; |
1126 | 1.42k | } |
1127 | | // There is a null byte at the end of the buffer, so we don't have to check |
1128 | | // Start[1] is within the buffer. |
1129 | 57 | if (Start[0] == '\\' && (4 Start[1] == '\r'4 || Start[1] == '\n'4 )) |
1130 | 4 | return false; |
1131 | 53 | size_t Len = Matches[0].size(); |
1132 | | |
1133 | | // The kind has to be an identifier so we can match it against those defined |
1134 | | // in Keywords. The kind has to be set before the length because the setLength |
1135 | | // function checks that the kind is not an annotation. |
1136 | 53 | Tok.setKind(tok::raw_identifier); |
1137 | 53 | Tok.setLength(Len); |
1138 | 53 | Tok.setLocation(Lex->getSourceLocation(Start, Len)); |
1139 | 53 | Tok.setRawIdentifierData(Start); |
1140 | 53 | Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false); |
1141 | 53 | return true; |
1142 | 57 | } |
1143 | | |
1144 | 1.58M | void FormatTokenLexer::readRawToken(FormatToken &Tok) { |
1145 | | // For Verilog, first see if there is a special token, and fall back to the |
1146 | | // normal lexer if there isn't one. |
1147 | 1.58M | if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok)1.48k ) |
1148 | 1.58M | Lex->LexFromRawLexer(Tok.Tok); |
1149 | 1.58M | Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()), |
1150 | 1.58M | Tok.Tok.getLength()); |
1151 | | // For formatting, treat unterminated string literals like normal string |
1152 | | // literals. |
1153 | 1.58M | if (Tok.is(tok::unknown)) { |
1154 | 491k | if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') { |
1155 | 34 | Tok.Tok.setKind(tok::string_literal); |
1156 | 34 | Tok.IsUnterminatedLiteral = true; |
1157 | 491k | } else if (Style.isJavaScript() && Tok.TokenText == "''"17.3k ) { |
1158 | 12 | Tok.Tok.setKind(tok::string_literal); |
1159 | 12 | } |
1160 | 491k | } |
1161 | | |
1162 | 1.58M | if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Proto1.53M || |
1163 | 1.58M | Style.Language == FormatStyle::LK_TextProto1.52M ) && |
1164 | 1.58M | Tok.is(tok::char_constant)71.7k ) { |
1165 | 844 | Tok.Tok.setKind(tok::string_literal); |
1166 | 844 | } |
1167 | | |
1168 | 1.58M | if (Tok.is(tok::comment) && (15.9k Tok.TokenText == "// clang-format on"15.9k || |
1169 | 15.9k | Tok.TokenText == "/* clang-format on */"15.8k )) { |
1170 | 137 | FormattingDisabled = false; |
1171 | 137 | } |
1172 | | |
1173 | 1.58M | Tok.Finalized = FormattingDisabled; |
1174 | | |
1175 | 1.58M | if (Tok.is(tok::comment) && (15.9k Tok.TokenText == "// clang-format off"15.9k || |
1176 | 15.9k | Tok.TokenText == "/* clang-format off */"15.8k )) { |
1177 | 137 | FormattingDisabled = true; |
1178 | 137 | } |
1179 | 1.58M | } |
1180 | | |
1181 | 728 | void FormatTokenLexer::resetLexer(unsigned Offset) { |
1182 | 728 | StringRef Buffer = SourceMgr.getBufferData(ID); |
1183 | 728 | LangOpts = getFormattingLangOpts(Style); |
1184 | 728 | Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts, |
1185 | 728 | Buffer.begin(), Buffer.begin() + Offset, Buffer.end())); |
1186 | 728 | Lex->SetKeepWhitespaceMode(true); |
1187 | 728 | TrailingWhitespace = 0; |
1188 | 728 | } |
1189 | | |
1190 | | } // namespace format |
1191 | | } // namespace clang |