Coverage Report

Created: 2022-07-16 07:03

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file implements FormatTokenLexer, which tokenizes a source file
11
/// into a FormatToken stream suitable for ClangFormat.
12
///
13
//===----------------------------------------------------------------------===//
14
15
#include "FormatTokenLexer.h"
16
#include "FormatToken.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/SourceManager.h"
19
#include "clang/Format/Format.h"
20
#include "llvm/Support/Regex.h"
21
22
namespace clang {
23
namespace format {
24
25
FormatTokenLexer::FormatTokenLexer(
26
    const SourceManager &SourceMgr, FileID ID, unsigned Column,
27
    const FormatStyle &Style, encoding::Encoding Encoding,
28
    llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29
    IdentifierTable &IdentTable)
30
    : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31
      Column(Column), TrailingWhitespace(0),
32
      LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33
      Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34
      Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35
      FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36
65.5k
      MacroBlockEndRegex(Style.MacroBlockEnd) {
37
65.5k
  Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38
65.5k
  Lex->SetKeepWhitespaceMode(true);
39
40
196k
  for (const std::string &ForEachMacro : Style.ForEachMacros) {
41
196k
    auto Identifier = &IdentTable.get(ForEachMacro);
42
196k
    Macros.insert({Identifier, TT_ForEachMacro});
43
196k
  }
44
68.5k
  for (const std::string &IfMacro : Style.IfMacros) {
45
68.5k
    auto Identifier = &IdentTable.get(IfMacro);
46
68.5k
    Macros.insert({Identifier, TT_IfMacro});
47
68.5k
  }
48
66.7k
  for (const std::string &AttributeMacro : Style.AttributeMacros) {
49
66.7k
    auto Identifier = &IdentTable.get(AttributeMacro);
50
66.7k
    Macros.insert({Identifier, TT_AttributeMacro});
51
66.7k
  }
52
131k
  for (const std::string &StatementMacro : Style.StatementMacros) {
53
131k
    auto Identifier = &IdentTable.get(StatementMacro);
54
131k
    Macros.insert({Identifier, TT_StatementMacro});
55
131k
  }
56
65.5k
  for (const std::string &TypenameMacro : Style.TypenameMacros) {
57
765
    auto Identifier = &IdentTable.get(TypenameMacro);
58
765
    Macros.insert({Identifier, TT_TypenameMacro});
59
765
  }
60
65.5k
  for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61
191
    auto Identifier = &IdentTable.get(NamespaceMacro);
62
191
    Macros.insert({Identifier, TT_NamespaceMacro});
63
191
  }
64
65.5k
  for (const std::string &WhitespaceSensitiveMacro :
65
327k
       Style.WhitespaceSensitiveMacros) {
66
327k
    auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67
327k
    Macros.insert({Identifier, TT_UntouchableMacroFunc});
68
327k
  }
69
65.5k
  for (const std::string &StatementAttributeLikeMacro :
70
65.5k
       Style.StatementAttributeLikeMacros) {
71
65.5k
    auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72
65.5k
    Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73
65.5k
  }
74
65.5k
}
75
76
65.5k
ArrayRef<FormatToken *> FormatTokenLexer::lex() {
77
65.5k
  assert(Tokens.empty());
78
0
  assert(FirstInLineIndex == 0);
79
1.10M
  do {
80
1.10M
    Tokens.push_back(getNextToken());
81
1.10M
    if (Style.isJavaScript()) {
82
37.9k
      tryParseJSRegexLiteral();
83
37.9k
      handleTemplateStrings();
84
37.9k
    }
85
1.10M
    if (Style.Language == FormatStyle::LK_TextProto)
86
6.73k
      tryParsePythonComment();
87
1.10M
    tryMergePreviousTokens();
88
1.10M
    if (Style.isCSharp()) {
89
      // This needs to come after tokens have been merged so that C#
90
      // string literals are correctly identified.
91
9.11k
      handleCSharpVerbatimAndInterpolatedStrings();
92
9.11k
    }
93
1.10M
    if (Tokens.back()->NewlinesBefore > 0 || 
Tokens.back()->IsMultiline1.01M
)
94
88.2k
      FirstInLineIndex = Tokens.size() - 1;
95
1.10M
  } while (Tokens.back()->isNot(tok::eof));
96
65.5k
  return Tokens;
97
65.5k
}
98
99
1.10M
void FormatTokenLexer::tryMergePreviousTokens() {
100
1.10M
  if (tryMerge_TMacro())
101
18
    return;
102
1.10M
  if (tryMergeConflictMarkers())
103
45
    return;
104
1.10M
  if (tryMergeLessLess())
105
1.29k
    return;
106
1.10M
  if (tryMergeForEach())
107
9
    return;
108
1.10M
  if (Style.isCpp() && 
tryTransformTryUsageForC()1.03M
)
109
36
    return;
110
111
1.10M
  if (Style.isJavaScript() || 
Style.isCSharp()1.06M
) {
112
47.0k
    static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
113
47.0k
                                                               tok::question};
114
47.0k
    static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
115
47.0k
                                                             tok::period};
116
47.0k
    static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
117
118
47.0k
    if (tryMergeTokens(FatArrow, TT_FatArrow))
119
244
      return;
120
46.8k
    if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
121
      // Treat like the "||" operator (as opposed to the ternary ?).
122
38
      Tokens.back()->Tok.setKind(tok::pipepipe);
123
38
      return;
124
38
    }
125
46.7k
    if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
126
      // Treat like a regular "." access.
127
22
      Tokens.back()->Tok.setKind(tok::period);
128
22
      return;
129
22
    }
130
46.7k
    if (tryMergeNullishCoalescingEqual())
131
14
      return;
132
46.7k
  }
133
134
1.09M
  if (Style.isCSharp()) {
135
8.98k
    static const tok::TokenKind CSharpNullConditionalLSquare[] = {
136
8.98k
        tok::question, tok::l_square};
137
138
8.98k
    if (tryMergeCSharpKeywordVariables())
139
8
      return;
140
8.98k
    if (tryMergeCSharpStringLiteral())
141
43
      return;
142
8.93k
    if (tryTransformCSharpForEach())
143
8
      return;
144
8.92k
    if (tryMergeTokens(CSharpNullConditionalLSquare,
145
8.92k
                       TT_CSharpNullConditionalLSquare)) {
146
      // Treat like a regular "[" operator.
147
8
      Tokens.back()->Tok.setKind(tok::l_square);
148
8
      return;
149
8
    }
150
8.92k
  }
151
152
1.09M
  if (tryMergeNSStringLiteral())
153
483
    return;
154
155
1.09M
  if (Style.isJavaScript()) {
156
37.7k
    static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
157
37.7k
    static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
158
37.7k
                                                   tok::equal};
159
37.7k
    static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
160
37.7k
                                                  tok::greaterequal};
161
37.7k
    static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
162
37.7k
    static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
163
37.7k
                                                           tok::starequal};
164
37.7k
    static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
165
37.7k
    static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
166
167
    // FIXME: Investigate what token type gives the correct operator priority.
168
37.7k
    if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
169
12
      return;
170
37.7k
    if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
171
12
      return;
172
37.7k
    if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
173
10
      return;
174
37.7k
    if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
175
4
      return;
176
37.7k
    if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
177
4
      Tokens.back()->Tok.setKind(tok::starequal);
178
4
      return;
179
4
    }
180
37.7k
    if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
181
37.7k
        
tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)37.7k
) {
182
      // Treat like the "=" assignment operator.
183
8
      Tokens.back()->Tok.setKind(tok::equal);
184
8
      return;
185
8
    }
186
37.7k
    if (tryMergeJSPrivateIdentifier())
187
32
      return;
188
37.7k
  }
189
190
1.09M
  if (Style.Language == FormatStyle::LK_Java) {
191
4.54k
    static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
192
4.54k
        tok::greater, tok::greater, tok::greaterequal};
193
4.54k
    if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
194
2
      return;
195
4.54k
  }
196
1.09M
}
197
198
1.09M
bool FormatTokenLexer::tryMergeNSStringLiteral() {
199
1.09M
  if (Tokens.size() < 2)
200
65.5k
    return false;
201
1.03M
  auto &At = *(Tokens.end() - 2);
202
1.03M
  auto &String = *(Tokens.end() - 1);
203
1.03M
  if (!At->is(tok::at) || 
!String->is(tok::string_literal)2.83k
)
204
1.03M
    return false;
205
483
  At->Tok.setKind(tok::string_literal);
206
483
  At->TokenText = StringRef(At->TokenText.begin(),
207
483
                            String->TokenText.end() - At->TokenText.begin());
208
483
  At->ColumnWidth += String->ColumnWidth;
209
483
  At->setType(TT_ObjCStringLiteral);
210
483
  Tokens.erase(Tokens.end() - 1);
211
483
  return true;
212
1.03M
}
213
214
37.7k
bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
215
  // Merges #idenfier into a single identifier with the text #identifier
216
  // but the token tok::identifier.
217
37.7k
  if (Tokens.size() < 2)
218
3.02k
    return false;
219
34.6k
  auto &Hash = *(Tokens.end() - 2);
220
34.6k
  auto &Identifier = *(Tokens.end() - 1);
221
34.6k
  if (!Hash->is(tok::hash) || 
!Identifier->is(tok::identifier)36
)
222
34.6k
    return false;
223
32
  Hash->Tok.setKind(tok::identifier);
224
32
  Hash->TokenText =
225
32
      StringRef(Hash->TokenText.begin(),
226
32
                Identifier->TokenText.end() - Hash->TokenText.begin());
227
32
  Hash->ColumnWidth += Identifier->ColumnWidth;
228
32
  Hash->setType(TT_JsPrivateIdentifier);
229
32
  Tokens.erase(Tokens.end() - 1);
230
32
  return true;
231
34.6k
}
232
233
// Search for verbatim or interpolated string literals @"ABC" or
234
// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
235
// prevent splitting of @, $ and ".
236
// Merging of multiline verbatim strings with embedded '"' is handled in
237
// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
238
8.98k
bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
239
8.98k
  if (Tokens.size() < 2)
240
527
    return false;
241
242
  // Interpolated strings could contain { } with " characters inside.
243
  // $"{x ?? "null"}"
244
  // should not be split into $"{x ?? ", null, "}" but should treated as a
245
  // single string-literal.
246
  //
247
  // We opt not to try and format expressions inside {} within a C#
248
  // interpolated string. Formatting expressions within an interpolated string
249
  // would require similar work as that done for JavaScript template strings
250
  // in `handleTemplateStrings()`.
251
8.45k
  auto &CSharpInterpolatedString = *(Tokens.end() - 2);
252
8.45k
  if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral &&
253
8.45k
      
(43
CSharpInterpolatedString->TokenText.startswith(R"($")")43
||
254
43
       
CSharpInterpolatedString->TokenText.startswith(R"($@")")12
)) {
255
37
    int UnmatchedOpeningBraceCount = 0;
256
257
37
    auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
258
989
    for (size_t Index = 0; Index < TokenTextSize; 
++Index952
) {
259
952
      char C = CSharpInterpolatedString->TokenText[Index];
260
952
      if (C == '{') {
261
        // "{{"  inside an interpolated string is an escaped '{' so skip it.
262
49
        if (Index + 1 < TokenTextSize &&
263
49
            CSharpInterpolatedString->TokenText[Index + 1] == '{') {
264
6
          ++Index;
265
6
          continue;
266
6
        }
267
43
        ++UnmatchedOpeningBraceCount;
268
903
      } else if (C == '}') {
269
        // "}}"  inside an interpolated string is an escaped '}' so skip it.
270
43
        if (Index + 1 < TokenTextSize &&
271
43
            CSharpInterpolatedString->TokenText[Index + 1] == '}') {
272
6
          ++Index;
273
6
          continue;
274
6
        }
275
37
        --UnmatchedOpeningBraceCount;
276
37
      }
277
952
    }
278
279
37
    if (UnmatchedOpeningBraceCount > 0) {
280
6
      auto &NextToken = *(Tokens.end() - 1);
281
6
      CSharpInterpolatedString->TokenText =
282
6
          StringRef(CSharpInterpolatedString->TokenText.begin(),
283
6
                    NextToken->TokenText.end() -
284
6
                        CSharpInterpolatedString->TokenText.begin());
285
6
      CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
286
6
      Tokens.erase(Tokens.end() - 1);
287
6
      return true;
288
6
    }
289
37
  }
290
291
  // Look for @"aaaaaa" or $"aaaaaa".
292
8.44k
  auto &String = *(Tokens.end() - 1);
293
8.44k
  if (!String->is(tok::string_literal))
294
8.32k
    return false;
295
296
119
  auto &At = *(Tokens.end() - 2);
297
119
  if (!(At->is(tok::at) || 
At->TokenText == "$"107
))
298
82
    return false;
299
300
37
  if (Tokens.size() > 2 && 
At->is(tok::at)35
) {
301
12
    auto &Dollar = *(Tokens.end() - 3);
302
12
    if (Dollar->TokenText == "$") {
303
      // This looks like $@"aaaaa" so we need to combine all 3 tokens.
304
6
      Dollar->Tok.setKind(tok::string_literal);
305
6
      Dollar->TokenText =
306
6
          StringRef(Dollar->TokenText.begin(),
307
6
                    String->TokenText.end() - Dollar->TokenText.begin());
308
6
      Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
309
6
      Dollar->setType(TT_CSharpStringLiteral);
310
6
      Tokens.erase(Tokens.end() - 2);
311
6
      Tokens.erase(Tokens.end() - 1);
312
6
      return true;
313
6
    }
314
12
  }
315
316
  // Convert back into just a string_literal.
317
31
  At->Tok.setKind(tok::string_literal);
318
31
  At->TokenText = StringRef(At->TokenText.begin(),
319
31
                            String->TokenText.end() - At->TokenText.begin());
320
31
  At->ColumnWidth += String->ColumnWidth;
321
31
  At->setType(TT_CSharpStringLiteral);
322
31
  Tokens.erase(Tokens.end() - 1);
323
31
  return true;
324
37
}
325
326
// Valid C# attribute targets:
327
// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
328
const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
329
    "assembly", "module",   "field",  "event", "method",
330
    "param",    "property", "return", "type",
331
};
332
333
46.7k
bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
334
46.7k
  if (Tokens.size() < 2)
335
3.55k
    return false;
336
43.2k
  auto &NullishCoalescing = *(Tokens.end() - 2);
337
43.2k
  auto &Equal = *(Tokens.end() - 1);
338
43.2k
  if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
339
43.2k
      
!Equal->is(tok::equal)38
) {
340
43.2k
    return false;
341
43.2k
  }
342
14
  NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
343
14
  NullishCoalescing->TokenText =
344
14
      StringRef(NullishCoalescing->TokenText.begin(),
345
14
                Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
346
14
  NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
347
14
  NullishCoalescing->setType(TT_NullCoalescingEqual);
348
14
  Tokens.erase(Tokens.end() - 1);
349
14
  return true;
350
43.2k
}
351
352
8.98k
bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
353
8.98k
  if (Tokens.size() < 2)
354
527
    return false;
355
8.46k
  auto &At = *(Tokens.end() - 2);
356
8.46k
  auto &Keyword = *(Tokens.end() - 1);
357
8.46k
  if (!At->is(tok::at))
358
8.43k
    return false;
359
28
  if (!Keywords.isCSharpKeyword(*Keyword))
360
20
    return false;
361
362
8
  At->Tok.setKind(tok::identifier);
363
8
  At->TokenText = StringRef(At->TokenText.begin(),
364
8
                            Keyword->TokenText.end() - At->TokenText.begin());
365
8
  At->ColumnWidth += Keyword->ColumnWidth;
366
8
  At->setType(Keyword->getType());
367
8
  Tokens.erase(Tokens.end() - 1);
368
8
  return true;
369
28
}
370
371
// In C# transform identifier foreach into kw_foreach
372
8.93k
bool FormatTokenLexer::tryTransformCSharpForEach() {
373
8.93k
  if (Tokens.size() < 1)
374
0
    return false;
375
8.93k
  auto &Identifier = *(Tokens.end() - 1);
376
8.93k
  if (!Identifier->is(tok::identifier))
377
6.42k
    return false;
378
2.51k
  if (Identifier->TokenText != "foreach")
379
2.50k
    return false;
380
381
8
  Identifier->setType(TT_ForEachMacro);
382
8
  Identifier->Tok.setKind(tok::kw_for);
383
8
  return true;
384
2.51k
}
385
386
1.10M
bool FormatTokenLexer::tryMergeForEach() {
387
1.10M
  if (Tokens.size() < 2)
388
65.5k
    return false;
389
1.03M
  auto &For = *(Tokens.end() - 2);
390
1.03M
  auto &Each = *(Tokens.end() - 1);
391
1.03M
  if (!For->is(tok::kw_for))
392
1.03M
    return false;
393
1.91k
  if (!Each->is(tok::identifier))
394
1.89k
    return false;
395
20
  if (Each->TokenText != "each")
396
11
    return false;
397
398
9
  For->setType(TT_ForEachMacro);
399
9
  For->Tok.setKind(tok::kw_for);
400
401
9
  For->TokenText = StringRef(For->TokenText.begin(),
402
9
                             Each->TokenText.end() - For->TokenText.begin());
403
9
  For->ColumnWidth += Each->ColumnWidth;
404
9
  Tokens.erase(Tokens.end() - 1);
405
9
  return true;
406
20
}
407
408
1.03M
bool FormatTokenLexer::tryTransformTryUsageForC() {
409
1.03M
  if (Tokens.size() < 2)
410
60.7k
    return false;
411
975k
  auto &Try = *(Tokens.end() - 2);
412
975k
  if (!Try->is(tok::kw_try))
413
974k
    return false;
414
337
  auto &Next = *(Tokens.end() - 1);
415
337
  if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
416
295
    return false;
417
418
42
  if (Tokens.size() > 2) {
419
33
    auto &At = *(Tokens.end() - 3);
420
33
    if (At->is(tok::at))
421
6
      return false;
422
33
  }
423
424
36
  Try->Tok.setKind(tok::identifier);
425
36
  return true;
426
42
}
427
428
1.10M
bool FormatTokenLexer::tryMergeLessLess() {
429
  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
430
1.10M
  if (Tokens.size() < 3)
431
130k
    return false;
432
433
970k
  auto First = Tokens.end() - 3;
434
970k
  if (First[0]->isNot(tok::less) || 
First[1]->isNot(tok::less)20.3k
)
435
969k
    return false;
436
437
  // Only merge if there currently is no whitespace between the two "<".
438
1.55k
  if (First[1]->hasWhitespaceBefore())
439
36
    return false;
440
441
1.51k
  auto X = Tokens.size() > 3 ? 
First[-1]1.51k
:
nullptr3
;
442
1.51k
  auto Y = First[2];
443
1.51k
  if ((X && 
X->is(tok::less)1.51k
) ||
Y->is(tok::less)1.39k
)
444
225
    return false;
445
446
  // Do not remove a whitespace between the two "<" e.g. "operator< <>".
447
1.29k
  if (X && X->is(tok::kw_operator) && 
Y->is(tok::greater)30
)
448
0
    return false;
449
450
1.29k
  First[0]->Tok.setKind(tok::lessless);
451
1.29k
  First[0]->TokenText = "<<";
452
1.29k
  First[0]->ColumnWidth += 1;
453
1.29k
  Tokens.erase(Tokens.end() - 2);
454
1.29k
  return true;
455
1.29k
}
456
457
bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
458
418k
                                      TokenType NewType) {
459
418k
  if (Tokens.size() < Kinds.size())
460
35.9k
    return false;
461
462
382k
  SmallVectorImpl<FormatToken *>::const_iterator First =
463
382k
      Tokens.end() - Kinds.size();
464
382k
  if (!First[0]->is(Kinds[0]))
465
379k
    return false;
466
3.26k
  unsigned AddLength = 0;
467
3.69k
  for (unsigned i = 1; i < Kinds.size(); 
++i426
) {
468
3.32k
    if (!First[i]->is(Kinds[i]) || 
First[i]->hasWhitespaceBefore()438
)
469
2.90k
      return false;
470
426
    AddLength += First[i]->TokenText.size();
471
426
  }
472
364
  Tokens.resize(Tokens.size() - Kinds.size() + 1);
473
364
  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
474
364
                                  First[0]->TokenText.size() + AddLength);
475
364
  First[0]->ColumnWidth += AddLength;
476
364
  First[0]->setType(NewType);
477
364
  return true;
478
3.26k
}
479
480
// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
481
336
bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
482
  // NB: This is not entirely correct, as an r_paren can introduce an operand
483
  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
484
  // corner case to not matter in practice, though.
485
336
  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
486
336
                      tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
487
336
                      tok::colon, tok::question, tok::tilde) ||
488
336
         Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
489
292
                      tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
490
292
                      tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
491
336
         
Tok->isBinaryOperator()288
;
492
336
}
493
494
340
bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
495
340
  if (!Prev)
496
4
    return true;
497
498
  // Regex literals can only follow after prefix unary operators, not after
499
  // postfix unary operators. If the '++' is followed by a non-operand
500
  // introducing token, the slash here is the operand and not the start of a
501
  // regex.
502
  // `!` is an unary prefix operator, but also a post-fix operator that casts
503
  // away nullability, so the same check applies.
504
336
  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
505
20
    return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
506
507
  // The previous token must introduce an operand location where regex
508
  // literals can occur.
509
316
  if (!precedesOperand(Prev))
510
24
    return false;
511
512
292
  return true;
513
316
}
514
515
// Tries to parse a JavaScript Regex literal starting at the current token,
516
// if that begins with a slash and is in a location where JavaScript allows
517
// regex literals. Changes the current token to a regex literal and updates
518
// its text if successful.
519
37.9k
void FormatTokenLexer::tryParseJSRegexLiteral() {
520
37.9k
  FormatToken *RegexToken = Tokens.back();
521
37.9k
  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
522
37.6k
    return;
523
524
340
  FormatToken *Prev = nullptr;
525
344
  for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
526
    // NB: Because previous pointers are not initialized yet, this cannot use
527
    // Token.getPreviousNonComment.
528
344
    if (FT->isNot(tok::comment)) {
529
336
      Prev = FT;
530
336
      break;
531
336
    }
532
344
  }
533
534
340
  if (!canPrecedeRegexLiteral(Prev))
535
36
    return;
536
537
  // 'Manually' lex ahead in the current file buffer.
538
304
  const char *Offset = Lex->getBufferLocation();
539
304
  const char *RegexBegin = Offset - RegexToken->TokenText.size();
540
304
  StringRef Buffer = Lex->getBuffer();
541
304
  bool InCharacterClass = false;
542
304
  bool HaveClosingSlash = false;
543
1.91k
  for (; !HaveClosingSlash && 
Offset != Buffer.end()1.61k
;
++Offset1.61k
) {
544
    // Regular expressions are terminated with a '/', which can only be
545
    // escaped using '\' or a character class between '[' and ']'.
546
    // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
547
1.61k
    switch (*Offset) {
548
116
    case '\\':
549
      // Skip the escaped character.
550
116
      ++Offset;
551
116
      break;
552
40
    case '[':
553
40
      InCharacterClass = true;
554
40
      break;
555
40
    case ']':
556
40
      InCharacterClass = false;
557
40
      break;
558
320
    case '/':
559
320
      if (!InCharacterClass)
560
304
        HaveClosingSlash = true;
561
320
      break;
562
1.61k
    }
563
1.61k
  }
564
565
304
  RegexToken->setType(TT_RegexLiteral);
566
  // Treat regex literals like other string_literals.
567
304
  RegexToken->Tok.setKind(tok::string_literal);
568
304
  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
569
304
  RegexToken->ColumnWidth = RegexToken->TokenText.size();
570
571
304
  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
572
304
}
573
574
9.11k
void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
575
9.11k
  FormatToken *CSharpStringLiteral = Tokens.back();
576
577
9.11k
  if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral)
578
9.06k
    return;
579
580
  // Deal with multiline strings.
581
43
  if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
582
43
        
CSharpStringLiteral->TokenText.startswith(R"($@")")37
)) {
583
31
    return;
584
31
  }
585
586
12
  const char *StrBegin =
587
12
      Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
588
12
  const char *Offset = StrBegin;
589
12
  if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
590
6
    Offset += 2;
591
6
  else // CSharpStringLiteral->TokenText.startswith(R"($@")")
592
6
    Offset += 3;
593
594
  // Look for a terminating '"' in the current file buffer.
595
  // Make no effort to format code within an interpolated or verbatim string.
596
288
  for (; Offset != Lex->getBuffer().end(); 
++Offset276
) {
597
288
    if (Offset[0] == '"') {
598
      // "" within a verbatim string is an escaped double quote: skip it.
599
22
      if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
600
10
        ++Offset;
601
12
      else
602
12
        break;
603
22
    }
604
288
  }
605
606
  // Make no attempt to format code properly if a verbatim string is
607
  // unterminated.
608
12
  if (Offset == Lex->getBuffer().end())
609
0
    return;
610
611
12
  StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
612
12
  CSharpStringLiteral->TokenText = LiteralText;
613
614
  // Adjust width for potentially multiline string literals.
615
12
  size_t FirstBreak = LiteralText.find('\n');
616
12
  StringRef FirstLineText = FirstBreak == StringRef::npos
617
12
                                ? 
LiteralText10
618
12
                                : 
LiteralText.substr(0, FirstBreak)2
;
619
12
  CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
620
12
      FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
621
12
      Encoding);
622
12
  size_t LastBreak = LiteralText.rfind('\n');
623
12
  if (LastBreak != StringRef::npos) {
624
2
    CSharpStringLiteral->IsMultiline = true;
625
2
    unsigned StartColumn = 0;
626
2
    CSharpStringLiteral->LastLineColumnWidth =
627
2
        encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
628
2
                                      StartColumn, Style.TabWidth, Encoding);
629
2
  }
630
631
12
  SourceLocation loc = Offset < Lex->getBuffer().end()
632
12
                           ? Lex->getSourceLocation(Offset + 1)
633
12
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
634
12
  resetLexer(SourceMgr.getFileOffset(loc));
635
12
}
636
637
37.9k
void FormatTokenLexer::handleTemplateStrings() {
638
37.9k
  FormatToken *BacktickToken = Tokens.back();
639
640
37.9k
  if (BacktickToken->is(tok::l_brace)) {
641
2.07k
    StateStack.push(LexerState::NORMAL);
642
2.07k
    return;
643
2.07k
  }
644
35.8k
  if (BacktickToken->is(tok::r_brace)) {
645
2.17k
    if (StateStack.size() == 1)
646
4
      return;
647
2.16k
    StateStack.pop();
648
2.16k
    if (StateStack.top() != LexerState::TEMPLATE_STRING)
649
2.06k
      return;
650
    // If back in TEMPLATE_STRING, fallthrough and continue parsing the
651
33.7k
  } else if (BacktickToken->is(tok::unknown) &&
652
33.7k
             
BacktickToken->TokenText == "`"148
) {
653
148
    StateStack.push(LexerState::TEMPLATE_STRING);
654
33.5k
  } else {
655
33.5k
    return; // Not actually a template
656
33.5k
  }
657
658
  // 'Manually' lex ahead in the current file buffer.
659
248
  const char *Offset = Lex->getBufferLocation();
660
248
  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
661
1.90k
  for (; Offset != Lex->getBuffer().end(); 
++Offset1.65k
) {
662
1.90k
    if (Offset[0] == '`') {
663
148
      StateStack.pop();
664
148
      break;
665
148
    }
666
1.75k
    if (Offset[0] == '\\') {
667
8
      ++Offset; // Skip the escaped character.
668
1.74k
    } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
669
1.74k
               
Offset[1] == '{'100
) {
670
      // '${' introduces an expression interpolation in the template string.
671
100
      StateStack.push(LexerState::NORMAL);
672
100
      ++Offset;
673
100
      break;
674
100
    }
675
1.75k
  }
676
677
248
  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
678
248
  BacktickToken->setType(TT_TemplateString);
679
248
  BacktickToken->Tok.setKind(tok::string_literal);
680
248
  BacktickToken->TokenText = LiteralText;
681
682
  // Adjust width for potentially multiline string literals.
683
248
  size_t FirstBreak = LiteralText.find('\n');
684
248
  StringRef FirstLineText = FirstBreak == StringRef::npos
685
248
                                ? 
LiteralText212
686
248
                                : 
LiteralText.substr(0, FirstBreak)36
;
687
248
  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
688
248
      FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
689
248
  size_t LastBreak = LiteralText.rfind('\n');
690
248
  if (LastBreak != StringRef::npos) {
691
36
    BacktickToken->IsMultiline = true;
692
36
    unsigned StartColumn = 0; // The template tail spans the entire line.
693
36
    BacktickToken->LastLineColumnWidth =
694
36
        encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
695
36
                                      StartColumn, Style.TabWidth, Encoding);
696
36
  }
697
698
248
  SourceLocation loc = Offset < Lex->getBuffer().end()
699
248
                           ? Lex->getSourceLocation(Offset + 1)
700
248
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
701
248
  resetLexer(SourceMgr.getFileOffset(loc));
702
248
}
703
704
6.73k
void FormatTokenLexer::tryParsePythonComment() {
705
6.73k
  FormatToken *HashToken = Tokens.back();
706
6.73k
  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
707
6.66k
    return;
708
  // Turn the remainder of this line into a comment.
709
68
  const char *CommentBegin =
710
68
      Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
711
68
  size_t From = CommentBegin - Lex->getBuffer().begin();
712
68
  size_t To = Lex->getBuffer().find_first_of('\n', From);
713
68
  if (To == StringRef::npos)
714
8
    To = Lex->getBuffer().size();
715
68
  size_t Len = To - From;
716
68
  HashToken->setType(TT_LineComment);
717
68
  HashToken->Tok.setKind(tok::comment);
718
68
  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
719
68
  SourceLocation Loc = To < Lex->getBuffer().size()
720
68
                           ? 
Lex->getSourceLocation(CommentBegin + Len)60
721
68
                           : 
SourceMgr.getLocForEndOfFile(ID)8
;
722
68
  resetLexer(SourceMgr.getFileOffset(Loc));
723
68
}
724
725
1.10M
bool FormatTokenLexer::tryMerge_TMacro() {
726
1.10M
  if (Tokens.size() < 4)
727
195k
    return false;
728
906k
  FormatToken *Last = Tokens.back();
729
906k
  if (!Last->is(tok::r_paren))
730
824k
    return false;
731
732
81.5k
  FormatToken *String = Tokens[Tokens.size() - 2];
733
81.5k
  if (!String->is(tok::string_literal) || 
String->IsMultiline795
)
734
80.7k
    return false;
735
736
762
  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
737
359
    return false;
738
739
403
  FormatToken *Macro = Tokens[Tokens.size() - 4];
740
403
  if (Macro->TokenText != "_T")
741
385
    return false;
742
743
18
  const char *Start = Macro->TokenText.data();
744
18
  const char *End = Last->TokenText.data() + Last->TokenText.size();
745
18
  String->TokenText = StringRef(Start, End - Start);
746
18
  String->IsFirst = Macro->IsFirst;
747
18
  String->LastNewlineOffset = Macro->LastNewlineOffset;
748
18
  String->WhitespaceRange = Macro->WhitespaceRange;
749
18
  String->OriginalColumn = Macro->OriginalColumn;
750
18
  String->ColumnWidth = encoding::columnWidthWithTabs(
751
18
      String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
752
18
  String->NewlinesBefore = Macro->NewlinesBefore;
753
18
  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
754
755
18
  Tokens.pop_back();
756
18
  Tokens.pop_back();
757
18
  Tokens.pop_back();
758
18
  Tokens.back() = String;
759
18
  if (FirstInLineIndex >= Tokens.size())
760
3
    FirstInLineIndex = Tokens.size() - 1;
761
18
  return true;
762
403
}
763
764
1.10M
bool FormatTokenLexer::tryMergeConflictMarkers() {
765
1.10M
  if (Tokens.back()->NewlinesBefore == 0 && 
Tokens.back()->isNot(tok::eof)1.01M
)
766
953k
    return false;
767
768
  // Conflict lines look like:
769
  // <marker> <text from the vcs>
770
  // For example:
771
  // >>>>>>> /file/in/file/system at revision 1234
772
  //
773
  // We merge all tokens in a line that starts with a conflict marker
774
  // into a single token with a special token type that the unwrapped line
775
  // parser will use to correctly rebuild the underlying code.
776
777
147k
  FileID ID;
778
  // Get the position of the first token in the line.
779
147k
  unsigned FirstInLineOffset;
780
147k
  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
781
147k
      Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
782
147k
  StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
783
  // Calculate the offset of the start of the current line.
784
147k
  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
785
147k
  if (LineOffset == StringRef::npos)
786
64.3k
    LineOffset = 0;
787
83.4k
  else
788
83.4k
    ++LineOffset;
789
790
147k
  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
791
147k
  StringRef LineStart;
792
147k
  if (FirstSpace == StringRef::npos)
793
8.72k
    LineStart = Buffer.substr(LineOffset);
794
139k
  else
795
139k
    LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
796
797
147k
  TokenType Type = TT_Unknown;
798
147k
  if (LineStart == "<<<<<<<" || 
LineStart == ">>>>"147k
) {
799
9
    Type = TT_ConflictStart;
800
147k
  } else if (LineStart == "|||||||" || 
LineStart == "======="147k
||
801
147k
             
LineStart == "===="147k
) {
802
27
    Type = TT_ConflictAlternative;
803
147k
  } else if (LineStart == ">>>>>>>" || 
LineStart == "<<<<"147k
) {
804
9
    Type = TT_ConflictEnd;
805
9
  }
806
807
147k
  if (Type != TT_Unknown) {
808
45
    FormatToken *Next = Tokens.back();
809
810
45
    Tokens.resize(FirstInLineIndex + 1);
811
    // We do not need to build a complete token here, as we will skip it
812
    // during parsing anyway (as we must not touch whitespace around conflict
813
    // markers).
814
45
    Tokens.back()->setType(Type);
815
45
    Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
816
817
45
    Tokens.push_back(Next);
818
45
    return true;
819
45
  }
820
821
147k
  return false;
822
147k
}
823
824
2.66k
FormatToken *FormatTokenLexer::getStashedToken() {
825
  // Create a synthesized second '>' or '<' token.
826
2.66k
  Token Tok = FormatTok->Tok;
827
2.66k
  StringRef TokenText = FormatTok->TokenText;
828
829
2.66k
  unsigned OriginalColumn = FormatTok->OriginalColumn;
830
2.66k
  FormatTok = new (Allocator.Allocate()) FormatToken;
831
2.66k
  FormatTok->Tok = Tok;
832
2.66k
  SourceLocation TokLocation =
833
2.66k
      FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
834
2.66k
  FormatTok->Tok.setLocation(TokLocation);
835
2.66k
  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
836
2.66k
  FormatTok->TokenText = TokenText;
837
2.66k
  FormatTok->ColumnWidth = 1;
838
2.66k
  FormatTok->OriginalColumn = OriginalColumn + 1;
839
840
2.66k
  return FormatTok;
841
2.66k
}
842
843
/// Truncate the current token to the new length and make the lexer continue
844
/// from the end of the truncated token. Used for other languages that have
845
/// different token boundaries, like JavaScript in which a comment ends at a
846
/// line break regardless of whether the line break follows a backslash. Also
847
/// used to set the lexer to the end of whitespace if the lexer regards
848
/// whitespace and an unrecognized symbol as one token.
849
96
void FormatTokenLexer::truncateToken(size_t NewLen) {
850
96
  assert(NewLen <= FormatTok->TokenText.size());
851
0
  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
852
96
      Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
853
96
  FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
854
96
  FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
855
96
      FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
856
96
      Encoding);
857
96
  FormatTok->Tok.setLength(NewLen);
858
96
}
859
860
/// Count the length of leading whitespace in a token.
861
1.52M
static size_t countLeadingWhitespace(StringRef Text) {
862
  // Basically counting the length matched by this regex.
863
  // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
864
  // Directly using the regex turned out to be slow. With the regex
865
  // version formatting all files in this directory took about 1.25
866
  // seconds. This version took about 0.5 seconds.
867
1.52M
  const unsigned char *const Begin = Text.bytes_begin();
868
1.52M
  const unsigned char *const End = Text.bytes_end();
869
1.52M
  const unsigned char *Cur = Begin;
870
2.35M
  while (Cur < End) {
871
1.86M
    if (isspace(Cur[0])) {
872
834k
      ++Cur;
873
1.03M
    } else if (Cur[0] == '\\' && 
(931
Cur[1] == '\n'931
||
Cur[1] == '\r'87
)) {
874
      // A '\' followed by a newline always escapes the newline, regardless
875
      // of whether there is another '\' before it.
876
      // The source has a null byte at the end. So the end of the entire input
877
      // isn't reached yet. Also the lexer doesn't break apart an escaped
878
      // newline.
879
868
      assert(End - Cur >= 2);
880
0
      Cur += 2;
881
1.03M
    } else if (Cur[0] == '?' && 
Cur[1] == '?'2.48k
&&
Cur[2] == '/'38
&&
882
1.03M
               
(0
Cur[3] == '\n'0
||
Cur[3] == '\r'0
)) {
883
      // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
884
      // characters are quoted individually in this comment because if we write
885
      // them together some compilers warn that we have a trigraph in the code.
886
0
      assert(End - Cur >= 4);
887
0
      Cur += 4;
888
1.03M
    } else {
889
1.03M
      break;
890
1.03M
    }
891
1.86M
  }
892
1.52M
  return Cur - Begin;
893
1.52M
}
894
895
1.10M
FormatToken *FormatTokenLexer::getNextToken() {
896
1.10M
  if (StateStack.top() == LexerState::TOKEN_STASHED) {
897
2.66k
    StateStack.pop();
898
2.66k
    return getStashedToken();
899
2.66k
  }
900
901
1.09M
  FormatTok = new (Allocator.Allocate()) FormatToken;
902
1.09M
  readRawToken(*FormatTok);
903
1.09M
  SourceLocation WhitespaceStart =
904
1.09M
      FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
905
1.09M
  FormatTok->IsFirst = IsFirstToken;
906
1.09M
  IsFirstToken = false;
907
908
  // Consume and record whitespace until we find a significant token.
909
  // Some tok::unknown tokens are not just whitespace, e.g. whitespace
910
  // followed by a symbol such as backtick. Those symbols may be
911
  // significant in other languages.
912
1.09M
  unsigned WhitespaceLength = TrailingWhitespace;
913
1.58M
  while (FormatTok->isNot(tok::eof)) {
914
1.52M
    auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
915
1.52M
    if (LeadingWhitespace == 0)
916
1.03M
      break;
917
490k
    if (LeadingWhitespace < FormatTok->TokenText.size())
918
84
      truncateToken(LeadingWhitespace);
919
490k
    StringRef Text = FormatTok->TokenText;
920
490k
    bool InEscape = false;
921
1.32M
    for (int i = 0, e = Text.size(); i != e; 
++i835k
) {
922
835k
      switch (Text[i]) {
923
387
      case '\r':
924
        // If this is a CRLF sequence, break here and the LF will be handled on
925
        // the next loop iteration. Otherwise, this is a single Mac CR, treat it
926
        // the same as a single LF.
927
387
        if (i + 1 < e && 
Text[i + 1] == '\n'378
)
928
378
          break;
929
387
        
LLVM_FALLTHROUGH9
;9
930
91.5k
      case '\n':
931
91.5k
        ++FormatTok->NewlinesBefore;
932
91.5k
        if (!InEscape)
933
90.7k
          FormatTok->HasUnescapedNewline = true;
934
868
        else
935
868
          InEscape = false;
936
91.5k
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
937
91.5k
        Column = 0;
938
91.5k
        break;
939
12
      case '\f':
940
24
      case '\v':
941
24
        Column = 0;
942
24
        break;
943
740k
      case ' ':
944
740k
        ++Column;
945
740k
        break;
946
2.47k
      case '\t':
947
2.47k
        Column +=
948
2.47k
            Style.TabWidth - (Style.TabWidth ? 
Column % Style.TabWidth2.42k
:
045
);
949
2.47k
        break;
950
868
      case '\\':
951
868
      case '?':
952
868
      case '/':
953
        // The text was entirely whitespace when this loop was entered. Thus
954
        // this has to be an escape sequence.
955
868
        assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
956
868
               Text.substr(i, 4) == "\?\?/\r" ||
957
868
               Text.substr(i, 4) == "\?\?/\n" ||
958
868
               (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
959
868
                           Text.substr(i - 1, 4) == "\?\?/\n")) ||
960
868
               (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
961
868
                           Text.substr(i - 2, 4) == "\?\?/\n")));
962
0
        InEscape = true;
963
868
        break;
964
0
      default:
965
        // This shouldn't happen.
966
0
        assert(false);
967
0
        break;
968
835k
      }
969
835k
    }
970
490k
    WhitespaceLength += Text.size();
971
490k
    readRawToken(*FormatTok);
972
490k
  }
973
974
1.09M
  if (FormatTok->is(tok::unknown))
975
307
    FormatTok->setType(TT_ImplicitStringLiteral);
976
977
  // JavaScript and Java do not allow to escape the end of the line with a
978
  // backslash. Backslashes are syntax errors in plain source, but can occur in
979
  // comments. When a single line comment ends with a \, it'll cause the next
980
  // line of code to be lexed as a comment, breaking formatting. The code below
981
  // finds comments that contain a backslash followed by a line break, truncates
982
  // the comment token at the backslash, and resets the lexer to restart behind
983
  // the backslash.
984
1.09M
  if ((Style.isJavaScript() || 
Style.Language == FormatStyle::LK_Java1.06M
) &&
985
1.09M
      
FormatTok->is(tok::comment)42.4k
&&
FormatTok->TokenText.startswith("//")537
) {
986
360
    size_t BackslashPos = FormatTok->TokenText.find('\\');
987
364
    while (BackslashPos != StringRef::npos) {
988
16
      if (BackslashPos + 1 < FormatTok->TokenText.size() &&
989
16
          FormatTok->TokenText[BackslashPos + 1] == '\n') {
990
12
        truncateToken(BackslashPos + 1);
991
12
        break;
992
12
      }
993
4
      BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
994
4
    }
995
360
  }
996
997
1.09M
  if (Style.isVerilog()) {
998
    // Verilog uses the backtick instead of the hash for preprocessor stuff.
999
    // And it uses the hash for delays and parameter lists. In order to continue
1000
    // using `tok::hash` in other places, the backtick gets marked as the hash
1001
    // here.  And in order to tell the backtick and hash apart for
1002
    // Verilog-specific stuff, the hash becomes an identifier.
1003
1.02k
    if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1004
31
      FormatTok->Tok.setKind(tok::raw_identifier);
1005
991
    } else if (FormatTok->is(tok::raw_identifier)) {
1006
473
      if (FormatTok->TokenText == "`") {
1007
53
        FormatTok->Tok.setIdentifierInfo(nullptr);
1008
53
        FormatTok->Tok.setKind(tok::hash);
1009
420
      } else if (FormatTok->TokenText == "``") {
1010
0
        FormatTok->Tok.setIdentifierInfo(nullptr);
1011
0
        FormatTok->Tok.setKind(tok::hashhash);
1012
0
      }
1013
473
    }
1014
1.02k
  }
1015
1016
1.09M
  FormatTok->WhitespaceRange = SourceRange(
1017
1.09M
      WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1018
1019
1.09M
  FormatTok->OriginalColumn = Column;
1020
1021
1.09M
  TrailingWhitespace = 0;
1022
1.09M
  if (FormatTok->is(tok::comment)) {
1023
    // FIXME: Add the trimmed whitespace to Column.
1024
15.9k
    StringRef UntrimmedText = FormatTok->TokenText;
1025
15.9k
    FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1026
15.9k
    TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1027
1.08M
  } else if (FormatTok->is(tok::raw_identifier)) {
1028
414k
    IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1029
414k
    FormatTok->Tok.setIdentifierInfo(&Info);
1030
414k
    FormatTok->Tok.setKind(Info.getTokenID());
1031
414k
    if (Style.Language == FormatStyle::LK_Java &&
1032
414k
        FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1033
1.86k
                           tok::kw_operator)) {
1034
8
      FormatTok->Tok.setKind(tok::identifier);
1035
8
      FormatTok->Tok.setIdentifierInfo(nullptr);
1036
414k
    } else if (Style.isJavaScript() &&
1037
414k
               FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1038
13.1k
                                  tok::kw_operator)) {
1039
28
      FormatTok->Tok.setKind(tok::identifier);
1040
28
      FormatTok->Tok.setIdentifierInfo(nullptr);
1041
28
    }
1042
668k
  } else if (FormatTok->is(tok::greatergreater)) {
1043
1.25k
    FormatTok->Tok.setKind(tok::greater);
1044
1.25k
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1045
1.25k
    ++Column;
1046
1.25k
    StateStack.push(LexerState::TOKEN_STASHED);
1047
667k
  } else if (FormatTok->is(tok::lessless)) {
1048
1.41k
    FormatTok->Tok.setKind(tok::less);
1049
1.41k
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1050
1.41k
    ++Column;
1051
1.41k
    StateStack.push(LexerState::TOKEN_STASHED);
1052
1.41k
  }
1053
1054
  // Now FormatTok is the next non-whitespace token.
1055
1056
1.09M
  StringRef Text = FormatTok->TokenText;
1057
1.09M
  size_t FirstNewlinePos = Text.find('\n');
1058
1.09M
  if (FirstNewlinePos == StringRef::npos) {
1059
    // FIXME: ColumnWidth actually depends on the start column, we need to
1060
    // take this into account when the token is moved.
1061
1.09M
    FormatTok->ColumnWidth =
1062
1.09M
        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1063
1.09M
    Column += FormatTok->ColumnWidth;
1064
1.09M
  } else {
1065
928
    FormatTok->IsMultiline = true;
1066
    // FIXME: ColumnWidth actually depends on the start column, we need to
1067
    // take this into account when the token is moved.
1068
928
    FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1069
928
        Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1070
1071
    // The last line of the token always starts in column 0.
1072
    // Thus, the length can be precomputed even in the presence of tabs.
1073
928
    FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1074
928
        Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1075
928
    Column = FormatTok->LastLineColumnWidth;
1076
928
  }
1077
1078
1.09M
  if (Style.isCpp()) {
1079
1.03M
    auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1080
1.03M
    if (!(Tokens.size() > 0 && 
Tokens.back()->Tok.getIdentifierInfo()973k
&&
1081
1.03M
          Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1082
391k
              tok::pp_define) &&
1083
1.03M
        
it != Macros.end()1.03M
) {
1084
2.66k
      FormatTok->setType(it->second);
1085
2.66k
      if (it->second == TT_IfMacro) {
1086
        // The lexer token currently has type tok::kw_unknown. However, for this
1087
        // substitution to be treated correctly in the TokenAnnotator, faking
1088
        // the tok value seems to be needed. Not sure if there's a more elegant
1089
        // way.
1090
1.35k
        FormatTok->Tok.setKind(tok::kw_if);
1091
1.35k
      }
1092
1.03M
    } else if (FormatTok->is(tok::identifier)) {
1093
246k
      if (MacroBlockBeginRegex.match(Text))
1094
36
        FormatTok->setType(TT_MacroBlockBegin);
1095
246k
      else if (MacroBlockEndRegex.match(Text))
1096
36
        FormatTok->setType(TT_MacroBlockEnd);
1097
246k
    }
1098
1.03M
  }
1099
1100
1.09M
  return FormatTok;
1101
1.09M
}
1102
1103
1.48k
bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1104
  // In Verilog the quote is not a character literal.
1105
  //
1106
  // Make the backtick and double backtick identifiers to match against them
1107
  // more easily.
1108
  //
1109
  // In Verilog an escaped identifier starts with backslash and ends with
1110
  // whitespace. Unless that whitespace is an escaped newline. A backslash can
1111
  // also begin an escaped newline outside of an escaped identifier. We check
1112
  // for that outside of the Regex since we can't use negative lookhead
1113
  // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1114
  // identifier may have a length of 0 according to Section A.9.3.
1115
  // FIXME: If there is an escaped newline in the middle of an escaped
1116
  // identifier, allow for pasting the two lines together, But escaped
1117
  // identifiers usually occur only in generated code anyway.
1118
1.48k
  static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re"
1119
1.48k
                                        "(\r?\n|\r)|[^[:space:]])*)");
1120
1121
1.48k
  SmallVector<StringRef, 4> Matches;
1122
1.48k
  const char *Start = Lex->getBufferLocation();
1123
1.48k
  if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1124
1.48k
                          &Matches)) {
1125
1.42k
    return false;
1126
1.42k
  }
1127
  // There is a null byte at the end of the buffer, so we don't have to check
1128
  // Start[1] is within the buffer.
1129
57
  if (Start[0] == '\\' && 
(4
Start[1] == '\r'4
||
Start[1] == '\n'4
))
1130
4
    return false;
1131
53
  size_t Len = Matches[0].size();
1132
1133
  // The kind has to be an identifier so we can match it against those defined
1134
  // in Keywords. The kind has to be set before the length because the setLength
1135
  // function checks that the kind is not an annotation.
1136
53
  Tok.setKind(tok::raw_identifier);
1137
53
  Tok.setLength(Len);
1138
53
  Tok.setLocation(Lex->getSourceLocation(Start, Len));
1139
53
  Tok.setRawIdentifierData(Start);
1140
53
  Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1141
53
  return true;
1142
57
}
1143
1144
1.58M
void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1145
  // For Verilog, first see if there is a special token, and fall back to the
1146
  // normal lexer if there isn't one.
1147
1.58M
  if (!Style.isVerilog() || 
!readRawTokenVerilogSpecific(Tok.Tok)1.48k
)
1148
1.58M
    Lex->LexFromRawLexer(Tok.Tok);
1149
1.58M
  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1150
1.58M
                            Tok.Tok.getLength());
1151
  // For formatting, treat unterminated string literals like normal string
1152
  // literals.
1153
1.58M
  if (Tok.is(tok::unknown)) {
1154
491k
    if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1155
34
      Tok.Tok.setKind(tok::string_literal);
1156
34
      Tok.IsUnterminatedLiteral = true;
1157
491k
    } else if (Style.isJavaScript() && 
Tok.TokenText == "''"17.3k
) {
1158
12
      Tok.Tok.setKind(tok::string_literal);
1159
12
    }
1160
491k
  }
1161
1162
1.58M
  if ((Style.isJavaScript() || 
Style.Language == FormatStyle::LK_Proto1.53M
||
1163
1.58M
       
Style.Language == FormatStyle::LK_TextProto1.52M
) &&
1164
1.58M
      
Tok.is(tok::char_constant)71.7k
) {
1165
844
    Tok.Tok.setKind(tok::string_literal);
1166
844
  }
1167
1168
1.58M
  if (Tok.is(tok::comment) && 
(15.9k
Tok.TokenText == "// clang-format on"15.9k
||
1169
15.9k
                               
Tok.TokenText == "/* clang-format on */"15.8k
)) {
1170
137
    FormattingDisabled = false;
1171
137
  }
1172
1173
1.58M
  Tok.Finalized = FormattingDisabled;
1174
1175
1.58M
  if (Tok.is(tok::comment) && 
(15.9k
Tok.TokenText == "// clang-format off"15.9k
||
1176
15.9k
                               
Tok.TokenText == "/* clang-format off */"15.8k
)) {
1177
137
    FormattingDisabled = true;
1178
137
  }
1179
1.58M
}
1180
1181
728
void FormatTokenLexer::resetLexer(unsigned Offset) {
1182
728
  StringRef Buffer = SourceMgr.getBufferData(ID);
1183
728
  LangOpts = getFormattingLangOpts(Style);
1184
728
  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1185
728
                      Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1186
728
  Lex->SetKeepWhitespaceMode(true);
1187
728
  TrailingWhitespace = 0;
1188
728
}
1189
1190
} // namespace format
1191
} // namespace clang