Coverage Report

Created: 2022-05-17 06:19

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file implements FormatTokenLexer, which tokenizes a source file
11
/// into a FormatToken stream suitable for ClangFormat.
12
///
13
//===----------------------------------------------------------------------===//
14
15
#include "FormatTokenLexer.h"
16
#include "FormatToken.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/SourceManager.h"
19
#include "clang/Format/Format.h"
20
#include "llvm/Support/Regex.h"
21
22
namespace clang {
23
namespace format {
24
25
FormatTokenLexer::FormatTokenLexer(
26
    const SourceManager &SourceMgr, FileID ID, unsigned Column,
27
    const FormatStyle &Style, encoding::Encoding Encoding,
28
    llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29
    IdentifierTable &IdentTable)
30
    : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31
      Column(Column), TrailingWhitespace(0),
32
      LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33
      Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34
      Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35
      FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36
64.4k
      MacroBlockEndRegex(Style.MacroBlockEnd) {
37
64.4k
  Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38
64.4k
  Lex->SetKeepWhitespaceMode(true);
39
40
193k
  for (const std::string &ForEachMacro : Style.ForEachMacros) {
41
193k
    auto Identifier = &IdentTable.get(ForEachMacro);
42
193k
    Macros.insert({Identifier, TT_ForEachMacro});
43
193k
  }
44
67.3k
  for (const std::string &IfMacro : Style.IfMacros) {
45
67.3k
    auto Identifier = &IdentTable.get(IfMacro);
46
67.3k
    Macros.insert({Identifier, TT_IfMacro});
47
67.3k
  }
48
65.4k
  for (const std::string &AttributeMacro : Style.AttributeMacros) {
49
65.4k
    auto Identifier = &IdentTable.get(AttributeMacro);
50
65.4k
    Macros.insert({Identifier, TT_AttributeMacro});
51
65.4k
  }
52
128k
  for (const std::string &StatementMacro : Style.StatementMacros) {
53
128k
    auto Identifier = &IdentTable.get(StatementMacro);
54
128k
    Macros.insert({Identifier, TT_StatementMacro});
55
128k
  }
56
64.4k
  for (const std::string &TypenameMacro : Style.TypenameMacros) {
57
765
    auto Identifier = &IdentTable.get(TypenameMacro);
58
765
    Macros.insert({Identifier, TT_TypenameMacro});
59
765
  }
60
64.4k
  for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61
191
    auto Identifier = &IdentTable.get(NamespaceMacro);
62
191
    Macros.insert({Identifier, TT_NamespaceMacro});
63
191
  }
64
64.4k
  for (const std::string &WhitespaceSensitiveMacro :
65
322k
       Style.WhitespaceSensitiveMacros) {
66
322k
    auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67
322k
    Macros.insert({Identifier, TT_UntouchableMacroFunc});
68
322k
  }
69
64.4k
  for (const std::string &StatementAttributeLikeMacro :
70
64.4k
       Style.StatementAttributeLikeMacros) {
71
64.4k
    auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72
64.4k
    Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73
64.4k
  }
74
64.4k
}
75
76
64.4k
ArrayRef<FormatToken *> FormatTokenLexer::lex() {
77
64.4k
  assert(Tokens.empty());
78
0
  assert(FirstInLineIndex == 0);
79
1.08M
  do {
80
1.08M
    Tokens.push_back(getNextToken());
81
1.08M
    if (Style.isJavaScript()) {
82
37.9k
      tryParseJSRegexLiteral();
83
37.9k
      handleTemplateStrings();
84
37.9k
    }
85
1.08M
    if (Style.Language == FormatStyle::LK_TextProto)
86
6.73k
      tryParsePythonComment();
87
1.08M
    tryMergePreviousTokens();
88
1.08M
    if (Style.isCSharp())
89
      // This needs to come after tokens have been merged so that C#
90
      // string literals are correctly identified.
91
9.11k
      handleCSharpVerbatimAndInterpolatedStrings();
92
1.08M
    if (Tokens.back()->NewlinesBefore > 0 || 
Tokens.back()->IsMultiline999k
)
93
87.1k
      FirstInLineIndex = Tokens.size() - 1;
94
1.08M
  } while (Tokens.back()->isNot(tok::eof));
95
64.4k
  return Tokens;
96
64.4k
}
97
98
1.08M
void FormatTokenLexer::tryMergePreviousTokens() {
99
1.08M
  if (tryMerge_TMacro())
100
18
    return;
101
1.08M
  if (tryMergeConflictMarkers())
102
45
    return;
103
1.08M
  if (tryMergeLessLess())
104
1.29k
    return;
105
1.08M
  if (tryMergeForEach())
106
9
    return;
107
1.08M
  if (Style.isCpp() && 
tryTransformTryUsageForC()1.02M
)
108
36
    return;
109
110
1.08M
  if (Style.isJavaScript() || 
Style.isCSharp()1.04M
) {
111
47.0k
    static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
112
47.0k
                                                               tok::question};
113
47.0k
    static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
114
47.0k
                                                             tok::period};
115
47.0k
    static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
116
117
47.0k
    if (tryMergeTokens(FatArrow, TT_FatArrow))
118
244
      return;
119
46.8k
    if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
120
      // Treat like the "||" operator (as opposed to the ternary ?).
121
38
      Tokens.back()->Tok.setKind(tok::pipepipe);
122
38
      return;
123
38
    }
124
46.7k
    if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
125
      // Treat like a regular "." access.
126
22
      Tokens.back()->Tok.setKind(tok::period);
127
22
      return;
128
22
    }
129
46.7k
    if (tryMergeNullishCoalescingEqual())
130
14
      return;
131
46.7k
  }
132
133
1.08M
  if (Style.isCSharp()) {
134
8.98k
    static const tok::TokenKind CSharpNullConditionalLSquare[] = {
135
8.98k
        tok::question, tok::l_square};
136
137
8.98k
    if (tryMergeCSharpKeywordVariables())
138
8
      return;
139
8.98k
    if (tryMergeCSharpStringLiteral())
140
43
      return;
141
8.93k
    if (tryTransformCSharpForEach())
142
8
      return;
143
8.92k
    if (tryMergeTokens(CSharpNullConditionalLSquare,
144
8.92k
                       TT_CSharpNullConditionalLSquare)) {
145
      // Treat like a regular "[" operator.
146
8
      Tokens.back()->Tok.setKind(tok::l_square);
147
8
      return;
148
8
    }
149
8.92k
  }
150
151
1.08M
  if (tryMergeNSStringLiteral())
152
483
    return;
153
154
1.08M
  if (Style.isJavaScript()) {
155
37.7k
    static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
156
37.7k
    static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
157
37.7k
                                                   tok::equal};
158
37.7k
    static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
159
37.7k
                                                  tok::greaterequal};
160
37.7k
    static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
161
37.7k
    static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
162
37.7k
                                                           tok::starequal};
163
37.7k
    static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
164
37.7k
    static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
165
166
    // FIXME: Investigate what token type gives the correct operator priority.
167
37.7k
    if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
168
12
      return;
169
37.7k
    if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
170
12
      return;
171
37.7k
    if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
172
10
      return;
173
37.7k
    if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
174
4
      return;
175
37.7k
    if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
176
4
      Tokens.back()->Tok.setKind(tok::starequal);
177
4
      return;
178
4
    }
179
37.7k
    if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
180
37.7k
        
tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)37.7k
) {
181
      // Treat like the "=" assignment operator.
182
8
      Tokens.back()->Tok.setKind(tok::equal);
183
8
      return;
184
8
    }
185
37.7k
    if (tryMergeJSPrivateIdentifier())
186
32
      return;
187
37.7k
  }
188
189
1.08M
  if (Style.Language == FormatStyle::LK_Java) {
190
4.52k
    static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
191
4.52k
        tok::greater, tok::greater, tok::greaterequal};
192
4.52k
    if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
193
2
      return;
194
4.52k
  }
195
1.08M
}
196
197
1.08M
bool FormatTokenLexer::tryMergeNSStringLiteral() {
198
1.08M
  if (Tokens.size() < 2)
199
64.4k
    return false;
200
1.02M
  auto &At = *(Tokens.end() - 2);
201
1.02M
  auto &String = *(Tokens.end() - 1);
202
1.02M
  if (!At->is(tok::at) || 
!String->is(tok::string_literal)2.83k
)
203
1.01M
    return false;
204
483
  At->Tok.setKind(tok::string_literal);
205
483
  At->TokenText = StringRef(At->TokenText.begin(),
206
483
                            String->TokenText.end() - At->TokenText.begin());
207
483
  At->ColumnWidth += String->ColumnWidth;
208
483
  At->setType(TT_ObjCStringLiteral);
209
483
  Tokens.erase(Tokens.end() - 1);
210
483
  return true;
211
1.02M
}
212
213
37.7k
bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
214
  // Merges #idenfier into a single identifier with the text #identifier
215
  // but the token tok::identifier.
216
37.7k
  if (Tokens.size() < 2)
217
3.02k
    return false;
218
34.6k
  auto &Hash = *(Tokens.end() - 2);
219
34.6k
  auto &Identifier = *(Tokens.end() - 1);
220
34.6k
  if (!Hash->is(tok::hash) || 
!Identifier->is(tok::identifier)36
)
221
34.6k
    return false;
222
32
  Hash->Tok.setKind(tok::identifier);
223
32
  Hash->TokenText =
224
32
      StringRef(Hash->TokenText.begin(),
225
32
                Identifier->TokenText.end() - Hash->TokenText.begin());
226
32
  Hash->ColumnWidth += Identifier->ColumnWidth;
227
32
  Hash->setType(TT_JsPrivateIdentifier);
228
32
  Tokens.erase(Tokens.end() - 1);
229
32
  return true;
230
34.6k
}
231
232
// Search for verbatim or interpolated string literals @"ABC" or
233
// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
234
// prevent splitting of @, $ and ".
235
// Merging of multiline verbatim strings with embedded '"' is handled in
236
// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
237
8.98k
bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
238
8.98k
  if (Tokens.size() < 2)
239
527
    return false;
240
241
  // Interpolated strings could contain { } with " characters inside.
242
  // $"{x ?? "null"}"
243
  // should not be split into $"{x ?? ", null, "}" but should treated as a
244
  // single string-literal.
245
  //
246
  // We opt not to try and format expressions inside {} within a C#
247
  // interpolated string. Formatting expressions within an interpolated string
248
  // would require similar work as that done for JavaScript template strings
249
  // in `handleTemplateStrings()`.
250
8.45k
  auto &CSharpInterpolatedString = *(Tokens.end() - 2);
251
8.45k
  if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral &&
252
8.45k
      
(43
CSharpInterpolatedString->TokenText.startswith(R"($")")43
||
253
43
       
CSharpInterpolatedString->TokenText.startswith(R"($@")")12
)) {
254
37
    int UnmatchedOpeningBraceCount = 0;
255
256
37
    auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
257
989
    for (size_t Index = 0; Index < TokenTextSize; 
++Index952
) {
258
952
      char C = CSharpInterpolatedString->TokenText[Index];
259
952
      if (C == '{') {
260
        // "{{"  inside an interpolated string is an escaped '{' so skip it.
261
49
        if (Index + 1 < TokenTextSize &&
262
49
            CSharpInterpolatedString->TokenText[Index + 1] == '{') {
263
6
          ++Index;
264
6
          continue;
265
6
        }
266
43
        ++UnmatchedOpeningBraceCount;
267
903
      } else if (C == '}') {
268
        // "}}"  inside an interpolated string is an escaped '}' so skip it.
269
43
        if (Index + 1 < TokenTextSize &&
270
43
            CSharpInterpolatedString->TokenText[Index + 1] == '}') {
271
6
          ++Index;
272
6
          continue;
273
6
        }
274
37
        --UnmatchedOpeningBraceCount;
275
37
      }
276
952
    }
277
278
37
    if (UnmatchedOpeningBraceCount > 0) {
279
6
      auto &NextToken = *(Tokens.end() - 1);
280
6
      CSharpInterpolatedString->TokenText =
281
6
          StringRef(CSharpInterpolatedString->TokenText.begin(),
282
6
                    NextToken->TokenText.end() -
283
6
                        CSharpInterpolatedString->TokenText.begin());
284
6
      CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
285
6
      Tokens.erase(Tokens.end() - 1);
286
6
      return true;
287
6
    }
288
37
  }
289
290
  // Look for @"aaaaaa" or $"aaaaaa".
291
8.44k
  auto &String = *(Tokens.end() - 1);
292
8.44k
  if (!String->is(tok::string_literal))
293
8.32k
    return false;
294
295
119
  auto &At = *(Tokens.end() - 2);
296
119
  if (!(At->is(tok::at) || 
At->TokenText == "$"107
))
297
82
    return false;
298
299
37
  if (Tokens.size() > 2 && 
At->is(tok::at)35
) {
300
12
    auto &Dollar = *(Tokens.end() - 3);
301
12
    if (Dollar->TokenText == "$") {
302
      // This looks like $@"aaaaa" so we need to combine all 3 tokens.
303
6
      Dollar->Tok.setKind(tok::string_literal);
304
6
      Dollar->TokenText =
305
6
          StringRef(Dollar->TokenText.begin(),
306
6
                    String->TokenText.end() - Dollar->TokenText.begin());
307
6
      Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
308
6
      Dollar->setType(TT_CSharpStringLiteral);
309
6
      Tokens.erase(Tokens.end() - 2);
310
6
      Tokens.erase(Tokens.end() - 1);
311
6
      return true;
312
6
    }
313
12
  }
314
315
  // Convert back into just a string_literal.
316
31
  At->Tok.setKind(tok::string_literal);
317
31
  At->TokenText = StringRef(At->TokenText.begin(),
318
31
                            String->TokenText.end() - At->TokenText.begin());
319
31
  At->ColumnWidth += String->ColumnWidth;
320
31
  At->setType(TT_CSharpStringLiteral);
321
31
  Tokens.erase(Tokens.end() - 1);
322
31
  return true;
323
37
}
324
325
// Valid C# attribute targets:
326
// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
327
const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
328
    "assembly", "module",   "field",  "event", "method",
329
    "param",    "property", "return", "type",
330
};
331
332
46.7k
bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
333
46.7k
  if (Tokens.size() < 2)
334
3.55k
    return false;
335
43.2k
  auto &NullishCoalescing = *(Tokens.end() - 2);
336
43.2k
  auto &Equal = *(Tokens.end() - 1);
337
43.2k
  if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
338
43.2k
      
!Equal->is(tok::equal)38
)
339
43.2k
    return false;
340
14
  NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
341
14
  NullishCoalescing->TokenText =
342
14
      StringRef(NullishCoalescing->TokenText.begin(),
343
14
                Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
344
14
  NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
345
14
  NullishCoalescing->setType(TT_NullCoalescingEqual);
346
14
  Tokens.erase(Tokens.end() - 1);
347
14
  return true;
348
43.2k
}
349
350
8.98k
bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
351
8.98k
  if (Tokens.size() < 2)
352
527
    return false;
353
8.46k
  auto &At = *(Tokens.end() - 2);
354
8.46k
  auto &Keyword = *(Tokens.end() - 1);
355
8.46k
  if (!At->is(tok::at))
356
8.43k
    return false;
357
28
  if (!Keywords.isCSharpKeyword(*Keyword))
358
20
    return false;
359
360
8
  At->Tok.setKind(tok::identifier);
361
8
  At->TokenText = StringRef(At->TokenText.begin(),
362
8
                            Keyword->TokenText.end() - At->TokenText.begin());
363
8
  At->ColumnWidth += Keyword->ColumnWidth;
364
8
  At->setType(Keyword->getType());
365
8
  Tokens.erase(Tokens.end() - 1);
366
8
  return true;
367
28
}
368
369
// In C# transform identifier foreach into kw_foreach
370
8.93k
bool FormatTokenLexer::tryTransformCSharpForEach() {
371
8.93k
  if (Tokens.size() < 1)
372
0
    return false;
373
8.93k
  auto &Identifier = *(Tokens.end() - 1);
374
8.93k
  if (!Identifier->is(tok::identifier))
375
6.42k
    return false;
376
2.51k
  if (Identifier->TokenText != "foreach")
377
2.50k
    return false;
378
379
8
  Identifier->setType(TT_ForEachMacro);
380
8
  Identifier->Tok.setKind(tok::kw_for);
381
8
  return true;
382
2.51k
}
383
384
1.08M
bool FormatTokenLexer::tryMergeForEach() {
385
1.08M
  if (Tokens.size() < 2)
386
64.4k
    return false;
387
1.02M
  auto &For = *(Tokens.end() - 2);
388
1.02M
  auto &Each = *(Tokens.end() - 1);
389
1.02M
  if (!For->is(tok::kw_for))
390
1.01M
    return false;
391
1.85k
  if (!Each->is(tok::identifier))
392
1.83k
    return false;
393
20
  if (Each->TokenText != "each")
394
11
    return false;
395
396
9
  For->setType(TT_ForEachMacro);
397
9
  For->Tok.setKind(tok::kw_for);
398
399
9
  For->TokenText = StringRef(For->TokenText.begin(),
400
9
                             Each->TokenText.end() - For->TokenText.begin());
401
9
  For->ColumnWidth += Each->ColumnWidth;
402
9
  Tokens.erase(Tokens.end() - 1);
403
9
  return true;
404
20
}
405
406
1.02M
bool FormatTokenLexer::tryTransformTryUsageForC() {
407
1.02M
  if (Tokens.size() < 2)
408
59.7k
    return false;
409
962k
  auto &Try = *(Tokens.end() - 2);
410
962k
  if (!Try->is(tok::kw_try))
411
961k
    return false;
412
337
  auto &Next = *(Tokens.end() - 1);
413
337
  if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
414
295
    return false;
415
416
42
  if (Tokens.size() > 2) {
417
33
    auto &At = *(Tokens.end() - 3);
418
33
    if (At->is(tok::at))
419
6
      return false;
420
33
  }
421
422
36
  Try->Tok.setKind(tok::identifier);
423
36
  return true;
424
42
}
425
426
1.08M
bool FormatTokenLexer::tryMergeLessLess() {
427
  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
428
1.08M
  if (Tokens.size() < 3)
429
128k
    return false;
430
431
957k
  auto First = Tokens.end() - 3;
432
957k
  if (First[0]->isNot(tok::less) || 
First[1]->isNot(tok::less)20.2k
)
433
956k
    return false;
434
435
  // Only merge if there currently is no whitespace between the two "<".
436
1.55k
  if (First[1]->hasWhitespaceBefore())
437
36
    return false;
438
439
1.51k
  auto X = Tokens.size() > 3 ? 
First[-1]1.51k
:
nullptr3
;
440
1.51k
  auto Y = First[2];
441
1.51k
  if ((X && 
X->is(tok::less)1.51k
) ||
Y->is(tok::less)1.39k
)
442
225
    return false;
443
444
  // Do not remove a whitespace between the two "<" e.g. "operator< <>".
445
1.29k
  if (X && X->is(tok::kw_operator) && 
Y->is(tok::greater)30
)
446
0
    return false;
447
448
1.29k
  First[0]->Tok.setKind(tok::lessless);
449
1.29k
  First[0]->TokenText = "<<";
450
1.29k
  First[0]->ColumnWidth += 1;
451
1.29k
  Tokens.erase(Tokens.end() - 2);
452
1.29k
  return true;
453
1.29k
}
454
455
bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
456
418k
                                      TokenType NewType) {
457
418k
  if (Tokens.size() < Kinds.size())
458
35.9k
    return false;
459
460
382k
  SmallVectorImpl<FormatToken *>::const_iterator First =
461
382k
      Tokens.end() - Kinds.size();
462
382k
  if (!First[0]->is(Kinds[0]))
463
379k
    return false;
464
3.26k
  unsigned AddLength = 0;
465
3.69k
  for (unsigned i = 1; i < Kinds.size(); 
++i426
) {
466
3.32k
    if (!First[i]->is(Kinds[i]) || 
First[i]->hasWhitespaceBefore()438
)
467
2.90k
      return false;
468
426
    AddLength += First[i]->TokenText.size();
469
426
  }
470
364
  Tokens.resize(Tokens.size() - Kinds.size() + 1);
471
364
  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
472
364
                                  First[0]->TokenText.size() + AddLength);
473
364
  First[0]->ColumnWidth += AddLength;
474
364
  First[0]->setType(NewType);
475
364
  return true;
476
3.26k
}
477
478
// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
479
336
bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
480
  // NB: This is not entirely correct, as an r_paren can introduce an operand
481
  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
482
  // corner case to not matter in practice, though.
483
336
  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
484
336
                      tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
485
336
                      tok::colon, tok::question, tok::tilde) ||
486
336
         Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
487
292
                      tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
488
292
                      tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
489
336
         
Tok->isBinaryOperator()288
;
490
336
}
491
492
340
bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
493
340
  if (!Prev)
494
4
    return true;
495
496
  // Regex literals can only follow after prefix unary operators, not after
497
  // postfix unary operators. If the '++' is followed by a non-operand
498
  // introducing token, the slash here is the operand and not the start of a
499
  // regex.
500
  // `!` is an unary prefix operator, but also a post-fix operator that casts
501
  // away nullability, so the same check applies.
502
336
  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
503
20
    return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
504
505
  // The previous token must introduce an operand location where regex
506
  // literals can occur.
507
316
  if (!precedesOperand(Prev))
508
24
    return false;
509
510
292
  return true;
511
316
}
512
513
// Tries to parse a JavaScript Regex literal starting at the current token,
514
// if that begins with a slash and is in a location where JavaScript allows
515
// regex literals. Changes the current token to a regex literal and updates
516
// its text if successful.
517
37.9k
void FormatTokenLexer::tryParseJSRegexLiteral() {
518
37.9k
  FormatToken *RegexToken = Tokens.back();
519
37.9k
  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
520
37.6k
    return;
521
522
340
  FormatToken *Prev = nullptr;
523
344
  for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
524
    // NB: Because previous pointers are not initialized yet, this cannot use
525
    // Token.getPreviousNonComment.
526
344
    if (FT->isNot(tok::comment)) {
527
336
      Prev = FT;
528
336
      break;
529
336
    }
530
344
  }
531
532
340
  if (!canPrecedeRegexLiteral(Prev))
533
36
    return;
534
535
  // 'Manually' lex ahead in the current file buffer.
536
304
  const char *Offset = Lex->getBufferLocation();
537
304
  const char *RegexBegin = Offset - RegexToken->TokenText.size();
538
304
  StringRef Buffer = Lex->getBuffer();
539
304
  bool InCharacterClass = false;
540
304
  bool HaveClosingSlash = false;
541
1.91k
  for (; !HaveClosingSlash && 
Offset != Buffer.end()1.61k
;
++Offset1.61k
) {
542
    // Regular expressions are terminated with a '/', which can only be
543
    // escaped using '\' or a character class between '[' and ']'.
544
    // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
545
1.61k
    switch (*Offset) {
546
116
    case '\\':
547
      // Skip the escaped character.
548
116
      ++Offset;
549
116
      break;
550
40
    case '[':
551
40
      InCharacterClass = true;
552
40
      break;
553
40
    case ']':
554
40
      InCharacterClass = false;
555
40
      break;
556
320
    case '/':
557
320
      if (!InCharacterClass)
558
304
        HaveClosingSlash = true;
559
320
      break;
560
1.61k
    }
561
1.61k
  }
562
563
304
  RegexToken->setType(TT_RegexLiteral);
564
  // Treat regex literals like other string_literals.
565
304
  RegexToken->Tok.setKind(tok::string_literal);
566
304
  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
567
304
  RegexToken->ColumnWidth = RegexToken->TokenText.size();
568
569
304
  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
570
304
}
571
572
9.11k
void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
573
9.11k
  FormatToken *CSharpStringLiteral = Tokens.back();
574
575
9.11k
  if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral)
576
9.06k
    return;
577
578
  // Deal with multiline strings.
579
43
  if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
580
43
        
CSharpStringLiteral->TokenText.startswith(R"($@")")37
))
581
31
    return;
582
583
12
  const char *StrBegin =
584
12
      Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
585
12
  const char *Offset = StrBegin;
586
12
  if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
587
6
    Offset += 2;
588
6
  else // CSharpStringLiteral->TokenText.startswith(R"($@")")
589
6
    Offset += 3;
590
591
  // Look for a terminating '"' in the current file buffer.
592
  // Make no effort to format code within an interpolated or verbatim string.
593
288
  for (; Offset != Lex->getBuffer().end(); 
++Offset276
) {
594
288
    if (Offset[0] == '"') {
595
      // "" within a verbatim string is an escaped double quote: skip it.
596
22
      if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
597
10
        ++Offset;
598
12
      else
599
12
        break;
600
22
    }
601
288
  }
602
603
  // Make no attempt to format code properly if a verbatim string is
604
  // unterminated.
605
12
  if (Offset == Lex->getBuffer().end())
606
0
    return;
607
608
12
  StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
609
12
  CSharpStringLiteral->TokenText = LiteralText;
610
611
  // Adjust width for potentially multiline string literals.
612
12
  size_t FirstBreak = LiteralText.find('\n');
613
12
  StringRef FirstLineText = FirstBreak == StringRef::npos
614
12
                                ? 
LiteralText10
615
12
                                : 
LiteralText.substr(0, FirstBreak)2
;
616
12
  CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
617
12
      FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
618
12
      Encoding);
619
12
  size_t LastBreak = LiteralText.rfind('\n');
620
12
  if (LastBreak != StringRef::npos) {
621
2
    CSharpStringLiteral->IsMultiline = true;
622
2
    unsigned StartColumn = 0;
623
2
    CSharpStringLiteral->LastLineColumnWidth =
624
2
        encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
625
2
                                      StartColumn, Style.TabWidth, Encoding);
626
2
  }
627
628
12
  SourceLocation loc = Offset < Lex->getBuffer().end()
629
12
                           ? Lex->getSourceLocation(Offset + 1)
630
12
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
631
12
  resetLexer(SourceMgr.getFileOffset(loc));
632
12
}
633
634
37.9k
void FormatTokenLexer::handleTemplateStrings() {
635
37.9k
  FormatToken *BacktickToken = Tokens.back();
636
637
37.9k
  if (BacktickToken->is(tok::l_brace)) {
638
2.07k
    StateStack.push(LexerState::NORMAL);
639
2.07k
    return;
640
2.07k
  }
641
35.8k
  if (BacktickToken->is(tok::r_brace)) {
642
2.17k
    if (StateStack.size() == 1)
643
4
      return;
644
2.16k
    StateStack.pop();
645
2.16k
    if (StateStack.top() != LexerState::TEMPLATE_STRING)
646
2.06k
      return;
647
    // If back in TEMPLATE_STRING, fallthrough and continue parsing the
648
33.7k
  } else if (BacktickToken->is(tok::unknown) &&
649
33.7k
             
BacktickToken->TokenText == "`"148
) {
650
148
    StateStack.push(LexerState::TEMPLATE_STRING);
651
33.5k
  } else {
652
33.5k
    return; // Not actually a template
653
33.5k
  }
654
655
  // 'Manually' lex ahead in the current file buffer.
656
248
  const char *Offset = Lex->getBufferLocation();
657
248
  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
658
1.90k
  for (; Offset != Lex->getBuffer().end(); 
++Offset1.65k
) {
659
1.90k
    if (Offset[0] == '`') {
660
148
      StateStack.pop();
661
148
      break;
662
148
    }
663
1.75k
    if (Offset[0] == '\\') {
664
8
      ++Offset; // Skip the escaped character.
665
1.74k
    } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
666
1.74k
               
Offset[1] == '{'100
) {
667
      // '${' introduces an expression interpolation in the template string.
668
100
      StateStack.push(LexerState::NORMAL);
669
100
      ++Offset;
670
100
      break;
671
100
    }
672
1.75k
  }
673
674
248
  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
675
248
  BacktickToken->setType(TT_TemplateString);
676
248
  BacktickToken->Tok.setKind(tok::string_literal);
677
248
  BacktickToken->TokenText = LiteralText;
678
679
  // Adjust width for potentially multiline string literals.
680
248
  size_t FirstBreak = LiteralText.find('\n');
681
248
  StringRef FirstLineText = FirstBreak == StringRef::npos
682
248
                                ? 
LiteralText212
683
248
                                : 
LiteralText.substr(0, FirstBreak)36
;
684
248
  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
685
248
      FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
686
248
  size_t LastBreak = LiteralText.rfind('\n');
687
248
  if (LastBreak != StringRef::npos) {
688
36
    BacktickToken->IsMultiline = true;
689
36
    unsigned StartColumn = 0; // The template tail spans the entire line.
690
36
    BacktickToken->LastLineColumnWidth =
691
36
        encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
692
36
                                      StartColumn, Style.TabWidth, Encoding);
693
36
  }
694
695
248
  SourceLocation loc = Offset < Lex->getBuffer().end()
696
248
                           ? Lex->getSourceLocation(Offset + 1)
697
248
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
698
248
  resetLexer(SourceMgr.getFileOffset(loc));
699
248
}
700
701
6.73k
void FormatTokenLexer::tryParsePythonComment() {
702
6.73k
  FormatToken *HashToken = Tokens.back();
703
6.73k
  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
704
6.66k
    return;
705
  // Turn the remainder of this line into a comment.
706
68
  const char *CommentBegin =
707
68
      Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
708
68
  size_t From = CommentBegin - Lex->getBuffer().begin();
709
68
  size_t To = Lex->getBuffer().find_first_of('\n', From);
710
68
  if (To == StringRef::npos)
711
8
    To = Lex->getBuffer().size();
712
68
  size_t Len = To - From;
713
68
  HashToken->setType(TT_LineComment);
714
68
  HashToken->Tok.setKind(tok::comment);
715
68
  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
716
68
  SourceLocation Loc = To < Lex->getBuffer().size()
717
68
                           ? 
Lex->getSourceLocation(CommentBegin + Len)60
718
68
                           : 
SourceMgr.getLocForEndOfFile(ID)8
;
719
68
  resetLexer(SourceMgr.getFileOffset(Loc));
720
68
}
721
722
1.08M
bool FormatTokenLexer::tryMerge_TMacro() {
723
1.08M
  if (Tokens.size() < 4)
724
191k
    return false;
725
894k
  FormatToken *Last = Tokens.back();
726
894k
  if (!Last->is(tok::r_paren))
727
814k
    return false;
728
729
80.3k
  FormatToken *String = Tokens[Tokens.size() - 2];
730
80.3k
  if (!String->is(tok::string_literal) || 
String->IsMultiline795
)
731
79.5k
    return false;
732
733
762
  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
734
359
    return false;
735
736
403
  FormatToken *Macro = Tokens[Tokens.size() - 4];
737
403
  if (Macro->TokenText != "_T")
738
385
    return false;
739
740
18
  const char *Start = Macro->TokenText.data();
741
18
  const char *End = Last->TokenText.data() + Last->TokenText.size();
742
18
  String->TokenText = StringRef(Start, End - Start);
743
18
  String->IsFirst = Macro->IsFirst;
744
18
  String->LastNewlineOffset = Macro->LastNewlineOffset;
745
18
  String->WhitespaceRange = Macro->WhitespaceRange;
746
18
  String->OriginalColumn = Macro->OriginalColumn;
747
18
  String->ColumnWidth = encoding::columnWidthWithTabs(
748
18
      String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
749
18
  String->NewlinesBefore = Macro->NewlinesBefore;
750
18
  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
751
752
18
  Tokens.pop_back();
753
18
  Tokens.pop_back();
754
18
  Tokens.pop_back();
755
18
  Tokens.back() = String;
756
18
  if (FirstInLineIndex >= Tokens.size())
757
3
    FirstInLineIndex = Tokens.size() - 1;
758
18
  return true;
759
403
}
760
761
1.08M
bool FormatTokenLexer::tryMergeConflictMarkers() {
762
1.08M
  if (Tokens.back()->NewlinesBefore == 0 && 
Tokens.back()->isNot(tok::eof)1.00M
)
763
941k
    return false;
764
765
  // Conflict lines look like:
766
  // <marker> <text from the vcs>
767
  // For example:
768
  // >>>>>>> /file/in/file/system at revision 1234
769
  //
770
  // We merge all tokens in a line that starts with a conflict marker
771
  // into a single token with a special token type that the unwrapped line
772
  // parser will use to correctly rebuild the underlying code.
773
774
145k
  FileID ID;
775
  // Get the position of the first token in the line.
776
145k
  unsigned FirstInLineOffset;
777
145k
  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
778
145k
      Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
779
145k
  StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
780
  // Calculate the offset of the start of the current line.
781
145k
  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
782
145k
  if (LineOffset == StringRef::npos)
783
63.1k
    LineOffset = 0;
784
82.3k
  else
785
82.3k
    ++LineOffset;
786
787
145k
  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
788
145k
  StringRef LineStart;
789
145k
  if (FirstSpace == StringRef::npos)
790
8.31k
    LineStart = Buffer.substr(LineOffset);
791
137k
  else
792
137k
    LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
793
794
145k
  TokenType Type = TT_Unknown;
795
145k
  if (LineStart == "<<<<<<<" || 
LineStart == ">>>>"145k
)
796
9
    Type = TT_ConflictStart;
797
145k
  else if (LineStart == "|||||||" || 
LineStart == "======="145k
||
798
145k
           
LineStart == "===="145k
)
799
27
    Type = TT_ConflictAlternative;
800
145k
  else if (LineStart == ">>>>>>>" || 
LineStart == "<<<<"145k
)
801
9
    Type = TT_ConflictEnd;
802
803
145k
  if (Type != TT_Unknown) {
804
45
    FormatToken *Next = Tokens.back();
805
806
45
    Tokens.resize(FirstInLineIndex + 1);
807
    // We do not need to build a complete token here, as we will skip it
808
    // during parsing anyway (as we must not touch whitespace around conflict
809
    // markers).
810
45
    Tokens.back()->setType(Type);
811
45
    Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
812
813
45
    Tokens.push_back(Next);
814
45
    return true;
815
45
  }
816
817
145k
  return false;
818
145k
}
819
820
2.66k
FormatToken *FormatTokenLexer::getStashedToken() {
821
  // Create a synthesized second '>' or '<' token.
822
2.66k
  Token Tok = FormatTok->Tok;
823
2.66k
  StringRef TokenText = FormatTok->TokenText;
824
825
2.66k
  unsigned OriginalColumn = FormatTok->OriginalColumn;
826
2.66k
  FormatTok = new (Allocator.Allocate()) FormatToken;
827
2.66k
  FormatTok->Tok = Tok;
828
2.66k
  SourceLocation TokLocation =
829
2.66k
      FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
830
2.66k
  FormatTok->Tok.setLocation(TokLocation);
831
2.66k
  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
832
2.66k
  FormatTok->TokenText = TokenText;
833
2.66k
  FormatTok->ColumnWidth = 1;
834
2.66k
  FormatTok->OriginalColumn = OriginalColumn + 1;
835
836
2.66k
  return FormatTok;
837
2.66k
}
838
839
1.08M
FormatToken *FormatTokenLexer::getNextToken() {
840
1.08M
  if (StateStack.top() == LexerState::TOKEN_STASHED) {
841
2.66k
    StateStack.pop();
842
2.66k
    return getStashedToken();
843
2.66k
  }
844
845
1.08M
  FormatTok = new (Allocator.Allocate()) FormatToken;
846
1.08M
  readRawToken(*FormatTok);
847
1.08M
  SourceLocation WhitespaceStart =
848
1.08M
      FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
849
1.08M
  FormatTok->IsFirst = IsFirstToken;
850
1.08M
  IsFirstToken = false;
851
852
  // Consume and record whitespace until we find a significant token.
853
1.08M
  unsigned WhitespaceLength = TrailingWhitespace;
854
1.56M
  while (FormatTok->is(tok::unknown)) {
855
484k
    StringRef Text = FormatTok->TokenText;
856
484k
    auto EscapesNewline = [&](int pos) {
857
      // A '\r' here is just part of '\r\n'. Skip it.
858
90.4k
      if (pos >= 0 && 
Text[pos] == '\r'5.14k
)
859
369
        --pos;
860
      // See whether there is an odd number of '\' before this.
861
      // FIXME: This is wrong. A '\' followed by a newline is always removed,
862
      // regardless of whether there is another '\' before it.
863
      // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
864
90.4k
      unsigned count = 0;
865
91.1k
      for (; pos >= 0; 
--pos, ++count784
)
866
4.84k
        if (Text[pos] != '\\')
867
4.05k
          break;
868
90.4k
      return count & 1;
869
90.4k
    };
870
    // FIXME: This miscounts tok:unknown tokens that are not just
871
    // whitespace, e.g. a '`' character.
872
1.31M
    for (int i = 0, e = Text.size(); i != e; 
++i826k
) {
873
827k
      switch (Text[i]) {
874
90.4k
      case '\n':
875
90.4k
        ++FormatTok->NewlinesBefore;
876
90.4k
        FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
877
90.4k
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
878
90.4k
        Column = 0;
879
90.4k
        break;
880
378
      case '\r':
881
378
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
882
378
        Column = 0;
883
378
        break;
884
12
      case '\f':
885
24
      case '\v':
886
24
        Column = 0;
887
24
        break;
888
732k
      case ' ':
889
732k
        ++Column;
890
732k
        break;
891
2.47k
      case '\t':
892
2.47k
        Column +=
893
2.47k
            Style.TabWidth - (Style.TabWidth ? 
Column % Style.TabWidth2.42k
:
045
);
894
2.47k
        break;
895
847
      case '\\':
896
847
        if (i + 1 == e || 
(784
Text[i + 1] != '\r'784
&&
Text[i + 1] != '\n'769
))
897
63
          FormatTok->setType(TT_ImplicitStringLiteral);
898
847
        break;
899
244
      default:
900
244
        FormatTok->setType(TT_ImplicitStringLiteral);
901
244
        break;
902
827k
      }
903
827k
      if (FormatTok->getType() == TT_ImplicitStringLiteral)
904
307
        break;
905
827k
    }
906
907
484k
    if (FormatTok->is(TT_ImplicitStringLiteral))
908
307
      break;
909
484k
    WhitespaceLength += FormatTok->Tok.getLength();
910
911
484k
    readRawToken(*FormatTok);
912
484k
  }
913
914
  // JavaScript and Java do not allow to escape the end of the line with a
915
  // backslash. Backslashes are syntax errors in plain source, but can occur in
916
  // comments. When a single line comment ends with a \, it'll cause the next
917
  // line of code to be lexed as a comment, breaking formatting. The code below
918
  // finds comments that contain a backslash followed by a line break, truncates
919
  // the comment token at the backslash, and resets the lexer to restart behind
920
  // the backslash.
921
1.08M
  if ((Style.isJavaScript() || 
Style.Language == FormatStyle::LK_Java1.04M
) &&
922
1.08M
      
FormatTok->is(tok::comment)42.4k
&&
FormatTok->TokenText.startswith("//")537
) {
923
360
    size_t BackslashPos = FormatTok->TokenText.find('\\');
924
364
    while (BackslashPos != StringRef::npos) {
925
16
      if (BackslashPos + 1 < FormatTok->TokenText.size() &&
926
16
          FormatTok->TokenText[BackslashPos + 1] == '\n') {
927
12
        const char *Offset = Lex->getBufferLocation();
928
12
        Offset -= FormatTok->TokenText.size();
929
12
        Offset += BackslashPos + 1;
930
12
        resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
931
12
        FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
932
12
        FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
933
12
            FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
934
12
            Encoding);
935
12
        break;
936
12
      }
937
4
      BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
938
4
    }
939
360
  }
940
941
  // In case the token starts with escaped newlines, we want to
942
  // take them into account as whitespace - this pattern is quite frequent
943
  // in macro definitions.
944
  // FIXME: Add a more explicit test.
945
1.08M
  while (FormatTok->TokenText.size() > 1 && 
FormatTok->TokenText[0] == '\\'360k
) {
946
80
    unsigned SkippedWhitespace = 0;
947
80
    if (FormatTok->TokenText.size() > 2 &&
948
80
        (FormatTok->TokenText[1] == '\r' && 
FormatTok->TokenText[2] == '\n'9
))
949
9
      SkippedWhitespace = 3;
950
71
    else if (FormatTok->TokenText[1] == '\n')
951
71
      SkippedWhitespace = 2;
952
0
    else
953
0
      break;
954
955
80
    ++FormatTok->NewlinesBefore;
956
80
    WhitespaceLength += SkippedWhitespace;
957
80
    FormatTok->LastNewlineOffset = SkippedWhitespace;
958
80
    Column = 0;
959
80
    FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
960
80
  }
961
962
1.08M
  FormatTok->WhitespaceRange = SourceRange(
963
1.08M
      WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
964
965
1.08M
  FormatTok->OriginalColumn = Column;
966
967
1.08M
  TrailingWhitespace = 0;
968
1.08M
  if (FormatTok->is(tok::comment)) {
969
    // FIXME: Add the trimmed whitespace to Column.
970
15.9k
    StringRef UntrimmedText = FormatTok->TokenText;
971
15.9k
    FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
972
15.9k
    TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
973
1.06M
  } else if (FormatTok->is(tok::raw_identifier)) {
974
408k
    IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
975
408k
    FormatTok->Tok.setIdentifierInfo(&Info);
976
408k
    FormatTok->Tok.setKind(Info.getTokenID());
977
408k
    if (Style.Language == FormatStyle::LK_Java &&
978
408k
        FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
979
1.85k
                           tok::kw_operator)) {
980
8
      FormatTok->Tok.setKind(tok::identifier);
981
8
      FormatTok->Tok.setIdentifierInfo(nullptr);
982
408k
    } else if (Style.isJavaScript() &&
983
408k
               FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
984
13.1k
                                  tok::kw_operator)) {
985
28
      FormatTok->Tok.setKind(tok::identifier);
986
28
      FormatTok->Tok.setIdentifierInfo(nullptr);
987
28
    }
988
659k
  } else if (FormatTok->is(tok::greatergreater)) {
989
1.25k
    FormatTok->Tok.setKind(tok::greater);
990
1.25k
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
991
1.25k
    ++Column;
992
1.25k
    StateStack.push(LexerState::TOKEN_STASHED);
993
657k
  } else if (FormatTok->is(tok::lessless)) {
994
1.41k
    FormatTok->Tok.setKind(tok::less);
995
1.41k
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
996
1.41k
    ++Column;
997
1.41k
    StateStack.push(LexerState::TOKEN_STASHED);
998
1.41k
  }
999
1000
  // Now FormatTok is the next non-whitespace token.
1001
1002
1.08M
  StringRef Text = FormatTok->TokenText;
1003
1.08M
  size_t FirstNewlinePos = Text.find('\n');
1004
1.08M
  if (FirstNewlinePos == StringRef::npos) {
1005
    // FIXME: ColumnWidth actually depends on the start column, we need to
1006
    // take this into account when the token is moved.
1007
1.08M
    FormatTok->ColumnWidth =
1008
1.08M
        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1009
1.08M
    Column += FormatTok->ColumnWidth;
1010
1.08M
  } else {
1011
928
    FormatTok->IsMultiline = true;
1012
    // FIXME: ColumnWidth actually depends on the start column, we need to
1013
    // take this into account when the token is moved.
1014
928
    FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1015
928
        Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1016
1017
    // The last line of the token always starts in column 0.
1018
    // Thus, the length can be precomputed even in the presence of tabs.
1019
928
    FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1020
928
        Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1021
928
    Column = FormatTok->LastLineColumnWidth;
1022
928
  }
1023
1024
1.08M
  if (Style.isCpp()) {
1025
1.02M
    auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1026
1.02M
    if (!(Tokens.size() > 0 && 
Tokens.back()->Tok.getIdentifierInfo()960k
&&
1027
1.02M
          Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1028
386k
              tok::pp_define) &&
1029
1.02M
        
it != Macros.end()1.01M
) {
1030
2.63k
      if (it->second == TT_UntouchableMacroFunc)
1031
48
        FormatTok->setFinalizedType(TT_UntouchableMacroFunc);
1032
2.58k
      else
1033
2.58k
        FormatTok->setType(it->second);
1034
2.63k
      if (it->second == TT_IfMacro) {
1035
        // The lexer token currently has type tok::kw_unknown. However, for this
1036
        // substitution to be treated correctly in the TokenAnnotator, faking
1037
        // the tok value seems to be needed. Not sure if there's a more elegant
1038
        // way.
1039
1.35k
        FormatTok->Tok.setKind(tok::kw_if);
1040
1.35k
      }
1041
1.01M
    } else if (FormatTok->is(tok::identifier)) {
1042
243k
      if (MacroBlockBeginRegex.match(Text))
1043
36
        FormatTok->setType(TT_MacroBlockBegin);
1044
243k
      else if (MacroBlockEndRegex.match(Text))
1045
36
        FormatTok->setType(TT_MacroBlockEnd);
1046
243k
    }
1047
1.02M
  }
1048
1049
1.08M
  return FormatTok;
1050
1.08M
}
1051
1052
1.56M
void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1053
1.56M
  Lex->LexFromRawLexer(Tok.Tok);
1054
1.56M
  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1055
1.56M
                            Tok.Tok.getLength());
1056
  // For formatting, treat unterminated string literals like normal string
1057
  // literals.
1058
1.56M
  if (Tok.is(tok::unknown)) {
1059
484k
    if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1060
34
      Tok.Tok.setKind(tok::string_literal);
1061
34
      Tok.IsUnterminatedLiteral = true;
1062
484k
    } else if (Style.isJavaScript() && 
Tok.TokenText == "''"17.3k
) {
1063
12
      Tok.Tok.setKind(tok::string_literal);
1064
12
    }
1065
484k
  }
1066
1067
1.56M
  if ((Style.isJavaScript() || 
Style.Language == FormatStyle::LK_Proto1.51M
||
1068
1.56M
       
Style.Language == FormatStyle::LK_TextProto1.50M
) &&
1069
1.56M
      
Tok.is(tok::char_constant)71.7k
)
1070
844
    Tok.Tok.setKind(tok::string_literal);
1071
1072
1.56M
  if (Tok.is(tok::comment) && 
(15.9k
Tok.TokenText == "// clang-format on"15.9k
||
1073
15.9k
                               
Tok.TokenText == "/* clang-format on */"15.8k
))
1074
137
    FormattingDisabled = false;
1075
1076
1.56M
  Tok.Finalized = FormattingDisabled;
1077
1078
1.56M
  if (Tok.is(tok::comment) && 
(15.9k
Tok.TokenText == "// clang-format off"15.9k
||
1079
15.9k
                               
Tok.TokenText == "/* clang-format off */"15.8k
))
1080
137
    FormattingDisabled = true;
1081
1.56M
}
1082
1083
644
void FormatTokenLexer::resetLexer(unsigned Offset) {
1084
644
  StringRef Buffer = SourceMgr.getBufferData(ID);
1085
644
  LangOpts = getFormattingLangOpts(Style);
1086
644
  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1087
644
                      Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1088
644
  Lex->SetKeepWhitespaceMode(true);
1089
644
  TrailingWhitespace = 0;
1090
644
}
1091
1092
} // namespace format
1093
} // namespace clang