Coverage Report

Created: 2021-04-24 07:00

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file implements FormatTokenLexer, which tokenizes a source file
11
/// into a FormatToken stream suitable for ClangFormat.
12
///
13
//===----------------------------------------------------------------------===//
14
15
#include "FormatTokenLexer.h"
16
#include "FormatToken.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/SourceManager.h"
19
#include "clang/Format/Format.h"
20
#include "llvm/Support/Regex.h"
21
22
namespace clang {
23
namespace format {
24
25
FormatTokenLexer::FormatTokenLexer(
26
    const SourceManager &SourceMgr, FileID ID, unsigned Column,
27
    const FormatStyle &Style, encoding::Encoding Encoding,
28
    llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29
    IdentifierTable &IdentTable)
30
    : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31
      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
32
      Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
33
      Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
34
      FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
35
38.7k
      MacroBlockEndRegex(Style.MacroBlockEnd) {
36
38.7k
  Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr,
37
38.7k
                      getFormattingLangOpts(Style)));
38
38.7k
  Lex->SetKeepWhitespaceMode(true);
39
40
38.7k
  for (const std::string &ForEachMacro : Style.ForEachMacros)
41
116k
    Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
42
38.7k
  for (const std::string &AttributeMacro : Style.AttributeMacros)
43
39.4k
    Macros.insert({&IdentTable.get(AttributeMacro), TT_AttributeMacro});
44
38.7k
  for (const std::string &StatementMacro : Style.StatementMacros)
45
77.6k
    Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
46
38.7k
  for (const std::string &TypenameMacro : Style.TypenameMacros)
47
553
    Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
48
38.7k
  for (const std::string &NamespaceMacro : Style.NamespaceMacros)
49
159
    Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
50
38.7k
  for (const std::string &WhitespaceSensitiveMacro :
51
193k
       Style.WhitespaceSensitiveMacros) {
52
193k
    Macros.insert(
53
193k
        {&IdentTable.get(WhitespaceSensitiveMacro), TT_UntouchableMacroFunc});
54
193k
  }
55
38.7k
  for (const std::string &StatementAttributeLikeMacro :
56
38.7k
       Style.StatementAttributeLikeMacros)
57
38.7k
    Macros.insert({&IdentTable.get(StatementAttributeLikeMacro),
58
38.7k
                   TT_StatementAttributeLikeMacro});
59
38.7k
}
60
61
38.7k
ArrayRef<FormatToken *> FormatTokenLexer::lex() {
62
38.7k
  assert(Tokens.empty());
63
0
  assert(FirstInLineIndex == 0);
64
664k
  do {
65
664k
    Tokens.push_back(getNextToken());
66
664k
    if (Style.Language == FormatStyle::LK_JavaScript) {
67
35.1k
      tryParseJSRegexLiteral();
68
35.1k
      handleTemplateStrings();
69
35.1k
    }
70
664k
    if (Style.Language == FormatStyle::LK_TextProto)
71
6.72k
      tryParsePythonComment();
72
664k
    tryMergePreviousTokens();
73
664k
    if (Style.isCSharp())
74
      // This needs to come after tokens have been merged so that C#
75
      // string literals are correctly identified.
76
4.86k
      handleCSharpVerbatimAndInterpolatedStrings();
77
664k
    if (Tokens.back()->NewlinesBefore > 0 || 
Tokens.back()->IsMultiline599k
)
78
65.6k
      FirstInLineIndex = Tokens.size() - 1;
79
664k
  } while (Tokens.back()->Tok.isNot(tok::eof));
80
38.7k
  return Tokens;
81
38.7k
}
82
83
664k
void FormatTokenLexer::tryMergePreviousTokens() {
84
664k
  if (tryMerge_TMacro())
85
15
    return;
86
664k
  if (tryMergeConflictMarkers())
87
36
    return;
88
664k
  if (tryMergeLessLess())
89
982
    return;
90
663k
  if (tryMergeForEach())
91
7
    return;
92
663k
  if (Style.isCpp() && 
tryTransformTryUsageForC()608k
)
93
28
    return;
94
95
663k
  if (Style.isCSharp()) {
96
4.86k
    if (tryMergeCSharpKeywordVariables())
97
2
      return;
98
4.86k
    if (tryMergeCSharpStringLiteral())
99
43
      return;
100
4.81k
    if (tryMergeCSharpDoubleQuestion())
101
6
      return;
102
4.81k
    if (tryMergeCSharpNullConditional())
103
14
      return;
104
4.79k
    if (tryTransformCSharpForEach())
105
8
      return;
106
4.79k
    static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
107
4.79k
    if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
108
34
      return;
109
4.79k
  }
110
111
663k
  if (tryMergeNSStringLiteral())
112
222
    return;
113
114
663k
  if (Style.Language == FormatStyle::LK_JavaScript) {
115
35.1k
    static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
116
35.1k
    static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
117
35.1k
                                                   tok::equal};
118
35.1k
    static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
119
35.1k
                                                  tok::greaterequal};
120
35.1k
    static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
121
35.1k
    static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
122
35.1k
    static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
123
35.1k
                                                           tok::starequal};
124
35.1k
    static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question,
125
35.1k
                                                               tok::period};
126
35.1k
    static const tok::TokenKind JSNullishOperator[] = {tok::question,
127
35.1k
                                                       tok::question};
128
35.1k
    static const tok::TokenKind JSNullishEqual[] = {tok::question,
129
35.1k
                                                    tok::question, tok::equal};
130
35.1k
    static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
131
35.1k
    static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
132
133
    // FIXME: Investigate what token type gives the correct operator priority.
134
35.1k
    if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
135
12
      return;
136
35.1k
    if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
137
12
      return;
138
35.1k
    if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
139
10
      return;
140
35.1k
    if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
141
148
      return;
142
34.9k
    if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
143
4
      return;
144
34.9k
    if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
145
4
      Tokens.back()->Tok.setKind(tok::starequal);
146
4
      return;
147
4
    }
148
34.9k
    if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) {
149
      // Treat like the "||" operator (as opposed to the ternary ?).
150
20
      Tokens.back()->Tok.setKind(tok::pipepipe);
151
20
      return;
152
20
    }
153
34.9k
    if (tryMergeTokens(JSNullPropagatingOperator,
154
34.9k
                       TT_JsNullPropagatingOperator)) {
155
      // Treat like a regular "." access.
156
16
      Tokens.back()->Tok.setKind(tok::period);
157
16
      return;
158
16
    }
159
34.9k
    if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
160
34.9k
        
tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)34.9k
||
161
34.9k
        
tryMergeTokens(JSNullishEqual, TT_JsNullishCoalescingEqual)34.9k
) {
162
      // Treat like the "=" assignment operator.
163
12
      Tokens.back()->Tok.setKind(tok::equal);
164
12
      return;
165
12
    }
166
34.9k
    if (tryMergeJSPrivateIdentifier())
167
32
      return;
168
34.9k
  }
169
170
663k
  if (Style.Language == FormatStyle::LK_Java) {
171
4.46k
    static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
172
4.46k
        tok::greater, tok::greater, tok::greaterequal};
173
4.46k
    if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
174
2
      return;
175
4.46k
  }
176
663k
}
177
178
663k
bool FormatTokenLexer::tryMergeNSStringLiteral() {
179
663k
  if (Tokens.size() < 2)
180
38.7k
    return false;
181
624k
  auto &At = *(Tokens.end() - 2);
182
624k
  auto &String = *(Tokens.end() - 1);
183
624k
  if (!At->is(tok::at) || 
!String->is(tok::string_literal)1.28k
)
184
624k
    return false;
185
222
  At->Tok.setKind(tok::string_literal);
186
222
  At->TokenText = StringRef(At->TokenText.begin(),
187
222
                            String->TokenText.end() - At->TokenText.begin());
188
222
  At->ColumnWidth += String->ColumnWidth;
189
222
  At->setType(TT_ObjCStringLiteral);
190
222
  Tokens.erase(Tokens.end() - 1);
191
222
  return true;
192
624k
}
193
194
34.9k
bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
195
  // Merges #idenfier into a single identifier with the text #identifier
196
  // but the token tok::identifier.
197
34.9k
  if (Tokens.size() < 2)
198
2.85k
    return false;
199
32.0k
  auto &Hash = *(Tokens.end() - 2);
200
32.0k
  auto &Identifier = *(Tokens.end() - 1);
201
32.0k
  if (!Hash->is(tok::hash) || 
!Identifier->is(tok::identifier)36
)
202
32.0k
    return false;
203
32
  Hash->Tok.setKind(tok::identifier);
204
32
  Hash->TokenText =
205
32
      StringRef(Hash->TokenText.begin(),
206
32
                Identifier->TokenText.end() - Hash->TokenText.begin());
207
32
  Hash->ColumnWidth += Identifier->ColumnWidth;
208
32
  Hash->setType(TT_JsPrivateIdentifier);
209
32
  Tokens.erase(Tokens.end() - 1);
210
32
  return true;
211
32.0k
}
212
213
// Search for verbatim or interpolated string literals @"ABC" or
214
// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
215
// prevent splitting of @, $ and ".
216
// Merging of multiline verbatim strings with embedded '"' is handled in
217
// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
218
4.86k
bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
219
4.86k
  if (Tokens.size() < 2)
220
335
    return false;
221
222
  // Interpolated strings could contain { } with " characters inside.
223
  // $"{x ?? "null"}"
224
  // should not be split into $"{x ?? ", null, "}" but should treated as a
225
  // single string-literal.
226
  //
227
  // We opt not to try and format expressions inside {} within a C#
228
  // interpolated string. Formatting expressions within an interpolated string
229
  // would require similar work as that done for JavaScript template strings
230
  // in `handleTemplateStrings()`.
231
4.52k
  auto &CSharpInterpolatedString = *(Tokens.end() - 2);
232
4.52k
  if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral &&
233
4.52k
      
(43
CSharpInterpolatedString->TokenText.startswith(R"($")")43
||
234
43
       
CSharpInterpolatedString->TokenText.startswith(R"($@")")12
)) {
235
37
    int UnmatchedOpeningBraceCount = 0;
236
237
37
    auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
238
989
    for (size_t Index = 0; Index < TokenTextSize; 
++Index952
) {
239
952
      char C = CSharpInterpolatedString->TokenText[Index];
240
952
      if (C == '{') {
241
        // "{{"  inside an interpolated string is an escaped '{' so skip it.
242
49
        if (Index + 1 < TokenTextSize &&
243
49
            CSharpInterpolatedString->TokenText[Index + 1] == '{') {
244
6
          ++Index;
245
6
          continue;
246
6
        }
247
43
        ++UnmatchedOpeningBraceCount;
248
903
      } else if (C == '}') {
249
        // "}}"  inside an interpolated string is an escaped '}' so skip it.
250
43
        if (Index + 1 < TokenTextSize &&
251
43
            CSharpInterpolatedString->TokenText[Index + 1] == '}') {
252
6
          ++Index;
253
6
          continue;
254
6
        }
255
37
        --UnmatchedOpeningBraceCount;
256
37
      }
257
952
    }
258
259
37
    if (UnmatchedOpeningBraceCount > 0) {
260
6
      auto &NextToken = *(Tokens.end() - 1);
261
6
      CSharpInterpolatedString->TokenText =
262
6
          StringRef(CSharpInterpolatedString->TokenText.begin(),
263
6
                    NextToken->TokenText.end() -
264
6
                        CSharpInterpolatedString->TokenText.begin());
265
6
      CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
266
6
      Tokens.erase(Tokens.end() - 1);
267
6
      return true;
268
6
    }
269
37
  }
270
271
  // Look for @"aaaaaa" or $"aaaaaa".
272
4.52k
  auto &String = *(Tokens.end() - 1);
273
4.52k
  if (!String->is(tok::string_literal))
274
4.42k
    return false;
275
276
93
  auto &At = *(Tokens.end() - 2);
277
93
  if (!(At->is(tok::at) || 
At->TokenText == "$"81
))
278
56
    return false;
279
280
37
  if (Tokens.size() > 2 && 
At->is(tok::at)35
) {
281
12
    auto &Dollar = *(Tokens.end() - 3);
282
12
    if (Dollar->TokenText == "$") {
283
      // This looks like $@"aaaaa" so we need to combine all 3 tokens.
284
6
      Dollar->Tok.setKind(tok::string_literal);
285
6
      Dollar->TokenText =
286
6
          StringRef(Dollar->TokenText.begin(),
287
6
                    String->TokenText.end() - Dollar->TokenText.begin());
288
6
      Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
289
6
      Dollar->setType(TT_CSharpStringLiteral);
290
6
      Tokens.erase(Tokens.end() - 2);
291
6
      Tokens.erase(Tokens.end() - 1);
292
6
      return true;
293
6
    }
294
12
  }
295
296
  // Convert back into just a string_literal.
297
31
  At->Tok.setKind(tok::string_literal);
298
31
  At->TokenText = StringRef(At->TokenText.begin(),
299
31
                            String->TokenText.end() - At->TokenText.begin());
300
31
  At->ColumnWidth += String->ColumnWidth;
301
31
  At->setType(TT_CSharpStringLiteral);
302
31
  Tokens.erase(Tokens.end() - 1);
303
31
  return true;
304
37
}
305
306
// Valid C# attribute targets:
307
// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
308
const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
309
    "assembly", "module",   "field",  "event", "method",
310
    "param",    "property", "return", "type",
311
};
312
313
4.81k
bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
314
4.81k
  if (Tokens.size() < 2)
315
335
    return false;
316
4.48k
  auto &FirstQuestion = *(Tokens.end() - 2);
317
4.48k
  auto &SecondQuestion = *(Tokens.end() - 1);
318
4.48k
  if (!FirstQuestion->is(tok::question) || 
!SecondQuestion->is(tok::question)58
)
319
4.47k
    return false;
320
6
  FirstQuestion->Tok.setKind(tok::question); // no '??' in clang tokens.
321
6
  FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
322
6
                                       SecondQuestion->TokenText.end() -
323
6
                                           FirstQuestion->TokenText.begin());
324
6
  FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
325
6
  FirstQuestion->setType(TT_CSharpNullCoalescing);
326
6
  Tokens.erase(Tokens.end() - 1);
327
6
  return true;
328
4.48k
}
329
330
// Merge '?[' and '?.' pairs into single tokens.
331
4.81k
bool FormatTokenLexer::tryMergeCSharpNullConditional() {
332
4.81k
  if (Tokens.size() < 2)
333
335
    return false;
334
4.47k
  auto &Question = *(Tokens.end() - 2);
335
4.47k
  auto &PeriodOrLSquare = *(Tokens.end() - 1);
336
4.47k
  if (!Question->is(tok::question) ||
337
4.47k
      
!PeriodOrLSquare->isOneOf(tok::l_square, tok::period)52
)
338
4.46k
    return false;
339
14
  Question->TokenText =
340
14
      StringRef(Question->TokenText.begin(),
341
14
                PeriodOrLSquare->TokenText.end() - Question->TokenText.begin());
342
14
  Question->ColumnWidth += PeriodOrLSquare->ColumnWidth;
343
344
14
  if (PeriodOrLSquare->is(tok::l_square)) {
345
8
    Question->Tok.setKind(tok::question); // no '?[' in clang tokens.
346
8
    Question->setType(TT_CSharpNullConditionalLSquare);
347
8
  } else {
348
6
    Question->Tok.setKind(tok::question); // no '?.' in clang tokens.
349
6
    Question->setType(TT_CSharpNullConditional);
350
6
  }
351
352
14
  Tokens.erase(Tokens.end() - 1);
353
14
  return true;
354
4.47k
}
355
356
4.86k
bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
357
4.86k
  if (Tokens.size() < 2)
358
335
    return false;
359
4.52k
  auto &At = *(Tokens.end() - 2);
360
4.52k
  auto &Keyword = *(Tokens.end() - 1);
361
4.52k
  if (!At->is(tok::at))
362
4.51k
    return false;
363
16
  if (!Keywords.isCSharpKeyword(*Keyword))
364
14
    return false;
365
366
2
  At->Tok.setKind(tok::identifier);
367
2
  At->TokenText = StringRef(At->TokenText.begin(),
368
2
                            Keyword->TokenText.end() - At->TokenText.begin());
369
2
  At->ColumnWidth += Keyword->ColumnWidth;
370
2
  At->setType(Keyword->getType());
371
2
  Tokens.erase(Tokens.end() - 1);
372
2
  return true;
373
16
}
374
375
// In C# transform identifier foreach into kw_foreach
376
4.79k
bool FormatTokenLexer::tryTransformCSharpForEach() {
377
4.79k
  if (Tokens.size() < 1)
378
0
    return false;
379
4.79k
  auto &Identifier = *(Tokens.end() - 1);
380
4.79k
  if (!Identifier->is(tok::identifier))
381
3.39k
    return false;
382
1.40k
  if (Identifier->TokenText != "foreach")
383
1.39k
    return false;
384
385
8
  Identifier->setType(TT_ForEachMacro);
386
8
  Identifier->Tok.setKind(tok::kw_for);
387
8
  return true;
388
1.40k
}
389
390
663k
bool FormatTokenLexer::tryMergeForEach() {
391
663k
  if (Tokens.size() < 2)
392
38.7k
    return false;
393
624k
  auto &For = *(Tokens.end() - 2);
394
624k
  auto &Each = *(Tokens.end() - 1);
395
624k
  if (!For->is(tok::kw_for))
396
624k
    return false;
397
751
  if (!Each->is(tok::identifier))
398
736
    return false;
399
15
  if (Each->TokenText != "each")
400
8
    return false;
401
402
7
  For->setType(TT_ForEachMacro);
403
7
  For->Tok.setKind(tok::kw_for);
404
405
7
  For->TokenText = StringRef(For->TokenText.begin(),
406
7
                             Each->TokenText.end() - For->TokenText.begin());
407
7
  For->ColumnWidth += Each->ColumnWidth;
408
7
  Tokens.erase(Tokens.end() - 1);
409
7
  return true;
410
15
}
411
412
608k
bool FormatTokenLexer::tryTransformTryUsageForC() {
413
608k
  if (Tokens.size() < 2)
414
34.5k
    return false;
415
573k
  auto &Try = *(Tokens.end() - 2);
416
573k
  if (!Try->is(tok::kw_try))
417
573k
    return false;
418
225
  auto &Next = *(Tokens.end() - 1);
419
225
  if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
420
195
    return false;
421
422
30
  if (Tokens.size() > 2) {
423
23
    auto &At = *(Tokens.end() - 3);
424
23
    if (At->is(tok::at))
425
2
      return false;
426
23
  }
427
428
28
  Try->Tok.setKind(tok::identifier);
429
28
  return true;
430
30
}
431
432
664k
bool FormatTokenLexer::tryMergeLessLess() {
433
  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
434
664k
  if (Tokens.size() < 3)
435
77.2k
    return false;
436
437
587k
  bool FourthTokenIsLess = false;
438
587k
  if (Tokens.size() > 3)
439
549k
    FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
440
441
587k
  auto First = Tokens.end() - 3;
442
587k
  if (First[2]->is(tok::less) || 
First[1]->isNot(tok::less)577k
||
443
587k
      
First[0]->isNot(tok::less)10.9k
||
FourthTokenIsLess1.05k
)
444
586k
    return false;
445
446
  // Only merge if there currently is no whitespace between the two "<".
447
982
  if (First[1]->WhitespaceRange.getBegin() !=
448
982
      First[1]->WhitespaceRange.getEnd())
449
0
    return false;
450
451
982
  First[0]->Tok.setKind(tok::lessless);
452
982
  First[0]->TokenText = "<<";
453
982
  First[0]->ColumnWidth += 1;
454
982
  Tokens.erase(Tokens.end() - 2);
455
982
  return true;
456
982
}
457
458
bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
459
394k
                                      TokenType NewType) {
460
394k
  if (Tokens.size() < Kinds.size())
461
38.0k
    return false;
462
463
356k
  SmallVectorImpl<FormatToken *>::const_iterator First =
464
356k
      Tokens.end() - Kinds.size();
465
356k
  if (!First[0]->is(Kinds[0]))
466
353k
    return false;
467
2.82k
  unsigned AddLength = 0;
468
3.15k
  for (unsigned i = 1; i < Kinds.size(); 
++i336
) {
469
2.88k
    if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
470
348
                                       First[i]->WhitespaceRange.getEnd())
471
2.54k
      return false;
472
336
    AddLength += First[i]->TokenText.size();
473
336
  }
474
274
  Tokens.resize(Tokens.size() - Kinds.size() + 1);
475
274
  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
476
274
                                  First[0]->TokenText.size() + AddLength);
477
274
  First[0]->ColumnWidth += AddLength;
478
274
  First[0]->setType(NewType);
479
274
  return true;
480
2.82k
}
481
482
// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
483
336
bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
484
  // NB: This is not entirely correct, as an r_paren can introduce an operand
485
  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
486
  // corner case to not matter in practice, though.
487
336
  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
488
336
                      tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
489
336
                      tok::colon, tok::question, tok::tilde) ||
490
336
         Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
491
292
                      tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
492
292
                      tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
493
336
         
Tok->isBinaryOperator()288
;
494
336
}
495
496
340
bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
497
340
  if (!Prev)
498
4
    return true;
499
500
  // Regex literals can only follow after prefix unary operators, not after
501
  // postfix unary operators. If the '++' is followed by a non-operand
502
  // introducing token, the slash here is the operand and not the start of a
503
  // regex.
504
  // `!` is an unary prefix operator, but also a post-fix operator that casts
505
  // away nullability, so the same check applies.
506
336
  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
507
20
    return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
508
509
  // The previous token must introduce an operand location where regex
510
  // literals can occur.
511
316
  if (!precedesOperand(Prev))
512
24
    return false;
513
514
292
  return true;
515
316
}
516
517
// Tries to parse a JavaScript Regex literal starting at the current token,
518
// if that begins with a slash and is in a location where JavaScript allows
519
// regex literals. Changes the current token to a regex literal and updates
520
// its text if successful.
521
35.1k
void FormatTokenLexer::tryParseJSRegexLiteral() {
522
35.1k
  FormatToken *RegexToken = Tokens.back();
523
35.1k
  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
524
34.8k
    return;
525
526
340
  FormatToken *Prev = nullptr;
527
348
  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; 
++I8
) {
528
    // NB: Because previous pointers are not initialized yet, this cannot use
529
    // Token.getPreviousNonComment.
530
344
    if ((*I)->isNot(tok::comment)) {
531
336
      Prev = *I;
532
336
      break;
533
336
    }
534
344
  }
535
536
340
  if (!canPrecedeRegexLiteral(Prev))
537
36
    return;
538
539
  // 'Manually' lex ahead in the current file buffer.
540
304
  const char *Offset = Lex->getBufferLocation();
541
304
  const char *RegexBegin = Offset - RegexToken->TokenText.size();
542
304
  StringRef Buffer = Lex->getBuffer();
543
304
  bool InCharacterClass = false;
544
304
  bool HaveClosingSlash = false;
545
1.91k
  for (; !HaveClosingSlash && 
Offset != Buffer.end()1.61k
;
++Offset1.61k
) {
546
    // Regular expressions are terminated with a '/', which can only be
547
    // escaped using '\' or a character class between '[' and ']'.
548
    // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
549
1.61k
    switch (*Offset) {
550
116
    case '\\':
551
      // Skip the escaped character.
552
116
      ++Offset;
553
116
      break;
554
40
    case '[':
555
40
      InCharacterClass = true;
556
40
      break;
557
40
    case ']':
558
40
      InCharacterClass = false;
559
40
      break;
560
320
    case '/':
561
320
      if (!InCharacterClass)
562
304
        HaveClosingSlash = true;
563
320
      break;
564
1.61k
    }
565
1.61k
  }
566
567
304
  RegexToken->setType(TT_RegexLiteral);
568
  // Treat regex literals like other string_literals.
569
304
  RegexToken->Tok.setKind(tok::string_literal);
570
304
  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
571
304
  RegexToken->ColumnWidth = RegexToken->TokenText.size();
572
573
304
  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
574
304
}
575
576
4.86k
void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
577
4.86k
  FormatToken *CSharpStringLiteral = Tokens.back();
578
579
4.86k
  if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral)
580
4.82k
    return;
581
582
  // Deal with multiline strings.
583
43
  if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
584
43
        
CSharpStringLiteral->TokenText.startswith(R"($@")")37
))
585
31
    return;
586
587
12
  const char *StrBegin =
588
12
      Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
589
12
  const char *Offset = StrBegin;
590
12
  if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
591
6
    Offset += 2;
592
6
  else // CSharpStringLiteral->TokenText.startswith(R"($@")")
593
6
    Offset += 3;
594
595
  // Look for a terminating '"' in the current file buffer.
596
  // Make no effort to format code within an interpolated or verbatim string.
597
288
  for (; Offset != Lex->getBuffer().end(); 
++Offset276
) {
598
288
    if (Offset[0] == '"') {
599
      // "" within a verbatim string is an escaped double quote: skip it.
600
22
      if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
601
10
        ++Offset;
602
12
      else
603
12
        break;
604
22
    }
605
288
  }
606
607
  // Make no attempt to format code properly if a verbatim string is
608
  // unterminated.
609
12
  if (Offset == Lex->getBuffer().end())
610
0
    return;
611
612
12
  StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
613
12
  CSharpStringLiteral->TokenText = LiteralText;
614
615
  // Adjust width for potentially multiline string literals.
616
12
  size_t FirstBreak = LiteralText.find('\n');
617
12
  StringRef FirstLineText = FirstBreak == StringRef::npos
618
12
                                ? 
LiteralText10
619
12
                                : 
LiteralText.substr(0, FirstBreak)2
;
620
12
  CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
621
12
      FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
622
12
      Encoding);
623
12
  size_t LastBreak = LiteralText.rfind('\n');
624
12
  if (LastBreak != StringRef::npos) {
625
2
    CSharpStringLiteral->IsMultiline = true;
626
2
    unsigned StartColumn = 0;
627
2
    CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs(
628
2
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
629
2
        Style.TabWidth, Encoding);
630
2
  }
631
632
12
  SourceLocation loc = Offset < Lex->getBuffer().end()
633
12
                           ? Lex->getSourceLocation(Offset + 1)
634
12
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
635
12
  resetLexer(SourceMgr.getFileOffset(loc));
636
12
}
637
638
35.1k
void FormatTokenLexer::handleTemplateStrings() {
639
35.1k
  FormatToken *BacktickToken = Tokens.back();
640
641
35.1k
  if (BacktickToken->is(tok::l_brace)) {
642
1.90k
    StateStack.push(LexerState::NORMAL);
643
1.90k
    return;
644
1.90k
  }
645
33.2k
  if (BacktickToken->is(tok::r_brace)) {
646
2.00k
    if (StateStack.size() == 1)
647
4
      return;
648
1.99k
    StateStack.pop();
649
1.99k
    if (StateStack.top() != LexerState::TEMPLATE_STRING)
650
1.89k
      return;
651
    // If back in TEMPLATE_STRING, fallthrough and continue parsing the
652
31.2k
  } else if (BacktickToken->is(tok::unknown) &&
653
31.2k
             
BacktickToken->TokenText == "`"148
) {
654
148
    StateStack.push(LexerState::TEMPLATE_STRING);
655
31.1k
  } else {
656
31.1k
    return; // Not actually a template
657
31.1k
  }
658
659
  // 'Manually' lex ahead in the current file buffer.
660
248
  const char *Offset = Lex->getBufferLocation();
661
248
  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
662
1.90k
  for (; Offset != Lex->getBuffer().end(); 
++Offset1.65k
) {
663
1.90k
    if (Offset[0] == '`') {
664
148
      StateStack.pop();
665
148
      break;
666
148
    }
667
1.75k
    if (Offset[0] == '\\') {
668
8
      ++Offset; // Skip the escaped character.
669
1.74k
    } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
670
1.74k
               
Offset[1] == '{'100
) {
671
      // '${' introduces an expression interpolation in the template string.
672
100
      StateStack.push(LexerState::NORMAL);
673
100
      ++Offset;
674
100
      break;
675
100
    }
676
1.75k
  }
677
678
248
  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
679
248
  BacktickToken->setType(TT_TemplateString);
680
248
  BacktickToken->Tok.setKind(tok::string_literal);
681
248
  BacktickToken->TokenText = LiteralText;
682
683
  // Adjust width for potentially multiline string literals.
684
248
  size_t FirstBreak = LiteralText.find('\n');
685
248
  StringRef FirstLineText = FirstBreak == StringRef::npos
686
248
                                ? 
LiteralText212
687
248
                                : 
LiteralText.substr(0, FirstBreak)36
;
688
248
  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
689
248
      FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
690
248
  size_t LastBreak = LiteralText.rfind('\n');
691
248
  if (LastBreak != StringRef::npos) {
692
36
    BacktickToken->IsMultiline = true;
693
36
    unsigned StartColumn = 0; // The template tail spans the entire line.
694
36
    BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
695
36
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
696
36
        Style.TabWidth, Encoding);
697
36
  }
698
699
248
  SourceLocation loc = Offset < Lex->getBuffer().end()
700
248
                           ? Lex->getSourceLocation(Offset + 1)
701
248
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
702
248
  resetLexer(SourceMgr.getFileOffset(loc));
703
248
}
704
705
6.72k
void FormatTokenLexer::tryParsePythonComment() {
706
6.72k
  FormatToken *HashToken = Tokens.back();
707
6.72k
  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
708
6.66k
    return;
709
  // Turn the remainder of this line into a comment.
710
65
  const char *CommentBegin =
711
65
      Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
712
65
  size_t From = CommentBegin - Lex->getBuffer().begin();
713
65
  size_t To = Lex->getBuffer().find_first_of('\n', From);
714
65
  if (To == StringRef::npos)
715
7
    To = Lex->getBuffer().size();
716
65
  size_t Len = To - From;
717
65
  HashToken->setType(TT_LineComment);
718
65
  HashToken->Tok.setKind(tok::comment);
719
65
  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
720
65
  SourceLocation Loc = To < Lex->getBuffer().size()
721
65
                           ? 
Lex->getSourceLocation(CommentBegin + Len)58
722
65
                           : 
SourceMgr.getLocForEndOfFile(ID)7
;
723
65
  resetLexer(SourceMgr.getFileOffset(Loc));
724
65
}
725
726
664k
bool FormatTokenLexer::tryMerge_TMacro() {
727
664k
  if (Tokens.size() < 4)
728
115k
    return false;
729
549k
  FormatToken *Last = Tokens.back();
730
549k
  if (!Last->is(tok::r_paren))
731
501k
    return false;
732
733
48.1k
  FormatToken *String = Tokens[Tokens.size() - 2];
734
48.1k
  if (!String->is(tok::string_literal) || 
String->IsMultiline635
)
735
47.5k
    return false;
736
737
602
  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
738
276
    return false;
739
740
326
  FormatToken *Macro = Tokens[Tokens.size() - 4];
741
326
  if (Macro->TokenText != "_T")
742
311
    return false;
743
744
15
  const char *Start = Macro->TokenText.data();
745
15
  const char *End = Last->TokenText.data() + Last->TokenText.size();
746
15
  String->TokenText = StringRef(Start, End - Start);
747
15
  String->IsFirst = Macro->IsFirst;
748
15
  String->LastNewlineOffset = Macro->LastNewlineOffset;
749
15
  String->WhitespaceRange = Macro->WhitespaceRange;
750
15
  String->OriginalColumn = Macro->OriginalColumn;
751
15
  String->ColumnWidth = encoding::columnWidthWithTabs(
752
15
      String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
753
15
  String->NewlinesBefore = Macro->NewlinesBefore;
754
15
  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
755
756
15
  Tokens.pop_back();
757
15
  Tokens.pop_back();
758
15
  Tokens.pop_back();
759
15
  Tokens.back() = String;
760
15
  return true;
761
326
}
762
763
664k
bool FormatTokenLexer::tryMergeConflictMarkers() {
764
664k
  if (Tokens.back()->NewlinesBefore == 0 && 
Tokens.back()->isNot(tok::eof)599k
)
765
564k
    return false;
766
767
  // Conflict lines look like:
768
  // <marker> <text from the vcs>
769
  // For example:
770
  // >>>>>>> /file/in/file/system at revision 1234
771
  //
772
  // We merge all tokens in a line that starts with a conflict marker
773
  // into a single token with a special token type that the unwrapped line
774
  // parser will use to correctly rebuild the underlying code.
775
776
100k
  FileID ID;
777
  // Get the position of the first token in the line.
778
100k
  unsigned FirstInLineOffset;
779
100k
  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
780
100k
      Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
781
100k
  StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
782
  // Calculate the offset of the start of the current line.
783
100k
  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
784
100k
  if (LineOffset == StringRef::npos) {
785
37.5k
    LineOffset = 0;
786
62.5k
  } else {
787
62.5k
    ++LineOffset;
788
62.5k
  }
789
790
100k
  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
791
100k
  StringRef LineStart;
792
100k
  if (FirstSpace == StringRef::npos) {
793
5.80k
    LineStart = Buffer.substr(LineOffset);
794
94.2k
  } else {
795
94.2k
    LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
796
94.2k
  }
797
798
100k
  TokenType Type = TT_Unknown;
799
100k
  if (LineStart == "<<<<<<<" || 
LineStart == ">>>>"100k
) {
800
9
    Type = TT_ConflictStart;
801
100k
  } else if (LineStart == "|||||||" || 
LineStart == "======="100k
||
802
100k
             
LineStart == "===="100k
) {
803
18
    Type = TT_ConflictAlternative;
804
100k
  } else if (LineStart == ">>>>>>>" || 
LineStart == "<<<<"100k
) {
805
9
    Type = TT_ConflictEnd;
806
9
  }
807
808
100k
  if (Type != TT_Unknown) {
809
36
    FormatToken *Next = Tokens.back();
810
811
36
    Tokens.resize(FirstInLineIndex + 1);
812
    // We do not need to build a complete token here, as we will skip it
813
    // during parsing anyway (as we must not touch whitespace around conflict
814
    // markers).
815
36
    Tokens.back()->setType(Type);
816
36
    Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
817
818
36
    Tokens.push_back(Next);
819
36
    return true;
820
36
  }
821
822
100k
  return false;
823
100k
}
824
825
1.54k
FormatToken *FormatTokenLexer::getStashedToken() {
826
  // Create a synthesized second '>' or '<' token.
827
1.54k
  Token Tok = FormatTok->Tok;
828
1.54k
  StringRef TokenText = FormatTok->TokenText;
829
830
1.54k
  unsigned OriginalColumn = FormatTok->OriginalColumn;
831
1.54k
  FormatTok = new (Allocator.Allocate()) FormatToken;
832
1.54k
  FormatTok->Tok = Tok;
833
1.54k
  SourceLocation TokLocation =
834
1.54k
      FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
835
1.54k
  FormatTok->Tok.setLocation(TokLocation);
836
1.54k
  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
837
1.54k
  FormatTok->TokenText = TokenText;
838
1.54k
  FormatTok->ColumnWidth = 1;
839
1.54k
  FormatTok->OriginalColumn = OriginalColumn + 1;
840
841
1.54k
  return FormatTok;
842
1.54k
}
843
844
664k
FormatToken *FormatTokenLexer::getNextToken() {
845
664k
  if (StateStack.top() == LexerState::TOKEN_STASHED) {
846
1.54k
    StateStack.pop();
847
1.54k
    return getStashedToken();
848
1.54k
  }
849
850
663k
  FormatTok = new (Allocator.Allocate()) FormatToken;
851
663k
  readRawToken(*FormatTok);
852
663k
  SourceLocation WhitespaceStart =
853
663k
      FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
854
663k
  FormatTok->IsFirst = IsFirstToken;
855
663k
  IsFirstToken = false;
856
857
  // Consume and record whitespace until we find a significant token.
858
663k
  unsigned WhitespaceLength = TrailingWhitespace;
859
965k
  while (FormatTok->Tok.is(tok::unknown)) {
860
302k
    StringRef Text = FormatTok->TokenText;
861
302k
    auto EscapesNewline = [&](int pos) {
862
      // A '\r' here is just part of '\r\n'. Skip it.
863
68.0k
      if (pos >= 0 && 
Text[pos] == '\r'3.99k
)
864
189
        --pos;
865
      // See whether there is an odd number of '\' before this.
866
      // FIXME: This is wrong. A '\' followed by a newline is always removed,
867
      // regardless of whether there is another '\' before it.
868
      // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
869
68.0k
      unsigned count = 0;
870
68.8k
      for (; pos >= 0; 
--pos, ++count752
)
871
3.84k
        if (Text[pos] != '\\')
872
3.09k
          break;
873
68.0k
      return count & 1;
874
68.0k
    };
875
    // FIXME: This miscounts tok:unknown tokens that are not just
876
    // whitespace, e.g. a '`' character.
877
905k
    for (int i = 0, e = Text.size(); i != e; 
++i603k
) {
878
603k
      switch (Text[i]) {
879
68.0k
      case '\n':
880
68.0k
        ++FormatTok->NewlinesBefore;
881
68.0k
        FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
882
68.0k
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
883
68.0k
        Column = 0;
884
68.0k
        break;
885
196
      case '\r':
886
196
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
887
196
        Column = 0;
888
196
        break;
889
10
      case '\f':
890
20
      case '\v':
891
20
        Column = 0;
892
20
        break;
893
532k
      case ' ':
894
532k
        ++Column;
895
532k
        break;
896
1.83k
      case '\t':
897
1.83k
        Column +=
898
1.83k
            Style.TabWidth - (Style.TabWidth ? 
Column % Style.TabWidth1.78k
:
045
);
899
1.83k
        break;
900
803
      case '\\':
901
803
        if (i + 1 == e || 
(752
Text[i + 1] != '\r'752
&&
Text[i + 1] != '\n'737
))
902
51
          FormatTok->setType(TT_ImplicitStringLiteral);
903
803
        break;
904
235
      default:
905
235
        FormatTok->setType(TT_ImplicitStringLiteral);
906
235
        break;
907
603k
      }
908
603k
      if (FormatTok->getType() == TT_ImplicitStringLiteral)
909
286
        break;
910
603k
    }
911
912
302k
    if (FormatTok->is(TT_ImplicitStringLiteral))
913
286
      break;
914
302k
    WhitespaceLength += FormatTok->Tok.getLength();
915
916
302k
    readRawToken(*FormatTok);
917
302k
  }
918
919
  // JavaScript and Java do not allow to escape the end of the line with a
920
  // backslash. Backslashes are syntax errors in plain source, but can occur in
921
  // comments. When a single line comment ends with a \, it'll cause the next
922
  // line of code to be lexed as a comment, breaking formatting. The code below
923
  // finds comments that contain a backslash followed by a line break, truncates
924
  // the comment token at the backslash, and resets the lexer to restart behind
925
  // the backslash.
926
663k
  if ((Style.Language == FormatStyle::LK_JavaScript ||
927
663k
       
Style.Language == FormatStyle::LK_Java628k
) &&
928
663k
      
FormatTok->is(tok::comment)39.5k
&&
FormatTok->TokenText.startswith("//")489
) {
929
314
    size_t BackslashPos = FormatTok->TokenText.find('\\');
930
318
    while (BackslashPos != StringRef::npos) {
931
16
      if (BackslashPos + 1 < FormatTok->TokenText.size() &&
932
16
          FormatTok->TokenText[BackslashPos + 1] == '\n') {
933
12
        const char *Offset = Lex->getBufferLocation();
934
12
        Offset -= FormatTok->TokenText.size();
935
12
        Offset += BackslashPos + 1;
936
12
        resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
937
12
        FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
938
12
        FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
939
12
            FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
940
12
            Encoding);
941
12
        break;
942
12
      }
943
4
      BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
944
4
    }
945
314
  }
946
947
  // In case the token starts with escaped newlines, we want to
948
  // take them into account as whitespace - this pattern is quite frequent
949
  // in macro definitions.
950
  // FIXME: Add a more explicit test.
951
663k
  while (FormatTok->TokenText.size() > 1 && 
FormatTok->TokenText[0] == '\\'227k
) {
952
77
    unsigned SkippedWhitespace = 0;
953
77
    if (FormatTok->TokenText.size() > 2 &&
954
77
        (FormatTok->TokenText[1] == '\r' && 
FormatTok->TokenText[2] == '\n'9
))
955
9
      SkippedWhitespace = 3;
956
68
    else if (FormatTok->TokenText[1] == '\n')
957
68
      SkippedWhitespace = 2;
958
0
    else
959
0
      break;
960
961
77
    ++FormatTok->NewlinesBefore;
962
77
    WhitespaceLength += SkippedWhitespace;
963
77
    FormatTok->LastNewlineOffset = SkippedWhitespace;
964
77
    Column = 0;
965
77
    FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
966
77
  }
967
968
663k
  FormatTok->WhitespaceRange = SourceRange(
969
663k
      WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
970
971
663k
  FormatTok->OriginalColumn = Column;
972
973
663k
  TrailingWhitespace = 0;
974
663k
  if (FormatTok->Tok.is(tok::comment)) {
975
    // FIXME: Add the trimmed whitespace to Column.
976
11.2k
    StringRef UntrimmedText = FormatTok->TokenText;
977
11.2k
    FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
978
11.2k
    TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
979
652k
  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
980
249k
    IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
981
249k
    FormatTok->Tok.setIdentifierInfo(&Info);
982
249k
    FormatTok->Tok.setKind(Info.getTokenID());
983
249k
    if (Style.Language == FormatStyle::LK_Java &&
984
249k
        FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
985
1.83k
                           tok::kw_operator)) {
986
8
      FormatTok->Tok.setKind(tok::identifier);
987
8
      FormatTok->Tok.setIdentifierInfo(nullptr);
988
249k
    } else if (Style.Language == FormatStyle::LK_JavaScript &&
989
249k
               FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
990
12.0k
                                  tok::kw_operator)) {
991
28
      FormatTok->Tok.setKind(tok::identifier);
992
28
      FormatTok->Tok.setIdentifierInfo(nullptr);
993
28
    }
994
402k
  } else if (FormatTok->Tok.is(tok::greatergreater)) {
995
476
    FormatTok->Tok.setKind(tok::greater);
996
476
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
997
476
    ++Column;
998
476
    StateStack.push(LexerState::TOKEN_STASHED);
999
402k
  } else if (FormatTok->Tok.is(tok::lessless)) {
1000
1.07k
    FormatTok->Tok.setKind(tok::less);
1001
1.07k
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1002
1.07k
    ++Column;
1003
1.07k
    StateStack.push(LexerState::TOKEN_STASHED);
1004
1.07k
  }
1005
1006
  // Now FormatTok is the next non-whitespace token.
1007
1008
663k
  StringRef Text = FormatTok->TokenText;
1009
663k
  size_t FirstNewlinePos = Text.find('\n');
1010
663k
  if (FirstNewlinePos == StringRef::npos) {
1011
    // FIXME: ColumnWidth actually depends on the start column, we need to
1012
    // take this into account when the token is moved.
1013
662k
    FormatTok->ColumnWidth =
1014
662k
        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1015
662k
    Column += FormatTok->ColumnWidth;
1016
662k
  } else {
1017
691
    FormatTok->IsMultiline = true;
1018
    // FIXME: ColumnWidth actually depends on the start column, we need to
1019
    // take this into account when the token is moved.
1020
691
    FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1021
691
        Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1022
1023
    // The last line of the token always starts in column 0.
1024
    // Thus, the length can be precomputed even in the presence of tabs.
1025
691
    FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1026
691
        Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1027
691
    Column = FormatTok->LastLineColumnWidth;
1028
691
  }
1029
1030
663k
  if (Style.isCpp()) {
1031
607k
    auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1032
607k
    if (!(Tokens.size() > 0 && 
Tokens.back()->Tok.getIdentifierInfo()573k
&&
1033
607k
          Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1034
229k
              tok::pp_define) &&
1035
607k
        
it != Macros.end()605k
) {
1036
778
      FormatTok->setType(it->second);
1037
606k
    } else if (FormatTok->is(tok::identifier)) {
1038
150k
      if (MacroBlockBeginRegex.match(Text)) {
1039
28
        FormatTok->setType(TT_MacroBlockBegin);
1040
150k
      } else if (MacroBlockEndRegex.match(Text)) {
1041
28
        FormatTok->setType(TT_MacroBlockEnd);
1042
28
      }
1043
150k
    }
1044
607k
  }
1045
1046
663k
  return FormatTok;
1047
663k
}
1048
1049
965k
void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1050
965k
  Lex->LexFromRawLexer(Tok.Tok);
1051
965k
  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1052
965k
                            Tok.Tok.getLength());
1053
  // For formatting, treat unterminated string literals like normal string
1054
  // literals.
1055
965k
  if (Tok.is(tok::unknown)) {
1056
302k
    if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1057
27
      Tok.Tok.setKind(tok::string_literal);
1058
27
      Tok.IsUnterminatedLiteral = true;
1059
302k
    } else if (Style.Language == FormatStyle::LK_JavaScript &&
1060
302k
               
Tok.TokenText == "''"15.5k
) {
1061
12
      Tok.Tok.setKind(tok::string_literal);
1062
12
    }
1063
302k
  }
1064
1065
965k
  if ((Style.Language == FormatStyle::LK_JavaScript ||
1066
965k
       
Style.Language == FormatStyle::LK_Proto915k
||
1067
965k
       
Style.Language == FormatStyle::LK_TextProto908k
) &&
1068
965k
      
Tok.is(tok::char_constant)67.2k
) {
1069
778
    Tok.Tok.setKind(tok::string_literal);
1070
778
  }
1071
1072
965k
  if (Tok.is(tok::comment) && 
(11.2k
Tok.TokenText == "// clang-format on"11.2k
||
1073
11.2k
                               
Tok.TokenText == "/* clang-format on */"11.1k
)) {
1074
29
    FormattingDisabled = false;
1075
29
  }
1076
1077
965k
  Tok.Finalized = FormattingDisabled;
1078
1079
965k
  if (Tok.is(tok::comment) && 
(11.2k
Tok.TokenText == "// clang-format off"11.2k
||
1080
11.2k
                               
Tok.TokenText == "/* clang-format off */"11.1k
)) {
1081
32
    FormattingDisabled = true;
1082
32
  }
1083
965k
}
1084
1085
641
void FormatTokenLexer::resetLexer(unsigned Offset) {
1086
641
  StringRef Buffer = SourceMgr.getBufferData(ID);
1087
641
  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
1088
641
                      getFormattingLangOpts(Style), Buffer.begin(),
1089
641
                      Buffer.begin() + Offset, Buffer.end()));
1090
641
  Lex->SetKeepWhitespaceMode(true);
1091
641
  TrailingWhitespace = 0;
1092
641
}
1093
1094
} // namespace format
1095
} // namespace clang