Coverage Report

Created: 2020-09-15 12:33

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file implements FormatTokenLexer, which tokenizes a source file
11
/// into a FormatToken stream suitable for ClangFormat.
12
///
13
//===----------------------------------------------------------------------===//
14
15
#include "FormatTokenLexer.h"
16
#include "FormatToken.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/SourceManager.h"
19
#include "clang/Format/Format.h"
20
#include "llvm/Support/Regex.h"
21
22
namespace clang {
23
namespace format {
24
25
FormatTokenLexer::FormatTokenLexer(
26
    const SourceManager &SourceMgr, FileID ID, unsigned Column,
27
    const FormatStyle &Style, encoding::Encoding Encoding,
28
    llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29
    IdentifierTable &IdentTable)
30
    : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31
      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
32
      Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
33
      Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
34
      FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
35
36.3k
      MacroBlockEndRegex(Style.MacroBlockEnd) {
36
36.3k
  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
37
36.3k
                      getFormattingLangOpts(Style)));
38
36.3k
  Lex->SetKeepWhitespaceMode(true);
39
36.3k
40
36.3k
  for (const std::string &ForEachMacro : Style.ForEachMacros)
41
109k
    Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
42
36.3k
  for (const std::string &AttributeMacro : Style.AttributeMacros)
43
36.9k
    Macros.insert({&IdentTable.get(AttributeMacro), TT_AttributeMacro});
44
36.3k
  for (const std::string &StatementMacro : Style.StatementMacros)
45
72.7k
    Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
46
36.3k
  for (const std::string &TypenameMacro : Style.TypenameMacros)
47
553
    Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
48
36.3k
  for (const std::string &NamespaceMacro : Style.NamespaceMacros)
49
159
    Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
50
36.3k
  for (const std::string &WhitespaceSensitiveMacro :
51
109k
       Style.WhitespaceSensitiveMacros) {
52
109k
    Macros.insert(
53
109k
        {&IdentTable.get(WhitespaceSensitiveMacro), TT_UntouchableMacroFunc});
54
109k
  }
55
36.3k
}
56
57
36.3k
ArrayRef<FormatToken *> FormatTokenLexer::lex() {
58
36.3k
  assert(Tokens.empty());
59
36.3k
  assert(FirstInLineIndex == 0);
60
613k
  do {
61
613k
    Tokens.push_back(getNextToken());
62
613k
    if (Style.Language == FormatStyle::LK_JavaScript) {
63
34.3k
      tryParseJSRegexLiteral();
64
34.3k
      handleTemplateStrings();
65
34.3k
    }
66
613k
    if (Style.Language == FormatStyle::LK_TextProto)
67
6.72k
      tryParsePythonComment();
68
613k
    tryMergePreviousTokens();
69
613k
    if (Style.isCSharp())
70
      // This needs to come after tokens have been merged so that C#
71
      // string literals are correctly identified.
72
4.87k
      handleCSharpVerbatimAndInterpolatedStrings();
73
613k
    if (Tokens.back()->NewlinesBefore > 0 || 
Tokens.back()->IsMultiline555k
)
74
58.8k
      FirstInLineIndex = Tokens.size() - 1;
75
613k
  } while (Tokens.back()->Tok.isNot(tok::eof));
76
36.3k
  return Tokens;
77
36.3k
}
78
79
613k
void FormatTokenLexer::tryMergePreviousTokens() {
80
613k
  if (tryMerge_TMacro())
81
15
    return;
82
613k
  if (tryMergeConflictMarkers())
83
36
    return;
84
613k
  if (tryMergeLessLess())
85
982
    return;
86
612k
  if (tryMergeForEach())
87
7
    return;
88
612k
  if (Style.isCpp() && 
tryTransformTryUsageForC()558k
)
89
28
    return;
90
612k
91
612k
  if (Style.isCSharp()) {
92
4.87k
    if (tryMergeCSharpKeywordVariables())
93
2
      return;
94
4.86k
    if (tryMergeCSharpStringLiteral())
95
43
      return;
96
4.82k
    if (tryMergeCSharpDoubleQuestion())
97
6
      return;
98
4.81k
    if (tryMergeCSharpNullConditional())
99
14
      return;
100
4.80k
    if (tryTransformCSharpForEach())
101
8
      return;
102
4.79k
    static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
103
4.79k
    if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
104
34
      return;
105
612k
  }
106
612k
107
612k
  if (tryMergeNSStringLiteral())
108
222
    return;
109
612k
110
612k
  if (Style.Language == FormatStyle::LK_JavaScript) {
111
34.3k
    static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
112
34.3k
    static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
113
34.3k
                                                   tok::equal};
114
34.3k
    static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
115
34.3k
                                                  tok::greaterequal};
116
34.3k
    static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
117
34.3k
    static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
118
34.3k
    static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
119
34.3k
                                                           tok::starequal};
120
34.3k
    static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question,
121
34.3k
                                                               tok::period};
122
34.3k
    static const tok::TokenKind JSNullishOperator[] = {tok::question,
123
34.3k
                                                       tok::question};
124
34.3k
125
    // FIXME: Investigate what token type gives the correct operator priority.
126
34.3k
    if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
127
12
      return;
128
34.3k
    if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
129
12
      return;
130
34.3k
    if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
131
10
      return;
132
34.3k
    if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
133
148
      return;
134
34.1k
    if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
135
4
      return;
136
34.1k
    if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
137
4
      Tokens.back()->Tok.setKind(tok::starequal);
138
4
      return;
139
4
    }
140
34.1k
    if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) {
141
      // Treat like the "||" operator (as opposed to the ternary ?).
142
16
      Tokens.back()->Tok.setKind(tok::pipepipe);
143
16
      return;
144
16
    }
145
34.1k
    if (tryMergeTokens(JSNullPropagatingOperator,
146
16
                       TT_JsNullPropagatingOperator)) {
147
      // Treat like a regular "." access.
148
16
      Tokens.back()->Tok.setKind(tok::period);
149
16
      return;
150
16
    }
151
34.1k
    if (tryMergeJSPrivateIdentifier())
152
32
      return;
153
612k
  }
154
612k
155
612k
  if (Style.Language == FormatStyle::LK_Java) {
156
4.35k
    static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
157
4.35k
        tok::greater, tok::greater, tok::greaterequal};
158
4.35k
    if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
159
2
      return;
160
4.35k
  }
161
612k
}
162
163
612k
bool FormatTokenLexer::tryMergeNSStringLiteral() {
164
612k
  if (Tokens.size() < 2)
165
36.3k
    return false;
166
576k
  auto &At = *(Tokens.end() - 2);
167
576k
  auto &String = *(Tokens.end() - 1);
168
576k
  if (!At->is(tok::at) || 
!String->is(tok::string_literal)1.26k
)
169
576k
    return false;
170
222
  At->Tok.setKind(tok::string_literal);
171
222
  At->TokenText = StringRef(At->TokenText.begin(),
172
222
                            String->TokenText.end() - At->TokenText.begin());
173
222
  At->ColumnWidth += String->ColumnWidth;
174
222
  At->setType(TT_ObjCStringLiteral);
175
222
  Tokens.erase(Tokens.end() - 1);
176
222
  return true;
177
222
}
178
179
34.1k
bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
180
  // Merges #idenfier into a single identifier with the text #identifier
181
  // but the token tok::identifier.
182
34.1k
  if (Tokens.size() < 2)
183
2.77k
    return false;
184
31.3k
  auto &Hash = *(Tokens.end() - 2);
185
31.3k
  auto &Identifier = *(Tokens.end() - 1);
186
31.3k
  if (!Hash->is(tok::hash) || 
!Identifier->is(tok::identifier)36
)
187
31.3k
    return false;
188
32
  Hash->Tok.setKind(tok::identifier);
189
32
  Hash->TokenText =
190
32
      StringRef(Hash->TokenText.begin(),
191
32
                Identifier->TokenText.end() - Hash->TokenText.begin());
192
32
  Hash->ColumnWidth += Identifier->ColumnWidth;
193
32
  Hash->setType(TT_JsPrivateIdentifier);
194
32
  Tokens.erase(Tokens.end() - 1);
195
32
  return true;
196
32
}
197
198
// Search for verbatim or interpolated string literals @"ABC" or
199
// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
200
// prevent splitting of @, $ and ".
201
// Merging of multiline verbatim strings with embedded '"' is handled in
202
// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
203
4.86k
bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
204
4.86k
  if (Tokens.size() < 2)
205
335
    return false;
206
4.53k
207
  // Interpolated strings could contain { } with " characters inside.
208
  // $"{x ?? "null"}"
209
  // should not be split into $"{x ?? ", null, "}" but should treated as a
210
  // single string-literal.
211
  //
212
  // We opt not to try and format expressions inside {} within a C#
213
  // interpolated string. Formatting expressions within an interpolated string
214
  // would require similar work as that done for JavaScript template strings
215
  // in `handleTemplateStrings()`.
216
4.53k
  auto &CSharpInterpolatedString = *(Tokens.end() - 2);
217
4.53k
  if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral &&
218
43
      (CSharpInterpolatedString->TokenText.startswith(R"($")") ||
219
37
       
CSharpInterpolatedString->TokenText.startswith(R"($@")")12
)) {
220
37
    int UnmatchedOpeningBraceCount = 0;
221
37
222
37
    auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
223
989
    for (size_t Index = 0; Index < TokenTextSize; 
++Index952
) {
224
952
      char C = CSharpInterpolatedString->TokenText[Index];
225
952
      if (C == '{') {
226
        // "{{"  inside an interpolated string is an escaped '{' so skip it.
227
49
        if (Index + 1 < TokenTextSize &&
228
49
            CSharpInterpolatedString->TokenText[Index + 1] == '{') {
229
6
          ++Index;
230
6
          continue;
231
6
        }
232
43
        ++UnmatchedOpeningBraceCount;
233
903
      } else if (C == '}') {
234
        // "}}"  inside an interpolated string is an escaped '}' so skip it.
235
43
        if (Index + 1 < TokenTextSize &&
236
43
            CSharpInterpolatedString->TokenText[Index + 1] == '}') {
237
6
          ++Index;
238
6
          continue;
239
6
        }
240
37
        --UnmatchedOpeningBraceCount;
241
37
      }
242
952
    }
243
37
244
37
    if (UnmatchedOpeningBraceCount > 0) {
245
6
      auto &NextToken = *(Tokens.end() - 1);
246
6
      CSharpInterpolatedString->TokenText =
247
6
          StringRef(CSharpInterpolatedString->TokenText.begin(),
248
6
                    NextToken->TokenText.end() -
249
6
                        CSharpInterpolatedString->TokenText.begin());
250
6
      CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
251
6
      Tokens.erase(Tokens.end() - 1);
252
6
      return true;
253
6
    }
254
4.52k
  }
255
4.52k
256
  // Look for @"aaaaaa" or $"aaaaaa".
257
4.52k
  auto &String = *(Tokens.end() - 1);
258
4.52k
  if (!String->is(tok::string_literal))
259
4.43k
    return false;
260
93
261
93
  auto &At = *(Tokens.end() - 2);
262
93
  if (!(At->is(tok::at) || 
At->TokenText == "$"81
))
263
56
    return false;
264
37
265
37
  if (Tokens.size() > 2 && 
At->is(tok::at)35
) {
266
12
    auto &Dollar = *(Tokens.end() - 3);
267
12
    if (Dollar->TokenText == "$") {
268
      // This looks like $@"aaaaa" so we need to combine all 3 tokens.
269
6
      Dollar->Tok.setKind(tok::string_literal);
270
6
      Dollar->TokenText =
271
6
          StringRef(Dollar->TokenText.begin(),
272
6
                    String->TokenText.end() - Dollar->TokenText.begin());
273
6
      Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
274
6
      Dollar->setType(TT_CSharpStringLiteral);
275
6
      Tokens.erase(Tokens.end() - 2);
276
6
      Tokens.erase(Tokens.end() - 1);
277
6
      return true;
278
6
    }
279
31
  }
280
31
281
  // Convert back into just a string_literal.
282
31
  At->Tok.setKind(tok::string_literal);
283
31
  At->TokenText = StringRef(At->TokenText.begin(),
284
31
                            String->TokenText.end() - At->TokenText.begin());
285
31
  At->ColumnWidth += String->ColumnWidth;
286
31
  At->setType(TT_CSharpStringLiteral);
287
31
  Tokens.erase(Tokens.end() - 1);
288
31
  return true;
289
31
}
290
291
// Valid C# attribute targets:
292
// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
293
const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
294
    "assembly", "module",   "field",  "event", "method",
295
    "param",    "property", "return", "type",
296
};
297
298
4.82k
bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
299
4.82k
  if (Tokens.size() < 2)
300
335
    return false;
301
4.49k
  auto &FirstQuestion = *(Tokens.end() - 2);
302
4.49k
  auto &SecondQuestion = *(Tokens.end() - 1);
303
4.49k
  if (!FirstQuestion->is(tok::question) || 
!SecondQuestion->is(tok::question)58
)
304
4.48k
    return false;
305
6
  FirstQuestion->Tok.setKind(tok::question); // no '??' in clang tokens.
306
6
  FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
307
6
                                       SecondQuestion->TokenText.end() -
308
6
                                           FirstQuestion->TokenText.begin());
309
6
  FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
310
6
  FirstQuestion->setType(TT_CSharpNullCoalescing);
311
6
  Tokens.erase(Tokens.end() - 1);
312
6
  return true;
313
6
}
314
315
// Merge '?[' and '?.' pairs into single tokens.
316
4.81k
bool FormatTokenLexer::tryMergeCSharpNullConditional() {
317
4.81k
  if (Tokens.size() < 2)
318
335
    return false;
319
4.48k
  auto &Question = *(Tokens.end() - 2);
320
4.48k
  auto &PeriodOrLSquare = *(Tokens.end() - 1);
321
4.48k
  if (!Question->is(tok::question) ||
322
52
      !PeriodOrLSquare->isOneOf(tok::l_square, tok::period))
323
4.47k
    return false;
324
14
  Question->TokenText =
325
14
      StringRef(Question->TokenText.begin(),
326
14
                PeriodOrLSquare->TokenText.end() - Question->TokenText.begin());
327
14
  Question->ColumnWidth += PeriodOrLSquare->ColumnWidth;
328
14
329
14
  if (PeriodOrLSquare->is(tok::l_square)) {
330
8
    Question->Tok.setKind(tok::question); // no '?[' in clang tokens.
331
8
    Question->setType(TT_CSharpNullConditionalLSquare);
332
6
  } else {
333
6
    Question->Tok.setKind(tok::question); // no '?.' in clang tokens.
334
6
    Question->setType(TT_CSharpNullConditional);
335
6
  }
336
14
337
14
  Tokens.erase(Tokens.end() - 1);
338
14
  return true;
339
14
}
340
341
4.87k
bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
342
4.87k
  if (Tokens.size() < 2)
343
335
    return false;
344
4.53k
  auto &At = *(Tokens.end() - 2);
345
4.53k
  auto &Keyword = *(Tokens.end() - 1);
346
4.53k
  if (!At->is(tok::at))
347
4.51k
    return false;
348
16
  if (!Keywords.isCSharpKeyword(*Keyword))
349
14
    return false;
350
2
351
2
  At->Tok.setKind(tok::identifier);
352
2
  At->TokenText = StringRef(At->TokenText.begin(),
353
2
                            Keyword->TokenText.end() - At->TokenText.begin());
354
2
  At->ColumnWidth += Keyword->ColumnWidth;
355
2
  At->setType(Keyword->getType());
356
2
  Tokens.erase(Tokens.end() - 1);
357
2
  return true;
358
2
}
359
360
// In C# transform identifier foreach into kw_foreach
361
4.80k
bool FormatTokenLexer::tryTransformCSharpForEach() {
362
4.80k
  if (Tokens.size() < 1)
363
0
    return false;
364
4.80k
  auto &Identifier = *(Tokens.end() - 1);
365
4.80k
  if (!Identifier->is(tok::identifier))
366
3.40k
    return false;
367
1.40k
  if (Identifier->TokenText != "foreach")
368
1.39k
    return false;
369
8
370
8
  Identifier->setType(TT_ForEachMacro);
371
8
  Identifier->Tok.setKind(tok::kw_for);
372
8
  return true;
373
8
}
374
375
612k
bool FormatTokenLexer::tryMergeForEach() {
376
612k
  if (Tokens.size() < 2)
377
36.3k
    return false;
378
576k
  auto &For = *(Tokens.end() - 2);
379
576k
  auto &Each = *(Tokens.end() - 1);
380
576k
  if (!For->is(tok::kw_for))
381
575k
    return false;
382
674
  if (!Each->is(tok::identifier))
383
659
    return false;
384
15
  if (Each->TokenText != "each")
385
8
    return false;
386
7
387
7
  For->setType(TT_ForEachMacro);
388
7
  For->Tok.setKind(tok::kw_for);
389
7
390
7
  For->TokenText = StringRef(For->TokenText.begin(),
391
7
                             Each->TokenText.end() - For->TokenText.begin());
392
7
  For->ColumnWidth += Each->ColumnWidth;
393
7
  Tokens.erase(Tokens.end() - 1);
394
7
  return true;
395
7
}
396
397
558k
bool FormatTokenLexer::tryTransformTryUsageForC() {
398
558k
  if (Tokens.size() < 2)
399
32.2k
    return false;
400
525k
  auto &Try = *(Tokens.end() - 2);
401
525k
  if (!Try->is(tok::kw_try))
402
525k
    return false;
403
176
  auto &Next = *(Tokens.end() - 1);
404
176
  if (Next->isOneOf(tok::l_brace, tok::colon))
405
146
    return false;
406
30
407
30
  if (Tokens.size() > 2) {
408
23
    auto &At = *(Tokens.end() - 3);
409
23
    if (At->is(tok::at))
410
2
      return false;
411
28
  }
412
28
413
28
  Try->Tok.setKind(tok::identifier);
414
28
  return true;
415
28
}
416
417
613k
bool FormatTokenLexer::tryMergeLessLess() {
418
  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
419
613k
  if (Tokens.size() < 3)
420
72.4k
    return false;
421
541k
422
541k
  bool FourthTokenIsLess = false;
423
541k
  if (Tokens.size() > 3)
424
505k
    FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
425
541k
426
541k
  auto First = Tokens.end() - 3;
427
541k
  if (First[2]->is(tok::less) || 
First[1]->isNot(tok::less)532k
||
428
9.88k
      First[0]->isNot(tok::less) || 
FourthTokenIsLess1.05k
)
429
540k
    return false;
430
982
431
  // Only merge if there currently is no whitespace between the two "<".
432
982
  if (First[1]->WhitespaceRange.getBegin() !=
433
982
      First[1]->WhitespaceRange.getEnd())
434
0
    return false;
435
982
436
982
  First[0]->Tok.setKind(tok::lessless);
437
982
  First[0]->TokenText = "<<";
438
982
  First[0]->ColumnWidth += 1;
439
982
  Tokens.erase(Tokens.end() - 2);
440
982
  return true;
441
982
}
442
443
bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
444
283k
                                      TokenType NewType) {
445
283k
  if (Tokens.size() < Kinds.size())
446
25.8k
    return false;
447
257k
448
257k
  SmallVectorImpl<FormatToken *>::const_iterator First =
449
257k
      Tokens.end() - Kinds.size();
450
257k
  if (!First[0]->is(Kinds[0]))
451
254k
    return false;
452
2.61k
  unsigned AddLength = 0;
453
2.93k
  for (unsigned i = 1; i < Kinds.size(); 
++i320
) {
454
2.67k
    if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
455
332
                                       First[i]->WhitespaceRange.getEnd())
456
2.35k
      return false;
457
320
    AddLength += First[i]->TokenText.size();
458
320
  }
459
258
  Tokens.resize(Tokens.size() - Kinds.size() + 1);
460
258
  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
461
258
                                  First[0]->TokenText.size() + AddLength);
462
258
  First[0]->ColumnWidth += AddLength;
463
258
  First[0]->setType(NewType);
464
258
  return true;
465
2.61k
}
466
467
// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
468
336
bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
469
  // NB: This is not entirely correct, as an r_paren can introduce an operand
470
  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
471
  // corner case to not matter in practice, though.
472
336
  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
473
336
                      tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
474
336
                      tok::colon, tok::question, tok::tilde) ||
475
292
         Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
476
292
                      tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
477
292
                      tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
478
288
         Tok->isBinaryOperator();
479
336
}
480
481
340
bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
482
340
  if (!Prev)
483
4
    return true;
484
336
485
  // Regex literals can only follow after prefix unary operators, not after
486
  // postfix unary operators. If the '++' is followed by a non-operand
487
  // introducing token, the slash here is the operand and not the start of a
488
  // regex.
489
  // `!` is an unary prefix operator, but also a post-fix operator that casts
490
  // away nullability, so the same check applies.
491
336
  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
492
20
    return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
493
316
494
  // The previous token must introduce an operand location where regex
495
  // literals can occur.
496
316
  if (!precedesOperand(Prev))
497
24
    return false;
498
292
499
292
  return true;
500
292
}
501
502
// Tries to parse a JavaScript Regex literal starting at the current token,
503
// if that begins with a slash and is in a location where JavaScript allows
504
// regex literals. Changes the current token to a regex literal and updates
505
// its text if successful.
506
34.3k
void FormatTokenLexer::tryParseJSRegexLiteral() {
507
34.3k
  FormatToken *RegexToken = Tokens.back();
508
34.3k
  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
509
34.0k
    return;
510
340
511
340
  FormatToken *Prev = nullptr;
512
348
  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; 
++I8
) {
513
    // NB: Because previous pointers are not initialized yet, this cannot use
514
    // Token.getPreviousNonComment.
515
344
    if ((*I)->isNot(tok::comment)) {
516
336
      Prev = *I;
517
336
      break;
518
336
    }
519
344
  }
520
340
521
340
  if (!canPrecedeRegexLiteral(Prev))
522
36
    return;
523
304
524
  // 'Manually' lex ahead in the current file buffer.
525
304
  const char *Offset = Lex->getBufferLocation();
526
304
  const char *RegexBegin = Offset - RegexToken->TokenText.size();
527
304
  StringRef Buffer = Lex->getBuffer();
528
304
  bool InCharacterClass = false;
529
304
  bool HaveClosingSlash = false;
530
1.91k
  for (; !HaveClosingSlash && 
Offset != Buffer.end()1.61k
;
++Offset1.61k
) {
531
    // Regular expressions are terminated with a '/', which can only be
532
    // escaped using '\' or a character class between '[' and ']'.
533
    // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
534
1.61k
    switch (*Offset) {
535
116
    case '\\':
536
      // Skip the escaped character.
537
116
      ++Offset;
538
116
      break;
539
40
    case '[':
540
40
      InCharacterClass = true;
541
40
      break;
542
40
    case ']':
543
40
      InCharacterClass = false;
544
40
      break;
545
320
    case '/':
546
320
      if (!InCharacterClass)
547
304
        HaveClosingSlash = true;
548
320
      break;
549
1.61k
    }
550
1.61k
  }
551
304
552
304
  RegexToken->setType(TT_RegexLiteral);
553
  // Treat regex literals like other string_literals.
554
304
  RegexToken->Tok.setKind(tok::string_literal);
555
304
  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
556
304
  RegexToken->ColumnWidth = RegexToken->TokenText.size();
557
304
558
304
  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
559
304
}
560
561
4.87k
void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
562
4.87k
  FormatToken *CSharpStringLiteral = Tokens.back();
563
4.87k
564
4.87k
  if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral)
565
4.82k
    return;
566
43
567
  // Deal with multiline strings.
568
43
  if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
569
37
        CSharpStringLiteral->TokenText.startswith(R"($@")")))
570
31
    return;
571
12
572
12
  const char *StrBegin =
573
12
      Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
574
12
  const char *Offset = StrBegin;
575
12
  if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
576
6
    Offset += 2;
577
6
  else // CSharpStringLiteral->TokenText.startswith(R"($@")")
578
6
    Offset += 3;
579
12
580
  // Look for a terminating '"' in the current file buffer.
581
  // Make no effort to format code within an interpolated or verbatim string.
582
288
  for (; Offset != Lex->getBuffer().end(); 
++Offset276
) {
583
288
    if (Offset[0] == '"') {
584
      // "" within a verbatim string is an escaped double quote: skip it.
585
22
      if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
586
10
        ++Offset;
587
12
      else
588
12
        break;
589
22
    }
590
288
  }
591
12
592
  // Make no attempt to format code properly if a verbatim string is
593
  // unterminated.
594
12
  if (Offset == Lex->getBuffer().end())
595
0
    return;
596
12
597
12
  StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
598
12
  CSharpStringLiteral->TokenText = LiteralText;
599
12
600
  // Adjust width for potentially multiline string literals.
601
12
  size_t FirstBreak = LiteralText.find('\n');
602
12
  StringRef FirstLineText = FirstBreak == StringRef::npos
603
10
                                ? LiteralText
604
2
                                : LiteralText.substr(0, FirstBreak);
605
12
  CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
606
12
      FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
607
12
      Encoding);
608
12
  size_t LastBreak = LiteralText.rfind('\n');
609
12
  if (LastBreak != StringRef::npos) {
610
2
    CSharpStringLiteral->IsMultiline = true;
611
2
    unsigned StartColumn = 0;
612
2
    CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs(
613
2
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
614
2
        Style.TabWidth, Encoding);
615
2
  }
616
12
617
12
  SourceLocation loc = Offset < Lex->getBuffer().end()
618
12
                           ? Lex->getSourceLocation(Offset + 1)
619
0
                           : SourceMgr.getLocForEndOfFile(ID);
620
12
  resetLexer(SourceMgr.getFileOffset(loc));
621
12
}
622
623
34.3k
void FormatTokenLexer::handleTemplateStrings() {
624
34.3k
  FormatToken *BacktickToken = Tokens.back();
625
34.3k
626
34.3k
  if (BacktickToken->is(tok::l_brace)) {
627
1.85k
    StateStack.push(LexerState::NORMAL);
628
1.85k
    return;
629
1.85k
  }
630
32.5k
  if (BacktickToken->is(tok::r_brace)) {
631
1.95k
    if (StateStack.size() == 1)
632
4
      return;
633
1.94k
    StateStack.pop();
634
1.94k
    if (StateStack.top() != LexerState::TEMPLATE_STRING)
635
1.84k
      return;
636
    // If back in TEMPLATE_STRING, fallthrough and continue parsing the
637
30.5k
  } else if (BacktickToken->is(tok::unknown) &&
638
148
             BacktickToken->TokenText == "`") {
639
148
    StateStack.push(LexerState::TEMPLATE_STRING);
640
30.4k
  } else {
641
30.4k
    return; // Not actually a template
642
30.4k
  }
643
248
644
  // 'Manually' lex ahead in the current file buffer.
645
248
  const char *Offset = Lex->getBufferLocation();
646
248
  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
647
1.90k
  for (; Offset != Lex->getBuffer().end(); 
++Offset1.65k
) {
648
1.90k
    if (Offset[0] == '`') {
649
148
      StateStack.pop();
650
148
      break;
651
148
    }
652
1.75k
    if (Offset[0] == '\\') {
653
8
      ++Offset; // Skip the escaped character.
654
1.74k
    } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
655
100
               Offset[1] == '{') {
656
      // '${' introduces an expression interpolation in the template string.
657
100
      StateStack.push(LexerState::NORMAL);
658
100
      ++Offset;
659
100
      break;
660
100
    }
661
1.75k
  }
662
248
663
248
  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
664
248
  BacktickToken->setType(TT_TemplateString);
665
248
  BacktickToken->Tok.setKind(tok::string_literal);
666
248
  BacktickToken->TokenText = LiteralText;
667
248
668
  // Adjust width for potentially multiline string literals.
669
248
  size_t FirstBreak = LiteralText.find('\n');
670
248
  StringRef FirstLineText = FirstBreak == StringRef::npos
671
212
                                ? LiteralText
672
36
                                : LiteralText.substr(0, FirstBreak);
673
248
  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
674
248
      FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
675
248
  size_t LastBreak = LiteralText.rfind('\n');
676
248
  if (LastBreak != StringRef::npos) {
677
36
    BacktickToken->IsMultiline = true;
678
36
    unsigned StartColumn = 0; // The template tail spans the entire line.
679
36
    BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
680
36
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
681
36
        Style.TabWidth, Encoding);
682
36
  }
683
248
684
248
  SourceLocation loc = Offset < Lex->getBuffer().end()
685
248
                           ? Lex->getSourceLocation(Offset + 1)
686
0
                           : SourceMgr.getLocForEndOfFile(ID);
687
248
  resetLexer(SourceMgr.getFileOffset(loc));
688
248
}
689
690
6.72k
void FormatTokenLexer::tryParsePythonComment() {
691
6.72k
  FormatToken *HashToken = Tokens.back();
692
6.72k
  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
693
6.66k
    return;
694
  // Turn the remainder of this line into a comment.
695
63
  const char *CommentBegin =
696
63
      Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
697
63
  size_t From = CommentBegin - Lex->getBuffer().begin();
698
63
  size_t To = Lex->getBuffer().find_first_of('\n', From);
699
63
  if (To == StringRef::npos)
700
7
    To = Lex->getBuffer().size();
701
63
  size_t Len = To - From;
702
63
  HashToken->setType(TT_LineComment);
703
63
  HashToken->Tok.setKind(tok::comment);
704
63
  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
705
63
  SourceLocation Loc = To < Lex->getBuffer().size()
706
56
                           ? Lex->getSourceLocation(CommentBegin + Len)
707
7
                           : SourceMgr.getLocForEndOfFile(ID);
708
63
  resetLexer(SourceMgr.getFileOffset(Loc));
709
63
}
710
711
613k
bool FormatTokenLexer::tryMerge_TMacro() {
712
613k
  if (Tokens.size() < 4)
713
107k
    return false;
714
506k
  FormatToken *Last = Tokens.back();
715
506k
  if (!Last->is(tok::r_paren))
716
460k
    return false;
717
45.7k
718
45.7k
  FormatToken *String = Tokens[Tokens.size() - 2];
719
45.7k
  if (!String->is(tok::string_literal) || 
String->IsMultiline629
)
720
45.1k
    return false;
721
596
722
596
  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
723
276
    return false;
724
320
725
320
  FormatToken *Macro = Tokens[Tokens.size() - 4];
726
320
  if (Macro->TokenText != "_T")
727
305
    return false;
728
15
729
15
  const char *Start = Macro->TokenText.data();
730
15
  const char *End = Last->TokenText.data() + Last->TokenText.size();
731
15
  String->TokenText = StringRef(Start, End - Start);
732
15
  String->IsFirst = Macro->IsFirst;
733
15
  String->LastNewlineOffset = Macro->LastNewlineOffset;
734
15
  String->WhitespaceRange = Macro->WhitespaceRange;
735
15
  String->OriginalColumn = Macro->OriginalColumn;
736
15
  String->ColumnWidth = encoding::columnWidthWithTabs(
737
15
      String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
738
15
  String->NewlinesBefore = Macro->NewlinesBefore;
739
15
  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
740
15
741
15
  Tokens.pop_back();
742
15
  Tokens.pop_back();
743
15
  Tokens.pop_back();
744
15
  Tokens.back() = String;
745
15
  return true;
746
15
}
747
748
613k
bool FormatTokenLexer::tryMergeConflictMarkers() {
749
613k
  if (Tokens.back()->NewlinesBefore == 0 && 
Tokens.back()->isNot(tok::eof)555k
)
750
522k
    return false;
751
91.5k
752
  // Conflict lines look like:
753
  // <marker> <text from the vcs>
754
  // For example:
755
  // >>>>>>> /file/in/file/system at revision 1234
756
  //
757
  // We merge all tokens in a line that starts with a conflict marker
758
  // into a single token with a special token type that the unwrapped line
759
  // parser will use to correctly rebuild the underlying code.
760
91.5k
761
91.5k
  FileID ID;
762
  // Get the position of the first token in the line.
763
91.5k
  unsigned FirstInLineOffset;
764
91.5k
  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
765
91.5k
      Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
766
91.5k
  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
767
  // Calculate the offset of the start of the current line.
768
91.5k
  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
769
91.5k
  if (LineOffset == StringRef::npos) {
770
35.1k
    LineOffset = 0;
771
56.3k
  } else {
772
56.3k
    ++LineOffset;
773
56.3k
  }
774
91.5k
775
91.5k
  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
776
91.5k
  StringRef LineStart;
777
91.5k
  if (FirstSpace == StringRef::npos) {
778
5.45k
    LineStart = Buffer.substr(LineOffset);
779
86.0k
  } else {
780
86.0k
    LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
781
86.0k
  }
782
91.5k
783
91.5k
  TokenType Type = TT_Unknown;
784
91.5k
  if (LineStart == "<<<<<<<" || 
LineStart == ">>>>"91.5k
) {
785
9
    Type = TT_ConflictStart;
786
91.5k
  } else if (LineStart == "|||||||" || 
LineStart == "======="91.5k
||
787
91.5k
             LineStart == "====") {
788
18
    Type = TT_ConflictAlternative;
789
91.5k
  } else if (LineStart == ">>>>>>>" || 
LineStart == "<<<<"91.5k
) {
790
9
    Type = TT_ConflictEnd;
791
9
  }
792
91.5k
793
91.5k
  if (Type != TT_Unknown) {
794
36
    FormatToken *Next = Tokens.back();
795
36
796
36
    Tokens.resize(FirstInLineIndex + 1);
797
    // We do not need to build a complete token here, as we will skip it
798
    // during parsing anyway (as we must not touch whitespace around conflict
799
    // markers).
800
36
    Tokens.back()->setType(Type);
801
36
    Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
802
36
803
36
    Tokens.push_back(Next);
804
36
    return true;
805
36
  }
806
91.5k
807
91.5k
  return false;
808
91.5k
}
809
810
1.48k
FormatToken *FormatTokenLexer::getStashedToken() {
811
  // Create a synthesized second '>' or '<' token.
812
1.48k
  Token Tok = FormatTok->Tok;
813
1.48k
  StringRef TokenText = FormatTok->TokenText;
814
1.48k
815
1.48k
  unsigned OriginalColumn = FormatTok->OriginalColumn;
816
1.48k
  FormatTok = new (Allocator.Allocate()) FormatToken;
817
1.48k
  FormatTok->Tok = Tok;
818
1.48k
  SourceLocation TokLocation =
819
1.48k
      FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
820
1.48k
  FormatTok->Tok.setLocation(TokLocation);
821
1.48k
  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
822
1.48k
  FormatTok->TokenText = TokenText;
823
1.48k
  FormatTok->ColumnWidth = 1;
824
1.48k
  FormatTok->OriginalColumn = OriginalColumn + 1;
825
1.48k
826
1.48k
  return FormatTok;
827
1.48k
}
828
829
613k
FormatToken *FormatTokenLexer::getNextToken() {
830
613k
  if (StateStack.top() == LexerState::TOKEN_STASHED) {
831
1.48k
    StateStack.pop();
832
1.48k
    return getStashedToken();
833
1.48k
  }
834
612k
835
612k
  FormatTok = new (Allocator.Allocate()) FormatToken;
836
612k
  readRawToken(*FormatTok);
837
612k
  SourceLocation WhitespaceStart =
838
612k
      FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
839
612k
  FormatTok->IsFirst = IsFirstToken;
840
612k
  IsFirstToken = false;
841
612k
842
  // Consume and record whitespace until we find a significant token.
843
612k
  unsigned WhitespaceLength = TrailingWhitespace;
844
888k
  while (FormatTok->Tok.is(tok::unknown)) {
845
275k
    StringRef Text = FormatTok->TokenText;
846
60.0k
    auto EscapesNewline = [&](int pos) {
847
      // A '\r' here is just part of '\r\n'. Skip it.
848
60.0k
      if (pos >= 0 && 
Text[pos] == '\r'2.61k
)
849
102
        --pos;
850
      // See whether there is an odd number of '\' before this.
851
      // FIXME: This is wrong. A '\' followed by a newline is always removed,
852
      // regardless of whether there is another '\' before it.
853
      // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
854
60.0k
      unsigned count = 0;
855
60.7k
      for (; pos >= 0; 
--pos, ++count716
)
856
2.54k
        if (Text[pos] != '\\')
857
1.83k
          break;
858
60.0k
      return count & 1;
859
60.0k
    };
860
    // FIXME: This miscounts tok:unknown tokens that are not just
861
    // whitespace, e.g. a '`' character.
862
840k
    for (int i = 0, e = Text.size(); i != e; 
++i564k
) {
863
564k
      switch (Text[i]) {
864
60.0k
      case '\n':
865
60.0k
        ++FormatTok->NewlinesBefore;
866
60.0k
        FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
867
60.0k
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
868
60.0k
        Column = 0;
869
60.0k
        break;
870
109
      case '\r':
871
109
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
872
109
        Column = 0;
873
109
        break;
874
20
      case '\f':
875
20
      case '\v':
876
20
        Column = 0;
877
20
        break;
878
501k
      case ' ':
879
501k
        ++Column;
880
501k
        break;
881
1.83k
      case '\t':
882
1.83k
        Column +=
883
1.78k
            Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 
045
);
884
1.83k
        break;
885
767
      case '\\':
886
767
        if (i + 1 == e || 
(716
Text[i + 1] != '\r'716
&&
Text[i + 1] != '\n'701
))
887
51
          FormatTok->setType(TT_ImplicitStringLiteral);
888
767
        break;
889
235
      default:
890
235
        FormatTok->setType(TT_ImplicitStringLiteral);
891
235
        break;
892
564k
      }
893
564k
      if (FormatTok->getType() == TT_ImplicitStringLiteral)
894
286
        break;
895
564k
    }
896
275k
897
275k
    if (FormatTok->is(TT_ImplicitStringLiteral))
898
286
      break;
899
275k
    WhitespaceLength += FormatTok->Tok.getLength();
900
275k
901
275k
    readRawToken(*FormatTok);
902
275k
  }
903
612k
904
  // JavaScript and Java do not allow to escape the end of the line with a
905
  // backslash. Backslashes are syntax errors in plain source, but can occur in
906
  // comments. When a single line comment ends with a \, it'll cause the next
907
  // line of code to be lexed as a comment, breaking formatting. The code below
908
  // finds comments that contain a backslash followed by a line break, truncates
909
  // the comment token at the backslash, and resets the lexer to restart behind
910
  // the backslash.
911
612k
  if ((Style.Language == FormatStyle::LK_JavaScript ||
912
578k
       Style.Language == FormatStyle::LK_Java) &&
913
38.6k
      FormatTok->is(tok::comment) && 
FormatTok->TokenText.startswith("//")448
) {
914
307
    size_t BackslashPos = FormatTok->TokenText.find('\\');
915
311
    while (BackslashPos != StringRef::npos) {
916
16
      if (BackslashPos + 1 < FormatTok->TokenText.size() &&
917
16
          FormatTok->TokenText[BackslashPos + 1] == '\n') {
918
12
        const char *Offset = Lex->getBufferLocation();
919
12
        Offset -= FormatTok->TokenText.size();
920
12
        Offset += BackslashPos + 1;
921
12
        resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
922
12
        FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
923
12
        FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
924
12
            FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
925
12
            Encoding);
926
12
        break;
927
12
      }
928
4
      BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
929
4
    }
930
307
  }
931
612k
932
  // In case the token starts with escaped newlines, we want to
933
  // take them into account as whitespace - this pattern is quite frequent
934
  // in macro definitions.
935
  // FIXME: Add a more explicit test.
936
612k
  while (FormatTok->TokenText.size() > 1 && 
FormatTok->TokenText[0] == '\\'207k
) {
937
77
    unsigned SkippedWhitespace = 0;
938
77
    if (FormatTok->TokenText.size() > 2 &&
939
77
        (FormatTok->TokenText[1] == '\r' && 
FormatTok->TokenText[2] == '\n'9
))
940
9
      SkippedWhitespace = 3;
941
68
    else if (FormatTok->TokenText[1] == '\n')
942
68
      SkippedWhitespace = 2;
943
0
    else
944
0
      break;
945
77
946
77
    ++FormatTok->NewlinesBefore;
947
77
    WhitespaceLength += SkippedWhitespace;
948
77
    FormatTok->LastNewlineOffset = SkippedWhitespace;
949
77
    Column = 0;
950
77
    FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
951
77
  }
952
612k
953
612k
  FormatTok->WhitespaceRange = SourceRange(
954
612k
      WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
955
612k
956
612k
  FormatTok->OriginalColumn = Column;
957
612k
958
612k
  TrailingWhitespace = 0;
959
612k
  if (FormatTok->Tok.is(tok::comment)) {
960
    // FIXME: Add the trimmed whitespace to Column.
961
9.77k
    StringRef UntrimmedText = FormatTok->TokenText;
962
9.77k
    FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
963
9.77k
    TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
964
602k
  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
965
228k
    IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
966
228k
    FormatTok->Tok.setIdentifierInfo(&Info);
967
228k
    FormatTok->Tok.setKind(Info.getTokenID());
968
228k
    if (Style.Language == FormatStyle::LK_Java &&
969
1.76k
        FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
970
8
                           tok::kw_operator)) {
971
8
      FormatTok->Tok.setKind(tok::identifier);
972
8
      FormatTok->Tok.setIdentifierInfo(nullptr);
973
228k
    } else if (Style.Language == FormatStyle::LK_JavaScript &&
974
11.7k
               FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
975
28
                                  tok::kw_operator)) {
976
28
      FormatTok->Tok.setKind(tok::identifier);
977
28
      FormatTok->Tok.setIdentifierInfo(nullptr);
978
28
    }
979
373k
  } else if (FormatTok->Tok.is(tok::greatergreater)) {
980
418
    FormatTok->Tok.setKind(tok::greater);
981
418
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
982
418
    ++Column;
983
418
    StateStack.push(LexerState::TOKEN_STASHED);
984
373k
  } else if (FormatTok->Tok.is(tok::lessless)) {
985
1.07k
    FormatTok->Tok.setKind(tok::less);
986
1.07k
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
987
1.07k
    ++Column;
988
1.07k
    StateStack.push(LexerState::TOKEN_STASHED);
989
1.07k
  }
990
612k
991
  // Now FormatTok is the next non-whitespace token.
992
612k
993
612k
  StringRef Text = FormatTok->TokenText;
994
612k
  size_t FirstNewlinePos = Text.find('\n');
995
612k
  if (FirstNewlinePos == StringRef::npos) {
996
    // FIXME: ColumnWidth actually depends on the start column, we need to
997
    // take this into account when the token is moved.
998
611k
    FormatTok->ColumnWidth =
999
611k
        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1000
611k
    Column += FormatTok->ColumnWidth;
1001
638
  } else {
1002
638
    FormatTok->IsMultiline = true;
1003
    // FIXME: ColumnWidth actually depends on the start column, we need to
1004
    // take this into account when the token is moved.
1005
638
    FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1006
638
        Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1007
638
1008
    // The last line of the token always starts in column 0.
1009
    // Thus, the length can be precomputed even in the presence of tabs.
1010
638
    FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1011
638
        Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1012
638
    Column = FormatTok->LastLineColumnWidth;
1013
638
  }
1014
612k
1015
612k
  if (Style.isCpp()) {
1016
557k
    auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1017
557k
    if (!(Tokens.size() > 0 && 
Tokens.back()->Tok.getIdentifierInfo()525k
&&
1018
209k
          Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1019
209k
              tok::pp_define) &&
1020
555k
        it != Macros.end()) {
1021
723
      FormatTok->setType(it->second);
1022
557k
    } else if (FormatTok->is(tok::identifier)) {
1023
138k
      if (MacroBlockBeginRegex.match(Text)) {
1024
28
        FormatTok->setType(TT_MacroBlockBegin);
1025
138k
      } else if (MacroBlockEndRegex.match(Text)) {
1026
28
        FormatTok->setType(TT_MacroBlockEnd);
1027
28
      }
1028
138k
    }
1029
557k
  }
1030
612k
1031
612k
  return FormatTok;
1032
612k
}
1033
1034
888k
void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1035
888k
  Lex->LexFromRawLexer(Tok.Tok);
1036
888k
  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1037
888k
                            Tok.Tok.getLength());
1038
  // For formatting, treat unterminated string literals like normal string
1039
  // literals.
1040
888k
  if (Tok.is(tok::unknown)) {
1041
275k
    if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1042
27
      Tok.Tok.setKind(tok::string_literal);
1043
27
      Tok.IsUnterminatedLiteral = true;
1044
275k
    } else if (Style.Language == FormatStyle::LK_JavaScript &&
1045
15.1k
               Tok.TokenText == "''") {
1046
12
      Tok.Tok.setKind(tok::string_literal);
1047
12
    }
1048
275k
  }
1049
888k
1050
888k
  if ((Style.Language == FormatStyle::LK_JavaScript ||
1051
838k
       Style.Language == FormatStyle::LK_Proto ||
1052
831k
       Style.Language == FormatStyle::LK_TextProto) &&
1053
66.0k
      Tok.is(tok::char_constant)) {
1054
728
    Tok.Tok.setKind(tok::string_literal);
1055
728
  }
1056
888k
1057
888k
  if (Tok.is(tok::comment) && 
(9.77k
Tok.TokenText == "// clang-format on"9.77k
||
1058
9.76k
                               Tok.TokenText == "/* clang-format on */")) {
1059
26
    FormattingDisabled = false;
1060
26
  }
1061
888k
1062
888k
  Tok.Finalized = FormattingDisabled;
1063
888k
1064
888k
  if (Tok.is(tok::comment) && 
(9.77k
Tok.TokenText == "// clang-format off"9.77k
||
1065
9.75k
                               Tok.TokenText == "/* clang-format off */")) {
1066
29
    FormattingDisabled = true;
1067
29
  }
1068
888k
}
1069
1070
639
void FormatTokenLexer::resetLexer(unsigned Offset) {
1071
639
  StringRef Buffer = SourceMgr.getBufferData(ID);
1072
639
  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
1073
639
                      getFormattingLangOpts(Style), Buffer.begin(),
1074
639
                      Buffer.begin() + Offset, Buffer.end()));
1075
639
  Lex->SetKeepWhitespaceMode(true);
1076
639
  TrailingWhitespace = 0;
1077
639
}
1078
1079
} // namespace format
1080
} // namespace clang