Coverage Report

Created: 2021-08-24 07:12

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file implements FormatTokenLexer, which tokenizes a source file
11
/// into a FormatToken stream suitable for ClangFormat.
12
///
13
//===----------------------------------------------------------------------===//
14
15
#include "FormatTokenLexer.h"
16
#include "FormatToken.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/SourceManager.h"
19
#include "clang/Format/Format.h"
20
#include "llvm/Support/Regex.h"
21
22
namespace clang {
23
namespace format {
24
25
FormatTokenLexer::FormatTokenLexer(
26
    const SourceManager &SourceMgr, FileID ID, unsigned Column,
27
    const FormatStyle &Style, encoding::Encoding Encoding,
28
    llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29
    IdentifierTable &IdentTable)
30
    : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31
      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
32
      Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
33
      Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
34
      FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
35
41.2k
      MacroBlockEndRegex(Style.MacroBlockEnd) {
36
41.2k
  Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr,
37
41.2k
                      getFormattingLangOpts(Style)));
38
41.2k
  Lex->SetKeepWhitespaceMode(true);
39
40
41.2k
  for (const std::string &ForEachMacro : Style.ForEachMacros)
41
123k
    Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
42
41.2k
  for (const std::string &IfMacro : Style.IfMacros)
43
43.3k
    Macros.insert({&IdentTable.get(IfMacro), TT_IfMacro});
44
41.2k
  for (const std::string &AttributeMacro : Style.AttributeMacros)
45
41.9k
    Macros.insert({&IdentTable.get(AttributeMacro), TT_AttributeMacro});
46
41.2k
  for (const std::string &StatementMacro : Style.StatementMacros)
47
82.5k
    Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
48
41.2k
  for (const std::string &TypenameMacro : Style.TypenameMacros)
49
588
    Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
50
41.2k
  for (const std::string &NamespaceMacro : Style.NamespaceMacros)
51
159
    Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
52
41.2k
  for (const std::string &WhitespaceSensitiveMacro :
53
206k
       Style.WhitespaceSensitiveMacros) {
54
206k
    Macros.insert(
55
206k
        {&IdentTable.get(WhitespaceSensitiveMacro), TT_UntouchableMacroFunc});
56
206k
  }
57
41.2k
  for (const std::string &StatementAttributeLikeMacro :
58
41.2k
       Style.StatementAttributeLikeMacros)
59
41.2k
    Macros.insert({&IdentTable.get(StatementAttributeLikeMacro),
60
41.2k
                   TT_StatementAttributeLikeMacro});
61
41.2k
}
62
63
41.2k
ArrayRef<FormatToken *> FormatTokenLexer::lex() {
64
41.2k
  assert(Tokens.empty());
65
0
  assert(FirstInLineIndex == 0);
66
716k
  do {
67
716k
    Tokens.push_back(getNextToken());
68
716k
    if (Style.Language == FormatStyle::LK_JavaScript) {
69
35.6k
      tryParseJSRegexLiteral();
70
35.6k
      handleTemplateStrings();
71
35.6k
    }
72
716k
    if (Style.Language == FormatStyle::LK_TextProto)
73
6.72k
      tryParsePythonComment();
74
716k
    tryMergePreviousTokens();
75
716k
    if (Style.isCSharp())
76
      // This needs to come after tokens have been merged so that C#
77
      // string literals are correctly identified.
78
6.43k
      handleCSharpVerbatimAndInterpolatedStrings();
79
716k
    if (Tokens.back()->NewlinesBefore > 0 || 
Tokens.back()->IsMultiline647k
)
80
69.5k
      FirstInLineIndex = Tokens.size() - 1;
81
716k
  } while (Tokens.back()->Tok.isNot(tok::eof));
82
41.2k
  return Tokens;
83
41.2k
}
84
85
716k
void FormatTokenLexer::tryMergePreviousTokens() {
86
716k
  if (tryMerge_TMacro())
87
15
    return;
88
716k
  if (tryMergeConflictMarkers())
89
36
    return;
90
715k
  if (tryMergeLessLess())
91
982
    return;
92
714k
  if (tryMergeForEach())
93
7
    return;
94
714k
  if (Style.isCpp() && 
tryTransformTryUsageForC()656k
)
95
28
    return;
96
97
714k
  if (Style.Language == FormatStyle::LK_JavaScript || 
Style.isCSharp()679k
) {
98
42.1k
    static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
99
42.1k
                                                               tok::question};
100
42.1k
    static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
101
42.1k
                                                             tok::period};
102
42.1k
    static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
103
104
42.1k
    if (tryMergeTokens(FatArrow, TT_FatArrow))
105
202
      return;
106
41.9k
    if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
107
      // Treat like the "||" operator (as opposed to the ternary ?).
108
38
      Tokens.back()->Tok.setKind(tok::pipepipe);
109
38
      return;
110
38
    }
111
41.8k
    if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
112
      // Treat like a regular "." access.
113
22
      Tokens.back()->Tok.setKind(tok::period);
114
22
      return;
115
22
    }
116
41.8k
    if (tryMergeNullishCoalescingEqual()) {
117
14
      return;
118
14
    }
119
41.8k
  }
120
121
714k
  if (Style.isCSharp()) {
122
6.34k
    static const tok::TokenKind CSharpNullConditionalLSquare[] = {
123
6.34k
        tok::question, tok::l_square};
124
125
6.34k
    if (tryMergeCSharpKeywordVariables())
126
2
      return;
127
6.34k
    if (tryMergeCSharpStringLiteral())
128
43
      return;
129
6.30k
    if (tryTransformCSharpForEach())
130
8
      return;
131
6.29k
    if (tryMergeTokens(CSharpNullConditionalLSquare,
132
6.29k
                       TT_CSharpNullConditionalLSquare)) {
133
      // Treat like a regular "[" operator.
134
8
      Tokens.back()->Tok.setKind(tok::l_square);
135
8
      return;
136
8
    }
137
6.29k
  }
138
139
714k
  if (tryMergeNSStringLiteral())
140
222
    return;
141
142
714k
  if (Style.Language == FormatStyle::LK_JavaScript) {
143
35.5k
    static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
144
35.5k
    static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
145
35.5k
                                                   tok::equal};
146
35.5k
    static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
147
35.5k
                                                  tok::greaterequal};
148
35.5k
    static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
149
35.5k
    static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
150
35.5k
                                                           tok::starequal};
151
35.5k
    static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
152
35.5k
    static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
153
154
    // FIXME: Investigate what token type gives the correct operator priority.
155
35.5k
    if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
156
12
      return;
157
35.4k
    if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
158
12
      return;
159
35.4k
    if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
160
10
      return;
161
35.4k
    if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
162
4
      return;
163
35.4k
    if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
164
4
      Tokens.back()->Tok.setKind(tok::starequal);
165
4
      return;
166
4
    }
167
35.4k
    if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
168
35.4k
        
tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)35.4k
) {
169
      // Treat like the "=" assignment operator.
170
8
      Tokens.back()->Tok.setKind(tok::equal);
171
8
      return;
172
8
    }
173
35.4k
    if (tryMergeJSPrivateIdentifier())
174
32
      return;
175
35.4k
  }
176
177
714k
  if (Style.Language == FormatStyle::LK_Java) {
178
4.46k
    static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
179
4.46k
        tok::greater, tok::greater, tok::greaterequal};
180
4.46k
    if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
181
2
      return;
182
4.46k
  }
183
714k
}
184
185
714k
bool FormatTokenLexer::tryMergeNSStringLiteral() {
186
714k
  if (Tokens.size() < 2)
187
41.2k
    return false;
188
673k
  auto &At = *(Tokens.end() - 2);
189
673k
  auto &String = *(Tokens.end() - 1);
190
673k
  if (!At->is(tok::at) || 
!String->is(tok::string_literal)1.28k
)
191
673k
    return false;
192
222
  At->Tok.setKind(tok::string_literal);
193
222
  At->TokenText = StringRef(At->TokenText.begin(),
194
222
                            String->TokenText.end() - At->TokenText.begin());
195
222
  At->ColumnWidth += String->ColumnWidth;
196
222
  At->setType(TT_ObjCStringLiteral);
197
222
  Tokens.erase(Tokens.end() - 1);
198
222
  return true;
199
673k
}
200
201
35.4k
bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
202
  // Merges #idenfier into a single identifier with the text #identifier
203
  // but the token tok::identifier.
204
35.4k
  if (Tokens.size() < 2)
205
2.88k
    return false;
206
32.5k
  auto &Hash = *(Tokens.end() - 2);
207
32.5k
  auto &Identifier = *(Tokens.end() - 1);
208
32.5k
  if (!Hash->is(tok::hash) || 
!Identifier->is(tok::identifier)36
)
209
32.5k
    return false;
210
32
  Hash->Tok.setKind(tok::identifier);
211
32
  Hash->TokenText =
212
32
      StringRef(Hash->TokenText.begin(),
213
32
                Identifier->TokenText.end() - Hash->TokenText.begin());
214
32
  Hash->ColumnWidth += Identifier->ColumnWidth;
215
32
  Hash->setType(TT_JsPrivateIdentifier);
216
32
  Tokens.erase(Tokens.end() - 1);
217
32
  return true;
218
32.5k
}
219
220
// Search for verbatim or interpolated string literals @"ABC" or
221
// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
222
// prevent splitting of @, $ and ".
223
// Merging of multiline verbatim strings with embedded '"' is handled in
224
// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
225
6.34k
bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
226
6.34k
  if (Tokens.size() < 2)
227
401
    return false;
228
229
  // Interpolated strings could contain { } with " characters inside.
230
  // $"{x ?? "null"}"
231
  // should not be split into $"{x ?? ", null, "}" but should treated as a
232
  // single string-literal.
233
  //
234
  // We opt not to try and format expressions inside {} within a C#
235
  // interpolated string. Formatting expressions within an interpolated string
236
  // would require similar work as that done for JavaScript template strings
237
  // in `handleTemplateStrings()`.
238
5.94k
  auto &CSharpInterpolatedString = *(Tokens.end() - 2);
239
5.94k
  if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral &&
240
5.94k
      
(43
CSharpInterpolatedString->TokenText.startswith(R"($")")43
||
241
43
       
CSharpInterpolatedString->TokenText.startswith(R"($@")")12
)) {
242
37
    int UnmatchedOpeningBraceCount = 0;
243
244
37
    auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
245
989
    for (size_t Index = 0; Index < TokenTextSize; 
++Index952
) {
246
952
      char C = CSharpInterpolatedString->TokenText[Index];
247
952
      if (C == '{') {
248
        // "{{"  inside an interpolated string is an escaped '{' so skip it.
249
49
        if (Index + 1 < TokenTextSize &&
250
49
            CSharpInterpolatedString->TokenText[Index + 1] == '{') {
251
6
          ++Index;
252
6
          continue;
253
6
        }
254
43
        ++UnmatchedOpeningBraceCount;
255
903
      } else if (C == '}') {
256
        // "}}"  inside an interpolated string is an escaped '}' so skip it.
257
43
        if (Index + 1 < TokenTextSize &&
258
43
            CSharpInterpolatedString->TokenText[Index + 1] == '}') {
259
6
          ++Index;
260
6
          continue;
261
6
        }
262
37
        --UnmatchedOpeningBraceCount;
263
37
      }
264
952
    }
265
266
37
    if (UnmatchedOpeningBraceCount > 0) {
267
6
      auto &NextToken = *(Tokens.end() - 1);
268
6
      CSharpInterpolatedString->TokenText =
269
6
          StringRef(CSharpInterpolatedString->TokenText.begin(),
270
6
                    NextToken->TokenText.end() -
271
6
                        CSharpInterpolatedString->TokenText.begin());
272
6
      CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
273
6
      Tokens.erase(Tokens.end() - 1);
274
6
      return true;
275
6
    }
276
37
  }
277
278
  // Look for @"aaaaaa" or $"aaaaaa".
279
5.93k
  auto &String = *(Tokens.end() - 1);
280
5.93k
  if (!String->is(tok::string_literal))
281
5.82k
    return false;
282
283
113
  auto &At = *(Tokens.end() - 2);
284
113
  if (!(At->is(tok::at) || 
At->TokenText == "$"101
))
285
76
    return false;
286
287
37
  if (Tokens.size() > 2 && 
At->is(tok::at)35
) {
288
12
    auto &Dollar = *(Tokens.end() - 3);
289
12
    if (Dollar->TokenText == "$") {
290
      // This looks like $@"aaaaa" so we need to combine all 3 tokens.
291
6
      Dollar->Tok.setKind(tok::string_literal);
292
6
      Dollar->TokenText =
293
6
          StringRef(Dollar->TokenText.begin(),
294
6
                    String->TokenText.end() - Dollar->TokenText.begin());
295
6
      Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
296
6
      Dollar->setType(TT_CSharpStringLiteral);
297
6
      Tokens.erase(Tokens.end() - 2);
298
6
      Tokens.erase(Tokens.end() - 1);
299
6
      return true;
300
6
    }
301
12
  }
302
303
  // Convert back into just a string_literal.
304
31
  At->Tok.setKind(tok::string_literal);
305
31
  At->TokenText = StringRef(At->TokenText.begin(),
306
31
                            String->TokenText.end() - At->TokenText.begin());
307
31
  At->ColumnWidth += String->ColumnWidth;
308
31
  At->setType(TT_CSharpStringLiteral);
309
31
  Tokens.erase(Tokens.end() - 1);
310
31
  return true;
311
37
}
312
313
// Valid C# attribute targets:
314
// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
315
const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
316
    "assembly", "module",   "field",  "event", "method",
317
    "param",    "property", "return", "type",
318
};
319
320
41.8k
bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
321
41.8k
  if (Tokens.size() < 2)
322
3.28k
    return false;
323
38.5k
  auto &NullishCoalescing = *(Tokens.end() - 2);
324
38.5k
  auto &Equal = *(Tokens.end() - 1);
325
38.5k
  if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
326
38.5k
      
!Equal->is(tok::equal)38
)
327
38.5k
    return false;
328
14
  NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
329
14
  NullishCoalescing->TokenText =
330
14
      StringRef(NullishCoalescing->TokenText.begin(),
331
14
                Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
332
14
  NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
333
14
  NullishCoalescing->setType(TT_NullCoalescingEqual);
334
14
  Tokens.erase(Tokens.end() - 1);
335
14
  return true;
336
38.5k
}
337
338
6.34k
bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
339
6.34k
  if (Tokens.size() < 2)
340
401
    return false;
341
5.94k
  auto &At = *(Tokens.end() - 2);
342
5.94k
  auto &Keyword = *(Tokens.end() - 1);
343
5.94k
  if (!At->is(tok::at))
344
5.93k
    return false;
345
16
  if (!Keywords.isCSharpKeyword(*Keyword))
346
14
    return false;
347
348
2
  At->Tok.setKind(tok::identifier);
349
2
  At->TokenText = StringRef(At->TokenText.begin(),
350
2
                            Keyword->TokenText.end() - At->TokenText.begin());
351
2
  At->ColumnWidth += Keyword->ColumnWidth;
352
2
  At->setType(Keyword->getType());
353
2
  Tokens.erase(Tokens.end() - 1);
354
2
  return true;
355
16
}
356
357
// In C# transform identifier foreach into kw_foreach
358
6.30k
bool FormatTokenLexer::tryTransformCSharpForEach() {
359
6.30k
  if (Tokens.size() < 1)
360
0
    return false;
361
6.30k
  auto &Identifier = *(Tokens.end() - 1);
362
6.30k
  if (!Identifier->is(tok::identifier))
363
4.52k
    return false;
364
1.77k
  if (Identifier->TokenText != "foreach")
365
1.76k
    return false;
366
367
8
  Identifier->setType(TT_ForEachMacro);
368
8
  Identifier->Tok.setKind(tok::kw_for);
369
8
  return true;
370
1.77k
}
371
372
714k
bool FormatTokenLexer::tryMergeForEach() {
373
714k
  if (Tokens.size() < 2)
374
41.2k
    return false;
375
673k
  auto &For = *(Tokens.end() - 2);
376
673k
  auto &Each = *(Tokens.end() - 1);
377
673k
  if (!For->is(tok::kw_for))
378
673k
    return false;
379
751
  if (!Each->is(tok::identifier))
380
736
    return false;
381
15
  if (Each->TokenText != "each")
382
8
    return false;
383
384
7
  For->setType(TT_ForEachMacro);
385
7
  For->Tok.setKind(tok::kw_for);
386
387
7
  For->TokenText = StringRef(For->TokenText.begin(),
388
7
                             Each->TokenText.end() - For->TokenText.begin());
389
7
  For->ColumnWidth += Each->ColumnWidth;
390
7
  Tokens.erase(Tokens.end() - 1);
391
7
  return true;
392
15
}
393
394
656k
bool FormatTokenLexer::tryTransformTryUsageForC() {
395
656k
  if (Tokens.size() < 2)
396
36.8k
    return false;
397
619k
  auto &Try = *(Tokens.end() - 2);
398
619k
  if (!Try->is(tok::kw_try))
399
619k
    return false;
400
225
  auto &Next = *(Tokens.end() - 1);
401
225
  if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
402
195
    return false;
403
404
30
  if (Tokens.size() > 2) {
405
23
    auto &At = *(Tokens.end() - 3);
406
23
    if (At->is(tok::at))
407
2
      return false;
408
23
  }
409
410
28
  Try->Tok.setKind(tok::identifier);
411
28
  return true;
412
30
}
413
414
715k
bool FormatTokenLexer::tryMergeLessLess() {
415
  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
416
715k
  if (Tokens.size() < 3)
417
82.2k
    return false;
418
419
633k
  bool FourthTokenIsLess = false;
420
633k
  if (Tokens.size() > 3)
421
593k
    FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
422
423
633k
  auto First = Tokens.end() - 3;
424
633k
  if (First[2]->is(tok::less) || 
First[1]->isNot(tok::less)623k
||
425
633k
      
First[0]->isNot(tok::less)11.2k
||
FourthTokenIsLess1.05k
)
426
632k
    return false;
427
428
  // Only merge if there currently is no whitespace between the two "<".
429
982
  if (First[1]->WhitespaceRange.getBegin() !=
430
982
      First[1]->WhitespaceRange.getEnd())
431
0
    return false;
432
433
982
  First[0]->Tok.setKind(tok::lessless);
434
982
  First[0]->TokenText = "<<";
435
982
  First[0]->ColumnWidth += 1;
436
982
  Tokens.erase(Tokens.end() - 2);
437
982
  return true;
438
982
}
439
440
bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
441
385k
                                      TokenType NewType) {
442
385k
  if (Tokens.size() < Kinds.size())
443
33.9k
    return false;
444
445
351k
  SmallVectorImpl<FormatToken *>::const_iterator First =
446
351k
      Tokens.end() - Kinds.size();
447
351k
  if (!First[0]->is(Kinds[0]))
448
348k
    return false;
449
2.91k
  unsigned AddLength = 0;
450
3.30k
  for (unsigned i = 1; i < Kinds.size(); 
++i384
) {
451
2.98k
    if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
452
396
                                       First[i]->WhitespaceRange.getEnd())
453
2.59k
      return false;
454
384
    AddLength += First[i]->TokenText.size();
455
384
  }
456
322
  Tokens.resize(Tokens.size() - Kinds.size() + 1);
457
322
  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
458
322
                                  First[0]->TokenText.size() + AddLength);
459
322
  First[0]->ColumnWidth += AddLength;
460
322
  First[0]->setType(NewType);
461
322
  return true;
462
2.91k
}
463
464
// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
465
336
bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
466
  // NB: This is not entirely correct, as an r_paren can introduce an operand
467
  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
468
  // corner case to not matter in practice, though.
469
336
  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
470
336
                      tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
471
336
                      tok::colon, tok::question, tok::tilde) ||
472
336
         Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
473
292
                      tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
474
292
                      tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
475
336
         
Tok->isBinaryOperator()288
;
476
336
}
477
478
340
bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
479
340
  if (!Prev)
480
4
    return true;
481
482
  // Regex literals can only follow after prefix unary operators, not after
483
  // postfix unary operators. If the '++' is followed by a non-operand
484
  // introducing token, the slash here is the operand and not the start of a
485
  // regex.
486
  // `!` is an unary prefix operator, but also a post-fix operator that casts
487
  // away nullability, so the same check applies.
488
336
  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
489
20
    return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
490
491
  // The previous token must introduce an operand location where regex
492
  // literals can occur.
493
316
  if (!precedesOperand(Prev))
494
24
    return false;
495
496
292
  return true;
497
316
}
498
499
// Tries to parse a JavaScript Regex literal starting at the current token,
500
// if that begins with a slash and is in a location where JavaScript allows
501
// regex literals. Changes the current token to a regex literal and updates
502
// its text if successful.
503
35.6k
void FormatTokenLexer::tryParseJSRegexLiteral() {
504
35.6k
  FormatToken *RegexToken = Tokens.back();
505
35.6k
  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
506
35.3k
    return;
507
508
340
  FormatToken *Prev = nullptr;
509
348
  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; 
++I8
) {
510
    // NB: Because previous pointers are not initialized yet, this cannot use
511
    // Token.getPreviousNonComment.
512
344
    if ((*I)->isNot(tok::comment)) {
513
336
      Prev = *I;
514
336
      break;
515
336
    }
516
344
  }
517
518
340
  if (!canPrecedeRegexLiteral(Prev))
519
36
    return;
520
521
  // 'Manually' lex ahead in the current file buffer.
522
304
  const char *Offset = Lex->getBufferLocation();
523
304
  const char *RegexBegin = Offset - RegexToken->TokenText.size();
524
304
  StringRef Buffer = Lex->getBuffer();
525
304
  bool InCharacterClass = false;
526
304
  bool HaveClosingSlash = false;
527
1.91k
  for (; !HaveClosingSlash && 
Offset != Buffer.end()1.61k
;
++Offset1.61k
) {
528
    // Regular expressions are terminated with a '/', which can only be
529
    // escaped using '\' or a character class between '[' and ']'.
530
    // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
531
1.61k
    switch (*Offset) {
532
116
    case '\\':
533
      // Skip the escaped character.
534
116
      ++Offset;
535
116
      break;
536
40
    case '[':
537
40
      InCharacterClass = true;
538
40
      break;
539
40
    case ']':
540
40
      InCharacterClass = false;
541
40
      break;
542
320
    case '/':
543
320
      if (!InCharacterClass)
544
304
        HaveClosingSlash = true;
545
320
      break;
546
1.61k
    }
547
1.61k
  }
548
549
304
  RegexToken->setType(TT_RegexLiteral);
550
  // Treat regex literals like other string_literals.
551
304
  RegexToken->Tok.setKind(tok::string_literal);
552
304
  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
553
304
  RegexToken->ColumnWidth = RegexToken->TokenText.size();
554
555
304
  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
556
304
}
557
558
6.43k
void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
559
6.43k
  FormatToken *CSharpStringLiteral = Tokens.back();
560
561
6.43k
  if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral)
562
6.39k
    return;
563
564
  // Deal with multiline strings.
565
43
  if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
566
43
        
CSharpStringLiteral->TokenText.startswith(R"($@")")37
))
567
31
    return;
568
569
12
  const char *StrBegin =
570
12
      Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
571
12
  const char *Offset = StrBegin;
572
12
  if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
573
6
    Offset += 2;
574
6
  else // CSharpStringLiteral->TokenText.startswith(R"($@")")
575
6
    Offset += 3;
576
577
  // Look for a terminating '"' in the current file buffer.
578
  // Make no effort to format code within an interpolated or verbatim string.
579
288
  for (; Offset != Lex->getBuffer().end(); 
++Offset276
) {
580
288
    if (Offset[0] == '"') {
581
      // "" within a verbatim string is an escaped double quote: skip it.
582
22
      if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
583
10
        ++Offset;
584
12
      else
585
12
        break;
586
22
    }
587
288
  }
588
589
  // Make no attempt to format code properly if a verbatim string is
590
  // unterminated.
591
12
  if (Offset == Lex->getBuffer().end())
592
0
    return;
593
594
12
  StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
595
12
  CSharpStringLiteral->TokenText = LiteralText;
596
597
  // Adjust width for potentially multiline string literals.
598
12
  size_t FirstBreak = LiteralText.find('\n');
599
12
  StringRef FirstLineText = FirstBreak == StringRef::npos
600
12
                                ? 
LiteralText10
601
12
                                : 
LiteralText.substr(0, FirstBreak)2
;
602
12
  CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
603
12
      FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
604
12
      Encoding);
605
12
  size_t LastBreak = LiteralText.rfind('\n');
606
12
  if (LastBreak != StringRef::npos) {
607
2
    CSharpStringLiteral->IsMultiline = true;
608
2
    unsigned StartColumn = 0;
609
2
    CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs(
610
2
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
611
2
        Style.TabWidth, Encoding);
612
2
  }
613
614
12
  SourceLocation loc = Offset < Lex->getBuffer().end()
615
12
                           ? Lex->getSourceLocation(Offset + 1)
616
12
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
617
12
  resetLexer(SourceMgr.getFileOffset(loc));
618
12
}
619
620
35.6k
void FormatTokenLexer::handleTemplateStrings() {
621
35.6k
  FormatToken *BacktickToken = Tokens.back();
622
623
35.6k
  if (BacktickToken->is(tok::l_brace)) {
624
1.96k
    StateStack.push(LexerState::NORMAL);
625
1.96k
    return;
626
1.96k
  }
627
33.7k
  if (BacktickToken->is(tok::r_brace)) {
628
2.06k
    if (StateStack.size() == 1)
629
4
      return;
630
2.05k
    StateStack.pop();
631
2.05k
    if (StateStack.top() != LexerState::TEMPLATE_STRING)
632
1.95k
      return;
633
    // If back in TEMPLATE_STRING, fallthrough and continue parsing the
634
31.6k
  } else if (BacktickToken->is(tok::unknown) &&
635
31.6k
             
BacktickToken->TokenText == "`"148
) {
636
148
    StateStack.push(LexerState::TEMPLATE_STRING);
637
31.5k
  } else {
638
31.5k
    return; // Not actually a template
639
31.5k
  }
640
641
  // 'Manually' lex ahead in the current file buffer.
642
248
  const char *Offset = Lex->getBufferLocation();
643
248
  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
644
1.90k
  for (; Offset != Lex->getBuffer().end(); 
++Offset1.65k
) {
645
1.90k
    if (Offset[0] == '`') {
646
148
      StateStack.pop();
647
148
      break;
648
148
    }
649
1.75k
    if (Offset[0] == '\\') {
650
8
      ++Offset; // Skip the escaped character.
651
1.74k
    } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
652
1.74k
               
Offset[1] == '{'100
) {
653
      // '${' introduces an expression interpolation in the template string.
654
100
      StateStack.push(LexerState::NORMAL);
655
100
      ++Offset;
656
100
      break;
657
100
    }
658
1.75k
  }
659
660
248
  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
661
248
  BacktickToken->setType(TT_TemplateString);
662
248
  BacktickToken->Tok.setKind(tok::string_literal);
663
248
  BacktickToken->TokenText = LiteralText;
664
665
  // Adjust width for potentially multiline string literals.
666
248
  size_t FirstBreak = LiteralText.find('\n');
667
248
  StringRef FirstLineText = FirstBreak == StringRef::npos
668
248
                                ? 
LiteralText212
669
248
                                : 
LiteralText.substr(0, FirstBreak)36
;
670
248
  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
671
248
      FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
672
248
  size_t LastBreak = LiteralText.rfind('\n');
673
248
  if (LastBreak != StringRef::npos) {
674
36
    BacktickToken->IsMultiline = true;
675
36
    unsigned StartColumn = 0; // The template tail spans the entire line.
676
36
    BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
677
36
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
678
36
        Style.TabWidth, Encoding);
679
36
  }
680
681
248
  SourceLocation loc = Offset < Lex->getBuffer().end()
682
248
                           ? Lex->getSourceLocation(Offset + 1)
683
248
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
684
248
  resetLexer(SourceMgr.getFileOffset(loc));
685
248
}
686
687
6.72k
void FormatTokenLexer::tryParsePythonComment() {
688
6.72k
  FormatToken *HashToken = Tokens.back();
689
6.72k
  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
690
6.66k
    return;
691
  // Turn the remainder of this line into a comment.
692
65
  const char *CommentBegin =
693
65
      Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
694
65
  size_t From = CommentBegin - Lex->getBuffer().begin();
695
65
  size_t To = Lex->getBuffer().find_first_of('\n', From);
696
65
  if (To == StringRef::npos)
697
7
    To = Lex->getBuffer().size();
698
65
  size_t Len = To - From;
699
65
  HashToken->setType(TT_LineComment);
700
65
  HashToken->Tok.setKind(tok::comment);
701
65
  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
702
65
  SourceLocation Loc = To < Lex->getBuffer().size()
703
65
                           ? 
Lex->getSourceLocation(CommentBegin + Len)58
704
65
                           : 
SourceMgr.getLocForEndOfFile(ID)7
;
705
65
  resetLexer(SourceMgr.getFileOffset(Loc));
706
65
}
707
708
716k
bool FormatTokenLexer::tryMerge_TMacro() {
709
716k
  if (Tokens.size() < 4)
710
122k
    return false;
711
593k
  FormatToken *Last = Tokens.back();
712
593k
  if (!Last->is(tok::r_paren))
713
539k
    return false;
714
715
53.6k
  FormatToken *String = Tokens[Tokens.size() - 2];
716
53.6k
  if (!String->is(tok::string_literal) || 
String->IsMultiline655
)
717
53.0k
    return false;
718
719
622
  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
720
276
    return false;
721
722
346
  FormatToken *Macro = Tokens[Tokens.size() - 4];
723
346
  if (Macro->TokenText != "_T")
724
331
    return false;
725
726
15
  const char *Start = Macro->TokenText.data();
727
15
  const char *End = Last->TokenText.data() + Last->TokenText.size();
728
15
  String->TokenText = StringRef(Start, End - Start);
729
15
  String->IsFirst = Macro->IsFirst;
730
15
  String->LastNewlineOffset = Macro->LastNewlineOffset;
731
15
  String->WhitespaceRange = Macro->WhitespaceRange;
732
15
  String->OriginalColumn = Macro->OriginalColumn;
733
15
  String->ColumnWidth = encoding::columnWidthWithTabs(
734
15
      String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
735
15
  String->NewlinesBefore = Macro->NewlinesBefore;
736
15
  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
737
738
15
  Tokens.pop_back();
739
15
  Tokens.pop_back();
740
15
  Tokens.pop_back();
741
15
  Tokens.back() = String;
742
15
  return true;
743
346
}
744
745
716k
bool FormatTokenLexer::tryMergeConflictMarkers() {
746
716k
  if (Tokens.back()->NewlinesBefore == 0 && 
Tokens.back()->isNot(tok::eof)647k
)
747
609k
    return false;
748
749
  // Conflict lines look like:
750
  // <marker> <text from the vcs>
751
  // For example:
752
  // >>>>>>> /file/in/file/system at revision 1234
753
  //
754
  // We merge all tokens in a line that starts with a conflict marker
755
  // into a single token with a special token type that the unwrapped line
756
  // parser will use to correctly rebuild the underlying code.
757
758
106k
  FileID ID;
759
  // Get the position of the first token in the line.
760
106k
  unsigned FirstInLineOffset;
761
106k
  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
762
106k
      Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
763
106k
  StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
764
  // Calculate the offset of the start of the current line.
765
106k
  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
766
106k
  if (LineOffset == StringRef::npos) {
767
39.9k
    LineOffset = 0;
768
66.0k
  } else {
769
66.0k
    ++LineOffset;
770
66.0k
  }
771
772
106k
  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
773
106k
  StringRef LineStart;
774
106k
  if (FirstSpace == StringRef::npos) {
775
6.18k
    LineStart = Buffer.substr(LineOffset);
776
99.8k
  } else {
777
99.8k
    LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
778
99.8k
  }
779
780
106k
  TokenType Type = TT_Unknown;
781
106k
  if (LineStart == "<<<<<<<" || 
LineStart == ">>>>"106k
) {
782
9
    Type = TT_ConflictStart;
783
106k
  } else if (LineStart == "|||||||" || 
LineStart == "======="106k
||
784
106k
             
LineStart == "===="106k
) {
785
18
    Type = TT_ConflictAlternative;
786
106k
  } else if (LineStart == ">>>>>>>" || 
LineStart == "<<<<"106k
) {
787
9
    Type = TT_ConflictEnd;
788
9
  }
789
790
106k
  if (Type != TT_Unknown) {
791
36
    FormatToken *Next = Tokens.back();
792
793
36
    Tokens.resize(FirstInLineIndex + 1);
794
    // We do not need to build a complete token here, as we will skip it
795
    // during parsing anyway (as we must not touch whitespace around conflict
796
    // markers).
797
36
    Tokens.back()->setType(Type);
798
36
    Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
799
800
36
    Tokens.push_back(Next);
801
36
    return true;
802
36
  }
803
804
106k
  return false;
805
106k
}
806
807
1.57k
FormatToken *FormatTokenLexer::getStashedToken() {
808
  // Create a synthesized second '>' or '<' token.
809
1.57k
  Token Tok = FormatTok->Tok;
810
1.57k
  StringRef TokenText = FormatTok->TokenText;
811
812
1.57k
  unsigned OriginalColumn = FormatTok->OriginalColumn;
813
1.57k
  FormatTok = new (Allocator.Allocate()) FormatToken;
814
1.57k
  FormatTok->Tok = Tok;
815
1.57k
  SourceLocation TokLocation =
816
1.57k
      FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
817
1.57k
  FormatTok->Tok.setLocation(TokLocation);
818
1.57k
  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
819
1.57k
  FormatTok->TokenText = TokenText;
820
1.57k
  FormatTok->ColumnWidth = 1;
821
1.57k
  FormatTok->OriginalColumn = OriginalColumn + 1;
822
823
1.57k
  return FormatTok;
824
1.57k
}
825
826
716k
FormatToken *FormatTokenLexer::getNextToken() {
827
716k
  if (StateStack.top() == LexerState::TOKEN_STASHED) {
828
1.57k
    StateStack.pop();
829
1.57k
    return getStashedToken();
830
1.57k
  }
831
832
714k
  FormatTok = new (Allocator.Allocate()) FormatToken;
833
714k
  readRawToken(*FormatTok);
834
714k
  SourceLocation WhitespaceStart =
835
714k
      FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
836
714k
  FormatTok->IsFirst = IsFirstToken;
837
714k
  IsFirstToken = false;
838
839
  // Consume and record whitespace until we find a significant token.
840
714k
  unsigned WhitespaceLength = TrailingWhitespace;
841
1.03M
  while (FormatTok->Tok.is(tok::unknown)) {
842
324k
    StringRef Text = FormatTok->TokenText;
843
324k
    auto EscapesNewline = [&](int pos) {
844
      // A '\r' here is just part of '\r\n'. Skip it.
845
72.0k
      if (pos >= 0 && 
Text[pos] == '\r'4.09k
)
846
189
        --pos;
847
      // See whether there is an odd number of '\' before this.
848
      // FIXME: This is wrong. A '\' followed by a newline is always removed,
849
      // regardless of whether there is another '\' before it.
850
      // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
851
72.0k
      unsigned count = 0;
852
72.7k
      for (; pos >= 0; 
--pos, ++count761
)
853
3.94k
        if (Text[pos] != '\\')
854
3.18k
          break;
855
72.0k
      return count & 1;
856
72.0k
    };
857
    // FIXME: This miscounts tok:unknown tokens that are not just
858
    // whitespace, e.g. a '`' character.
859
959k
    for (int i = 0, e = Text.size(); i != e; 
++i635k
) {
860
635k
      switch (Text[i]) {
861
72.0k
      case '\n':
862
72.0k
        ++FormatTok->NewlinesBefore;
863
72.0k
        FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
864
72.0k
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
865
72.0k
        Column = 0;
866
72.0k
        break;
867
196
      case '\r':
868
196
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
869
196
        Column = 0;
870
196
        break;
871
10
      case '\f':
872
20
      case '\v':
873
20
        Column = 0;
874
20
        break;
875
560k
      case ' ':
876
560k
        ++Column;
877
560k
        break;
878
1.83k
      case '\t':
879
1.83k
        Column +=
880
1.83k
            Style.TabWidth - (Style.TabWidth ? 
Column % Style.TabWidth1.78k
:
045
);
881
1.83k
        break;
882
812
      case '\\':
883
812
        if (i + 1 == e || 
(761
Text[i + 1] != '\r'761
&&
Text[i + 1] != '\n'746
))
884
51
          FormatTok->setType(TT_ImplicitStringLiteral);
885
812
        break;
886
235
      default:
887
235
        FormatTok->setType(TT_ImplicitStringLiteral);
888
235
        break;
889
635k
      }
890
635k
      if (FormatTok->getType() == TT_ImplicitStringLiteral)
891
286
        break;
892
635k
    }
893
894
324k
    if (FormatTok->is(TT_ImplicitStringLiteral))
895
286
      break;
896
323k
    WhitespaceLength += FormatTok->Tok.getLength();
897
898
323k
    readRawToken(*FormatTok);
899
323k
  }
900
901
  // JavaScript and Java do not allow to escape the end of the line with a
902
  // backslash. Backslashes are syntax errors in plain source, but can occur in
903
  // comments. When a single line comment ends with a \, it'll cause the next
904
  // line of code to be lexed as a comment, breaking formatting. The code below
905
  // finds comments that contain a backslash followed by a line break, truncates
906
  // the comment token at the backslash, and resets the lexer to restart behind
907
  // the backslash.
908
714k
  if ((Style.Language == FormatStyle::LK_JavaScript ||
909
714k
       
Style.Language == FormatStyle::LK_Java678k
) &&
910
714k
      
FormatTok->is(tok::comment)40.1k
&&
FormatTok->TokenText.startswith("//")533
) {
911
356
    size_t BackslashPos = FormatTok->TokenText.find('\\');
912
360
    while (BackslashPos != StringRef::npos) {
913
16
      if (BackslashPos + 1 < FormatTok->TokenText.size() &&
914
16
          FormatTok->TokenText[BackslashPos + 1] == '\n') {
915
12
        const char *Offset = Lex->getBufferLocation();
916
12
        Offset -= FormatTok->TokenText.size();
917
12
        Offset += BackslashPos + 1;
918
12
        resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
919
12
        FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
920
12
        FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
921
12
            FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
922
12
            Encoding);
923
12
        break;
924
12
      }
925
4
      BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
926
4
    }
927
356
  }
928
929
  // In case the token starts with escaped newlines, we want to
930
  // take them into account as whitespace - this pattern is quite frequent
931
  // in macro definitions.
932
  // FIXME: Add a more explicit test.
933
714k
  while (FormatTok->TokenText.size() > 1 && 
FormatTok->TokenText[0] == '\\'240k
) {
934
80
    unsigned SkippedWhitespace = 0;
935
80
    if (FormatTok->TokenText.size() > 2 &&
936
80
        (FormatTok->TokenText[1] == '\r' && 
FormatTok->TokenText[2] == '\n'9
))
937
9
      SkippedWhitespace = 3;
938
71
    else if (FormatTok->TokenText[1] == '\n')
939
71
      SkippedWhitespace = 2;
940
0
    else
941
0
      break;
942
943
80
    ++FormatTok->NewlinesBefore;
944
80
    WhitespaceLength += SkippedWhitespace;
945
80
    FormatTok->LastNewlineOffset = SkippedWhitespace;
946
80
    Column = 0;
947
80
    FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
948
80
  }
949
950
714k
  FormatTok->WhitespaceRange = SourceRange(
951
714k
      WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
952
953
714k
  FormatTok->OriginalColumn = Column;
954
955
714k
  TrailingWhitespace = 0;
956
714k
  if (FormatTok->Tok.is(tok::comment)) {
957
    // FIXME: Add the trimmed whitespace to Column.
958
11.5k
    StringRef UntrimmedText = FormatTok->TokenText;
959
11.5k
    FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
960
11.5k
    TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
961
702k
  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
962
265k
    IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
963
265k
    FormatTok->Tok.setIdentifierInfo(&Info);
964
265k
    FormatTok->Tok.setKind(Info.getTokenID());
965
265k
    if (Style.Language == FormatStyle::LK_Java &&
966
265k
        FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
967
1.83k
                           tok::kw_operator)) {
968
8
      FormatTok->Tok.setKind(tok::identifier);
969
8
      FormatTok->Tok.setIdentifierInfo(nullptr);
970
265k
    } else if (Style.Language == FormatStyle::LK_JavaScript &&
971
265k
               FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
972
12.2k
                                  tok::kw_operator)) {
973
28
      FormatTok->Tok.setKind(tok::identifier);
974
28
      FormatTok->Tok.setIdentifierInfo(nullptr);
975
28
    }
976
437k
  } else if (FormatTok->Tok.is(tok::greatergreater)) {
977
508
    FormatTok->Tok.setKind(tok::greater);
978
508
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
979
508
    ++Column;
980
508
    StateStack.push(LexerState::TOKEN_STASHED);
981
436k
  } else if (FormatTok->Tok.is(tok::lessless)) {
982
1.07k
    FormatTok->Tok.setKind(tok::less);
983
1.07k
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
984
1.07k
    ++Column;
985
1.07k
    StateStack.push(LexerState::TOKEN_STASHED);
986
1.07k
  }
987
988
  // Now FormatTok is the next non-whitespace token.
989
990
714k
  StringRef Text = FormatTok->TokenText;
991
714k
  size_t FirstNewlinePos = Text.find('\n');
992
714k
  if (FirstNewlinePos == StringRef::npos) {
993
    // FIXME: ColumnWidth actually depends on the start column, we need to
994
    // take this into account when the token is moved.
995
713k
    FormatTok->ColumnWidth =
996
713k
        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
997
713k
    Column += FormatTok->ColumnWidth;
998
713k
  } else {
999
693
    FormatTok->IsMultiline = true;
1000
    // FIXME: ColumnWidth actually depends on the start column, we need to
1001
    // take this into account when the token is moved.
1002
693
    FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1003
693
        Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1004
1005
    // The last line of the token always starts in column 0.
1006
    // Thus, the length can be precomputed even in the presence of tabs.
1007
693
    FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1008
693
        Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1009
693
    Column = FormatTok->LastLineColumnWidth;
1010
693
  }
1011
1012
714k
  if (Style.isCpp()) {
1013
656k
    auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1014
656k
    if (!(Tokens.size() > 0 && 
Tokens.back()->Tok.getIdentifierInfo()619k
&&
1015
656k
          Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1016
245k
              tok::pp_define) &&
1017
656k
        
it != Macros.end()653k
) {
1018
1.79k
      FormatTok->setType(it->second);
1019
1.79k
      if (it->second == TT_IfMacro) {
1020
        // The lexer token currently has type tok::kw_unknown. However, for this
1021
        // substitution to be treated correctly in the TokenAnnotator, faking
1022
        // the tok value seems to be needed. Not sure if there's a more elegant
1023
        // way.
1024
1.01k
        FormatTok->Tok.setKind(tok::kw_if);
1025
1.01k
      }
1026
654k
    } else if (FormatTok->is(tok::identifier)) {
1027
160k
      if (MacroBlockBeginRegex.match(Text)) {
1028
28
        FormatTok->setType(TT_MacroBlockBegin);
1029
159k
      } else if (MacroBlockEndRegex.match(Text)) {
1030
28
        FormatTok->setType(TT_MacroBlockEnd);
1031
28
      }
1032
160k
    }
1033
656k
  }
1034
1035
714k
  return FormatTok;
1036
714k
}
1037
1038
1.03M
void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1039
1.03M
  Lex->LexFromRawLexer(Tok.Tok);
1040
1.03M
  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1041
1.03M
                            Tok.Tok.getLength());
1042
  // For formatting, treat unterminated string literals like normal string
1043
  // literals.
1044
1.03M
  if (Tok.is(tok::unknown)) {
1045
324k
    if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1046
27
      Tok.Tok.setKind(tok::string_literal);
1047
27
      Tok.IsUnterminatedLiteral = true;
1048
324k
    } else if (Style.Language == FormatStyle::LK_JavaScript &&
1049
324k
               
Tok.TokenText == "''"15.8k
) {
1050
12
      Tok.Tok.setKind(tok::string_literal);
1051
12
    }
1052
324k
  }
1053
1054
1.03M
  if ((Style.Language == FormatStyle::LK_JavaScript ||
1055
1.03M
       
Style.Language == FormatStyle::LK_Proto987k
||
1056
1.03M
       
Style.Language == FormatStyle::LK_TextProto980k
) &&
1057
1.03M
      
Tok.is(tok::char_constant)68.0k
) {
1058
822
    Tok.Tok.setKind(tok::string_literal);
1059
822
  }
1060
1061
1.03M
  if (Tok.is(tok::comment) && 
(11.5k
Tok.TokenText == "// clang-format on"11.5k
||
1062
11.5k
                               
Tok.TokenText == "/* clang-format on */"11.5k
)) {
1063
50
    FormattingDisabled = false;
1064
50
  }
1065
1066
1.03M
  Tok.Finalized = FormattingDisabled;
1067
1068
1.03M
  if (Tok.is(tok::comment) && 
(11.5k
Tok.TokenText == "// clang-format off"11.5k
||
1069
11.5k
                               
Tok.TokenText == "/* clang-format off */"11.5k
)) {
1070
53
    FormattingDisabled = true;
1071
53
  }
1072
1.03M
}
1073
1074
641
void FormatTokenLexer::resetLexer(unsigned Offset) {
1075
641
  StringRef Buffer = SourceMgr.getBufferData(ID);
1076
641
  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
1077
641
                      getFormattingLangOpts(Style), Buffer.begin(),
1078
641
                      Buffer.begin() + Offset, Buffer.end()));
1079
641
  Lex->SetKeepWhitespaceMode(true);
1080
641
  TrailingWhitespace = 0;
1081
641
}
1082
1083
} // namespace format
1084
} // namespace clang