Coverage Report

Created: 2022-01-18 06:27

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file implements FormatTokenLexer, which tokenizes a source file
11
/// into a FormatToken stream suitable for ClangFormat.
12
///
13
//===----------------------------------------------------------------------===//
14
15
#include "FormatTokenLexer.h"
16
#include "FormatToken.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/SourceManager.h"
19
#include "clang/Format/Format.h"
20
#include "llvm/Support/Regex.h"
21
22
namespace clang {
23
namespace format {
24
25
FormatTokenLexer::FormatTokenLexer(
26
    const SourceManager &SourceMgr, FileID ID, unsigned Column,
27
    const FormatStyle &Style, encoding::Encoding Encoding,
28
    llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29
    IdentifierTable &IdentTable)
30
    : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31
      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
32
      Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
33
      Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
34
      FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
35
49.7k
      MacroBlockEndRegex(Style.MacroBlockEnd) {
36
49.7k
  Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr,
37
49.7k
                      getFormattingLangOpts(Style)));
38
49.7k
  Lex->SetKeepWhitespaceMode(true);
39
40
149k
  for (const std::string &ForEachMacro : Style.ForEachMacros) {
41
149k
    auto Identifier = &IdentTable.get(ForEachMacro);
42
149k
    Macros.insert({Identifier, TT_ForEachMacro});
43
149k
  }
44
51.9k
  for (const std::string &IfMacro : Style.IfMacros) {
45
51.9k
    auto Identifier = &IdentTable.get(IfMacro);
46
51.9k
    Macros.insert({Identifier, TT_IfMacro});
47
51.9k
  }
48
50.5k
  for (const std::string &AttributeMacro : Style.AttributeMacros) {
49
50.5k
    auto Identifier = &IdentTable.get(AttributeMacro);
50
50.5k
    Macros.insert({Identifier, TT_AttributeMacro});
51
50.5k
  }
52
99.6k
  for (const std::string &StatementMacro : Style.StatementMacros) {
53
99.6k
    auto Identifier = &IdentTable.get(StatementMacro);
54
99.6k
    Macros.insert({Identifier, TT_StatementMacro});
55
99.6k
  }
56
49.7k
  for (const std::string &TypenameMacro : Style.TypenameMacros) {
57
595
    auto Identifier = &IdentTable.get(TypenameMacro);
58
595
    Macros.insert({Identifier, TT_TypenameMacro});
59
595
  }
60
49.7k
  for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61
169
    auto Identifier = &IdentTable.get(NamespaceMacro);
62
169
    Macros.insert({Identifier, TT_NamespaceMacro});
63
169
  }
64
49.7k
  for (const std::string &WhitespaceSensitiveMacro :
65
248k
       Style.WhitespaceSensitiveMacros) {
66
248k
    auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67
248k
    Macros.insert({Identifier, TT_UntouchableMacroFunc});
68
248k
  }
69
49.7k
  for (const std::string &StatementAttributeLikeMacro :
70
49.7k
       Style.StatementAttributeLikeMacros) {
71
49.7k
    auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72
49.7k
    Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73
49.7k
  }
74
49.7k
}
75
76
49.7k
ArrayRef<FormatToken *> FormatTokenLexer::lex() {
77
49.7k
  assert(Tokens.empty());
78
0
  assert(FirstInLineIndex == 0);
79
832k
  do {
80
832k
    Tokens.push_back(getNextToken());
81
832k
    if (Style.isJavaScript()) {
82
37.5k
      tryParseJSRegexLiteral();
83
37.5k
      handleTemplateStrings();
84
37.5k
    }
85
832k
    if (Style.Language == FormatStyle::LK_TextProto)
86
6.72k
      tryParsePythonComment();
87
832k
    tryMergePreviousTokens();
88
832k
    if (Style.isCSharp())
89
      // This needs to come after tokens have been merged so that C#
90
      // string literals are correctly identified.
91
8.82k
      handleCSharpVerbatimAndInterpolatedStrings();
92
832k
    if (Tokens.back()->NewlinesBefore > 0 || 
Tokens.back()->IsMultiline755k
)
93
77.6k
      FirstInLineIndex = Tokens.size() - 1;
94
832k
  } while (Tokens.back()->Tok.isNot(tok::eof));
95
49.7k
  return Tokens;
96
49.7k
}
97
98
832k
void FormatTokenLexer::tryMergePreviousTokens() {
99
832k
  if (tryMerge_TMacro())
100
18
    return;
101
832k
  if (tryMergeConflictMarkers())
102
43
    return;
103
832k
  if (tryMergeLessLess())
104
982
    return;
105
831k
  if (tryMergeForEach())
106
7
    return;
107
831k
  if (Style.isCpp() && 
tryTransformTryUsageForC()768k
)
108
28
    return;
109
110
831k
  if (Style.isJavaScript() || 
Style.isCSharp()793k
) {
111
46.3k
    static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
112
46.3k
                                                               tok::question};
113
46.3k
    static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
114
46.3k
                                                             tok::period};
115
46.3k
    static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
116
117
46.3k
    if (tryMergeTokens(FatArrow, TT_FatArrow))
118
236
      return;
119
46.1k
    if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
120
      // Treat like the "||" operator (as opposed to the ternary ?).
121
38
      Tokens.back()->Tok.setKind(tok::pipepipe);
122
38
      return;
123
38
    }
124
46.1k
    if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
125
      // Treat like a regular "." access.
126
22
      Tokens.back()->Tok.setKind(tok::period);
127
22
      return;
128
22
    }
129
46.0k
    if (tryMergeNullishCoalescingEqual()) {
130
14
      return;
131
14
    }
132
46.0k
  }
133
134
830k
  if (Style.isCSharp()) {
135
8.70k
    static const tok::TokenKind CSharpNullConditionalLSquare[] = {
136
8.70k
        tok::question, tok::l_square};
137
138
8.70k
    if (tryMergeCSharpKeywordVariables())
139
8
      return;
140
8.70k
    if (tryMergeCSharpStringLiteral())
141
43
      return;
142
8.65k
    if (tryTransformCSharpForEach())
143
8
      return;
144
8.64k
    if (tryMergeTokens(CSharpNullConditionalLSquare,
145
8.64k
                       TT_CSharpNullConditionalLSquare)) {
146
      // Treat like a regular "[" operator.
147
8
      Tokens.back()->Tok.setKind(tok::l_square);
148
8
      return;
149
8
    }
150
8.64k
  }
151
152
830k
  if (tryMergeNSStringLiteral())
153
222
    return;
154
155
830k
  if (Style.isJavaScript()) {
156
37.3k
    static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
157
37.3k
    static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
158
37.3k
                                                   tok::equal};
159
37.3k
    static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
160
37.3k
                                                  tok::greaterequal};
161
37.3k
    static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
162
37.3k
    static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
163
37.3k
                                                           tok::starequal};
164
37.3k
    static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
165
37.3k
    static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
166
167
    // FIXME: Investigate what token type gives the correct operator priority.
168
37.3k
    if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
169
12
      return;
170
37.3k
    if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
171
12
      return;
172
37.3k
    if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
173
10
      return;
174
37.3k
    if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
175
4
      return;
176
37.3k
    if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
177
4
      Tokens.back()->Tok.setKind(tok::starequal);
178
4
      return;
179
4
    }
180
37.3k
    if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
181
37.3k
        
tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)37.3k
) {
182
      // Treat like the "=" assignment operator.
183
8
      Tokens.back()->Tok.setKind(tok::equal);
184
8
      return;
185
8
    }
186
37.3k
    if (tryMergeJSPrivateIdentifier())
187
32
      return;
188
37.3k
  }
189
190
830k
  if (Style.Language == FormatStyle::LK_Java) {
191
4.52k
    static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
192
4.52k
        tok::greater, tok::greater, tok::greaterequal};
193
4.52k
    if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
194
2
      return;
195
4.52k
  }
196
830k
}
197
198
830k
bool FormatTokenLexer::tryMergeNSStringLiteral() {
199
830k
  if (Tokens.size() < 2)
200
49.7k
    return false;
201
780k
  auto &At = *(Tokens.end() - 2);
202
780k
  auto &String = *(Tokens.end() - 1);
203
780k
  if (!At->is(tok::at) || 
!String->is(tok::string_literal)1.30k
)
204
780k
    return false;
205
222
  At->Tok.setKind(tok::string_literal);
206
222
  At->TokenText = StringRef(At->TokenText.begin(),
207
222
                            String->TokenText.end() - At->TokenText.begin());
208
222
  At->ColumnWidth += String->ColumnWidth;
209
222
  At->setType(TT_ObjCStringLiteral);
210
222
  Tokens.erase(Tokens.end() - 1);
211
222
  return true;
212
780k
}
213
214
37.3k
bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
215
  // Merges #idenfier into a single identifier with the text #identifier
216
  // but the token tok::identifier.
217
37.3k
  if (Tokens.size() < 2)
218
3.01k
    return false;
219
34.3k
  auto &Hash = *(Tokens.end() - 2);
220
34.3k
  auto &Identifier = *(Tokens.end() - 1);
221
34.3k
  if (!Hash->is(tok::hash) || 
!Identifier->is(tok::identifier)36
)
222
34.2k
    return false;
223
32
  Hash->Tok.setKind(tok::identifier);
224
32
  Hash->TokenText =
225
32
      StringRef(Hash->TokenText.begin(),
226
32
                Identifier->TokenText.end() - Hash->TokenText.begin());
227
32
  Hash->ColumnWidth += Identifier->ColumnWidth;
228
32
  Hash->setType(TT_JsPrivateIdentifier);
229
32
  Tokens.erase(Tokens.end() - 1);
230
32
  return true;
231
34.3k
}
232
233
// Search for verbatim or interpolated string literals @"ABC" or
234
// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
235
// prevent splitting of @, $ and ".
236
// Merging of multiline verbatim strings with embedded '"' is handled in
237
// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
238
8.70k
bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
239
8.70k
  if (Tokens.size() < 2)
240
507
    return false;
241
242
  // Interpolated strings could contain { } with " characters inside.
243
  // $"{x ?? "null"}"
244
  // should not be split into $"{x ?? ", null, "}" but should treated as a
245
  // single string-literal.
246
  //
247
  // We opt not to try and format expressions inside {} within a C#
248
  // interpolated string. Formatting expressions within an interpolated string
249
  // would require similar work as that done for JavaScript template strings
250
  // in `handleTemplateStrings()`.
251
8.19k
  auto &CSharpInterpolatedString = *(Tokens.end() - 2);
252
8.19k
  if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral &&
253
8.19k
      
(43
CSharpInterpolatedString->TokenText.startswith(R"($")")43
||
254
43
       
CSharpInterpolatedString->TokenText.startswith(R"($@")")12
)) {
255
37
    int UnmatchedOpeningBraceCount = 0;
256
257
37
    auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
258
989
    for (size_t Index = 0; Index < TokenTextSize; 
++Index952
) {
259
952
      char C = CSharpInterpolatedString->TokenText[Index];
260
952
      if (C == '{') {
261
        // "{{"  inside an interpolated string is an escaped '{' so skip it.
262
49
        if (Index + 1 < TokenTextSize &&
263
49
            CSharpInterpolatedString->TokenText[Index + 1] == '{') {
264
6
          ++Index;
265
6
          continue;
266
6
        }
267
43
        ++UnmatchedOpeningBraceCount;
268
903
      } else if (C == '}') {
269
        // "}}"  inside an interpolated string is an escaped '}' so skip it.
270
43
        if (Index + 1 < TokenTextSize &&
271
43
            CSharpInterpolatedString->TokenText[Index + 1] == '}') {
272
6
          ++Index;
273
6
          continue;
274
6
        }
275
37
        --UnmatchedOpeningBraceCount;
276
37
      }
277
952
    }
278
279
37
    if (UnmatchedOpeningBraceCount > 0) {
280
6
      auto &NextToken = *(Tokens.end() - 1);
281
6
      CSharpInterpolatedString->TokenText =
282
6
          StringRef(CSharpInterpolatedString->TokenText.begin(),
283
6
                    NextToken->TokenText.end() -
284
6
                        CSharpInterpolatedString->TokenText.begin());
285
6
      CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
286
6
      Tokens.erase(Tokens.end() - 1);
287
6
      return true;
288
6
    }
289
37
  }
290
291
  // Look for @"aaaaaa" or $"aaaaaa".
292
8.18k
  auto &String = *(Tokens.end() - 1);
293
8.18k
  if (!String->is(tok::string_literal))
294
8.06k
    return false;
295
296
119
  auto &At = *(Tokens.end() - 2);
297
119
  if (!(At->is(tok::at) || 
At->TokenText == "$"107
))
298
82
    return false;
299
300
37
  if (Tokens.size() > 2 && 
At->is(tok::at)35
) {
301
12
    auto &Dollar = *(Tokens.end() - 3);
302
12
    if (Dollar->TokenText == "$") {
303
      // This looks like $@"aaaaa" so we need to combine all 3 tokens.
304
6
      Dollar->Tok.setKind(tok::string_literal);
305
6
      Dollar->TokenText =
306
6
          StringRef(Dollar->TokenText.begin(),
307
6
                    String->TokenText.end() - Dollar->TokenText.begin());
308
6
      Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
309
6
      Dollar->setType(TT_CSharpStringLiteral);
310
6
      Tokens.erase(Tokens.end() - 2);
311
6
      Tokens.erase(Tokens.end() - 1);
312
6
      return true;
313
6
    }
314
12
  }
315
316
  // Convert back into just a string_literal.
317
31
  At->Tok.setKind(tok::string_literal);
318
31
  At->TokenText = StringRef(At->TokenText.begin(),
319
31
                            String->TokenText.end() - At->TokenText.begin());
320
31
  At->ColumnWidth += String->ColumnWidth;
321
31
  At->setType(TT_CSharpStringLiteral);
322
31
  Tokens.erase(Tokens.end() - 1);
323
31
  return true;
324
37
}
325
326
// Valid C# attribute targets:
327
// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
328
const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
329
    "assembly", "module",   "field",  "event", "method",
330
    "param",    "property", "return", "type",
331
};
332
333
46.0k
bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
334
46.0k
  if (Tokens.size() < 2)
335
3.51k
    return false;
336
42.5k
  auto &NullishCoalescing = *(Tokens.end() - 2);
337
42.5k
  auto &Equal = *(Tokens.end() - 1);
338
42.5k
  if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
339
42.5k
      
!Equal->is(tok::equal)38
)
340
42.5k
    return false;
341
14
  NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
342
14
  NullishCoalescing->TokenText =
343
14
      StringRef(NullishCoalescing->TokenText.begin(),
344
14
                Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
345
14
  NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
346
14
  NullishCoalescing->setType(TT_NullCoalescingEqual);
347
14
  Tokens.erase(Tokens.end() - 1);
348
14
  return true;
349
42.5k
}
350
351
8.70k
bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
352
8.70k
  if (Tokens.size() < 2)
353
507
    return false;
354
8.20k
  auto &At = *(Tokens.end() - 2);
355
8.20k
  auto &Keyword = *(Tokens.end() - 1);
356
8.20k
  if (!At->is(tok::at))
357
8.17k
    return false;
358
28
  if (!Keywords.isCSharpKeyword(*Keyword))
359
20
    return false;
360
361
8
  At->Tok.setKind(tok::identifier);
362
8
  At->TokenText = StringRef(At->TokenText.begin(),
363
8
                            Keyword->TokenText.end() - At->TokenText.begin());
364
8
  At->ColumnWidth += Keyword->ColumnWidth;
365
8
  At->setType(Keyword->getType());
366
8
  Tokens.erase(Tokens.end() - 1);
367
8
  return true;
368
28
}
369
370
// In C# transform identifier foreach into kw_foreach
371
8.65k
bool FormatTokenLexer::tryTransformCSharpForEach() {
372
8.65k
  if (Tokens.size() < 1)
373
0
    return false;
374
8.65k
  auto &Identifier = *(Tokens.end() - 1);
375
8.65k
  if (!Identifier->is(tok::identifier))
376
6.23k
    return false;
377
2.42k
  if (Identifier->TokenText != "foreach")
378
2.41k
    return false;
379
380
8
  Identifier->setType(TT_ForEachMacro);
381
8
  Identifier->Tok.setKind(tok::kw_for);
382
8
  return true;
383
2.42k
}
384
385
831k
bool FormatTokenLexer::tryMergeForEach() {
386
831k
  if (Tokens.size() < 2)
387
49.7k
    return false;
388
781k
  auto &For = *(Tokens.end() - 2);
389
781k
  auto &Each = *(Tokens.end() - 1);
390
781k
  if (!For->is(tok::kw_for))
391
779k
    return false;
392
1.35k
  if (!Each->is(tok::identifier))
393
1.34k
    return false;
394
15
  if (Each->TokenText != "each")
395
8
    return false;
396
397
7
  For->setType(TT_ForEachMacro);
398
7
  For->Tok.setKind(tok::kw_for);
399
400
7
  For->TokenText = StringRef(For->TokenText.begin(),
401
7
                             Each->TokenText.end() - For->TokenText.begin());
402
7
  For->ColumnWidth += Each->ColumnWidth;
403
7
  Tokens.erase(Tokens.end() - 1);
404
7
  return true;
405
15
}
406
407
768k
bool FormatTokenLexer::tryTransformTryUsageForC() {
408
768k
  if (Tokens.size() < 2)
409
45.1k
    return false;
410
723k
  auto &Try = *(Tokens.end() - 2);
411
723k
  if (!Try->is(tok::kw_try))
412
723k
    return false;
413
251
  auto &Next = *(Tokens.end() - 1);
414
251
  if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
415
221
    return false;
416
417
30
  if (Tokens.size() > 2) {
418
23
    auto &At = *(Tokens.end() - 3);
419
23
    if (At->is(tok::at))
420
2
      return false;
421
23
  }
422
423
28
  Try->Tok.setKind(tok::identifier);
424
28
  return true;
425
30
}
426
427
832k
bool FormatTokenLexer::tryMergeLessLess() {
428
  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
429
832k
  if (Tokens.size() < 3)
430
99.3k
    return false;
431
432
732k
  bool FourthTokenIsLess = false;
433
732k
  if (Tokens.size() > 3)
434
683k
    FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
435
436
732k
  auto First = Tokens.end() - 3;
437
732k
  if (First[2]->is(tok::less) || 
First[1]->isNot(tok::less)720k
||
438
732k
      
First[0]->isNot(tok::less)13.9k
||
FourthTokenIsLess1.06k
)
439
731k
    return false;
440
441
  // Only merge if there currently is no whitespace between the two "<".
442
982
  if (First[1]->WhitespaceRange.getBegin() !=
443
982
      First[1]->WhitespaceRange.getEnd())
444
0
    return false;
445
446
982
  First[0]->Tok.setKind(tok::lessless);
447
982
  First[0]->TokenText = "<<";
448
982
  First[0]->ColumnWidth += 1;
449
982
  Tokens.erase(Tokens.end() - 2);
450
982
  return true;
451
982
}
452
453
bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
454
413k
                                      TokenType NewType) {
455
413k
  if (Tokens.size() < Kinds.size())
456
35.6k
    return false;
457
458
377k
  SmallVectorImpl<FormatToken *>::const_iterator First =
459
377k
      Tokens.end() - Kinds.size();
460
377k
  if (!First[0]->is(Kinds[0]))
461
374k
    return false;
462
3.21k
  unsigned AddLength = 0;
463
3.63k
  for (unsigned i = 1; i < Kinds.size(); 
++i418
) {
464
3.27k
    if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
465
430
                                       First[i]->WhitespaceRange.getEnd())
466
2.85k
      return false;
467
418
    AddLength += First[i]->TokenText.size();
468
418
  }
469
356
  Tokens.resize(Tokens.size() - Kinds.size() + 1);
470
356
  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
471
356
                                  First[0]->TokenText.size() + AddLength);
472
356
  First[0]->ColumnWidth += AddLength;
473
356
  First[0]->setType(NewType);
474
356
  return true;
475
3.21k
}
476
477
// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
478
336
bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
479
  // NB: This is not entirely correct, as an r_paren can introduce an operand
480
  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
481
  // corner case to not matter in practice, though.
482
336
  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
483
336
                      tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
484
336
                      tok::colon, tok::question, tok::tilde) ||
485
336
         Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
486
292
                      tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
487
292
                      tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
488
336
         
Tok->isBinaryOperator()288
;
489
336
}
490
491
340
bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
492
340
  if (!Prev)
493
4
    return true;
494
495
  // Regex literals can only follow after prefix unary operators, not after
496
  // postfix unary operators. If the '++' is followed by a non-operand
497
  // introducing token, the slash here is the operand and not the start of a
498
  // regex.
499
  // `!` is an unary prefix operator, but also a post-fix operator that casts
500
  // away nullability, so the same check applies.
501
336
  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
502
20
    return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
503
504
  // The previous token must introduce an operand location where regex
505
  // literals can occur.
506
316
  if (!precedesOperand(Prev))
507
24
    return false;
508
509
292
  return true;
510
316
}
511
512
// Tries to parse a JavaScript Regex literal starting at the current token,
513
// if that begins with a slash and is in a location where JavaScript allows
514
// regex literals. Changes the current token to a regex literal and updates
515
// its text if successful.
516
37.5k
void FormatTokenLexer::tryParseJSRegexLiteral() {
517
37.5k
  FormatToken *RegexToken = Tokens.back();
518
37.5k
  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
519
37.2k
    return;
520
521
340
  FormatToken *Prev = nullptr;
522
344
  for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
523
    // NB: Because previous pointers are not initialized yet, this cannot use
524
    // Token.getPreviousNonComment.
525
344
    if (FT->isNot(tok::comment)) {
526
336
      Prev = FT;
527
336
      break;
528
336
    }
529
344
  }
530
531
340
  if (!canPrecedeRegexLiteral(Prev))
532
36
    return;
533
534
  // 'Manually' lex ahead in the current file buffer.
535
304
  const char *Offset = Lex->getBufferLocation();
536
304
  const char *RegexBegin = Offset - RegexToken->TokenText.size();
537
304
  StringRef Buffer = Lex->getBuffer();
538
304
  bool InCharacterClass = false;
539
304
  bool HaveClosingSlash = false;
540
1.91k
  for (; !HaveClosingSlash && 
Offset != Buffer.end()1.61k
;
++Offset1.61k
) {
541
    // Regular expressions are terminated with a '/', which can only be
542
    // escaped using '\' or a character class between '[' and ']'.
543
    // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
544
1.61k
    switch (*Offset) {
545
116
    case '\\':
546
      // Skip the escaped character.
547
116
      ++Offset;
548
116
      break;
549
40
    case '[':
550
40
      InCharacterClass = true;
551
40
      break;
552
40
    case ']':
553
40
      InCharacterClass = false;
554
40
      break;
555
320
    case '/':
556
320
      if (!InCharacterClass)
557
304
        HaveClosingSlash = true;
558
320
      break;
559
1.61k
    }
560
1.61k
  }
561
562
304
  RegexToken->setType(TT_RegexLiteral);
563
  // Treat regex literals like other string_literals.
564
304
  RegexToken->Tok.setKind(tok::string_literal);
565
304
  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
566
304
  RegexToken->ColumnWidth = RegexToken->TokenText.size();
567
568
304
  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
569
304
}
570
571
8.82k
void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
572
8.82k
  FormatToken *CSharpStringLiteral = Tokens.back();
573
574
8.82k
  if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral)
575
8.78k
    return;
576
577
  // Deal with multiline strings.
578
43
  if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
579
43
        
CSharpStringLiteral->TokenText.startswith(R"($@")")37
))
580
31
    return;
581
582
12
  const char *StrBegin =
583
12
      Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
584
12
  const char *Offset = StrBegin;
585
12
  if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
586
6
    Offset += 2;
587
6
  else // CSharpStringLiteral->TokenText.startswith(R"($@")")
588
6
    Offset += 3;
589
590
  // Look for a terminating '"' in the current file buffer.
591
  // Make no effort to format code within an interpolated or verbatim string.
592
288
  for (; Offset != Lex->getBuffer().end(); 
++Offset276
) {
593
288
    if (Offset[0] == '"') {
594
      // "" within a verbatim string is an escaped double quote: skip it.
595
22
      if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
596
10
        ++Offset;
597
12
      else
598
12
        break;
599
22
    }
600
288
  }
601
602
  // Make no attempt to format code properly if a verbatim string is
603
  // unterminated.
604
12
  if (Offset == Lex->getBuffer().end())
605
0
    return;
606
607
12
  StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
608
12
  CSharpStringLiteral->TokenText = LiteralText;
609
610
  // Adjust width for potentially multiline string literals.
611
12
  size_t FirstBreak = LiteralText.find('\n');
612
12
  StringRef FirstLineText = FirstBreak == StringRef::npos
613
12
                                ? 
LiteralText10
614
12
                                : 
LiteralText.substr(0, FirstBreak)2
;
615
12
  CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
616
12
      FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
617
12
      Encoding);
618
12
  size_t LastBreak = LiteralText.rfind('\n');
619
12
  if (LastBreak != StringRef::npos) {
620
2
    CSharpStringLiteral->IsMultiline = true;
621
2
    unsigned StartColumn = 0;
622
2
    CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs(
623
2
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
624
2
        Style.TabWidth, Encoding);
625
2
  }
626
627
12
  SourceLocation loc = Offset < Lex->getBuffer().end()
628
12
                           ? Lex->getSourceLocation(Offset + 1)
629
12
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
630
12
  resetLexer(SourceMgr.getFileOffset(loc));
631
12
}
632
633
37.5k
void FormatTokenLexer::handleTemplateStrings() {
634
37.5k
  FormatToken *BacktickToken = Tokens.back();
635
636
37.5k
  if (BacktickToken->is(tok::l_brace)) {
637
2.04k
    StateStack.push(LexerState::NORMAL);
638
2.04k
    return;
639
2.04k
  }
640
35.5k
  if (BacktickToken->is(tok::r_brace)) {
641
2.14k
    if (StateStack.size() == 1)
642
4
      return;
643
2.14k
    StateStack.pop();
644
2.14k
    if (StateStack.top() != LexerState::TEMPLATE_STRING)
645
2.04k
      return;
646
    // If back in TEMPLATE_STRING, fallthrough and continue parsing the
647
33.3k
  } else if (BacktickToken->is(tok::unknown) &&
648
33.3k
             
BacktickToken->TokenText == "`"148
) {
649
148
    StateStack.push(LexerState::TEMPLATE_STRING);
650
33.2k
  } else {
651
33.2k
    return; // Not actually a template
652
33.2k
  }
653
654
  // 'Manually' lex ahead in the current file buffer.
655
248
  const char *Offset = Lex->getBufferLocation();
656
248
  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
657
1.90k
  for (; Offset != Lex->getBuffer().end(); 
++Offset1.65k
) {
658
1.90k
    if (Offset[0] == '`') {
659
148
      StateStack.pop();
660
148
      break;
661
148
    }
662
1.75k
    if (Offset[0] == '\\') {
663
8
      ++Offset; // Skip the escaped character.
664
1.74k
    } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
665
1.74k
               
Offset[1] == '{'100
) {
666
      // '${' introduces an expression interpolation in the template string.
667
100
      StateStack.push(LexerState::NORMAL);
668
100
      ++Offset;
669
100
      break;
670
100
    }
671
1.75k
  }
672
673
248
  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
674
248
  BacktickToken->setType(TT_TemplateString);
675
248
  BacktickToken->Tok.setKind(tok::string_literal);
676
248
  BacktickToken->TokenText = LiteralText;
677
678
  // Adjust width for potentially multiline string literals.
679
248
  size_t FirstBreak = LiteralText.find('\n');
680
248
  StringRef FirstLineText = FirstBreak == StringRef::npos
681
248
                                ? 
LiteralText212
682
248
                                : 
LiteralText.substr(0, FirstBreak)36
;
683
248
  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
684
248
      FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
685
248
  size_t LastBreak = LiteralText.rfind('\n');
686
248
  if (LastBreak != StringRef::npos) {
687
36
    BacktickToken->IsMultiline = true;
688
36
    unsigned StartColumn = 0; // The template tail spans the entire line.
689
36
    BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
690
36
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
691
36
        Style.TabWidth, Encoding);
692
36
  }
693
694
248
  SourceLocation loc = Offset < Lex->getBuffer().end()
695
248
                           ? Lex->getSourceLocation(Offset + 1)
696
248
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
697
248
  resetLexer(SourceMgr.getFileOffset(loc));
698
248
}
699
700
6.72k
void FormatTokenLexer::tryParsePythonComment() {
701
6.72k
  FormatToken *HashToken = Tokens.back();
702
6.72k
  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
703
6.66k
    return;
704
  // Turn the remainder of this line into a comment.
705
65
  const char *CommentBegin =
706
65
      Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
707
65
  size_t From = CommentBegin - Lex->getBuffer().begin();
708
65
  size_t To = Lex->getBuffer().find_first_of('\n', From);
709
65
  if (To == StringRef::npos)
710
7
    To = Lex->getBuffer().size();
711
65
  size_t Len = To - From;
712
65
  HashToken->setType(TT_LineComment);
713
65
  HashToken->Tok.setKind(tok::comment);
714
65
  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
715
65
  SourceLocation Loc = To < Lex->getBuffer().size()
716
65
                           ? 
Lex->getSourceLocation(CommentBegin + Len)58
717
65
                           : 
SourceMgr.getLocForEndOfFile(ID)7
;
718
65
  resetLexer(SourceMgr.getFileOffset(Loc));
719
65
}
720
721
832k
bool FormatTokenLexer::tryMerge_TMacro() {
722
832k
  if (Tokens.size() < 4)
723
148k
    return false;
724
683k
  FormatToken *Last = Tokens.back();
725
683k
  if (!Last->is(tok::r_paren))
726
623k
    return false;
727
728
60.2k
  FormatToken *String = Tokens[Tokens.size() - 2];
729
60.2k
  if (!String->is(tok::string_literal) || 
String->IsMultiline689
)
730
59.6k
    return false;
731
732
656
  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
733
297
    return false;
734
735
359
  FormatToken *Macro = Tokens[Tokens.size() - 4];
736
359
  if (Macro->TokenText != "_T")
737
341
    return false;
738
739
18
  const char *Start = Macro->TokenText.data();
740
18
  const char *End = Last->TokenText.data() + Last->TokenText.size();
741
18
  String->TokenText = StringRef(Start, End - Start);
742
18
  String->IsFirst = Macro->IsFirst;
743
18
  String->LastNewlineOffset = Macro->LastNewlineOffset;
744
18
  String->WhitespaceRange = Macro->WhitespaceRange;
745
18
  String->OriginalColumn = Macro->OriginalColumn;
746
18
  String->ColumnWidth = encoding::columnWidthWithTabs(
747
18
      String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
748
18
  String->NewlinesBefore = Macro->NewlinesBefore;
749
18
  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
750
751
18
  Tokens.pop_back();
752
18
  Tokens.pop_back();
753
18
  Tokens.pop_back();
754
18
  Tokens.back() = String;
755
18
  if (FirstInLineIndex >= Tokens.size())
756
3
    FirstInLineIndex = Tokens.size() - 1;
757
18
  return true;
758
359
}
759
760
832k
bool FormatTokenLexer::tryMergeConflictMarkers() {
761
832k
  if (Tokens.back()->NewlinesBefore == 0 && 
Tokens.back()->isNot(tok::eof)755k
)
762
709k
    return false;
763
764
  // Conflict lines look like:
765
  // <marker> <text from the vcs>
766
  // For example:
767
  // >>>>>>> /file/in/file/system at revision 1234
768
  //
769
  // We merge all tokens in a line that starts with a conflict marker
770
  // into a single token with a special token type that the unwrapped line
771
  // parser will use to correctly rebuild the underlying code.
772
773
122k
  FileID ID;
774
  // Get the position of the first token in the line.
775
122k
  unsigned FirstInLineOffset;
776
122k
  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
777
122k
      Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
778
122k
  StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
779
  // Calculate the offset of the start of the current line.
780
122k
  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
781
122k
  if (LineOffset == StringRef::npos) {
782
48.5k
    LineOffset = 0;
783
73.8k
  } else {
784
73.8k
    ++LineOffset;
785
73.8k
  }
786
787
122k
  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
788
122k
  StringRef LineStart;
789
122k
  if (FirstSpace == StringRef::npos) {
790
6.95k
    LineStart = Buffer.substr(LineOffset);
791
115k
  } else {
792
115k
    LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
793
115k
  }
794
795
122k
  TokenType Type = TT_Unknown;
796
122k
  if (LineStart == "<<<<<<<" || 
LineStart == ">>>>"122k
) {
797
9
    Type = TT_ConflictStart;
798
122k
  } else if (LineStart == "|||||||" || 
LineStart == "======="122k
||
799
122k
             
LineStart == "===="122k
) {
800
25
    Type = TT_ConflictAlternative;
801
122k
  } else if (LineStart == ">>>>>>>" || 
LineStart == "<<<<"122k
) {
802
9
    Type = TT_ConflictEnd;
803
9
  }
804
805
122k
  if (Type != TT_Unknown) {
806
43
    FormatToken *Next = Tokens.back();
807
808
43
    Tokens.resize(FirstInLineIndex + 1);
809
    // We do not need to build a complete token here, as we will skip it
810
    // during parsing anyway (as we must not touch whitespace around conflict
811
    // markers).
812
43
    Tokens.back()->setType(Type);
813
43
    Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
814
815
43
    Tokens.push_back(Next);
816
43
    return true;
817
43
  }
818
819
122k
  return false;
820
122k
}
821
822
1.89k
FormatToken *FormatTokenLexer::getStashedToken() {
823
  // Create a synthesized second '>' or '<' token.
824
1.89k
  Token Tok = FormatTok->Tok;
825
1.89k
  StringRef TokenText = FormatTok->TokenText;
826
827
1.89k
  unsigned OriginalColumn = FormatTok->OriginalColumn;
828
1.89k
  FormatTok = new (Allocator.Allocate()) FormatToken;
829
1.89k
  FormatTok->Tok = Tok;
830
1.89k
  SourceLocation TokLocation =
831
1.89k
      FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
832
1.89k
  FormatTok->Tok.setLocation(TokLocation);
833
1.89k
  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
834
1.89k
  FormatTok->TokenText = TokenText;
835
1.89k
  FormatTok->ColumnWidth = 1;
836
1.89k
  FormatTok->OriginalColumn = OriginalColumn + 1;
837
838
1.89k
  return FormatTok;
839
1.89k
}
840
841
832k
FormatToken *FormatTokenLexer::getNextToken() {
842
832k
  if (StateStack.top() == LexerState::TOKEN_STASHED) {
843
1.89k
    StateStack.pop();
844
1.89k
    return getStashedToken();
845
1.89k
  }
846
847
830k
  FormatTok = new (Allocator.Allocate()) FormatToken;
848
830k
  readRawToken(*FormatTok);
849
830k
  SourceLocation WhitespaceStart =
850
830k
      FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
851
830k
  FormatTok->IsFirst = IsFirstToken;
852
830k
  IsFirstToken = false;
853
854
  // Consume and record whitespace until we find a significant token.
855
830k
  unsigned WhitespaceLength = TrailingWhitespace;
856
1.20M
  while (FormatTok->Tok.is(tok::unknown)) {
857
377k
    StringRef Text = FormatTok->TokenText;
858
377k
    auto EscapesNewline = [&](int pos) {
859
      // A '\r' here is just part of '\r\n'. Skip it.
860
80.7k
      if (pos >= 0 && 
Text[pos] == '\r'4.85k
)
861
369
        --pos;
862
      // See whether there is an odd number of '\' before this.
863
      // FIXME: This is wrong. A '\' followed by a newline is always removed,
864
      // regardless of whether there is another '\' before it.
865
      // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
866
80.7k
      unsigned count = 0;
867
81.4k
      for (; pos >= 0; 
--pos, ++count761
)
868
4.55k
        if (Text[pos] != '\\')
869
3.79k
          break;
870
80.7k
      return count & 1;
871
80.7k
    };
872
    // FIXME: This miscounts tok:unknown tokens that are not just
873
    // whitespace, e.g. a '`' character.
874
1.07M
    for (int i = 0, e = Text.size(); i != e; 
++i698k
) {
875
699k
      switch (Text[i]) {
876
80.7k
      case '\n':
877
80.7k
        ++FormatTok->NewlinesBefore;
878
80.7k
        FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
879
80.7k
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
880
80.7k
        Column = 0;
881
80.7k
        break;
882
376
      case '\r':
883
376
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
884
376
        Column = 0;
885
376
        break;
886
10
      case '\f':
887
20
      case '\v':
888
20
        Column = 0;
889
20
        break;
890
615k
      case ' ':
891
615k
        ++Column;
892
615k
        break;
893
1.88k
      case '\t':
894
1.88k
        Column +=
895
1.88k
            Style.TabWidth - (Style.TabWidth ? 
Column % Style.TabWidth1.83k
:
045
);
896
1.88k
        break;
897
812
      case '\\':
898
812
        if (i + 1 == e || 
(761
Text[i + 1] != '\r'761
&&
Text[i + 1] != '\n'746
))
899
51
          FormatTok->setType(TT_ImplicitStringLiteral);
900
812
        break;
901
238
      default:
902
238
        FormatTok->setType(TT_ImplicitStringLiteral);
903
238
        break;
904
699k
      }
905
699k
      if (FormatTok->getType() == TT_ImplicitStringLiteral)
906
289
        break;
907
699k
    }
908
909
377k
    if (FormatTok->is(TT_ImplicitStringLiteral))
910
289
      break;
911
377k
    WhitespaceLength += FormatTok->Tok.getLength();
912
913
377k
    readRawToken(*FormatTok);
914
377k
  }
915
916
  // JavaScript and Java do not allow to escape the end of the line with a
917
  // backslash. Backslashes are syntax errors in plain source, but can occur in
918
  // comments. When a single line comment ends with a \, it'll cause the next
919
  // line of code to be lexed as a comment, breaking formatting. The code below
920
  // finds comments that contain a backslash followed by a line break, truncates
921
  // the comment token at the backslash, and resets the lexer to restart behind
922
  // the backslash.
923
830k
  if ((Style.isJavaScript() || 
Style.Language == FormatStyle::LK_Java792k
) &&
924
830k
      
FormatTok->is(tok::comment)42.0k
&&
FormatTok->TokenText.startswith("//")537
) {
925
360
    size_t BackslashPos = FormatTok->TokenText.find('\\');
926
364
    while (BackslashPos != StringRef::npos) {
927
16
      if (BackslashPos + 1 < FormatTok->TokenText.size() &&
928
16
          FormatTok->TokenText[BackslashPos + 1] == '\n') {
929
12
        const char *Offset = Lex->getBufferLocation();
930
12
        Offset -= FormatTok->TokenText.size();
931
12
        Offset += BackslashPos + 1;
932
12
        resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
933
12
        FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
934
12
        FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
935
12
            FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
936
12
            Encoding);
937
12
        break;
938
12
      }
939
4
      BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
940
4
    }
941
360
  }
942
943
  // In case the token starts with escaped newlines, we want to
944
  // take them into account as whitespace - this pattern is quite frequent
945
  // in macro definitions.
946
  // FIXME: Add a more explicit test.
947
830k
  while (FormatTok->TokenText.size() > 1 && 
FormatTok->TokenText[0] == '\\'281k
) {
948
80
    unsigned SkippedWhitespace = 0;
949
80
    if (FormatTok->TokenText.size() > 2 &&
950
80
        (FormatTok->TokenText[1] == '\r' && 
FormatTok->TokenText[2] == '\n'9
))
951
9
      SkippedWhitespace = 3;
952
71
    else if (FormatTok->TokenText[1] == '\n')
953
71
      SkippedWhitespace = 2;
954
0
    else
955
0
      break;
956
957
80
    ++FormatTok->NewlinesBefore;
958
80
    WhitespaceLength += SkippedWhitespace;
959
80
    FormatTok->LastNewlineOffset = SkippedWhitespace;
960
80
    Column = 0;
961
80
    FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
962
80
  }
963
964
830k
  FormatTok->WhitespaceRange = SourceRange(
965
830k
      WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
966
967
830k
  FormatTok->OriginalColumn = Column;
968
969
830k
  TrailingWhitespace = 0;
970
830k
  if (FormatTok->Tok.is(tok::comment)) {
971
    // FIXME: Add the trimmed whitespace to Column.
972
13.2k
    StringRef UntrimmedText = FormatTok->TokenText;
973
13.2k
    FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
974
13.2k
    TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
975
817k
  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
976
316k
    IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
977
316k
    FormatTok->Tok.setIdentifierInfo(&Info);
978
316k
    FormatTok->Tok.setKind(Info.getTokenID());
979
316k
    if (Style.Language == FormatStyle::LK_Java &&
980
316k
        FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
981
1.85k
                           tok::kw_operator)) {
982
8
      FormatTok->Tok.setKind(tok::identifier);
983
8
      FormatTok->Tok.setIdentifierInfo(nullptr);
984
316k
    } else if (Style.isJavaScript() &&
985
316k
               FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
986
13.0k
                                  tok::kw_operator)) {
987
28
      FormatTok->Tok.setKind(tok::identifier);
988
28
      FormatTok->Tok.setIdentifierInfo(nullptr);
989
28
    }
990
500k
  } else if (FormatTok->Tok.is(tok::greatergreater)) {
991
815
    FormatTok->Tok.setKind(tok::greater);
992
815
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
993
815
    ++Column;
994
815
    StateStack.push(LexerState::TOKEN_STASHED);
995
499k
  } else if (FormatTok->Tok.is(tok::lessless)) {
996
1.07k
    FormatTok->Tok.setKind(tok::less);
997
1.07k
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
998
1.07k
    ++Column;
999
1.07k
    StateStack.push(LexerState::TOKEN_STASHED);
1000
1.07k
  }
1001
1002
  // Now FormatTok is the next non-whitespace token.
1003
1004
830k
  StringRef Text = FormatTok->TokenText;
1005
830k
  size_t FirstNewlinePos = Text.find('\n');
1006
830k
  if (FirstNewlinePos == StringRef::npos) {
1007
    // FIXME: ColumnWidth actually depends on the start column, we need to
1008
    // take this into account when the token is moved.
1009
829k
    FormatTok->ColumnWidth =
1010
829k
        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1011
829k
    Column += FormatTok->ColumnWidth;
1012
829k
  } else {
1013
877
    FormatTok->IsMultiline = true;
1014
    // FIXME: ColumnWidth actually depends on the start column, we need to
1015
    // take this into account when the token is moved.
1016
877
    FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1017
877
        Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1018
1019
    // The last line of the token always starts in column 0.
1020
    // Thus, the length can be precomputed even in the presence of tabs.
1021
877
    FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1022
877
        Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1023
877
    Column = FormatTok->LastLineColumnWidth;
1024
877
  }
1025
1026
830k
  if (Style.isCpp()) {
1027
767k
    auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1028
767k
    if (!(Tokens.size() > 0 && 
Tokens.back()->Tok.getIdentifierInfo()722k
&&
1029
767k
          Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1030
294k
              tok::pp_define) &&
1031
767k
        
it != Macros.end()765k
) {
1032
1.98k
      FormatTok->setType(it->second);
1033
1.98k
      if (it->second == TT_IfMacro) {
1034
        // The lexer token currently has type tok::kw_unknown. However, for this
1035
        // substitution to be treated correctly in the TokenAnnotator, faking
1036
        // the tok value seems to be needed. Not sure if there's a more elegant
1037
        // way.
1038
1.04k
        FormatTok->Tok.setKind(tok::kw_if);
1039
1.04k
      }
1040
765k
    } else if (FormatTok->is(tok::identifier)) {
1041
185k
      if (MacroBlockBeginRegex.match(Text)) {
1042
28
        FormatTok->setType(TT_MacroBlockBegin);
1043
185k
      } else if (MacroBlockEndRegex.match(Text)) {
1044
28
        FormatTok->setType(TT_MacroBlockEnd);
1045
28
      }
1046
185k
    }
1047
767k
  }
1048
1049
830k
  return FormatTok;
1050
830k
}
1051
1052
1.20M
void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1053
1.20M
  Lex->LexFromRawLexer(Tok.Tok);
1054
1.20M
  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1055
1.20M
                            Tok.Tok.getLength());
1056
  // For formatting, treat unterminated string literals like normal string
1057
  // literals.
1058
1.20M
  if (Tok.is(tok::unknown)) {
1059
378k
    if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1060
30
      Tok.Tok.setKind(tok::string_literal);
1061
30
      Tok.IsUnterminatedLiteral = true;
1062
377k
    } else if (Style.isJavaScript() && 
Tok.TokenText == "''"17.1k
) {
1063
12
      Tok.Tok.setKind(tok::string_literal);
1064
12
    }
1065
378k
  }
1066
1067
1.20M
  if ((Style.isJavaScript() || 
Style.Language == FormatStyle::LK_Proto1.15M
||
1068
1.20M
       
Style.Language == FormatStyle::LK_TextProto1.14M
) &&
1069
1.20M
      
Tok.is(tok::char_constant)71.1k
) {
1070
826
    Tok.Tok.setKind(tok::string_literal);
1071
826
  }
1072
1073
1.20M
  if (Tok.is(tok::comment) && 
(13.2k
Tok.TokenText == "// clang-format on"13.2k
||
1074
13.2k
                               
Tok.TokenText == "/* clang-format on */"13.1k
)) {
1075
64
    FormattingDisabled = false;
1076
64
  }
1077
1078
1.20M
  Tok.Finalized = FormattingDisabled;
1079
1080
1.20M
  if (Tok.is(tok::comment) && 
(13.2k
Tok.TokenText == "// clang-format off"13.2k
||
1081
13.2k
                               
Tok.TokenText == "/* clang-format off */"13.1k
)) {
1082
67
    FormattingDisabled = true;
1083
67
  }
1084
1.20M
}
1085
1086
641
void FormatTokenLexer::resetLexer(unsigned Offset) {
1087
641
  StringRef Buffer = SourceMgr.getBufferData(ID);
1088
641
  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
1089
641
                      getFormattingLangOpts(Style), Buffer.begin(),
1090
641
                      Buffer.begin() + Offset, Buffer.end()));
1091
641
  Lex->SetKeepWhitespaceMode(true);
1092
641
  TrailingWhitespace = 0;
1093
641
}
1094
1095
} // namespace format
1096
} // namespace clang