Coverage Report

Created: 2020-11-24 06:42

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file implements FormatTokenLexer, which tokenizes a source file
11
/// into a FormatToken stream suitable for ClangFormat.
12
///
13
//===----------------------------------------------------------------------===//
14
15
#include "FormatTokenLexer.h"
16
#include "FormatToken.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/SourceManager.h"
19
#include "clang/Format/Format.h"
20
#include "llvm/Support/Regex.h"
21
22
namespace clang {
23
namespace format {
24
25
FormatTokenLexer::FormatTokenLexer(
26
    const SourceManager &SourceMgr, FileID ID, unsigned Column,
27
    const FormatStyle &Style, encoding::Encoding Encoding,
28
    llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29
    IdentifierTable &IdentTable)
30
    : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31
      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
32
      Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
33
      Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
34
      FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
35
36.8k
      MacroBlockEndRegex(Style.MacroBlockEnd) {
36
36.8k
  Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr,
37
36.8k
                      getFormattingLangOpts(Style)));
38
36.8k
  Lex->SetKeepWhitespaceMode(true);
39
40
36.8k
  for (const std::string &ForEachMacro : Style.ForEachMacros)
41
110k
    Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
42
36.8k
  for (const std::string &AttributeMacro : Style.AttributeMacros)
43
37.6k
    Macros.insert({&IdentTable.get(AttributeMacro), TT_AttributeMacro});
44
36.8k
  for (const std::string &StatementMacro : Style.StatementMacros)
45
73.8k
    Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
46
36.8k
  for (const std::string &TypenameMacro : Style.TypenameMacros)
47
553
    Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
48
36.8k
  for (const std::string &NamespaceMacro : Style.NamespaceMacros)
49
159
    Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
50
36.8k
  for (const std::string &WhitespaceSensitiveMacro :
51
184k
       Style.WhitespaceSensitiveMacros) {
52
184k
    Macros.insert(
53
184k
        {&IdentTable.get(WhitespaceSensitiveMacro), TT_UntouchableMacroFunc});
54
184k
  }
55
36.8k
}
56
57
36.8k
ArrayRef<FormatToken *> FormatTokenLexer::lex() {
58
36.8k
  assert(Tokens.empty());
59
36.8k
  assert(FirstInLineIndex == 0);
60
622k
  do {
61
622k
    Tokens.push_back(getNextToken());
62
622k
    if (Style.Language == FormatStyle::LK_JavaScript) {
63
34.4k
      tryParseJSRegexLiteral();
64
34.4k
      handleTemplateStrings();
65
34.4k
    }
66
622k
    if (Style.Language == FormatStyle::LK_TextProto)
67
6.72k
      tryParsePythonComment();
68
622k
    tryMergePreviousTokens();
69
622k
    if (Style.isCSharp())
70
      // This needs to come after tokens have been merged so that C#
71
      // string literals are correctly identified.
72
4.87k
      handleCSharpVerbatimAndInterpolatedStrings();
73
622k
    if (Tokens.back()->NewlinesBefore > 0 || 
Tokens.back()->IsMultiline563k
)
74
59.8k
      FirstInLineIndex = Tokens.size() - 1;
75
622k
  } while (Tokens.back()->Tok.isNot(tok::eof));
76
36.8k
  return Tokens;
77
36.8k
}
78
79
622k
void FormatTokenLexer::tryMergePreviousTokens() {
80
622k
  if (tryMerge_TMacro())
81
15
    return;
82
622k
  if (tryMergeConflictMarkers())
83
36
    return;
84
622k
  if (tryMergeLessLess())
85
982
    return;
86
621k
  if (tryMergeForEach())
87
7
    return;
88
621k
  if (Style.isCpp() && 
tryTransformTryUsageForC()566k
)
89
28
    return;
90
91
621k
  if (Style.isCSharp()) {
92
4.87k
    if (tryMergeCSharpKeywordVariables())
93
2
      return;
94
4.86k
    if (tryMergeCSharpStringLiteral())
95
43
      return;
96
4.82k
    if (tryMergeCSharpDoubleQuestion())
97
6
      return;
98
4.81k
    if (tryMergeCSharpNullConditional())
99
14
      return;
100
4.80k
    if (tryTransformCSharpForEach())
101
8
      return;
102
4.79k
    static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
103
4.79k
    if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
104
34
      return;
105
621k
  }
106
107
621k
  if (tryMergeNSStringLiteral())
108
222
    return;
109
110
621k
  if (Style.Language == FormatStyle::LK_JavaScript) {
111
34.4k
    static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
112
34.4k
    static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
113
34.4k
                                                   tok::equal};
114
34.4k
    static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
115
34.4k
                                                  tok::greaterequal};
116
34.4k
    static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
117
34.4k
    static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
118
34.4k
    static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
119
34.4k
                                                           tok::starequal};
120
34.4k
    static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question,
121
34.4k
                                                               tok::period};
122
34.4k
    static const tok::TokenKind JSNullishOperator[] = {tok::question,
123
34.4k
                                                       tok::question};
124
34.4k
    static const tok::TokenKind JSNullishEqual[] = {tok::question,
125
34.4k
                                                    tok::question, tok::equal};
126
34.4k
    static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
127
34.4k
    static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
128
129
    // FIXME: Investigate what token type gives the correct operator priority.
130
34.4k
    if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
131
12
      return;
132
34.4k
    if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
133
12
      return;
134
34.4k
    if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
135
10
      return;
136
34.4k
    if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
137
148
      return;
138
34.3k
    if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
139
4
      return;
140
34.3k
    if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
141
4
      Tokens.back()->Tok.setKind(tok::starequal);
142
4
      return;
143
4
    }
144
34.2k
    if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) {
145
      // Treat like the "||" operator (as opposed to the ternary ?).
146
20
      Tokens.back()->Tok.setKind(tok::pipepipe);
147
20
      return;
148
20
    }
149
34.2k
    if (tryMergeTokens(JSNullPropagatingOperator,
150
16
                       TT_JsNullPropagatingOperator)) {
151
      // Treat like a regular "." access.
152
16
      Tokens.back()->Tok.setKind(tok::period);
153
16
      return;
154
16
    }
155
34.2k
    if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
156
34.2k
        tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual) ||
157
34.2k
        tryMergeTokens(JSNullishEqual, TT_JsNullishCoalescingEqual)) {
158
      // Treat like the "=" assignment operator.
159
12
      Tokens.back()->Tok.setKind(tok::equal);
160
12
      return;
161
12
    }
162
34.2k
    if (tryMergeJSPrivateIdentifier())
163
32
      return;
164
620k
  }
165
166
620k
  if (Style.Language == FormatStyle::LK_Java) {
167
4.46k
    static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
168
4.46k
        tok::greater, tok::greater, tok::greaterequal};
169
4.46k
    if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
170
2
      return;
171
4.46k
  }
172
620k
}
173
174
621k
bool FormatTokenLexer::tryMergeNSStringLiteral() {
175
621k
  if (Tokens.size() < 2)
176
36.8k
    return false;
177
584k
  auto &At = *(Tokens.end() - 2);
178
584k
  auto &String = *(Tokens.end() - 1);
179
584k
  if (!At->is(tok::at) || 
!String->is(tok::string_literal)1.28k
)
180
584k
    return false;
181
222
  At->Tok.setKind(tok::string_literal);
182
222
  At->TokenText = StringRef(At->TokenText.begin(),
183
222
                            String->TokenText.end() - At->TokenText.begin());
184
222
  At->ColumnWidth += String->ColumnWidth;
185
222
  At->setType(TT_ObjCStringLiteral);
186
222
  Tokens.erase(Tokens.end() - 1);
187
222
  return true;
188
222
}
189
190
34.2k
bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
191
  // Merges #idenfier into a single identifier with the text #identifier
192
  // but the token tok::identifier.
193
34.2k
  if (Tokens.size() < 2)
194
2.81k
    return false;
195
31.4k
  auto &Hash = *(Tokens.end() - 2);
196
31.4k
  auto &Identifier = *(Tokens.end() - 1);
197
31.4k
  if (!Hash->is(tok::hash) || 
!Identifier->is(tok::identifier)36
)
198
31.4k
    return false;
199
32
  Hash->Tok.setKind(tok::identifier);
200
32
  Hash->TokenText =
201
32
      StringRef(Hash->TokenText.begin(),
202
32
                Identifier->TokenText.end() - Hash->TokenText.begin());
203
32
  Hash->ColumnWidth += Identifier->ColumnWidth;
204
32
  Hash->setType(TT_JsPrivateIdentifier);
205
32
  Tokens.erase(Tokens.end() - 1);
206
32
  return true;
207
32
}
208
209
// Search for verbatim or interpolated string literals @"ABC" or
210
// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
211
// prevent splitting of @, $ and ".
212
// Merging of multiline verbatim strings with embedded '"' is handled in
213
// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
214
4.86k
bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
215
4.86k
  if (Tokens.size() < 2)
216
335
    return false;
217
218
  // Interpolated strings could contain { } with " characters inside.
219
  // $"{x ?? "null"}"
220
  // should not be split into $"{x ?? ", null, "}" but should treated as a
221
  // single string-literal.
222
  //
223
  // We opt not to try and format expressions inside {} within a C#
224
  // interpolated string. Formatting expressions within an interpolated string
225
  // would require similar work as that done for JavaScript template strings
226
  // in `handleTemplateStrings()`.
227
4.53k
  auto &CSharpInterpolatedString = *(Tokens.end() - 2);
228
4.53k
  if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral &&
229
43
      (CSharpInterpolatedString->TokenText.startswith(R"($")") ||
230
37
       
CSharpInterpolatedString->TokenText.startswith(R"($@")")12
)) {
231
37
    int UnmatchedOpeningBraceCount = 0;
232
233
37
    auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
234
989
    for (size_t Index = 0; Index < TokenTextSize; 
++Index952
) {
235
952
      char C = CSharpInterpolatedString->TokenText[Index];
236
952
      if (C == '{') {
237
        // "{{"  inside an interpolated string is an escaped '{' so skip it.
238
49
        if (Index + 1 < TokenTextSize &&
239
49
            CSharpInterpolatedString->TokenText[Index + 1] == '{') {
240
6
          ++Index;
241
6
          continue;
242
6
        }
243
43
        ++UnmatchedOpeningBraceCount;
244
903
      } else if (C == '}') {
245
        // "}}"  inside an interpolated string is an escaped '}' so skip it.
246
43
        if (Index + 1 < TokenTextSize &&
247
43
            CSharpInterpolatedString->TokenText[Index + 1] == '}') {
248
6
          ++Index;
249
6
          continue;
250
6
        }
251
37
        --UnmatchedOpeningBraceCount;
252
37
      }
253
952
    }
254
255
37
    if (UnmatchedOpeningBraceCount > 0) {
256
6
      auto &NextToken = *(Tokens.end() - 1);
257
6
      CSharpInterpolatedString->TokenText =
258
6
          StringRef(CSharpInterpolatedString->TokenText.begin(),
259
6
                    NextToken->TokenText.end() -
260
6
                        CSharpInterpolatedString->TokenText.begin());
261
6
      CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
262
6
      Tokens.erase(Tokens.end() - 1);
263
6
      return true;
264
6
    }
265
4.52k
  }
266
267
  // Look for @"aaaaaa" or $"aaaaaa".
268
4.52k
  auto &String = *(Tokens.end() - 1);
269
4.52k
  if (!String->is(tok::string_literal))
270
4.43k
    return false;
271
272
93
  auto &At = *(Tokens.end() - 2);
273
93
  if (!(At->is(tok::at) || 
At->TokenText == "$"81
))
274
56
    return false;
275
276
37
  if (Tokens.size() > 2 && 
At->is(tok::at)35
) {
277
12
    auto &Dollar = *(Tokens.end() - 3);
278
12
    if (Dollar->TokenText == "$") {
279
      // This looks like $@"aaaaa" so we need to combine all 3 tokens.
280
6
      Dollar->Tok.setKind(tok::string_literal);
281
6
      Dollar->TokenText =
282
6
          StringRef(Dollar->TokenText.begin(),
283
6
                    String->TokenText.end() - Dollar->TokenText.begin());
284
6
      Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
285
6
      Dollar->setType(TT_CSharpStringLiteral);
286
6
      Tokens.erase(Tokens.end() - 2);
287
6
      Tokens.erase(Tokens.end() - 1);
288
6
      return true;
289
6
    }
290
31
  }
291
292
  // Convert back into just a string_literal.
293
31
  At->Tok.setKind(tok::string_literal);
294
31
  At->TokenText = StringRef(At->TokenText.begin(),
295
31
                            String->TokenText.end() - At->TokenText.begin());
296
31
  At->ColumnWidth += String->ColumnWidth;
297
31
  At->setType(TT_CSharpStringLiteral);
298
31
  Tokens.erase(Tokens.end() - 1);
299
31
  return true;
300
31
}
301
302
// Valid C# attribute targets:
303
// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
304
const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
305
    "assembly", "module",   "field",  "event", "method",
306
    "param",    "property", "return", "type",
307
};
308
309
4.82k
bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
310
4.82k
  if (Tokens.size() < 2)
311
335
    return false;
312
4.49k
  auto &FirstQuestion = *(Tokens.end() - 2);
313
4.49k
  auto &SecondQuestion = *(Tokens.end() - 1);
314
4.49k
  if (!FirstQuestion->is(tok::question) || 
!SecondQuestion->is(tok::question)58
)
315
4.48k
    return false;
316
6
  FirstQuestion->Tok.setKind(tok::question); // no '??' in clang tokens.
317
6
  FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
318
6
                                       SecondQuestion->TokenText.end() -
319
6
                                           FirstQuestion->TokenText.begin());
320
6
  FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
321
6
  FirstQuestion->setType(TT_CSharpNullCoalescing);
322
6
  Tokens.erase(Tokens.end() - 1);
323
6
  return true;
324
6
}
325
326
// Merge '?[' and '?.' pairs into single tokens.
327
4.81k
bool FormatTokenLexer::tryMergeCSharpNullConditional() {
328
4.81k
  if (Tokens.size() < 2)
329
335
    return false;
330
4.48k
  auto &Question = *(Tokens.end() - 2);
331
4.48k
  auto &PeriodOrLSquare = *(Tokens.end() - 1);
332
4.48k
  if (!Question->is(tok::question) ||
333
52
      !PeriodOrLSquare->isOneOf(tok::l_square, tok::period))
334
4.47k
    return false;
335
14
  Question->TokenText =
336
14
      StringRef(Question->TokenText.begin(),
337
14
                PeriodOrLSquare->TokenText.end() - Question->TokenText.begin());
338
14
  Question->ColumnWidth += PeriodOrLSquare->ColumnWidth;
339
340
14
  if (PeriodOrLSquare->is(tok::l_square)) {
341
8
    Question->Tok.setKind(tok::question); // no '?[' in clang tokens.
342
8
    Question->setType(TT_CSharpNullConditionalLSquare);
343
6
  } else {
344
6
    Question->Tok.setKind(tok::question); // no '?.' in clang tokens.
345
6
    Question->setType(TT_CSharpNullConditional);
346
6
  }
347
348
14
  Tokens.erase(Tokens.end() - 1);
349
14
  return true;
350
14
}
351
352
4.87k
bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
353
4.87k
  if (Tokens.size() < 2)
354
335
    return false;
355
4.53k
  auto &At = *(Tokens.end() - 2);
356
4.53k
  auto &Keyword = *(Tokens.end() - 1);
357
4.53k
  if (!At->is(tok::at))
358
4.51k
    return false;
359
16
  if (!Keywords.isCSharpKeyword(*Keyword))
360
14
    return false;
361
362
2
  At->Tok.setKind(tok::identifier);
363
2
  At->TokenText = StringRef(At->TokenText.begin(),
364
2
                            Keyword->TokenText.end() - At->TokenText.begin());
365
2
  At->ColumnWidth += Keyword->ColumnWidth;
366
2
  At->setType(Keyword->getType());
367
2
  Tokens.erase(Tokens.end() - 1);
368
2
  return true;
369
2
}
370
371
// In C# transform identifier foreach into kw_foreach
372
4.80k
bool FormatTokenLexer::tryTransformCSharpForEach() {
373
4.80k
  if (Tokens.size() < 1)
374
0
    return false;
375
4.80k
  auto &Identifier = *(Tokens.end() - 1);
376
4.80k
  if (!Identifier->is(tok::identifier))
377
3.40k
    return false;
378
1.40k
  if (Identifier->TokenText != "foreach")
379
1.39k
    return false;
380
381
8
  Identifier->setType(TT_ForEachMacro);
382
8
  Identifier->Tok.setKind(tok::kw_for);
383
8
  return true;
384
8
}
385
386
621k
bool FormatTokenLexer::tryMergeForEach() {
387
621k
  if (Tokens.size() < 2)
388
36.8k
    return false;
389
584k
  auto &For = *(Tokens.end() - 2);
390
584k
  auto &Each = *(Tokens.end() - 1);
391
584k
  if (!For->is(tok::kw_for))
392
583k
    return false;
393
716
  if (!Each->is(tok::identifier))
394
701
    return false;
395
15
  if (Each->TokenText != "each")
396
8
    return false;
397
398
7
  For->setType(TT_ForEachMacro);
399
7
  For->Tok.setKind(tok::kw_for);
400
401
7
  For->TokenText = StringRef(For->TokenText.begin(),
402
7
                             Each->TokenText.end() - For->TokenText.begin());
403
7
  For->ColumnWidth += Each->ColumnWidth;
404
7
  Tokens.erase(Tokens.end() - 1);
405
7
  return true;
406
7
}
407
408
566k
bool FormatTokenLexer::tryTransformTryUsageForC() {
409
566k
  if (Tokens.size() < 2)
410
32.6k
    return false;
411
533k
  auto &Try = *(Tokens.end() - 2);
412
533k
  if (!Try->is(tok::kw_try))
413
533k
    return false;
414
204
  auto &Next = *(Tokens.end() - 1);
415
204
  if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
416
174
    return false;
417
418
30
  if (Tokens.size() > 2) {
419
23
    auto &At = *(Tokens.end() - 3);
420
23
    if (At->is(tok::at))
421
2
      return false;
422
28
  }
423
424
28
  Try->Tok.setKind(tok::identifier);
425
28
  return true;
426
28
}
427
428
622k
bool FormatTokenLexer::tryMergeLessLess() {
429
  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
430
622k
  if (Tokens.size() < 3)
431
73.5k
    return false;
432
433
549k
  bool FourthTokenIsLess = false;
434
549k
  if (Tokens.size() > 3)
435
513k
    FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
436
437
549k
  auto First = Tokens.end() - 3;
438
549k
  if (First[2]->is(tok::less) || 
First[1]->isNot(tok::less)540k
||
439
10.0k
      First[0]->isNot(tok::less) || 
FourthTokenIsLess1.05k
)
440
548k
    return false;
441
442
  // Only merge if there currently is no whitespace between the two "<".
443
982
  if (First[1]->WhitespaceRange.getBegin() !=
444
982
      First[1]->WhitespaceRange.getEnd())
445
0
    return false;
446
447
982
  First[0]->Tok.setKind(tok::lessless);
448
982
  First[0]->TokenText = "<<";
449
982
  First[0]->ColumnWidth += 1;
450
982
  Tokens.erase(Tokens.end() - 2);
451
982
  return true;
452
982
}
453
454
bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
455
387k
                                      TokenType NewType) {
456
387k
  if (Tokens.size() < Kinds.size())
457
37.4k
    return false;
458
459
349k
  SmallVectorImpl<FormatToken *>::const_iterator First =
460
349k
      Tokens.end() - Kinds.size();
461
349k
  if (!First[0]->is(Kinds[0]))
462
346k
    return false;
463
2.79k
  unsigned AddLength = 0;
464
3.13k
  for (unsigned i = 1; i < Kinds.size(); 
++i336
) {
465
2.85k
    if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
466
348
                                       First[i]->WhitespaceRange.getEnd())
467
2.52k
      return false;
468
336
    AddLength += First[i]->TokenText.size();
469
336
  }
470
274
  Tokens.resize(Tokens.size() - Kinds.size() + 1);
471
274
  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
472
274
                                  First[0]->TokenText.size() + AddLength);
473
274
  First[0]->ColumnWidth += AddLength;
474
274
  First[0]->setType(NewType);
475
274
  return true;
476
2.79k
}
477
478
// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
479
336
bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
480
  // NB: This is not entirely correct, as an r_paren can introduce an operand
481
  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
482
  // corner case to not matter in practice, though.
483
336
  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
484
336
                      tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
485
336
                      tok::colon, tok::question, tok::tilde) ||
486
292
         Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
487
292
                      tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
488
292
                      tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
489
288
         Tok->isBinaryOperator();
490
336
}
491
492
340
bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
493
340
  if (!Prev)
494
4
    return true;
495
496
  // Regex literals can only follow after prefix unary operators, not after
497
  // postfix unary operators. If the '++' is followed by a non-operand
498
  // introducing token, the slash here is the operand and not the start of a
499
  // regex.
500
  // `!` is an unary prefix operator, but also a post-fix operator that casts
501
  // away nullability, so the same check applies.
502
336
  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
503
20
    return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
504
505
  // The previous token must introduce an operand location where regex
506
  // literals can occur.
507
316
  if (!precedesOperand(Prev))
508
24
    return false;
509
510
292
  return true;
511
292
}
512
513
// Tries to parse a JavaScript Regex literal starting at the current token,
514
// if that begins with a slash and is in a location where JavaScript allows
515
// regex literals. Changes the current token to a regex literal and updates
516
// its text if successful.
517
34.4k
void FormatTokenLexer::tryParseJSRegexLiteral() {
518
34.4k
  FormatToken *RegexToken = Tokens.back();
519
34.4k
  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
520
34.1k
    return;
521
522
340
  FormatToken *Prev = nullptr;
523
348
  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; 
++I8
) {
524
    // NB: Because previous pointers are not initialized yet, this cannot use
525
    // Token.getPreviousNonComment.
526
344
    if ((*I)->isNot(tok::comment)) {
527
336
      Prev = *I;
528
336
      break;
529
336
    }
530
344
  }
531
532
340
  if (!canPrecedeRegexLiteral(Prev))
533
36
    return;
534
535
  // 'Manually' lex ahead in the current file buffer.
536
304
  const char *Offset = Lex->getBufferLocation();
537
304
  const char *RegexBegin = Offset - RegexToken->TokenText.size();
538
304
  StringRef Buffer = Lex->getBuffer();
539
304
  bool InCharacterClass = false;
540
304
  bool HaveClosingSlash = false;
541
1.91k
  for (; !HaveClosingSlash && 
Offset != Buffer.end()1.61k
;
++Offset1.61k
) {
542
    // Regular expressions are terminated with a '/', which can only be
543
    // escaped using '\' or a character class between '[' and ']'.
544
    // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
545
1.61k
    switch (*Offset) {
546
116
    case '\\':
547
      // Skip the escaped character.
548
116
      ++Offset;
549
116
      break;
550
40
    case '[':
551
40
      InCharacterClass = true;
552
40
      break;
553
40
    case ']':
554
40
      InCharacterClass = false;
555
40
      break;
556
320
    case '/':
557
320
      if (!InCharacterClass)
558
304
        HaveClosingSlash = true;
559
320
      break;
560
1.61k
    }
561
1.61k
  }
562
563
304
  RegexToken->setType(TT_RegexLiteral);
564
  // Treat regex literals like other string_literals.
565
304
  RegexToken->Tok.setKind(tok::string_literal);
566
304
  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
567
304
  RegexToken->ColumnWidth = RegexToken->TokenText.size();
568
569
304
  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
570
304
}
571
572
4.87k
void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
573
4.87k
  FormatToken *CSharpStringLiteral = Tokens.back();
574
575
4.87k
  if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral)
576
4.82k
    return;
577
578
  // Deal with multiline strings.
579
43
  if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
580
37
        CSharpStringLiteral->TokenText.startswith(R"($@")")))
581
31
    return;
582
583
12
  const char *StrBegin =
584
12
      Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
585
12
  const char *Offset = StrBegin;
586
12
  if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
587
6
    Offset += 2;
588
6
  else // CSharpStringLiteral->TokenText.startswith(R"($@")")
589
6
    Offset += 3;
590
591
  // Look for a terminating '"' in the current file buffer.
592
  // Make no effort to format code within an interpolated or verbatim string.
593
288
  for (; Offset != Lex->getBuffer().end(); 
++Offset276
) {
594
288
    if (Offset[0] == '"') {
595
      // "" within a verbatim string is an escaped double quote: skip it.
596
22
      if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
597
10
        ++Offset;
598
12
      else
599
12
        break;
600
22
    }
601
288
  }
602
603
  // Make no attempt to format code properly if a verbatim string is
604
  // unterminated.
605
12
  if (Offset == Lex->getBuffer().end())
606
0
    return;
607
608
12
  StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
609
12
  CSharpStringLiteral->TokenText = LiteralText;
610
611
  // Adjust width for potentially multiline string literals.
612
12
  size_t FirstBreak = LiteralText.find('\n');
613
12
  StringRef FirstLineText = FirstBreak == StringRef::npos
614
10
                                ? LiteralText
615
2
                                : LiteralText.substr(0, FirstBreak);
616
12
  CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
617
12
      FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
618
12
      Encoding);
619
12
  size_t LastBreak = LiteralText.rfind('\n');
620
12
  if (LastBreak != StringRef::npos) {
621
2
    CSharpStringLiteral->IsMultiline = true;
622
2
    unsigned StartColumn = 0;
623
2
    CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs(
624
2
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
625
2
        Style.TabWidth, Encoding);
626
2
  }
627
628
12
  SourceLocation loc = Offset < Lex->getBuffer().end()
629
12
                           ? Lex->getSourceLocation(Offset + 1)
630
0
                           : SourceMgr.getLocForEndOfFile(ID);
631
12
  resetLexer(SourceMgr.getFileOffset(loc));
632
12
}
633
634
34.4k
void FormatTokenLexer::handleTemplateStrings() {
635
34.4k
  FormatToken *BacktickToken = Tokens.back();
636
637
34.4k
  if (BacktickToken->is(tok::l_brace)) {
638
1.85k
    StateStack.push(LexerState::NORMAL);
639
1.85k
    return;
640
1.85k
  }
641
32.6k
  if (BacktickToken->is(tok::r_brace)) {
642
1.95k
    if (StateStack.size() == 1)
643
4
      return;
644
1.94k
    StateStack.pop();
645
1.94k
    if (StateStack.top() != LexerState::TEMPLATE_STRING)
646
1.84k
      return;
647
    // If back in TEMPLATE_STRING, fallthrough and continue parsing the
648
30.6k
  } else if (BacktickToken->is(tok::unknown) &&
649
148
             BacktickToken->TokenText == "`") {
650
148
    StateStack.push(LexerState::TEMPLATE_STRING);
651
30.5k
  } else {
652
30.5k
    return; // Not actually a template
653
30.5k
  }
654
655
  // 'Manually' lex ahead in the current file buffer.
656
248
  const char *Offset = Lex->getBufferLocation();
657
248
  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
658
1.90k
  for (; Offset != Lex->getBuffer().end(); 
++Offset1.65k
) {
659
1.90k
    if (Offset[0] == '`') {
660
148
      StateStack.pop();
661
148
      break;
662
148
    }
663
1.75k
    if (Offset[0] == '\\') {
664
8
      ++Offset; // Skip the escaped character.
665
1.74k
    } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
666
100
               Offset[1] == '{') {
667
      // '${' introduces an expression interpolation in the template string.
668
100
      StateStack.push(LexerState::NORMAL);
669
100
      ++Offset;
670
100
      break;
671
100
    }
672
1.75k
  }
673
674
248
  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
675
248
  BacktickToken->setType(TT_TemplateString);
676
248
  BacktickToken->Tok.setKind(tok::string_literal);
677
248
  BacktickToken->TokenText = LiteralText;
678
679
  // Adjust width for potentially multiline string literals.
680
248
  size_t FirstBreak = LiteralText.find('\n');
681
248
  StringRef FirstLineText = FirstBreak == StringRef::npos
682
212
                                ? LiteralText
683
36
                                : LiteralText.substr(0, FirstBreak);
684
248
  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
685
248
      FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
686
248
  size_t LastBreak = LiteralText.rfind('\n');
687
248
  if (LastBreak != StringRef::npos) {
688
36
    BacktickToken->IsMultiline = true;
689
36
    unsigned StartColumn = 0; // The template tail spans the entire line.
690
36
    BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
691
36
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
692
36
        Style.TabWidth, Encoding);
693
36
  }
694
695
248
  SourceLocation loc = Offset < Lex->getBuffer().end()
696
248
                           ? Lex->getSourceLocation(Offset + 1)
697
0
                           : SourceMgr.getLocForEndOfFile(ID);
698
248
  resetLexer(SourceMgr.getFileOffset(loc));
699
248
}
700
701
6.72k
void FormatTokenLexer::tryParsePythonComment() {
702
6.72k
  FormatToken *HashToken = Tokens.back();
703
6.72k
  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
704
6.66k
    return;
705
  // Turn the remainder of this line into a comment.
706
63
  const char *CommentBegin =
707
63
      Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
708
63
  size_t From = CommentBegin - Lex->getBuffer().begin();
709
63
  size_t To = Lex->getBuffer().find_first_of('\n', From);
710
63
  if (To == StringRef::npos)
711
7
    To = Lex->getBuffer().size();
712
63
  size_t Len = To - From;
713
63
  HashToken->setType(TT_LineComment);
714
63
  HashToken->Tok.setKind(tok::comment);
715
63
  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
716
63
  SourceLocation Loc = To < Lex->getBuffer().size()
717
56
                           ? Lex->getSourceLocation(CommentBegin + Len)
718
7
                           : SourceMgr.getLocForEndOfFile(ID);
719
63
  resetLexer(SourceMgr.getFileOffset(Loc));
720
63
}
721
722
622k
bool FormatTokenLexer::tryMerge_TMacro() {
723
622k
  if (Tokens.size() < 4)
724
109k
    return false;
725
513k
  FormatToken *Last = Tokens.back();
726
513k
  if (!Last->is(tok::r_paren))
727
466k
    return false;
728
729
46.2k
  FormatToken *String = Tokens[Tokens.size() - 2];
730
46.2k
  if (!String->is(tok::string_literal) || 
String->IsMultiline629
)
731
45.6k
    return false;
732
733
596
  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
734
276
    return false;
735
736
320
  FormatToken *Macro = Tokens[Tokens.size() - 4];
737
320
  if (Macro->TokenText != "_T")
738
305
    return false;
739
740
15
  const char *Start = Macro->TokenText.data();
741
15
  const char *End = Last->TokenText.data() + Last->TokenText.size();
742
15
  String->TokenText = StringRef(Start, End - Start);
743
15
  String->IsFirst = Macro->IsFirst;
744
15
  String->LastNewlineOffset = Macro->LastNewlineOffset;
745
15
  String->WhitespaceRange = Macro->WhitespaceRange;
746
15
  String->OriginalColumn = Macro->OriginalColumn;
747
15
  String->ColumnWidth = encoding::columnWidthWithTabs(
748
15
      String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
749
15
  String->NewlinesBefore = Macro->NewlinesBefore;
750
15
  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
751
752
15
  Tokens.pop_back();
753
15
  Tokens.pop_back();
754
15
  Tokens.pop_back();
755
15
  Tokens.back() = String;
756
15
  return true;
757
15
}
758
759
622k
bool FormatTokenLexer::tryMergeConflictMarkers() {
760
622k
  if (Tokens.back()->NewlinesBefore == 0 && 
Tokens.back()->isNot(tok::eof)563k
)
761
529k
    return false;
762
763
  // Conflict lines look like:
764
  // <marker> <text from the vcs>
765
  // For example:
766
  // >>>>>>> /file/in/file/system at revision 1234
767
  //
768
  // We merge all tokens in a line that starts with a conflict marker
769
  // into a single token with a special token type that the unwrapped line
770
  // parser will use to correctly rebuild the underlying code.
771
772
92.9k
  FileID ID;
773
  // Get the position of the first token in the line.
774
92.9k
  unsigned FirstInLineOffset;
775
92.9k
  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
776
92.9k
      Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
777
92.9k
  StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
778
  // Calculate the offset of the start of the current line.
779
92.9k
  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
780
92.9k
  if (LineOffset == StringRef::npos) {
781
35.6k
    LineOffset = 0;
782
57.2k
  } else {
783
57.2k
    ++LineOffset;
784
57.2k
  }
785
786
92.9k
  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
787
92.9k
  StringRef LineStart;
788
92.9k
  if (FirstSpace == StringRef::npos) {
789
5.52k
    LineStart = Buffer.substr(LineOffset);
790
87.4k
  } else {
791
87.4k
    LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
792
87.4k
  }
793
794
92.9k
  TokenType Type = TT_Unknown;
795
92.9k
  if (LineStart == "<<<<<<<" || 
LineStart == ">>>>"92.9k
) {
796
9
    Type = TT_ConflictStart;
797
92.9k
  } else if (LineStart == "|||||||" || 
LineStart == "======="92.9k
||
798
92.9k
             LineStart == "====") {
799
18
    Type = TT_ConflictAlternative;
800
92.9k
  } else if (LineStart == ">>>>>>>" || 
LineStart == "<<<<"92.9k
) {
801
9
    Type = TT_ConflictEnd;
802
9
  }
803
804
92.9k
  if (Type != TT_Unknown) {
805
36
    FormatToken *Next = Tokens.back();
806
807
36
    Tokens.resize(FirstInLineIndex + 1);
808
    // We do not need to build a complete token here, as we will skip it
809
    // during parsing anyway (as we must not touch whitespace around conflict
810
    // markers).
811
36
    Tokens.back()->setType(Type);
812
36
    Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
813
814
36
    Tokens.push_back(Next);
815
36
    return true;
816
36
  }
817
818
92.9k
  return false;
819
92.9k
}
820
821
1.49k
FormatToken *FormatTokenLexer::getStashedToken() {
822
  // Create a synthesized second '>' or '<' token.
823
1.49k
  Token Tok = FormatTok->Tok;
824
1.49k
  StringRef TokenText = FormatTok->TokenText;
825
826
1.49k
  unsigned OriginalColumn = FormatTok->OriginalColumn;
827
1.49k
  FormatTok = new (Allocator.Allocate()) FormatToken;
828
1.49k
  FormatTok->Tok = Tok;
829
1.49k
  SourceLocation TokLocation =
830
1.49k
      FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
831
1.49k
  FormatTok->Tok.setLocation(TokLocation);
832
1.49k
  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
833
1.49k
  FormatTok->TokenText = TokenText;
834
1.49k
  FormatTok->ColumnWidth = 1;
835
1.49k
  FormatTok->OriginalColumn = OriginalColumn + 1;
836
837
1.49k
  return FormatTok;
838
1.49k
}
839
840
622k
FormatToken *FormatTokenLexer::getNextToken() {
841
622k
  if (StateStack.top() == LexerState::TOKEN_STASHED) {
842
1.49k
    StateStack.pop();
843
1.49k
    return getStashedToken();
844
1.49k
  }
845
846
621k
  FormatTok = new (Allocator.Allocate()) FormatToken;
847
621k
  readRawToken(*FormatTok);
848
621k
  SourceLocation WhitespaceStart =
849
621k
      FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
850
621k
  FormatTok->IsFirst = IsFirstToken;
851
621k
  IsFirstToken = false;
852
853
  // Consume and record whitespace until we find a significant token.
854
621k
  unsigned WhitespaceLength = TrailingWhitespace;
855
900k
  while (FormatTok->Tok.is(tok::unknown)) {
856
279k
    StringRef Text = FormatTok->TokenText;
857
61.1k
    auto EscapesNewline = [&](int pos) {
858
      // A '\r' here is just part of '\r\n'. Skip it.
859
61.1k
      if (pos >= 0 && 
Text[pos] == '\r'2.77k
)
860
102
        --pos;
861
      // See whether there is an odd number of '\' before this.
862
      // FIXME: This is wrong. A '\' followed by a newline is always removed,
863
      // regardless of whether there is another '\' before it.
864
      // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
865
61.1k
      unsigned count = 0;
866
61.8k
      for (; pos >= 0; 
--pos, ++count716
)
867
2.70k
        if (Text[pos] != '\\')
868
1.98k
          break;
869
61.1k
      return count & 1;
870
61.1k
    };
871
    // FIXME: This miscounts tok:unknown tokens that are not just
872
    // whitespace, e.g. a '`' character.
873
849k
    for (int i = 0, e = Text.size(); i != e; 
++i569k
) {
874
570k
      switch (Text[i]) {
875
61.1k
      case '\n':
876
61.1k
        ++FormatTok->NewlinesBefore;
877
61.1k
        FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
878
61.1k
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
879
61.1k
        Column = 0;
880
61.1k
        break;
881
109
      case '\r':
882
109
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
883
109
        Column = 0;
884
109
        break;
885
20
      case '\f':
886
20
      case '\v':
887
20
        Column = 0;
888
20
        break;
889
506k
      case ' ':
890
506k
        ++Column;
891
506k
        break;
892
1.83k
      case '\t':
893
1.83k
        Column +=
894
1.78k
            Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 
045
);
895
1.83k
        break;
896
767
      case '\\':
897
767
        if (i + 1 == e || 
(716
Text[i + 1] != '\r'716
&&
Text[i + 1] != '\n'701
))
898
51
          FormatTok->setType(TT_ImplicitStringLiteral);
899
767
        break;
900
235
      default:
901
235
        FormatTok->setType(TT_ImplicitStringLiteral);
902
235
        break;
903
570k
      }
904
570k
      if (FormatTok->getType() == TT_ImplicitStringLiteral)
905
286
        break;
906
570k
    }
907
908
279k
    if (FormatTok->is(TT_ImplicitStringLiteral))
909
286
      break;
910
279k
    WhitespaceLength += FormatTok->Tok.getLength();
911
912
279k
    readRawToken(*FormatTok);
913
279k
  }
914
915
  // JavaScript and Java do not allow to escape the end of the line with a
916
  // backslash. Backslashes are syntax errors in plain source, but can occur in
917
  // comments. When a single line comment ends with a \, it'll cause the next
918
  // line of code to be lexed as a comment, breaking formatting. The code below
919
  // finds comments that contain a backslash followed by a line break, truncates
920
  // the comment token at the backslash, and resets the lexer to restart behind
921
  // the backslash.
922
621k
  if ((Style.Language == FormatStyle::LK_JavaScript ||
923
586k
       Style.Language == FormatStyle::LK_Java) &&
924
38.9k
      FormatTok->is(tok::comment) && 
FormatTok->TokenText.startswith("//")476
) {
925
307
    size_t BackslashPos = FormatTok->TokenText.find('\\');
926
311
    while (BackslashPos != StringRef::npos) {
927
16
      if (BackslashPos + 1 < FormatTok->TokenText.size() &&
928
16
          FormatTok->TokenText[BackslashPos + 1] == '\n') {
929
12
        const char *Offset = Lex->getBufferLocation();
930
12
        Offset -= FormatTok->TokenText.size();
931
12
        Offset += BackslashPos + 1;
932
12
        resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
933
12
        FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
934
12
        FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
935
12
            FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
936
12
            Encoding);
937
12
        break;
938
12
      }
939
4
      BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
940
4
    }
941
307
  }
942
943
  // In case the token starts with escaped newlines, we want to
944
  // take them into account as whitespace - this pattern is quite frequent
945
  // in macro definitions.
946
  // FIXME: Add a more explicit test.
947
621k
  while (FormatTok->TokenText.size() > 1 && 
FormatTok->TokenText[0] == '\\'210k
) {
948
77
    unsigned SkippedWhitespace = 0;
949
77
    if (FormatTok->TokenText.size() > 2 &&
950
77
        (FormatTok->TokenText[1] == '\r' && 
FormatTok->TokenText[2] == '\n'9
))
951
9
      SkippedWhitespace = 3;
952
68
    else if (FormatTok->TokenText[1] == '\n')
953
68
      SkippedWhitespace = 2;
954
0
    else
955
0
      break;
956
957
77
    ++FormatTok->NewlinesBefore;
958
77
    WhitespaceLength += SkippedWhitespace;
959
77
    FormatTok->LastNewlineOffset = SkippedWhitespace;
960
77
    Column = 0;
961
77
    FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
962
77
  }
963
964
621k
  FormatTok->WhitespaceRange = SourceRange(
965
621k
      WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
966
967
621k
  FormatTok->OriginalColumn = Column;
968
969
621k
  TrailingWhitespace = 0;
970
621k
  if (FormatTok->Tok.is(tok::comment)) {
971
    // FIXME: Add the trimmed whitespace to Column.
972
9.86k
    StringRef UntrimmedText = FormatTok->TokenText;
973
9.86k
    FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
974
9.86k
    TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
975
611k
  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
976
232k
    IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
977
232k
    FormatTok->Tok.setIdentifierInfo(&Info);
978
232k
    FormatTok->Tok.setKind(Info.getTokenID());
979
232k
    if (Style.Language == FormatStyle::LK_Java &&
980
1.83k
        FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
981
8
                           tok::kw_operator)) {
982
8
      FormatTok->Tok.setKind(tok::identifier);
983
8
      FormatTok->Tok.setIdentifierInfo(nullptr);
984
232k
    } else if (Style.Language == FormatStyle::LK_JavaScript &&
985
11.8k
               FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
986
28
                                  tok::kw_operator)) {
987
28
      FormatTok->Tok.setKind(tok::identifier);
988
28
      FormatTok->Tok.setIdentifierInfo(nullptr);
989
28
    }
990
379k
  } else if (FormatTok->Tok.is(tok::greatergreater)) {
991
420
    FormatTok->Tok.setKind(tok::greater);
992
420
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
993
420
    ++Column;
994
420
    StateStack.push(LexerState::TOKEN_STASHED);
995
378k
  } else if (FormatTok->Tok.is(tok::lessless)) {
996
1.07k
    FormatTok->Tok.setKind(tok::less);
997
1.07k
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
998
1.07k
    ++Column;
999
1.07k
    StateStack.push(LexerState::TOKEN_STASHED);
1000
1.07k
  }
1001
1002
  // Now FormatTok is the next non-whitespace token.
1003
1004
621k
  StringRef Text = FormatTok->TokenText;
1005
621k
  size_t FirstNewlinePos = Text.find('\n');
1006
621k
  if (FirstNewlinePos == StringRef::npos) {
1007
    // FIXME: ColumnWidth actually depends on the start column, we need to
1008
    // take this into account when the token is moved.
1009
620k
    FormatTok->ColumnWidth =
1010
620k
        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1011
620k
    Column += FormatTok->ColumnWidth;
1012
670
  } else {
1013
670
    FormatTok->IsMultiline = true;
1014
    // FIXME: ColumnWidth actually depends on the start column, we need to
1015
    // take this into account when the token is moved.
1016
670
    FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1017
670
        Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1018
1019
    // The last line of the token always starts in column 0.
1020
    // Thus, the length can be precomputed even in the presence of tabs.
1021
670
    FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1022
670
        Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1023
670
    Column = FormatTok->LastLineColumnWidth;
1024
670
  }
1025
1026
621k
  if (Style.isCpp()) {
1027
566k
    auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1028
566k
    if (!(Tokens.size() > 0 && 
Tokens.back()->Tok.getIdentifierInfo()533k
&&
1029
212k
          Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1030
212k
              tok::pp_define) &&
1031
564k
        it != Macros.end()) {
1032
766
      FormatTok->setType(it->second);
1033
565k
    } else if (FormatTok->is(tok::identifier)) {
1034
141k
      if (MacroBlockBeginRegex.match(Text)) {
1035
28
        FormatTok->setType(TT_MacroBlockBegin);
1036
141k
      } else if (MacroBlockEndRegex.match(Text)) {
1037
28
        FormatTok->setType(TT_MacroBlockEnd);
1038
28
      }
1039
141k
    }
1040
566k
  }
1041
1042
621k
  return FormatTok;
1043
621k
}
1044
1045
900k
void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1046
900k
  Lex->LexFromRawLexer(Tok.Tok);
1047
900k
  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1048
900k
                            Tok.Tok.getLength());
1049
  // For formatting, treat unterminated string literals like normal string
1050
  // literals.
1051
900k
  if (Tok.is(tok::unknown)) {
1052
279k
    if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1053
27
      Tok.Tok.setKind(tok::string_literal);
1054
27
      Tok.IsUnterminatedLiteral = true;
1055
279k
    } else if (Style.Language == FormatStyle::LK_JavaScript &&
1056
15.2k
               Tok.TokenText == "''") {
1057
12
      Tok.Tok.setKind(tok::string_literal);
1058
12
    }
1059
279k
  }
1060
1061
900k
  if ((Style.Language == FormatStyle::LK_JavaScript ||
1062
850k
       Style.Language == FormatStyle::LK_Proto ||
1063
844k
       Style.Language == FormatStyle::LK_TextProto) &&
1064
66.1k
      Tok.is(tok::char_constant)) {
1065
728
    Tok.Tok.setKind(tok::string_literal);
1066
728
  }
1067
1068
900k
  if (Tok.is(tok::comment) && 
(9.86k
Tok.TokenText == "// clang-format on"9.86k
||
1069
9.84k
                               Tok.TokenText == "/* clang-format on */")) {
1070
26
    FormattingDisabled = false;
1071
26
  }
1072
1073
900k
  Tok.Finalized = FormattingDisabled;
1074
1075
900k
  if (Tok.is(tok::comment) && 
(9.86k
Tok.TokenText == "// clang-format off"9.86k
||
1076
9.84k
                               Tok.TokenText == "/* clang-format off */")) {
1077
29
    FormattingDisabled = true;
1078
29
  }
1079
900k
}
1080
1081
639
void FormatTokenLexer::resetLexer(unsigned Offset) {
1082
639
  StringRef Buffer = SourceMgr.getBufferData(ID);
1083
639
  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
1084
639
                      getFormattingLangOpts(Style), Buffer.begin(),
1085
639
                      Buffer.begin() + Offset, Buffer.end()));
1086
639
  Lex->SetKeepWhitespaceMode(true);
1087
639
  TrailingWhitespace = 0;
1088
639
}
1089
1090
} // namespace format
1091
} // namespace clang