Coverage Report

Created: 2018-10-23 15:26

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/tools/lld/ELF/ScriptLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- ScriptLexer.cpp ----------------------------------------------------===//
2
//
3
//                             The LLVM Linker
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This file defines a lexer for the linker script.
11
//
12
// The linker script's grammar is not complex but ambiguous due to the
13
// lack of the formal specification of the language. What we are trying to
14
// do in this and other files in LLD is to make a "reasonable" linker
15
// script processor.
16
//
17
// Among simplicity, compatibility and efficiency, we put the most
18
// emphasis on simplicity when we wrote this lexer. Compatibility with the
19
// GNU linkers is important, but we did not try to clone every tiny corner
20
// case of their lexers, as even ld.bfd and ld.gold are subtly different
21
// in various corner cases. We do not care much about efficiency because
22
// the time spent in parsing linker scripts is usually negligible.
23
//
24
// Our grammar of the linker script is LL(2), meaning that it needs at
25
// most two-token lookahead to parse. The only place we need two-token
26
// lookahead is labels in version scripts, where we need to parse "local :"
27
// as if "local:".
28
//
29
// Overall, this lexer works fine for most linker scripts. There might
30
// be room for improving compatibility, but that's probably not at the
31
// top of our todo list.
32
//
33
//===----------------------------------------------------------------------===//
34
35
#include "ScriptLexer.h"
36
#include "lld/Common/ErrorHandler.h"
37
#include "llvm/ADT/Twine.h"
38
39
using namespace llvm;
40
using namespace lld;
41
using namespace lld::elf;
42
43
// Returns a whole line containing the current token.
44
130
StringRef ScriptLexer::getLine() {
45
130
  StringRef S = getCurrentMB().getBuffer();
46
130
  StringRef Tok = Tokens[Pos - 1];
47
130
48
130
  size_t Pos = S.rfind('\n', Tok.data() - S.data());
49
130
  if (Pos != StringRef::npos)
50
24
    S = S.substr(Pos + 1);
51
130
  return S.substr(0, S.find_first_of("\r\n"));
52
130
}
53
54
// Returns 1-based line number of the current token.
55
3.23k
size_t ScriptLexer::getLineNumber() {
56
3.23k
  StringRef S = getCurrentMB().getBuffer();
57
3.23k
  StringRef Tok = Tokens[Pos - 1];
58
3.23k
  return S.substr(0, Tok.data() - S.data()).count('\n') + 1;
59
3.23k
}
60
61
// Returns 0-based column number of the current token.
62
65
size_t ScriptLexer::getColumnNumber() {
63
65
  StringRef Tok = Tokens[Pos - 1];
64
65
  return Tok.data() - getLine().data();
65
65
}
66
67
3.23k
std::string ScriptLexer::getCurrentLocation() {
68
3.23k
  std::string Filename = getCurrentMB().getBufferIdentifier();
69
3.23k
  return (Filename + ":" + Twine(getLineNumber())).str();
70
3.23k
}
71
72
759
ScriptLexer::ScriptLexer(MemoryBufferRef MB) { tokenize(MB); }
73
74
// We don't want to record cascading errors. Keep only the first one.
75
86
void ScriptLexer::setError(const Twine &Msg) {
76
86
  if (errorCount())
77
21
    return;
78
65
79
65
  std::string S = (getCurrentLocation() + ": " + Msg).str();
80
65
  if (Pos)
81
65
    S += "\n>>> " + getLine().str() + "\n>>> " +
82
65
         std::string(getColumnNumber(), ' ') + "^";
83
65
  error(S);
84
65
}
85
86
// Split S into linker script tokens.
87
777
void ScriptLexer::tokenize(MemoryBufferRef MB) {
88
777
  std::vector<StringRef> Vec;
89
777
  MBs.push_back(MB);
90
777
  StringRef S = MB.getBuffer();
91
777
  StringRef Begin = S;
92
777
93
15.1k
  for (;;) {
94
15.1k
    S = skipSpace(S);
95
15.1k
    if (S.empty())
96
775
      break;
97
14.3k
98
14.3k
    // Quoted token. Note that double-quote characters are parts of a token
99
14.3k
    // because, in a glob match context, only unquoted tokens are interpreted
100
14.3k
    // as glob patterns. Double-quoted tokens are literal patterns in that
101
14.3k
    // context.
102
14.3k
    if (S.startswith("\"")) {
103
118
      size_t E = S.find("\"", 1);
104
118
      if (E == StringRef::npos) {
105
2
        StringRef Filename = MB.getBufferIdentifier();
106
2
        size_t Lineno = Begin.substr(0, S.data() - Begin.data()).count('\n');
107
2
        error(Filename + ":" + Twine(Lineno + 1) + ": unclosed quote");
108
2
        return;
109
2
      }
110
116
111
116
      Vec.push_back(S.take_front(E + 1));
112
116
      S = S.substr(E + 1);
113
116
      continue;
114
116
    }
115
14.2k
116
14.2k
    // ">foo" is parsed to ">" and "foo", but ">>" is parsed to ">>".
117
14.2k
    // "|", "||", "&" and "&&" are different operators.
118
14.2k
    if (S.startswith("<<") || 
S.startswith("<=")14.2k
||
S.startswith(">>")14.2k
||
119
14.2k
        
S.startswith(">=")14.2k
||
S.startswith("||")14.2k
||
S.startswith("&&")14.2k
) {
120
18
      Vec.push_back(S.substr(0, 2));
121
18
      S = S.substr(2);
122
18
      continue;
123
18
    }
124
14.1k
125
14.1k
    // Unquoted token. This is more relaxed than tokens in C-like language,
126
14.1k
    // so that you can write "file-name.cpp" as one bare token, for example.
127
14.1k
    size_t Pos = S.find_first_not_of(
128
14.1k
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
129
14.1k
        "0123456789_.$/\\~=+[]*?-!^:");
130
14.1k
131
14.1k
    // A character that cannot start a word (which is usually a
132
14.1k
    // punctuation) forms a single character token.
133
14.1k
    if (Pos == 0)
134
6.64k
      Pos = 1;
135
14.1k
    Vec.push_back(S.substr(0, Pos));
136
14.1k
    S = S.substr(Pos);
137
14.1k
  }
138
777
139
777
  Tokens.insert(Tokens.begin() + Pos, Vec.begin(), Vec.end());
140
775
}
141
142
// Skip leading whitespace characters or comments.
143
15.1k
StringRef ScriptLexer::skipSpace(StringRef S) {
144
26.0k
  for (;;) {
145
26.0k
    if (S.startswith("/*")) {
146
5
      size_t E = S.find("*/", 2);
147
5
      if (E == StringRef::npos) {
148
2
        error("unclosed comment in a linker script");
149
2
        return "";
150
2
      }
151
3
      S = S.substr(E + 2);
152
3
      continue;
153
3
    }
154
26.0k
    if (S.startswith("#")) {
155
1.45k
      size_t E = S.find('\n', 1);
156
1.45k
      if (E == StringRef::npos)
157
3
        E = S.size() - 1;
158
1.45k
      S = S.substr(E + 1);
159
1.45k
      continue;
160
1.45k
    }
161
24.5k
    size_t Size = S.size();
162
24.5k
    S = S.ltrim();
163
24.5k
    if (S.size() == Size)
164
15.1k
      return S;
165
24.5k
  }
166
15.1k
}
167
168
// An erroneous token is handled as if it were the last token before EOF.
169
61.7k
bool ScriptLexer::atEOF() { return errorCount() || 
Tokens.size() == Pos61.5k
; }
170
171
// Split a given string as an expression.
172
// This function returns "3", "*" and "5" for "3*5" for example.
173
8.65k
static std::vector<StringRef> tokenizeExpr(StringRef S) {
174
8.65k
  StringRef Ops = "+-*/:!~"; // List of operators
175
8.65k
176
8.65k
  // Quoted strings are literal strings, so we don't want to split it.
177
8.65k
  if (S.startswith("\""))
178
5
    return {S};
179
8.65k
180
8.65k
  // Split S with operators as separators.
181
8.65k
  std::vector<StringRef> Ret;
182
9.24k
  while (!S.empty()) {
183
8.66k
    size_t E = S.find_first_of(Ops);
184
8.66k
185
8.66k
    // No need to split if there is no operator.
186
8.66k
    if (E == StringRef::npos) {
187
8.07k
      Ret.push_back(S);
188
8.07k
      break;
189
8.07k
    }
190
593
191
593
    // Get a token before the opreator.
192
593
    if (E != 0)
193
8
      Ret.push_back(S.substr(0, E));
194
593
195
593
    // Get the operator as a token. Keep != as one token.
196
593
    if (S.substr(E).startswith("!=")) {
197
7
      Ret.push_back(S.substr(E, 2));
198
7
      S = S.substr(E + 2);
199
586
    } else {
200
586
      Ret.push_back(S.substr(E, 1));
201
586
      S = S.substr(E + 1);
202
586
    }
203
593
  }
204
8.65k
  return Ret;
205
8.65k
}
206
207
// In contexts where expressions are expected, the lexer should apply
208
// different tokenization rules than the default one. By default,
209
// arithmetic operator characters are regular characters, but in the
210
// expression context, they should be independent tokens.
211
//
212
// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
213
// in the expression context.
214
//
215
// This function may split the current token into multiple tokens.
216
49.7k
void ScriptLexer::maybeSplitExpr() {
217
49.7k
  if (!InExpr || 
errorCount()8.67k
||
atEOF()8.66k
)
218
41.0k
    return;
219
8.65k
220
8.65k
  std::vector<StringRef> V = tokenizeExpr(Tokens[Pos]);
221
8.65k
  if (V.size() == 1)
222
8.64k
    return;
223
12
  Tokens.erase(Tokens.begin() + Pos);
224
12
  Tokens.insert(Tokens.begin() + Pos, V.begin(), V.end());
225
12
}
226
227
49.7k
StringRef ScriptLexer::next() {
228
49.7k
  maybeSplitExpr();
229
49.7k
230
49.7k
  if (errorCount())
231
131
    return "";
232
49.6k
  if (atEOF()) {
233
9
    setError("unexpected EOF");
234
9
    return "";
235
9
  }
236
49.6k
  return Tokens[Pos++];
237
49.6k
}
238
239
35.5k
StringRef ScriptLexer::peek() {
240
35.5k
  StringRef Tok = next();
241
35.5k
  if (errorCount())
242
120
    return "";
243
35.4k
  Pos = Pos - 1;
244
35.4k
  return Tok;
245
35.4k
}
246
247
94
StringRef ScriptLexer::peek2() {
248
94
  skip();
249
94
  StringRef Tok = next();
250
94
  if (errorCount())
251
0
    return "";
252
94
  Pos = Pos - 2;
253
94
  return Tok;
254
94
}
255
256
22.3k
bool ScriptLexer::consume(StringRef Tok) {
257
22.3k
  if (peek() == Tok) {
258
2.84k
    skip();
259
2.84k
    return true;
260
2.84k
  }
261
19.5k
  return false;
262
19.5k
}
263
264
// Consumes Tok followed by ":". Space is allowed between Tok and ":".
265
555
bool ScriptLexer::consumeLabel(StringRef Tok) {
266
555
  if (consume((Tok + ":").str()))
267
129
    return true;
268
426
  if (Tokens.size() >= Pos + 2 && 
Tokens[Pos] == Tok424
&&
269
426
      
Tokens[Pos + 1] == ":"6
) {
270
5
    Pos += 2;
271
5
    return true;
272
5
  }
273
421
  return false;
274
421
}
275
276
3.10k
void ScriptLexer::skip() { (void)next(); }
277
278
5.02k
void ScriptLexer::expect(StringRef Expect) {
279
5.02k
  if (errorCount())
280
52
    return;
281
4.96k
  StringRef Tok = next();
282
4.96k
  if (Tok != Expect)
283
16
    setError(Expect + " expected, but got " + Tok);
284
4.96k
}
285
286
// Returns true if S encloses T.
287
6.67k
static bool encloses(StringRef S, StringRef T) {
288
6.67k
  return S.bytes_begin() <= T.bytes_begin() && 
T.bytes_end() <= S.bytes_end()6.66k
;
289
6.67k
}
290
291
6.60k
MemoryBufferRef ScriptLexer::getCurrentMB() {
292
6.60k
  // Find input buffer containing the current token.
293
6.60k
  assert(!MBs.empty() && Pos > 0);
294
6.60k
  for (MemoryBufferRef MB : MBs)
295
6.67k
    if (encloses(MB.getBuffer(), Tokens[Pos - 1]))
296
6.60k
      return MB;
297
6.60k
  
llvm_unreachable0
("getCurrentMB: failed to find a token");
298
}