Coverage Report

Created: 2018-08-19 21:11

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/tools/lld/ELF/ScriptLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- ScriptLexer.cpp ----------------------------------------------------===//
2
//
3
//                             The LLVM Linker
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This file defines a lexer for the linker script.
11
//
12
// The linker script's grammar is not complex but ambiguous due to the
13
// lack of the formal specification of the language. What we are trying to
14
// do in this and other files in LLD is to make a "reasonable" linker
15
// script processor.
16
//
17
// Among simplicity, compatibility and efficiency, we put the most
18
// emphasis on simplicity when we wrote this lexer. Compatibility with the
19
// GNU linkers is important, but we did not try to clone every tiny corner
20
// case of their lexers, as even ld.bfd and ld.gold are subtly different
21
// in various corner cases. We do not care much about efficiency because
22
// the time spent in parsing linker scripts is usually negligible.
23
//
24
// Our grammar of the linker script is LL(2), meaning that it needs at
25
// most two-token lookahead to parse. The only place we need two-token
26
// lookahead is labels in version scripts, where we need to parse "local :"
27
// as if "local:".
28
//
29
// Overall, this lexer works fine for most linker scripts. There might
30
// be room for improving compatibility, but that's probably not at the
31
// top of our todo list.
32
//
33
//===----------------------------------------------------------------------===//
34
35
#include "ScriptLexer.h"
36
#include "lld/Common/ErrorHandler.h"
37
#include "llvm/ADT/Twine.h"
38
39
using namespace llvm;
40
using namespace lld;
41
using namespace lld::elf;
42
43
// Returns a whole line containing the current token.
44
126
StringRef ScriptLexer::getLine() {
45
126
  StringRef S = getCurrentMB().getBuffer();
46
126
  StringRef Tok = Tokens[Pos - 1];
47
126
48
126
  size_t Pos = S.rfind('\n', Tok.data() - S.data());
49
126
  if (Pos != StringRef::npos)
50
24
    S = S.substr(Pos + 1);
51
126
  return S.substr(0, S.find_first_of("\r\n"));
52
126
}
53
54
// Returns 1-based line number of the current token.
55
3.08k
size_t ScriptLexer::getLineNumber() {
56
3.08k
  StringRef S = getCurrentMB().getBuffer();
57
3.08k
  StringRef Tok = Tokens[Pos - 1];
58
3.08k
  return S.substr(0, Tok.data() - S.data()).count('\n') + 1;
59
3.08k
}
60
61
// Returns 0-based column number of the current token.
62
63
size_t ScriptLexer::getColumnNumber() {
63
63
  StringRef Tok = Tokens[Pos - 1];
64
63
  return Tok.data() - getLine().data();
65
63
}
66
67
3.08k
std::string ScriptLexer::getCurrentLocation() {
68
3.08k
  std::string Filename = getCurrentMB().getBufferIdentifier();
69
3.08k
  return (Filename + ":" + Twine(getLineNumber())).str();
70
3.08k
}
71
72
703
ScriptLexer::ScriptLexer(MemoryBufferRef MB) { tokenize(MB); }
73
74
// We don't want to record cascading errors. Keep only the first one.
75
84
void ScriptLexer::setError(const Twine &Msg) {
76
84
  if (errorCount())
77
21
    return;
78
63
79
63
  std::string S = (getCurrentLocation() + ": " + Msg).str();
80
63
  if (Pos)
81
63
    S += "\n>>> " + getLine().str() + "\n>>> " +
82
63
         std::string(getColumnNumber(), ' ') + "^";
83
63
  error(S);
84
63
}
85
86
// Split S into linker script tokens.
87
716
void ScriptLexer::tokenize(MemoryBufferRef MB) {
88
716
  std::vector<StringRef> Vec;
89
716
  MBs.push_back(MB);
90
716
  StringRef S = MB.getBuffer();
91
716
  StringRef Begin = S;
92
716
93
14.5k
  for (;;) {
94
14.5k
    S = skipSpace(S);
95
14.5k
    if (S.empty())
96
714
      break;
97
13.8k
98
13.8k
    // Quoted token. Note that double-quote characters are parts of a token
99
13.8k
    // because, in a glob match context, only unquoted tokens are interpreted
100
13.8k
    // as glob patterns. Double-quoted tokens are literal patterns in that
101
13.8k
    // context.
102
13.8k
    if (S.startswith("\"")) {
103
113
      size_t E = S.find("\"", 1);
104
113
      if (E == StringRef::npos) {
105
2
        StringRef Filename = MB.getBufferIdentifier();
106
2
        size_t Lineno = Begin.substr(0, S.data() - Begin.data()).count('\n');
107
2
        error(Filename + ":" + Twine(Lineno + 1) + ": unclosed quote");
108
2
        return;
109
2
      }
110
111
111
111
      Vec.push_back(S.take_front(E + 1));
112
111
      S = S.substr(E + 1);
113
111
      continue;
114
111
    }
115
13.7k
116
13.7k
    // ">foo" is parsed to ">" and "foo", but ">>" is parsed to ">>".
117
13.7k
    // "|", "||", "&" and "&&" are different operators.
118
13.7k
    if (S.startswith("<<") || 
S.startswith("<=")13.7k
||
S.startswith(">>")13.7k
||
119
13.7k
        
S.startswith(">=")13.7k
||
S.startswith("||")13.7k
||
S.startswith("&&")13.7k
) {
120
18
      Vec.push_back(S.substr(0, 2));
121
18
      S = S.substr(2);
122
18
      continue;
123
18
    }
124
13.7k
125
13.7k
    // Unquoted token. This is more relaxed than tokens in C-like language,
126
13.7k
    // so that you can write "file-name.cpp" as one bare token, for example.
127
13.7k
    size_t Pos = S.find_first_not_of(
128
13.7k
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
129
13.7k
        "0123456789_.$/\\~=+[]*?-!^:");
130
13.7k
131
13.7k
    // A character that cannot start a word (which is usually a
132
13.7k
    // punctuation) forms a single character token.
133
13.7k
    if (Pos == 0)
134
6.47k
      Pos = 1;
135
13.7k
    Vec.push_back(S.substr(0, Pos));
136
13.7k
    S = S.substr(Pos);
137
13.7k
  }
138
716
139
716
  Tokens.insert(Tokens.begin() + Pos, Vec.begin(), Vec.end());
140
714
}
141
142
// Skip leading whitespace characters or comments.
143
14.5k
StringRef ScriptLexer::skipSpace(StringRef S) {
144
25.1k
  for (;;) {
145
25.1k
    if (S.startswith("/*")) {
146
5
      size_t E = S.find("*/", 2);
147
5
      if (E == StringRef::npos) {
148
2
        error("unclosed comment in a linker script");
149
2
        return "";
150
2
      }
151
3
      S = S.substr(E + 2);
152
3
      continue;
153
3
    }
154
25.1k
    if (S.startswith("#")) {
155
1.38k
      size_t E = S.find('\n', 1);
156
1.38k
      if (E == StringRef::npos)
157
3
        E = S.size() - 1;
158
1.38k
      S = S.substr(E + 1);
159
1.38k
      continue;
160
1.38k
    }
161
23.7k
    size_t Size = S.size();
162
23.7k
    S = S.ltrim();
163
23.7k
    if (S.size() == Size)
164
14.5k
      return S;
165
23.7k
  }
166
14.5k
}
167
168
// An erroneous token is handled as if it were the last token before EOF.
169
59.4k
bool ScriptLexer::atEOF() { return errorCount() || 
Tokens.size() == Pos59.3k
; }
170
171
// Split a given string as an expression.
172
// This function returns "3", "*" and "5" for "3*5" for example.
173
8.21k
static std::vector<StringRef> tokenizeExpr(StringRef S) {
174
8.21k
  StringRef Ops = "+-*/:!~"; // List of operators
175
8.21k
176
8.21k
  // Quoted strings are literal strings, so we don't want to split it.
177
8.21k
  if (S.startswith("\""))
178
5
    return {S};
179
8.20k
180
8.20k
  // Split S with operators as separators.
181
8.20k
  std::vector<StringRef> Ret;
182
8.79k
  while (!S.empty()) {
183
8.22k
    size_t E = S.find_first_of(Ops);
184
8.22k
185
8.22k
    // No need to split if there is no operator.
186
8.22k
    if (E == StringRef::npos) {
187
7.63k
      Ret.push_back(S);
188
7.63k
      break;
189
7.63k
    }
190
583
191
583
    // Get a token before the opreator.
192
583
    if (E != 0)
193
8
      Ret.push_back(S.substr(0, E));
194
583
195
583
    // Get the operator as a token. Keep != as one token.
196
583
    if (S.substr(E).startswith("!=")) {
197
7
      Ret.push_back(S.substr(E, 2));
198
7
      S = S.substr(E + 2);
199
576
    } else {
200
576
      Ret.push_back(S.substr(E, 1));
201
576
      S = S.substr(E + 1);
202
576
    }
203
583
  }
204
8.20k
  return Ret;
205
8.20k
}
206
207
// In contexts where expressions are expected, the lexer should apply
208
// different tokenization rules than the default one. By default,
209
// arithmetic operator characters are regular characters, but in the
210
// expression context, they should be independent tokens.
211
//
212
// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
213
// in the expression context.
214
//
215
// This function may split the current token into multiple tokens.
216
48.1k
void ScriptLexer::maybeSplitExpr() {
217
48.1k
  if (!InExpr || 
errorCount()8.23k
||
atEOF()8.21k
)
218
39.8k
    return;
219
8.21k
220
8.21k
  std::vector<StringRef> V = tokenizeExpr(Tokens[Pos]);
221
8.21k
  if (V.size() == 1)
222
8.20k
    return;
223
12
  Tokens.erase(Tokens.begin() + Pos);
224
12
  Tokens.insert(Tokens.begin() + Pos, V.begin(), V.end());
225
12
}
226
227
48.1k
StringRef ScriptLexer::next() {
228
48.1k
  maybeSplitExpr();
229
48.1k
230
48.1k
  if (errorCount())
231
116
    return "";
232
47.9k
  if (atEOF()) {
233
9
    setError("unexpected EOF");
234
9
    return "";
235
9
  }
236
47.9k
  return Tokens[Pos++];
237
47.9k
}
238
239
34.5k
StringRef ScriptLexer::peek() {
240
34.5k
  StringRef Tok = next();
241
34.5k
  if (errorCount())
242
108
    return "";
243
34.4k
  Pos = Pos - 1;
244
34.4k
  return Tok;
245
34.4k
}
246
247
21.8k
bool ScriptLexer::consume(StringRef Tok) {
248
21.8k
  if (peek() == Tok) {
249
2.74k
    skip();
250
2.74k
    return true;
251
2.74k
  }
252
19.1k
  return false;
253
19.1k
}
254
255
// Consumes Tok followed by ":". Space is allowed between Tok and ":".
256
548
bool ScriptLexer::consumeLabel(StringRef Tok) {
257
548
  if (consume((Tok + ":").str()))
258
127
    return true;
259
421
  if (Tokens.size() >= Pos + 2 && 
Tokens[Pos] == Tok419
&&
260
421
      
Tokens[Pos + 1] == ":"6
) {
261
5
    Pos += 2;
262
5
    return true;
263
5
  }
264
416
  return false;
265
416
}
266
267
2.92k
void ScriptLexer::skip() { (void)next(); }
268
269
4.86k
void ScriptLexer::expect(StringRef Expect) {
270
4.86k
  if (errorCount())
271
45
    return;
272
4.82k
  StringRef Tok = next();
273
4.82k
  if (Tok != Expect)
274
17
    setError(Expect + " expected, but got " + Tok);
275
4.82k
}
276
277
// Returns true if S encloses T.
278
6.35k
static bool encloses(StringRef S, StringRef T) {
279
6.35k
  return S.bytes_begin() <= T.bytes_begin() && 
T.bytes_end() <= S.bytes_end()6.33k
;
280
6.35k
}
281
282
6.29k
MemoryBufferRef ScriptLexer::getCurrentMB() {
283
6.29k
  // Find input buffer containing the current token.
284
6.29k
  assert(!MBs.empty() && Pos > 0);
285
6.29k
  for (MemoryBufferRef MB : MBs)
286
6.35k
    if (encloses(MB.getBuffer(), Tokens[Pos - 1]))
287
6.29k
      return MB;
288
6.29k
  
llvm_unreachable0
("getCurrentMB: failed to find a token");
289
}