Coverage Report

Created: 2017-09-19 22:28

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/tools/lld/ELF/ScriptLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- ScriptLexer.cpp ----------------------------------------------------===//
2
//
3
//                             The LLVM Linker
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This file defines a lexer for the linker script.
11
//
12
// The linker script's grammar is not complex but ambiguous due to the
13
// lack of the formal specification of the language. What we are trying to
14
// do in this and other files in LLD is to make a "reasonable" linker
15
// script processor.
16
//
17
// Among simplicity, compatibility and efficiency, we put the most
18
// emphasis on simplicity when we wrote this lexer. Compatibility with the
19
// GNU linkers is important, but we did not try to clone every tiny corner
20
// case of their lexers, as even ld.bfd and ld.gold are subtly different
21
// in various corner cases. We do not care much about efficiency because
22
// the time spent in parsing linker scripts is usually negligible.
23
//
24
// Our grammar of the linker script is LL(2), meaning that it needs at
25
// most two-token lookahead to parse. The only place we need two-token
26
// lookahead is labels in version scripts, where we need to parse "local :"
27
// as if "local:".
28
//
29
// Overall, this lexer works fine for most linker scripts. There might
30
// be room for improving compatibility, but that's probably not at the
31
// top of our todo list.
32
//
33
//===----------------------------------------------------------------------===//
34
35
#include "ScriptLexer.h"
36
#include "Error.h"
37
#include "llvm/ADT/Twine.h"
38
39
using namespace llvm;
40
using namespace lld;
41
using namespace lld::elf;
42
43
// Returns a whole line containing the current token.
44
100
StringRef ScriptLexer::getLine() {
45
100
  StringRef S = getCurrentMB().getBuffer();
46
100
  StringRef Tok = Tokens[Pos - 1];
47
100
48
100
  size_t Pos = S.rfind('\n', Tok.data() - S.data());
49
100
  if (Pos != StringRef::npos)
50
20
    S = S.substr(Pos + 1);
51
100
  return S.substr(0, S.find_first_of("\r\n"));
52
100
}
53
54
// Returns 1-based line number of the current token.
55
1.52k
size_t ScriptLexer::getLineNumber() {
56
1.52k
  StringRef S = getCurrentMB().getBuffer();
57
1.52k
  StringRef Tok = Tokens[Pos - 1];
58
1.52k
  return S.substr(0, Tok.data() - S.data()).count('\n') + 1;
59
1.52k
}
60
61
// Returns 0-based column number of the current token.
62
50
size_t ScriptLexer::getColumnNumber() {
63
50
  StringRef Tok = Tokens[Pos - 1];
64
50
  return Tok.data() - getLine().data();
65
50
}
66
67
1.52k
std::string ScriptLexer::getCurrentLocation() {
68
1.52k
  std::string Filename = getCurrentMB().getBufferIdentifier();
69
1.52k
  if (!Pos)
70
0
    return Filename;
71
1.52k
  return (Filename + ":" + Twine(getLineNumber())).str();
72
1.52k
}
73
74
480
ScriptLexer::ScriptLexer(MemoryBufferRef MB) { tokenize(MB); }
75
76
// We don't want to record cascading errors. Keep only the first one.
77
64
void ScriptLexer::setError(const Twine &Msg) {
78
64
  if (ErrorCount)
79
14
    return;
80
50
81
50
  std::string S = (getCurrentLocation() + ": " + Msg).str();
82
50
  if (Pos)
83
50
    S += "\n>>> " + getLine().str() + "\n>>> " +
84
50
         std::string(getColumnNumber(), ' ') + "^";
85
64
  error(S);
86
64
}
87
88
// Split S into linker script tokens.
89
494
void ScriptLexer::tokenize(MemoryBufferRef MB) {
90
494
  std::vector<StringRef> Vec;
91
494
  MBs.push_back(MB);
92
494
  StringRef S = MB.getBuffer();
93
494
  StringRef Begin = S;
94
494
95
9.77k
  for (;;) {
96
9.77k
    S = skipSpace(S);
97
9.77k
    if (S.empty())
98
490
      break;
99
9.28k
100
9.28k
    // Quoted token. Note that double-quote characters are parts of a token
101
9.28k
    // because, in a glob match context, only unquoted tokens are interpreted
102
9.28k
    // as glob patterns. Double-quoted tokens are literal patterns in that
103
9.28k
    // context.
104
9.28k
    
if (9.28k
S.startswith("\"")9.28k
) {
105
100
      size_t E = S.find("\"", 1);
106
100
      if (
E == StringRef::npos100
) {
107
4
        StringRef Filename = MB.getBufferIdentifier();
108
4
        size_t Lineno = Begin.substr(0, S.data() - Begin.data()).count('\n');
109
4
        error(Filename + ":" + Twine(Lineno + 1) + ": unclosed quote");
110
4
        return;
111
4
      }
112
96
113
96
      Vec.push_back(S.take_front(E + 1));
114
96
      S = S.substr(E + 1);
115
96
      continue;
116
96
    }
117
9.18k
118
9.18k
    // Unquoted token. This is more relaxed than tokens in C-like language,
119
9.18k
    // so that you can write "file-name.cpp" as one bare token, for example.
120
9.18k
    size_t Pos = S.find_first_not_of(
121
9.18k
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
122
9.18k
        "0123456789_.$/\\~=+[]*?-!<>^:");
123
9.18k
124
9.18k
    // A character that cannot start a word (which is usually a
125
9.18k
    // punctuation) forms a single character token.
126
9.18k
    if (Pos == 0)
127
4.36k
      Pos = 1;
128
9.77k
    Vec.push_back(S.substr(0, Pos));
129
9.77k
    S = S.substr(Pos);
130
9.77k
  }
131
494
132
490
  Tokens.insert(Tokens.begin() + Pos, Vec.begin(), Vec.end());
133
490
}
134
135
// Skip leading whitespace characters or comments.
136
9.77k
StringRef ScriptLexer::skipSpace(StringRef S) {
137
15.7k
  for (;;) {
138
15.7k
    if (
S.startswith("/*")15.7k
) {
139
9
      size_t E = S.find("*/", 2);
140
9
      if (
E == StringRef::npos9
) {
141
1
        error("unclosed comment in a linker script");
142
1
        return "";
143
1
      }
144
8
      S = S.substr(E + 2);
145
8
      continue;
146
8
    }
147
15.7k
    
if (15.7k
S.startswith("#")15.7k
) {
148
3
      size_t E = S.find('\n', 1);
149
3
      if (E == StringRef::npos)
150
1
        E = S.size() - 1;
151
3
      S = S.substr(E + 1);
152
3
      continue;
153
3
    }
154
15.7k
    size_t Size = S.size();
155
15.7k
    S = S.ltrim();
156
15.7k
    if (S.size() == Size)
157
9.77k
      return S;
158
0
  }
159
9.77k
}
160
161
// An erroneous token is handled as if it were the last token before EOF.
162
38.6k
bool ScriptLexer::atEOF() 
{ return ErrorCount || 38.6k
Tokens.size() == Pos38.5k
; }
163
164
// Split a given string as an expression.
165
// This function returns "3", "*" and "5" for "3*5" for example.
166
5.32k
static std::vector<StringRef> tokenizeExpr(StringRef S) {
167
5.32k
  StringRef Ops = "+-*/:!"; // List of operators
168
5.32k
169
5.32k
  // Quoted strings are literal strings, so we don't want to split it.
170
5.32k
  if (S.startswith("\""))
171
5
    return {S};
172
5.32k
173
5.32k
  // Split S with +-*/ as separators.
174
5.32k
  std::vector<StringRef> Ret;
175
5.72k
  while (
!S.empty()5.72k
) {
176
5.32k
    size_t E = S.find_first_of(Ops);
177
5.32k
178
5.32k
    // No need to split if there is no operator.
179
5.32k
    if (
E == StringRef::npos5.32k
) {
180
4.92k
      Ret.push_back(S);
181
4.92k
      break;
182
4.92k
    }
183
404
184
404
    // Get a token before the opreator.
185
404
    
if (404
E != 0404
)
186
5
      Ret.push_back(S.substr(0, E));
187
404
188
404
    // Get the operator as a token. Keep != as one token.
189
404
    if (
S.substr(E).startswith("!=")404
) {
190
7
      Ret.push_back(S.substr(E, 2));
191
7
      S = S.substr(E + 2);
192
404
    } else {
193
397
      Ret.push_back(S.substr(E, 1));
194
397
      S = S.substr(E + 1);
195
397
    }
196
5.32k
  }
197
5.32k
  return Ret;
198
5.32k
}
199
200
// In contexts where expressions are expected, the lexer should apply
201
// different tokenization rules than the default one. By default,
202
// arithmetic operator characters are regular characters, but in the
203
// expression context, they should be independent tokens.
204
//
205
// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
206
// in the expression context.
207
//
208
// This function may split the current token into multiple tokens.
209
31.5k
void ScriptLexer::maybeSplitExpr() {
210
31.5k
  if (
!InExpr || 31.5k
ErrorCount5.34k
||
atEOF()5.32k
)
211
26.2k
    return;
212
5.32k
213
5.32k
  std::vector<StringRef> V = tokenizeExpr(Tokens[Pos]);
214
5.32k
  if (V.size() == 1)
215
5.31k
    return;
216
8
  Tokens.erase(Tokens.begin() + Pos);
217
8
  Tokens.insert(Tokens.begin() + Pos, V.begin(), V.end());
218
8
}
219
220
31.5k
StringRef ScriptLexer::next() {
221
31.5k
  maybeSplitExpr();
222
31.5k
223
31.5k
  if (ErrorCount)
224
116
    return "";
225
31.4k
  
if (31.4k
atEOF()31.4k
) {
226
7
    setError("unexpected EOF");
227
7
    return "";
228
7
  }
229
31.4k
  return Tokens[Pos++];
230
31.4k
}
231
232
22.5k
StringRef ScriptLexer::peek() {
233
22.5k
  StringRef Tok = next();
234
22.5k
  if (ErrorCount)
235
112
    return "";
236
22.4k
  Pos = Pos - 1;
237
22.4k
  return Tok;
238
22.4k
}
239
240
13.8k
bool ScriptLexer::consume(StringRef Tok) {
241
13.8k
  if (
peek() == Tok13.8k
) {
242
1.77k
    skip();
243
1.77k
    return true;
244
1.77k
  }
245
12.0k
  return false;
246
12.0k
}
247
248
// Consumes Tok followed by ":". Space is allowed between Tok and ":".
249
430
bool ScriptLexer::consumeLabel(StringRef Tok) {
250
430
  if (consume((Tok + ":").str()))
251
100
    return true;
252
330
  
if (330
Tokens.size() >= Pos + 2 && 330
Tokens[Pos] == Tok328
&&
253
330
      
Tokens[Pos + 1] == ":"4
) {
254
3
    Pos += 2;
255
3
    return true;
256
3
  }
257
327
  return false;
258
327
}
259
260
1.90k
void ScriptLexer::skip() { (void)next(); }
261
262
3.28k
void ScriptLexer::expect(StringRef Expect) {
263
3.28k
  if (ErrorCount)
264
39
    return;
265
3.24k
  StringRef Tok = next();
266
3.24k
  if (Tok != Expect)
267
13
    setError(Expect + " expected, but got " + Tok);
268
3.28k
}
269
270
// Returns true if S encloses T.
271
3.18k
static bool encloses(StringRef S, StringRef T) {
272
3.18k
  return S.bytes_begin() <= T.bytes_begin() && T.bytes_end() <= S.bytes_end();
273
3.18k
}
274
275
3.14k
MemoryBufferRef ScriptLexer::getCurrentMB() {
276
3.14k
  // Find input buffer containing the current token.
277
3.14k
  assert(!MBs.empty());
278
3.14k
  if (!Pos)
279
0
    return MBs[0];
280
3.14k
281
3.14k
  for (MemoryBufferRef MB : MBs)
282
3.18k
    
if (3.18k
encloses(MB.getBuffer(), Tokens[Pos - 1])3.18k
)
283
3.14k
      return MB;
284
0
  
llvm_unreachable0
("getCurrentMB: failed to find a token");
285
0
}