@@ -36,24 +36,24 @@ using namespace llvm;
3636using namespace lld ;
3737using namespace lld ::elf;
3838
39+ ScriptLexer::ScriptLexer (MemoryBufferRef mb) : curBuf(mb), mbs(1 , mb) {}
40+
3941// Returns a whole line containing the current token.
4042StringRef ScriptLexer::getLine () {
4143 StringRef s = getCurrentMB ().getBuffer ();
42- StringRef tok = tokens[pos - 1 ];
4344
44- size_t pos = s.rfind (' \n ' , tok .data () - s.data ());
45+ size_t pos = s.rfind (' \n ' , prevTok .data () - s.data ());
4546 if (pos != StringRef::npos)
4647 s = s.substr (pos + 1 );
4748 return s.substr (0 , s.find_first_of (" \r\n " ));
4849}
4950
5051// Returns 1-based line number of the current token.
5152size_t ScriptLexer::getLineNumber () {
52- if (pos == 0 )
53+ if (prevTok. empty () )
5354 return 1 ;
5455 StringRef s = getCurrentMB ().getBuffer ();
55- StringRef tok = tokens[pos - 1 ];
56- const size_t tokOffset = tok.data () - s.data ();
56+ const size_t tokOffset = prevTok.data () - s.data ();
5757
5858 // For the first token, or when going backwards, start from the beginning of
5959 // the buffer. If this token is after the previous token, start from the
@@ -76,40 +76,41 @@ size_t ScriptLexer::getLineNumber() {
7676
7777// Returns 0-based column number of the current token.
7878size_t ScriptLexer::getColumnNumber () {
79- StringRef tok = tokens[pos - 1 ];
80- return tok.data () - getLine ().data ();
79+ return prevTok.data () - getLine ().data ();
8180}
8281
8382std::string ScriptLexer::getCurrentLocation () {
8483 std::string filename = std::string (getCurrentMB ().getBufferIdentifier ());
8584 return (filename + " :" + Twine (getLineNumber ())).str ();
8685}
8786
88- ScriptLexer::ScriptLexer (MemoryBufferRef mb) { tokenize (mb); }
89-
9087// We don't want to record cascading errors. Keep only the first one.
9188void ScriptLexer::setError (const Twine &msg) {
9289 if (errorCount ())
9390 return ;
9491
9592 std::string s = (getCurrentLocation () + " : " + msg).str ();
96- if (pos )
93+ if (prevTok. size () )
9794 s += " \n >>> " + getLine ().str () + " \n >>> " +
9895 std::string (getColumnNumber (), ' ' ) + " ^" ;
9996 error (s);
10097}
10198
102- // Split S into linker script tokens.
103- void ScriptLexer::tokenize (MemoryBufferRef mb) {
104- std::vector<StringRef> vec;
105- mbs.push_back (mb);
106- StringRef s = mb.getBuffer ();
107- StringRef begin = s;
108-
99+ void ScriptLexer::lex () {
109100 for (;;) {
101+ StringRef &s = curBuf.s ;
110102 s = skipSpace (s);
111- if (s.empty ())
112- break ;
103+ if (s.empty ()) {
104+ // If this buffer is from an INCLUDE command, switch to the "return
105+ // value"; otherwise, mark EOF.
106+ if (buffers.empty ()) {
107+ eof = true ;
108+ return ;
109+ }
110+ curBuf = buffers.pop_back_val ();
111+ continue ;
112+ }
113+ curTokState = inExpr;
113114
114115 // Quoted token. Note that double-quote characters are parts of a token
115116 // because, in a glob match context, only unquoted tokens are interpreted
@@ -118,45 +119,53 @@ void ScriptLexer::tokenize(MemoryBufferRef mb) {
118119 if (s.starts_with (" \" " )) {
119120 size_t e = s.find (" \" " , 1 );
120121 if (e == StringRef::npos) {
121- StringRef filename = mb. getBufferIdentifier ();
122- size_t lineno = begin. substr ( 0 , s.data () - begin. data () ).count (' \n ' );
123- error (filename + " :" + Twine (lineno + 1 ) + " : unclosed quote" );
122+ size_t lineno =
123+ StringRef (curBuf. begin , s.data () - curBuf. begin ).count (' \n ' );
124+ error (curBuf. filename + " :" + Twine (lineno + 1 ) + " : unclosed quote" );
124125 return ;
125126 }
126127
127- vec. push_back ( s.take_front (e + 1 ) );
128+ curTok = s.take_front (e + 1 );
128129 s = s.substr (e + 1 );
129- continue ;
130+ return ;
130131 }
131132
132133 // Some operators form separate tokens.
133134 if (s.starts_with (" <<=" ) || s.starts_with (" >>=" )) {
134- vec. push_back ( s.substr (0 , 3 ) );
135+ curTok = s.substr (0 , 3 );
135136 s = s.substr (3 );
136- continue ;
137+ return ;
137138 }
138- if (s.size () > 1 && ((s[1 ] == ' =' && strchr (" */+-<>&^|" , s[0 ])) ||
139- (s[0 ] == s[1 ] && strchr (" <>&|" , s[0 ])))) {
140- vec.push_back (s.substr (0 , 2 ));
139+ if (s.size () > 1 && (s[1 ] == ' =' && strchr (" +-*/!&^|" , s[0 ]))) {
140+ curTok = s.substr (0 , 2 );
141141 s = s.substr (2 );
142- continue ;
142+ return ;
143143 }
144144
145- // Unquoted token. This is more relaxed than tokens in C-like language,
146- // so that you can write "file-name.cpp" as one bare token, for example.
147- size_t pos = s.find_first_not_of (
148- " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
149- " 0123456789_.$/\\ ~=+[]*?-!^:" );
145+ // Unquoted token. The non-expression token is more relaxed than tokens in
146+ // C-like languages, so that you can write "file-name.cpp" as one bare
147+ // token.
148+ size_t pos;
149+ if (inExpr) {
150+ pos = s.find_first_not_of (
151+ " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
152+ " 0123456789_.$" );
153+ if (pos == 0 && s.size () >= 2 &&
154+ ((s[0 ] == s[1 ] && strchr (" <>&|" , s[0 ])) ||
155+ is_contained ({" ==" , " !=" , " <=" , " >=" , " <<" , " >>" }, s.substr (0 , 2 ))))
156+ pos = 2 ;
157+ } else {
158+ pos = s.find_first_not_of (
159+ " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
160+ " 0123456789_.$/\\ ~=+[]*?-!^:" );
161+ }
150162
151- // A character that cannot start a word (which is usually a
152- // punctuation) forms a single character token.
153163 if (pos == 0 )
154164 pos = 1 ;
155- vec. push_back ( s.substr (0 , pos) );
165+ curTok = s.substr (0 , pos);
156166 s = s.substr (pos);
167+ break ;
157168 }
158-
159- tokens.insert (tokens.begin () + pos, vec.begin (), vec.end ());
160169}
161170
162171// Skip leading whitespace characters or comments.
@@ -185,93 +194,30 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
185194 }
186195}
187196
188- // An erroneous token is handled as if it were the last token before EOF.
189- bool ScriptLexer::atEOF () { return errorCount () || tokens.size () == pos; }
190-
191- // Split a given string as an expression.
192- // This function returns "3", "*" and "5" for "3*5" for example.
193- static std::vector<StringRef> tokenizeExpr (StringRef s) {
194- StringRef ops = " !~*/+-<>?^:=" ; // List of operators
195-
196- // Quoted strings are literal strings, so we don't want to split it.
197- if (s.starts_with (" \" " ))
198- return {s};
199-
200- // Split S with operators as separators.
201- std::vector<StringRef> ret;
202- while (!s.empty ()) {
203- size_t e = s.find_first_of (ops);
204-
205- // No need to split if there is no operator.
206- if (e == StringRef::npos) {
207- ret.push_back (s);
208- break ;
209- }
210-
211- // Get a token before the operator.
212- if (e != 0 )
213- ret.push_back (s.substr (0 , e));
214-
215- // Get the operator as a token.
216- // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
217- if (s.substr (e).starts_with (" !=" ) || s.substr (e).starts_with (" ==" ) ||
218- s.substr (e).starts_with (" >=" ) || s.substr (e).starts_with (" <=" ) ||
219- s.substr (e).starts_with (" <<" ) || s.substr (e).starts_with (" >>" )) {
220- ret.push_back (s.substr (e, 2 ));
221- s = s.substr (e + 2 );
222- } else {
223- ret.push_back (s.substr (e, 1 ));
224- s = s.substr (e + 1 );
225- }
226- }
227- return ret;
228- }
229-
230- // In contexts where expressions are expected, the lexer should apply
231- // different tokenization rules than the default one. By default,
232- // arithmetic operator characters are regular characters, but in the
233- // expression context, they should be independent tokens.
234- //
235- // For example, "foo*3" should be tokenized to "foo", "*" and "3" only
236- // in the expression context.
237- //
238- // This function may split the current token into multiple tokens.
239- void ScriptLexer::maybeSplitExpr () {
240- if (!inExpr || errorCount () || atEOF ())
241- return ;
242-
243- std::vector<StringRef> v = tokenizeExpr (tokens[pos]);
244- if (v.size () == 1 )
245- return ;
246- tokens.erase (tokens.begin () + pos);
247- tokens.insert (tokens.begin () + pos, v.begin (), v.end ());
248- }
197+ // Used to determine whether to stop parsing. Treat errors like EOF.
198+ bool ScriptLexer::atEOF () { return eof || errorCount (); }
249199
250200StringRef ScriptLexer::next () {
251- maybeSplitExpr ();
252-
253- if (errorCount ())
254- return " " ;
255- if (atEOF ()) {
256- setError (" unexpected EOF" );
257- return " " ;
258- }
259- return tokens[pos++];
201+ prevTok = peek ();
202+ return std::exchange (curTok, StringRef (curBuf.s .data (), 0 ));
260203}
261204
262205StringRef ScriptLexer::peek () {
263- StringRef tok = next ();
264- if (errorCount ())
265- return " " ;
266- pos = pos - 1 ;
267- return tok;
206+ // curTok is invalid if curTokState and inExpr mismatch.
207+ if (curTok.size () && curTokState != inExpr) {
208+ curBuf.s = StringRef (curTok.data (), curBuf.s .end () - curTok.data ());
209+ curTok = {};
210+ }
211+ if (curTok.empty ())
212+ lex ();
213+ return curTok;
268214}
269215
270216bool ScriptLexer::consume (StringRef tok) {
271- if (next () = = tok)
272- return true ;
273- --pos ;
274- return false ;
217+ if (peek () ! = tok)
218+ return false ;
219+ next () ;
220+ return true ;
275221}
276222
277223void ScriptLexer::skip () { (void )next (); }
@@ -280,8 +226,12 @@ void ScriptLexer::expect(StringRef expect) {
280226 if (errorCount ())
281227 return ;
282228 StringRef tok = next ();
283- if (tok != expect)
284- setError (expect + " expected, but got " + tok);
229+ if (tok != expect) {
230+ if (atEOF ())
231+ setError (" unexpected EOF" );
232+ else
233+ setError (expect + " expected, but got " + tok);
234+ }
285235}
286236
287237// Returns true if S encloses T.
@@ -292,10 +242,8 @@ static bool encloses(StringRef s, StringRef t) {
292242MemoryBufferRef ScriptLexer::getCurrentMB () {
293243 // Find input buffer containing the current token.
294244 assert (!mbs.empty ());
295- if (pos == 0 )
296- return mbs.back ();
297245 for (MemoryBufferRef mb : mbs)
298- if (encloses (mb.getBuffer (), tokens[pos - 1 ] ))
246+ if (encloses (mb.getBuffer (), curBuf. s ))
299247 return mb;
300248 llvm_unreachable (" getCurrentMB: failed to find a token" );
301249}
0 commit comments