-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTokens and Lexemes.tex
378 lines (326 loc) · 10.8 KB
/
Tokens and Lexemes.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
% Uncomment for handout
\def\HANDOUT{}
\ifdefined\HANDOUT
\documentclass[handout]{beamer}
\usepackage{pgfpages}
\pgfpagesuselayout{4 on 1}[letterpaper,landscape,border shrink=5mm]
\else
\documentclass{beamer}
\fi
\mode<presentation>
{
\usetheme{Warsaw}
\definecolor{sered}{rgb}{0.78, 0.06, 0.18}
\definecolor{richblack}{rgb}{0.0, 0.0, 0.0}
\setbeamercolor{structure}{fg=sered,bg=richblack}
%\setbeamercovered{transparent}
}
\usepackage[english]{babel}
\usepackage[latin1]{inputenc}
\usepackage{times}
\usepackage[T1]{fontenc}
\usepackage{tikz}
\usepackage{graphicx}
\usepackage[export]{adjustbox}
\usepackage{fancyvrb}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{esvect}
\makeatletter
\newcommand{\imagesource}[1]{{\centering\hfill\break\hbox{\scriptsize Image Source:\thinspace{\tiny\itshape #1}}\par}}
\newcommand{\image}[3][\@nil]{%
\def\tmp{#1}%
\begin{center}
\ifx\tmp\@nnil
\includegraphics[max height = 0.55\textheight, max width = \textwidth]{images/#2}
\else
\includegraphics[max height = 0.50\textheight, max width = \textwidth]{images/#2}
\linebreak
#1
\fi
\linebreak
{\tiny Image Source:\thinspace{\tiny #3}}
\end{center}
}
\newenvironment{code}{%
\VerbatimEnvironment
\begin{adjustbox}{max width=\textwidth, max height=0.7\textheight}
\begin{BVerbatim}
}{
\end{BVerbatim}
\end{adjustbox}
}
\newenvironment{scaled}{%
\begin{adjustbox}{max width=\textwidth, max height=0.7\textheight}
}{
\end{adjustbox}
}
\title{Tokens and Lexemes}
\author{Robert Lowe}
\institute[Southeast Missouri State University] % (optional, but mostly needed)
{
Department of Computer Science\\
Southeast Missouri State University
}
\date[]{}
\subject{}
\pgfdeclareimage[height=1.0cm]{university-logo}{images/semo-logo}
\logo{\pgfuseimage{university-logo}}
\AtBeginSection[]
{
\begin{frame}<beamer>{Outline}
\tableofcontents[currentsection]
\end{frame}
}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
\begin{frame}{Outline}
\tableofcontents
\end{frame}
% Structuring a talk is a difficult task and the following structure
% may not be suitable. Here are some rules that apply for this
% solution:
% - Exactly two or three sections (other than the summary).
% - At *most* three subsections per section.
% - Talk about 30s to 2min per frame. So there should be between about
% 15 and 30 frames, all told.
% - A conference audience is likely to know very little of what you
% are going to talk about. So *simplify*!
% - In a 20min talk, getting the main ideas across is hard
% enough. Leave out details, even if it means being less precise than
% you think necessary.
% - If you omit details that are vital to the proof/implementation,
% just say so once. Everybody will be happy with that.
\section{Tokens and Lexemes}
\begin{frame}{Lexical Elements}
\begin{block}{Tokens vs Lexemes}
A {\bf token} is a generalization, like a category. A {\bf lexeme} is a specific string.
One token may match several lexemes.
\end{block}
\begin{description}
\item[Token] -- Representation/Label of an element of a lexical element.
\item[Lexeme] -- The actual sequence of characters which make up the lexical element.
\end{description}
\end{frame}
\begin{frame}{Lexical Element Information}
A lexical analyzer (lexer) provides a stream of tokens to the parser. The parser often
needs additional information to do its work:
\begin{itemize}
\item {\bf Token} -- An easily comparable representation of a token.
\item {\bf Lexeme} -- The specific string which was matched to the token.
\item {\bf Value} -- In some cases, for example literal values, the lexer
provides the concrete value of the lexeme.
\item {\bf Line, Column} -- The location of the start of the lexical element in the
source code. Useful for error reporting.
\end{itemize}
\end{frame}
\begin{frame}[fragile]{Example Lexical Elements}
\begin{code}
while( x < 11 )
\end{code}
\vspace{1cm}
\begin{code}
Lexeme(token=<Token.WHILE: 12>, lex='while', value='while', line=7, col=3)
Lexeme(token=<Token.LPAREN: 3>, lex='(', value='(', line=7, col=8)
Lexeme(token=<Token.IDENTIFIER: 23>, lex='x', value='x', line=7, col=10)
Lexeme(token=<Token.LT: 13>, lex='<', value='<', line=7, col=12)
Lexeme(token=<Token.INTNUM: 21>, lex='10', value=10, line=7, col=14)
Lexeme(token=<Token.RPAREN: 4>, lex=')', value=')', line=7, col=17)
\end{code}
\end{frame}
\section{Designing a Lexical Analyzer}
\begin{frame}[fragile]{Representing Tokens}
\begin{columns}
\column{0.6\textwidth}
\begin{itemize}
\item Tokens can simply be distinct integers.
\item Enumerated constants provide meaningful labels to the integers.
\item Python's \texttt{Enum} class from the \texttt{enum} module provides additional capabilities, such as the ability to easily print a token's name.
\item We usually add two additional tokens: \texttt{INVALID} and \texttt{EOF}
\end{itemize}
\column{0.4\textwidth}
\begin{block}{Toy C's Tokens}
\begin{code}
class Token(Enum):
INVALID = auto()
EOF = auto()
LPAREN = auto()
RPAREN = auto()
COMMA = auto()
LBRACE = auto()
RBRACE = auto()
SEMI = auto()
REAL = auto()
INT = auto()
EQUAL = auto()
WHILE = auto()
LT = auto()
PLUS = auto()
MINUS = auto()
TIMES = auto()
DIV = auto()
DIGIT = auto()
LETTER = auto()
DOT = auto()
INTNUM = auto()
REALNUM = auto()
IDENTIFIER = auto()
\end{code}
\end{block}
\end{columns}
\end{frame}
\begin{frame}[fragile]{Representing Lexical Elements}
\begin{block}{Toy C's Lexical Element}
\begin{code}
Lexeme = namedtuple("Lexeme", ("token", "lex", "value", "line", "col"))
...
self.cur_tok = Lexeme(Token.IDENTIFIER, s, s, line, col)
\end{code}
\end{block}
\begin{itemize}
\item A lexical element is usually a heterogeneous data structure.
\item Typically we desire these to be {\bf immutable}.
\item Python's \texttt{namedtuple} from the \texttt{collections} module is a good choice.
\end{itemize}
\end{frame}
\begin{frame}[fragile]{The Lexer Object}
\begin{itemize}
\begin{block}{Running the Lexer}
\begin{code}
lexer = Lexer(file)
lexer.next()
while lexer.cur_tok.token != Token.EOF:
print(lexer.cur_tok)
lexer.next()
\end{code}
\end{block}
\item Lexer's are typically complex objects.
\item A lexer's public interface typically consists of just two things:
\begin{description}
\item[\texttt{next()}] -- A function which advances the lexer to the next lexical element.
\item[\texttt{cur\_tok}] -- The current lexical element.
\end{description}
\end{itemize}
\end{frame}
\begin{frame}{Lexer State}
\begin{itemize}
\item A lexer's internal state typically consists of:
\begin{description}
\item[\texttt{file}] -- The stream being analyzed (Typically a source file)
\item[\texttt{line}] -- The line number being analyzed
\item[\texttt{col}] -- The current column number being analyzed
\item[\texttt{cur\_char}] -- The current character being analyzed
\item[\texttt{cur\_tok}] -- The current lexical element
\end{description}
\end{itemize}
\end{frame}
\begin{frame}[fragile]{Consuming Characters}
\begin{columns}
\column{0.6\textwidth}
\begin{itemize}
\item A lexer processes its input character by character.
\item The character under current consideration is stored in \texttt{cur\_char}.
\item Characters are consumed as they are matched/discarded.
\item The lexer keeps track of \texttt{line} and \texttt{col} fields as it reads the files.
\end{itemize}
\column{0.4\textwidth}
\begin{block}{Character Processing}
\begin{code}
def read(self):
self.cur_char = self.file.read(1)
if self.cur_char:
self.col+=1
def consume(self):
self.read()
# detect end of line
if self.cur_char == '\n':
self.line += 1
self.col = 0
def skip_space(self):
while self.cur_char.isspace():
self.consume()
self.labelDef = self.col == 1
\end{code}
\end{block}
\end{columns}
\end{frame}
\begin{frame}{Grouping the Lexeme Grammar}
Tokens can be broadly categorized into 3 groups.
\begin{description}
\item[Group 1] -- Single character tokens which are not the prefix of any other token.\newline
{\em Example:} \texttt{+, -, *}
\vspace{0.5cm}
\item[Group 2] -- Tokens of fixed length which have prefixes in common.\newline
{\em Example:} \texttt{=, ==, <, <=}
\vspace{0.5cm}
\item[Group 3] -- Tokens which are either of variable length or share a prefix with a token of variable length.
\newline
{\em Example:} \texttt{while, whileX, x, y, 12, 12.5, int, integer}
\end{description}
\end{frame}
\begin{frame}[fragile]{Group 1 - Single Match Algorithm}
\begin{code}
1.) Build a list of single match tokens and the characters they match.
2.) For each token in the list:
2.1) if cur_char matches, cur_tok = token, return True
3.) If no match was made, return False
\end{code}
\end{frame}
\begin{frame}[fragile]{Group 2 - Common Prefix Strings}
\begin{code}
1.) Build a list of common prefix tokens and the matching strings.
2.) s = cur_char
3.) Build a list of possible tokens with s as a prefix.
4.) while len(possible) > 1:
4.1) consume()
4.2) s = s + cur_char
4.3) Remove any elements from possible if they do not have s as a
prefix.
5.) if len(possible) == 0, return False
6.) token = possible[0]
7.) while len(s) < len(token):
7.1) consume()
7.2) s += cur_char
7.3) if s is not the prefix of token, cur_tok = invalid, return True
8.) cur_tok = token, return True
\end{code}
\end{frame}
\begin{frame}[fragile]{Group 3 - Repeating Patterns}
\begin{code}
1.) Build a list of fixed tokens.
2.) s = ""
3.) while cur_char is consistent with the pattern:
3.1) s = s + cur_char
3.2) consume()
4.) if len(s) == 0, return False
5.) for token in tokens:
5.1) If s matches token: cur_tok = token, return True
6.) cur_tok = matching variable length token.
7.) return True
\end{code}
\end{frame}
\begin{frame}[fragile]{Lexical Processing}
\begin{code}
def next(self):
self.skip_space()
# handle end of file
if not self.cur_char:
self.cur_tok = Lexeme(Token.EOF, None, None, self.line, self.col)
return self.cur_tok
# try each lexing possibility
if self.group1():
return self.cur_tok
elif self.group2():
return self.cur_tok
elif self.group3():
return self.cur_tok
else:
# must be an invalid token
self.cur_tok = Lexeme(Token.INVALID, self.cur_char, None, self.line, self.col)
self.consume()
return self.cur_tok
\end{code}
\end{frame}
\end{document}