Skip to content

Commit 02637fc

Browse files
committed
1.3 Lexer (basic)
1 parent 136d6ff commit 02637fc

File tree

3 files changed

+229
-0
lines changed

3 files changed

+229
-0
lines changed

lexer/lexer.go

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
package lexer
2+
3+
import "github.com/cedrickchee/hou/token"
4+
5+
// Package lexer implements the lexical analysis that is used to transform the
6+
// source code input into a stream of tokens for parsing by the parser.
7+
// The lexer only supports ASCII characters instead of the full Unicode range
8+
// for now to keep things simple.
9+
10+
// Lexer represents the lexer and contains the source input and internal state.
11+
type Lexer struct {
12+
input string
13+
position int // current position in input (points to current char)
14+
readPosition int // current reading position in input (after current char)
15+
ch byte // current char under examination
16+
}
17+
18+
// New returns a new Lexer.
19+
func New(input string) *Lexer {
20+
l := &Lexer{input: input}
21+
l.readChar()
22+
return l
23+
}
24+
25+
// NextToken returns the next token read from the input stream.
26+
func (l *Lexer) NextToken() token.Token {
27+
var tok token.Token
28+
29+
l.skipWhitespace()
30+
31+
switch l.ch {
32+
case '=':
33+
tok = newToken(token.ASSIGN, l.ch)
34+
case ';':
35+
tok = newToken(token.SEMICOLON, l.ch)
36+
case '(':
37+
tok = newToken(token.LPAREN, l.ch)
38+
case ')':
39+
tok = newToken(token.RPAREN, l.ch)
40+
case ',':
41+
tok = newToken(token.COMMA, l.ch)
42+
case '+':
43+
tok = newToken(token.PLUS, l.ch)
44+
case '{':
45+
tok = newToken(token.LBRACE, l.ch)
46+
case '}':
47+
tok = newToken(token.RBRACE, l.ch)
48+
case 0:
49+
tok.Literal = ""
50+
tok.Type = token.EOF
51+
default:
52+
if isLetter(l.ch) {
53+
tok.Literal = l.readIdentifier()
54+
tok.Type = token.LookupIdent(tok.Literal)
55+
// Early exit here. We don't need the call to readChar() below.
56+
return tok
57+
} else if isDigit(l.ch) {
58+
tok.Type = token.INT
59+
tok.Literal = l.readNumber()
60+
return tok
61+
} else {
62+
tok = newToken(token.ILLEGAL, l.ch)
63+
}
64+
}
65+
66+
l.readChar()
67+
return tok
68+
}
69+
70+
// Helper method to make the usage of these lexer fields easier to understand.
71+
// It gives us the next character and advance our position in the input string.
72+
func (l *Lexer) readChar() {
73+
// First, check whether we've reached the end of input.
74+
if l.readPosition >= len(l.input) {
75+
// 0 is the ASCII code for the "NUL" character and signifies either
76+
// "we haven't read anything yet" or "end of file".
77+
l.ch = 0
78+
} else {
79+
l.ch = l.input[l.readPosition]
80+
}
81+
// After that, l.readPosition always point to the next position where we're
82+
// going to read from next and l.position always points to the position
83+
// where we last read.
84+
l.position = l.readPosition
85+
l.readPosition++
86+
87+
// Note: Unicode support
88+
// ---------------------
89+
// In order to fully support Unicode and UTF-8 we would need to change l.ch
90+
// from a byte to rune and change the way we read the next characters,
91+
// since they could be multiple bytes wide now.
92+
}
93+
94+
// Reads in an identifier and advances our lexer’s positions until it encounters
95+
// a non-letter-character.
96+
func (l *Lexer) readIdentifier() string {
97+
position := l.position
98+
for isLetter(l.ch) {
99+
l.readChar()
100+
}
101+
return l.input[position:l.position]
102+
}
103+
104+
func (l *Lexer) readNumber() string {
105+
position := l.position
106+
for isDigit(l.ch) {
107+
l.readChar()
108+
}
109+
return l.input[position:l.position]
110+
}
111+
112+
// In Monkey whitespace only acts as a separator of tokens and doesn’t have
113+
// meaning, so we need to skip over it entirely.
114+
// Otherwise, we get an ILLEGAL token for the whitespace character. Example,
115+
// between “let five”.
116+
func (l *Lexer) skipWhitespace() {
117+
for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
118+
l.readChar()
119+
}
120+
}
121+
122+
func newToken(tokenType token.TokenType, ch byte) token.Token {
123+
return token.Token{Type: tokenType, Literal: string(ch)}
124+
}
125+
126+
// Helper function just checks whether the given argument is a letter.
127+
func isLetter(ch byte) bool {
128+
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_'
129+
}
130+
131+
// isDigit returns whether the passed in byte is a Latin digit between 0 and 9.
132+
func isDigit(ch byte) bool {
133+
return '0' <= ch && ch <= '9'
134+
}

lexer/lexer_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
package lexer
2+
3+
import (
4+
"testing"
5+
6+
"github.com/cedrickchee/hou/token"
7+
)
8+
9+
func TestNextToken(t *testing.T) {
10+
// input looks like a subset of the Monkey language. It contains all the
11+
// symbols we already successfully turned into tokens.
12+
input := `let five = 5;
13+
let ten = 10;
14+
15+
let add = fn(x, y) {
16+
x + y;
17+
};
18+
19+
let result = add(five, ten);
20+
`
21+
22+
tests := []struct {
23+
expectedType token.TokenType
24+
expectedLiteral string
25+
}{
26+
{token.LET, "let"},
27+
{token.IDENT, "five"},
28+
{token.ASSIGN, "="},
29+
{token.INT, "5"},
30+
{token.SEMICOLON, ";"},
31+
{token.LET, "let"},
32+
{token.IDENT, "ten"},
33+
{token.ASSIGN, "="},
34+
{token.INT, "10"},
35+
{token.SEMICOLON, ";"},
36+
{token.LET, "let"},
37+
{token.IDENT, "add"},
38+
{token.ASSIGN, "="},
39+
{token.FUNCTION, "fn"},
40+
{token.LPAREN, "("},
41+
{token.IDENT, "x"},
42+
{token.COMMA, ","},
43+
{token.IDENT, "y"},
44+
{token.RPAREN, ")"},
45+
{token.LBRACE, "{"},
46+
{token.IDENT, "x"},
47+
{token.PLUS, "+"},
48+
{token.IDENT, "y"},
49+
{token.SEMICOLON, ";"},
50+
{token.RBRACE, "}"},
51+
{token.SEMICOLON, ";"},
52+
{token.LET, "let"},
53+
{token.IDENT, "result"},
54+
{token.ASSIGN, "="},
55+
{token.IDENT, "add"},
56+
{token.LPAREN, "("},
57+
{token.IDENT, "five"},
58+
{token.COMMA, ","},
59+
{token.IDENT, "ten"},
60+
{token.RPAREN, ")"},
61+
{token.SEMICOLON, ";"},
62+
{token.EOF, ""},
63+
}
64+
65+
l := New(input)
66+
67+
for i, tt := range tests {
68+
tok := l.NextToken()
69+
70+
if tok.Type != tt.expectedType {
71+
t.Fatalf("tests[%d] - tokentype wrong. expected=%q, got=%q",
72+
i, tt.expectedType, tok.Type)
73+
}
74+
75+
if tok.Literal != tt.expectedLiteral {
76+
t.Fatalf("tests[%d] - literal wrong. expected=%q, got=%q",
77+
i, tt.expectedLiteral, tok.Literal)
78+
}
79+
}
80+
}

token/token.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ const (
3232
LET = "LET"
3333
)
3434

35+
// Language keywords table
36+
var keywords = map[string]TokenType{
37+
"fn": FUNCTION,
38+
"let": LET,
39+
}
40+
3541
// TokenType distinguishes between different types of tokens.
3642
type TokenType string
3743

@@ -40,3 +46,12 @@ type Token struct {
4046
Type TokenType
4147
Literal string
4248
}
49+
50+
// LookupIdent looks up the identifier in ident and returns the appropriate
51+
// token type depending on whether the identifier is user-defined or a keyword.
52+
func LookupIdent(ident string) TokenType {
53+
if tok, ok := keywords[ident]; ok {
54+
return tok // language keyword
55+
}
56+
return IDENT // user-defined identifier
57+
}

0 commit comments

Comments
 (0)