Skip to content

Commit 0b36e8b

Browse files
committed
Merge pull request #2 from kemurphy/upstream-regexp
Switch to new regexp library
2 parents f62180f + 54ac347 commit 0b36e8b

File tree

4 files changed

+100
-696
lines changed

4 files changed

+100
-696
lines changed

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ docs: doc/regexp/index.html
88
doc/%/index.html: %.rs
99
rustdoc $<
1010

11-
MC_FILES := src/main.rs src/regexp.rs src/lexer.rs
11+
MC_FILES := src/main.rs src/lexer.rs
1212

1313
mc: $(MC_FILES)
14-
rustc $< -o $@
14+
rustc $< -o $@ -L.
1515

1616
mc-tests: $(MC_FILES)
17-
rustc --test $< -o $@
17+
rustc --test $< -o $@ -L.
1818

1919
.PHONY: all test docs clean
2020
clean:

src/lexer.rs

Lines changed: 92 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,102 +1,135 @@
11
use regexp::Regexp;
2-
use regexp::NFA;
2+
use std::vec::Vec;
3+
use std::slice::CloneableVector;
34

45
/// A single regexp for a token.
6+
#[deriving(Clone)]
57
struct LexerRule {
6-
matcher: NFA,
8+
matcher: Regexp,
79
token: Token,
810
}
911

12+
#[deriving(Eq, Clone)]
1013
pub enum Token {
11-
WhiteSpace,
14+
// Whitespace
15+
WS,
16+
17+
// Reserved words
18+
Let,
19+
20+
// Symbols
21+
LParen,
22+
RParen,
23+
LBrace,
24+
RBrace,
25+
LBracket,
26+
RBracket,
27+
Less,
28+
Greater,
29+
And,
30+
Or,
31+
Xor,
32+
AndAnd,
33+
OrOr,
34+
Add,
35+
Sub,
36+
Mul,
37+
Div,
38+
Lsh,
39+
Rsh,
40+
Colon,
41+
Semi,
42+
Eq,
43+
Bang,
44+
45+
// Literals
1246
Ident,
1347
Number,
1448
HexNumber,
1549
String,
16-
Let,
17-
LogicalAnd,
18-
LogicalOr,
19-
Character(char),
2050
}
2151

2252
pub struct Lexer {
2353
rules: ~[LexerRule],
2454
}
2555

2656
impl Lexer {
27-
fn add_rule(&mut self, regexp: &str, token: Token) {
28-
let nfa = Regexp::parse(regexp).compile();
29-
self.rules.push(LexerRule {
30-
matcher: nfa,
31-
token: token,
32-
}
33-
)
34-
}
35-
36-
fn add_char_rule(&mut self, c: char) {
37-
if c == '@' || c == '#' || c == 'a' || c == ' ' {
38-
self.add_rule(format!("{:c}", c), Character(c));
39-
} else {
40-
self.add_rule(format!("\\\\{:c}", c), Character(c));
41-
}
42-
}
43-
44-
fn add_char_rules(&mut self, s: &str) {
45-
for c in s.chars() {
46-
self.add_char_rule(c);
57+
pub fn new() -> Lexer {
58+
macro_rules! lexer {
59+
( $( $t:expr => $r:expr ),*) => (
60+
Lexer { rules: ~[ $( LexerRule { matcher: regexp!(concat!("^(?:", $r, ")")), token: $t } ),* ] }
61+
)
4762
}
48-
}
4963

50-
pub fn new() -> Lexer {
51-
let mut l: Lexer = Lexer { rules: ~[] };
5264
// Note: rules are in decreasing order of priority if there's a
5365
// conflict. In particular, reserved words must go before Ident.
66+
lexer! {
67+
// Whitespace
68+
WS => r"\s|//.*|(?s)/\*.*\*/",
69+
70+
// Reserved words
71+
Let => r"let",
5472

55-
l.add_rule("let", Let);
56-
l.add_rule("*(\\ )", WhiteSpace);
57-
l.add_rule("\\a*(\\@)", Ident);
58-
l.add_rule("\\#*(\\#)", Number);
59-
l.add_rule("0x|(\\#,a,b,c,d,e,f,A,B,C,D,E,F)"+
60-
"*(|(\\#,a,b,c,d,e,f,A,B,C,D,E,F))", HexNumber);
61-
// TODO: this needs to be improved.
62-
l.add_rule("\"*(|(\\@,\\ ,\\\\\"))\"", String);
63-
// TODO: this too.
64-
l.add_rule("/\\**(|(\\@,\\ ,\\\\\"))\\*/", WhiteSpace);
65-
l.add_rule("&&", LogicalAnd);
66-
l.add_rule("\\|\\|", LogicalOr);
67-
68-
// All individual characters that are valid on their own as tokens.
69-
l.add_char_rules("()+-*/;:=!%^&|");
70-
71-
l
73+
// Symbols
74+
LParen => r"\(",
75+
RParen => r"\)",
76+
LBrace => r"\{",
77+
RBrace => r"\}",
78+
LBracket => r"\[",
79+
RBracket => r"\]",
80+
Less => r"<",
81+
Greater => r">",
82+
And => r"&",
83+
Or => r"\|",
84+
Xor => r"\^",
85+
AndAnd => r"&&",
86+
OrOr => r"\|\|",
87+
Add => r"\+",
88+
Sub => r"-",
89+
Mul => r"\*",
90+
Div => r"/",
91+
Lsh => r"<<",
92+
Rsh => r">>",
93+
Colon => r":",
94+
Semi => r";",
95+
Eq => r"=",
96+
Bang => r"!",
97+
98+
// Literals
99+
Ident => r"[a-zA-Z_]\w*",
100+
Number => r"\d",
101+
HexNumber => r"0[xX][:xdigit:]+",
102+
String => r#""(?:\\"|[^"])*""#
103+
}
72104
}
73105

74106
pub fn tokenize(&self, s: &str) -> (~[(Token, ~str)]) {
75107
let mut pos = 0u;
76-
let mut result = ~[];
108+
let mut result = vec!();
77109
while pos < s.len() {
78110
let mut longest = 0u;
79-
let mut best_token = None;
80-
let mut best_str = ~"";
111+
let mut best = None;
81112
for rule in self.rules.iter() {
82-
let m = rule.matcher.match_string(s.slice_from(pos));
113+
let m = rule.matcher.find(s.slice_from(pos));
83114
match m {
84-
Some(ref s) if s.len() > longest => {
85-
best_token = Some(rule.token);
86-
best_str = s.clone();
87-
longest = s.len();
115+
Some((begin, end)) if begin == 0 => {
116+
let s = s.slice(pos, pos + end);
117+
if s.len() > longest {
118+
best = Some((rule.token, s));
119+
longest = s.len();
120+
}
88121
},
89122
_ => {},
90123
}
91124
}
92125
pos += longest;
93-
match best_token.unwrap() {
94-
WhiteSpace => {},
95-
x => result.push((x, best_str))
126+
match best.unwrap() {
127+
(WS, _) => {},
128+
(t, s) => result.push((t, s.to_owned()))
96129
}
97130
}
98131

99-
result
132+
result.as_slice().to_owned()
100133
}
101134
}
102135

src/main.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1-
#![feature(globs)]
1+
#![feature(globs,phase,macro_rules)]
22
#![allow(dead_code,unused_imports)]
33

4+
#[phase(syntax)]
5+
extern crate regexp_macros;
6+
47
extern crate collections;
8+
extern crate regexp;
59

6-
mod regexp;
710
mod lexer;
811

912
fn main() {

0 commit comments

Comments
 (0)