added disc 7

cmsc330fall24 · Oct 11, 2024 · b6d97bb · b6d97bb
1 parent 0cdefe0
commit b6d97bb
Show file tree

Hide file tree

Showing 15 changed files with 353 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ This repository will have links to projects and discussions.
   + [Discussion 4](https://github.com/cmsc330fall24/fall2024/blob/main/discussions/discussion4.md)
   + [Discussion 5](https://github.com/cmsc330fall24/fall2024/blob/main/discussions/d5_nfa_dfa/README.md)
   + [Discussion 6](https://github.com/cmsc330fall24/fall2024/tree/main/discussions/d6_nfa_review_cfg)
+  + [Discussion 7](https://github.com/cmsc330fall24/fall2024/tree/main/discussions/d7_parsing)
 
 ## Discussion Solutions
   + [Discussion 2 Solutions](https://github.com/cmsc330fall24/fall2024/blob/main/discussions/discussion2_sol.md)

diff --git a/discussions/d7_parsing/.ocamlinit b/discussions/d7_parsing/.ocamlinit
@@ -0,0 +1,3 @@
+open D8.Lexer
+open D8.Parser
+open D8.Interpreter
diff --git a/discussions/d7_parsing/README.md b/discussions/d7_parsing/README.md
@@ -0,0 +1,59 @@
+# Discussion 7 - Friday, October 11th
+
+## Reminders
+
+1. Project 3 due **Monday, October 14 @ 11:59 PM**
+
+## Coding Exercise
+
+- To go from source code to a running program, there are 3 steps (at least for our purposes):
+
+  - Tokenizing/Lexing (separating text into smaller tokens)
+  - Parsing (generating something meaningful from the tokens - an AST)
+  - Interpreting (evaluating the result of the AST)
+
+- Consider the following grammar:
+
+  ```
+  S -> M + S | M
+  M -> N * M | N
+  N -> n | (S)
+
+  * where n is any integer
+  ```
+
+  - This grammar is right associative/recursive. Why did we provide a right associative grammar? What would you do if we didn't?.
+
+  - What is the relative precedence of the + and \* operators here? How is it determined? How can we use CFGs to enforce precedence?
+
+### Lexer
+
+- Open `lexer.ml`.
+- **NOTES:**
+  - Take a look at the variant type `token` we have defined
+  - Keep an index that keeps track of where we are in the string, and move forward as we keep tokenizing.
+  - It's probably also a good idea to just define all the regex's and store in variables at the top.
+
+### Parser
+
+- Open `parser.ml`.
+- **NOTES:**
+  - Take a look at the variant type `expr` we have defined
+  - Use `let rec ... and` to write mutually recursive functions.
+  - `lookahead` returns the head of the list.
+  - `match` "consumes" the head of the list (provided that the token and head of the list match).
+- **IMPORTANT:**
+  - We're going to write a function named `parse_X` for each nonterminal `X` in our grammar.
+  - Each of these functions will parse (consume) some tokens, and return (1) the unparsed tokens and (2) the AST which corresponds to the parsed tokens.
+
+### Interpreter
+
+- Open `interpreter.ml`.
+- **NOTES:**
+  - Our `eval` function will take in an AST created by `parser` and evaluate it into an integer
+  - Recursion is your friend!
+
+## Resources & Additional Readings
+
+- [Cliff's Notes on Grammars](https://bakalian.cs.umd.edu/assets/notes/grammars.pdf)
+- [Anwar's Parsing Slides](https://bakalian.cs.umd.edu/assets/slides/16-parsing1.pdf)
diff --git a/discussions/d7_parsing/dune-project b/discussions/d7_parsing/dune-project
@@ -0,0 +1 @@
+(lang dune 2.3)
diff --git a/discussions/d7_parsing/sols/interpreter.ml b/discussions/d7_parsing/sols/interpreter.ml
@@ -0,0 +1,13 @@
+open Parser
+
+(* Evaluater *)
+
+let rec eval (ast : expr) : int =
+  match ast with
+  | Int x -> x
+  | Mult (x, y) -> let x' = eval x in
+                   let y' = eval y in
+                   x' * y'
+  | Plus (x, y) -> let x' = eval x in
+                   let y' = eval y in
+                   x' + y'
diff --git a/discussions/d7_parsing/sols/lexer.ml b/discussions/d7_parsing/sols/lexer.ml
@@ -0,0 +1,53 @@
+(* Type *)
+type token =
+| Tok_Int of int
+| Tok_Mult
+| Tok_Plus
+| Tok_LParen
+| Tok_RParen
+| Tok_EOF
+
+let string_of_token tok = match tok with
+| Tok_Int(i) -> string_of_int i
+| Tok_Mult -> "*"
+| Tok_Plus -> "+"
+| Tok_LParen -> "("
+| Tok_RParen -> ")"
+| Tok_EOF -> ""
+
+
+let rec string_of_list conv lst = 
+match lst with
+| [] -> ""
+| h::[] -> conv h
+| h::t -> (conv h) ^ " " ^ (string_of_list conv t)
+
+(* Given source code returns a token list. *)
+let rec lexer (input : string) : token list =
+  let length = String.length input in
+
+  let rec tok pos =
+    if pos >= length then
+      [Tok_EOF]
+
+    else if Str.string_match (Str.regexp "(") input pos then
+      Tok_LParen::(tok (pos + 1))
+
+    else if Str.string_match (Str.regexp ")") input pos then
+      Tok_RParen::(tok (pos + 1))
+
+    else if Str.string_match (Str.regexp "\\+") input pos then
+      Tok_Plus::(tok (pos + 1))
+
+    else if Str.string_match (Str.regexp "\\*") input pos then
+      Tok_Mult::(tok (pos + 1))
+
+    else if Str.string_match (Str.regexp "-?[0-9]+") input pos then
+      let value = Str.matched_string input in
+      Tok_Int(int_of_string value)::(tok (pos + String.length value))
+    else if Str.string_match (Str.regexp " ") input pos then
+      tok (pos + 1)
+    else
+      failwith "lexing error"
+
+  in tok 0;;
diff --git a/discussions/d7_parsing/sols/lexer_re.ml b/discussions/d7_parsing/sols/lexer_re.ml
@@ -0,0 +1,57 @@
+(* Type *)
+type token =
+| Tok_Int of int
+| Tok_Mult
+| Tok_Plus
+| Tok_LParen
+| Tok_RParen
+| Tok_EOF
+
+let string_of_token tok = match tok with
+| Tok_Int(i) -> string_of_int i
+| Tok_Mult -> "*"
+| Tok_Plus -> "+"
+| Tok_LParen -> "("
+| Tok_RParen -> ")"
+| Tok_EOF -> ""
+
+let rec string_of_list conv lst = 
+match lst with
+| [] -> ""
+| h::[] -> conv h
+| h::t -> (conv h) ^ " " ^ (string_of_list conv t)
+
+(* Given source code returns a token list. *)
+let rec lexer (input : string) : token list =
+  let len = String.length input in
+
+  let numre = Re.compile (Re.Perl.re "^(-?[0-9]+)") in
+  let addre = Re.compile (Re.Perl.re "^\+") in
+  let mulre = Re.compile (Re.Perl.re "^\*") in
+  let lpre = Re.compile (Re.Perl.re "^\(") in
+  let rpre = Re.compile (Re.Perl.re "^\)") in
+  let wsre = Re.compile (Re.Perl.re "^(\s+)") in
+
+  if input = "" then []
+  else if Re.execp lpre input then
+    Tok_LParen::(lexer (String.sub input 1 (len - 1)))
+  else if Re.execp rpre input then
+    Tok_RParen::(lexer (String.sub input 1 (len - 1)))
+  else if Re.execp addre input then
+    Tok_Plus::(lexer (String.sub input 1 (len - 1)))
+  else if Re.execp mulre input then
+    Tok_Mult::(lexer (String.sub input 1 (len - 1)))
+  else if Re.execp numre input then
+    let numgroup = Re.exec numre input in
+    let num = Re.Group.get numgroup 1 in
+    let numlen = String.length num in
+    let numint = int_of_string num in
+    Tok_Int(numint)::(lexer (String.sub input numlen (len - numlen)))
+  else if Re.execp wsre input then 
+    let wsgroup = Re.exec wsre input in 
+    let ws = Re.Group.get wsgroup 1 in 
+    let wslen = String.length ws in
+    (lexer (String.sub input wslen (len - wslen)))
+  else
+    failwith "lexing error"
+;;
diff --git a/discussions/d7_parsing/sols/parser.ml b/discussions/d7_parsing/sols/parser.ml
@@ -0,0 +1,62 @@
+open Lexer
+
+(* Types *)
+type expr =
+| Int of int
+| Plus of expr * expr
+| Mult of expr * expr
+
+(* Provided helper function - takes a token list and an exprected token.
+ * Handles error cases and returns the tail of the list *)
+let match_token (toks : token list) (tok : token) : token list =
+  match toks with
+  | [] -> raise (Failure(string_of_token tok))
+  | h::t when h = tok -> t
+  | h::_ -> raise (Failure( 
+      Printf.sprintf "Expected %s from input %s, got %s"
+        (string_of_token tok)
+        (string_of_list string_of_token toks)
+        (string_of_token h)
+    ))
+
+let lookahead toks = match toks with
+   h::t -> h
+  | _ -> raise (Failure("Empty input to lookahead"))
+
+
+(* Parses a token list. *)
+let rec parser (toks : token list) : expr =
+  let (t, exp) = parse_S toks in
+  if t <> [Tok_EOF] then
+    raise (Failure "did not reach EOF")
+  else
+    exp
+
+(* Parses the S rule. *)
+and parse_S toks = 
+  let (t, m) = parse_M toks in
+  match lookahead t with
+  | Tok_Plus -> let t' = match_token t Tok_Plus in
+                let (t'', s) = parse_S t' in
+                (t'', Plus (m, s))
+  | _ -> t, m
+
+(* Parses the M rule. *)
+and parse_M toks =
+  let (t, n) = parse_N toks in
+  match lookahead t with
+  | Tok_Mult -> let t' = match_token t Tok_Mult in
+                let (t'', m) = parse_M t' in
+                (t'', Mult (n, m))
+  | _ -> t, n
+
+(* Parses the N rule. *)
+and parse_N toks =
+  match lookahead toks with
+  | Tok_Int i -> let t = match_token toks (Tok_Int i) in
+                 (t, Int i)
+  | Tok_LParen -> let t = match_token toks Tok_LParen in
+                  let (t', s) = parse_S t in
+                  let t'' = match_token t' Tok_RParen in
+                  (t'', s)
+  | _ -> failwith "parse_N failed"
diff --git a/discussions/d7_parsing/src/dune b/discussions/d7_parsing/src/dune
@@ -0,0 +1,7 @@
+(library
+  (name d8)
+  (modules lexer parser interpreter)
+  (libraries str))
+(env
+  (dev
+      (flags (:standard -w -27-39-33-32))))
diff --git a/discussions/d7_parsing/src/interpreter.ml b/discussions/d7_parsing/src/interpreter.ml
@@ -0,0 +1,6 @@
+open Parser
+
+(* Evaluater *)
+
+let rec eval (ast : expr) : int =
+  failwith "unimplemented"
diff --git a/discussions/d7_parsing/src/interpreter.mli b/discussions/d7_parsing/src/interpreter.mli
@@ -0,0 +1 @@
+val eval : Parser.expr -> int
diff --git a/discussions/d7_parsing/src/lexer.ml b/discussions/d7_parsing/src/lexer.ml
@@ -0,0 +1,26 @@
+(* Type *)
+type token =
+| Tok_Int of int
+| Tok_Mult
+| Tok_Plus
+| Tok_LParen
+| Tok_RParen
+| Tok_EOF
+
+let string_of_token tok = match tok with
+| Tok_Int(i) -> string_of_int i
+| Tok_Mult -> "*"
+| Tok_Plus -> "+"
+| Tok_LParen -> "("
+| Tok_RParen -> ")"
+| Tok_EOF -> ""
+
+let rec string_of_list conv lst = 
+match lst with
+| [] -> ""
+| h::[] -> conv h
+| h::t -> (conv h) ^ " " ^ (string_of_list conv t)
+
+(* Given source code returns a token list. *)
+let rec lexer (input : string) : token list =
+  failwith "unimplemented"
diff --git a/discussions/d7_parsing/src/lexer.mli b/discussions/d7_parsing/src/lexer.mli
@@ -0,0 +1,13 @@
+type token =
+| Tok_Int of int
+| Tok_Mult
+| Tok_Plus
+| Tok_LParen
+| Tok_RParen
+| Tok_EOF
+
+val lexer : string -> token list
+
+val string_of_token : token -> string
+
+val string_of_list : ('a -> string) -> 'a list -> string
diff --git a/discussions/d7_parsing/src/parser.ml b/discussions/d7_parsing/src/parser.ml
@@ -0,0 +1,42 @@
+open Lexer
+
+(* Types *)
+type expr =
+| Int of int
+| Plus of expr * expr
+| Mult of expr * expr
+
+(* Provided helper function - takes a token list and an exprected token.
+ * Handles error cases and returns the tail of the list *)
+let match_token (toks : token list) (tok : token) : token list =
+  match toks with
+  | [] -> raise (Failure(string_of_token tok))
+  | h::t when h = tok -> t
+  | h::_ -> raise (Failure(
+      Printf.sprintf "Expected %s from input %s, got %s"
+        (string_of_token tok)
+        (string_of_list string_of_token toks)
+        (string_of_token h)
+    ))
+
+let lookahead toks = match toks with
+	 h::t -> h
+	| _ -> raise (Failure("Empty input to lookahead"))
+
+
+
+(* Parses a token list. *)
+let rec parser (toks : token list) : expr =
+  failwith "unimplemented"
+
+(* Parses the S rule. *)
+and parse_S (toks : token list) : (token list * expr) =
+  failwith "unimplemented"
+
+(* Parses the M rule. *)
+and parse_M (toks : token list) : (token list * expr) =
+  failwith "unimplemented"
+
+(* Parses the N rule. *)
+and parse_N (toks : token list) : (token list * expr) =
+  failwith "unimplemented"
diff --git a/discussions/d7_parsing/src/parser.mli b/discussions/d7_parsing/src/parser.mli
@@ -0,0 +1,9 @@
+type expr =
+| Int of int
+| Plus of expr * expr
+| Mult of expr * expr
+
+val parser : Lexer.token list -> expr
+val parse_S : Lexer.token list -> Lexer.token list * expr
+val parse_M : Lexer.token list -> Lexer.token list * expr
+val parse_N : Lexer.token list -> Lexer.token list * expr