Skip to content

New regex parser #170

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Breaking changes:
prevents a compiler error (i.e. `MixedAssociativityError`)
without causing issues with `<$>`.
- Rename module prefix from `Text.Parsing.Parser` to `Parsing` (#169 by @jamesdbrock)
- Delete the `regex` parser and replace it with `mkRegex`. (#170 by @jamesdbrock)

New features:

Expand Down
22 changes: 15 additions & 7 deletions bench/Json/Parsing.purs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,18 @@ import Prelude hiding (between)

import Bench.Json.Common (Json(..))
import Control.Lazy (defer)
import Data.Either (Either(..))
import Data.List (List)
import Data.Maybe (Maybe(..))
import Data.Number as Number
import Data.String.Regex.Flags (noFlags)
import Data.Tuple (Tuple(..))
import Effect.Exception (throw)
import Effect.Unsafe (unsafePerformEffect)
import Parsing (ParserT, fail)
import Parsing.Combinators (between, choice, sepBy, try)
import Parsing.String (regex, skipSpaces, string)
import Partial.Unsafe (unsafeCrashWith)

json :: forall m. Monad m => ParserT String m Json
json = defer \_ ->
Expand Down Expand Up @@ -38,15 +43,18 @@ jsonArray = defer \_ ->
json `sepBy` (try (skipSpaces *> string ","))

jsonString :: forall m. Monad m => ParserT String m String
jsonString = between (string "\"") (string "\"") do
regex {} """\\"|[^"]*"""
jsonString = case regex """\\"|[^"]*""" noFlags of
Left err -> unsafeCrashWith err
Right p -> between (string "\"") (string "\"") p

jsonNumber :: forall m. Monad m => ParserT String m Number
jsonNumber = do
n <- regex {} """(\+|-)?(\d+(\.\d*)?|\d*\.\d+)([eE](\+|-)?\d+)?"""
case Number.fromString n of
Just n' -> pure n'
Nothing -> fail "Expected number"
jsonNumber = case regex """(\+|-)?(\d+(\.\d*)?|\d*\.\d+)([eE](\+|-)?\d+)?""" noFlags of
Left err -> unsafeCrashWith err
Right p -> do
n <- p
case Number.fromString n of
Just n' -> pure n'
Nothing -> fail "Expected number"

jsonBoolean :: forall m. Monad m => ParserT String m Boolean
jsonBoolean = choice
Expand Down
1 change: 0 additions & 1 deletion spago.dhall
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
, "numbers"
, "partial"
, "prelude"
, "record"
, "strings"
, "tailrec"
, "transformers"
Expand Down
125 changes: 46 additions & 79 deletions src/Parsing/String.purs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ module Parsing.String
, noneOfCodePoints
, match
, regex
, RegexFlagsRow
, consumeWith
) where

Expand All @@ -55,12 +54,10 @@ import Data.String (CodePoint, Pattern(..), codePointAt, length, null, singleton
import Data.String as String
import Data.String.CodeUnits as SCU
import Data.String.Regex as Regex
import Data.String.Regex.Flags (RegexFlags(..), RegexFlagsRec)
import Data.String.Regex.Flags (RegexFlags)
import Data.Tuple (Tuple(..), fst)
import Partial.Unsafe (unsafePartial)
import Prim.Row (class Nub, class Union)
import Record (merge)
import Parsing (ParseError(..), ParseState(..), ParserT(..), fail)
import Parsing (ParseError(..), ParseState(..), ParserT(..))
import Parsing.Combinators ((<?>), (<~?>))
import Parsing.Pos (Position(..))

Expand Down Expand Up @@ -229,101 +226,71 @@ match p = do
-- boundary.
pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x

-- | Parser which uses the `Data.String.Regex` module to match the regular
-- | expression pattern passed as the `String`
-- | argument to the parser.
-- | Compile a regular expression string into a regular expression parser.
-- |
-- | This function will use the `Data.String.Regex.regex` function to compile and return a parser which can be used
-- | in a `ParserT String m` monad.
-- |
-- | This parser will try to match the regular expression pattern starting
-- | at the current parser position. On success, it will return the matched
-- | substring.
-- |
-- | If the `Regex` pattern string fails to compile then this parser will fail.
-- | (Note: It’s not possible to use a precompiled `Regex` because this parser
-- | must set flags and make adjustments to the `Regex` pattern string.)
-- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet)
-- |
-- | This function should be called outside the context of a `ParserT String m` monad, because this function might
-- | fail with a `Left` RegExp compilation error message.
-- | If you call this function inside of the `ParserT String m` monad and then `fail` the parse when the compilation fails,
-- | then that could be confusing because a parser failure is supposed to indicate an invalid input string.
-- | If the compilation failure occurs in an `alt` then the compilation failure might not be reported at all and instead
-- | the input string would be parsed incorrectly.
-- |
-- | This parser may be useful for quickly consuming a large section of the
-- | input `String`, because in a JavaScript runtime environment the `RegExp`
-- | input `String`, because in a JavaScript runtime environment the RegExp
-- | runtime is a lot faster than primitive parsers.
-- |
-- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet)
-- | #### Example
-- |
-- | This example shows how to compile and run the `xMany` parser which will
-- | capture the regular expression pattern `x*`.
-- |
-- | ```purescript
-- | case regex "x*" noFlags of
-- | Left compileError -> unsafeCrashWith $ "xMany failed to compile: " <> compileError
-- | Right xMany -> runParser "xxxZ" do
-- | xMany
-- | ```
-- |
-- | #### Flags
-- |
-- | The `Record flags` argument to the parser is for `Regex` flags. Here are
-- | the default flags.
-- | Set `RegexFlags` with the `Semigroup` instance like this.
-- |
-- | ```purescript
-- | { dotAll: true
-- | ignoreCase: false
-- | unicode: true
-- | }
-- | regex "x*" (dotAll <> ignoreCase)
-- | ```
-- |
-- | To use the defaults, pass
-- | `{}` as the flags argument. For case-insensitive pattern matching, pass
-- | `{ignoreCase: true}` as the flags argument.
-- |
-- | The other `Data.String.Regex.Flags.RegexFlagsRec` fields are mostly
-- | nonsense in the context of parsing
-- | and use of the other flags may cause strange behavior in the parser.
-- | The `dotAll`, `unicode`, and `ignoreCase` flags might make sense for a `regex` parser. The other flags will
-- | probably cause surprising behavior and you should avoid them.
-- |
-- | [*MDN Advanced searching with flags*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#advanced_searching_with_flags)
regex :: forall m. String -> RegexFlags -> Either String (ParserT String m String)
regex pattern flags =
Regex.regex ("^(" <> pattern <> ")") flags <#> \regexobj ->
consumeWith \input -> do
case NonEmptyArray.head <$> Regex.match regexobj input of
Just (Just consumed) -> do
let remainder = SCU.drop (SCU.length consumed) input
Right { value: consumed, consumed, remainder }
_ ->
Left "No Regex pattern match"

-- | Consume a portion of the input string while yielding a value.
-- |
-- | #### Example
-- | Takes a consumption function which takes the remaining input `String`
-- | as its argument and returns three fields:
-- |
-- | ```
-- | runParser "ababXX" (regex {} "(ab)+")
-- | ```
-- | ```
-- | (Right "abab")
-- | ```
regex
:: forall m flags f_
. Monad m
=> Union flags RegexFlagsRow f_
=> Nub f_ RegexFlagsRow
=> Record flags
-> String
-> ParserT String m String
regex flags pattern =
-- Prefix a ^ to ensure the pattern only matches the current position in the parse
case Regex.regex ("^(" <> pattern <> ")") flags' of
Left paterr ->
fail $ "Regex pattern error " <> paterr
Right regexobj ->
consumeWith \input -> do
case NonEmptyArray.head <$> Regex.match regexobj input of
Just (Just consumed) -> do
let remainder = SCU.drop (SCU.length consumed) input
Right { value: consumed, consumed, remainder }
_ ->
Left "No Regex pattern match"
where
flags' = RegexFlags
( merge flags
{ dotAll: true
, global: false
, ignoreCase: false
, multiline: false
, sticky: false
, unicode: true
} :: RegexFlagsRec
)

-- | The fields from `Data.String.Regex.Flags.RegexFlagsRec`.
type RegexFlagsRow =
( dotAll :: Boolean
, global :: Boolean
, ignoreCase :: Boolean
, multiline :: Boolean
, sticky :: Boolean
, unicode :: Boolean
)

-- | Consumes a portion of the input string while yielding a value.
-- | * `value` is the value to return.
-- | * `consumed` is the input that was consumed and is used to update the parser position.
-- | * `remainder` is the new input state.
-- | * `consumed` is the input `String` that was consumed. It is used to update the parser position.
-- | * `remainder` is the new remaining input `String`.
consumeWith
:: forall m a
. (String -> Either String { value :: a, consumed :: String, remainder :: String })
Expand Down
62 changes: 41 additions & 21 deletions test/Main.purs
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
-- Run tests:
--
-- spago -x spago-dev.dhall test
--

module Test.Main where

import Prelude hiding (between, when)
Expand All @@ -16,17 +21,18 @@ import Data.Number (infinity, isNaN)
import Data.String.CodePoints as SCP
import Data.String.CodeUnits (fromCharArray, singleton)
import Data.String.CodeUnits as SCU
import Data.String.Regex.Flags (RegexFlags, ignoreCase, noFlags)
import Data.Tuple (Tuple(..))
import Effect (Effect)
import Effect.Console (logShow)
import Effect.Console (log, logShow)
import Partial.Unsafe (unsafePartial)
import Test.Assert (assert')
import Parsing (ParseError(..), Parser, ParserT, fail, parseErrorMessage, parseErrorPosition, position, region, runParser)
import Parsing.Combinators (between, chainl, chainl1Rec, chainlRec, chainr1Rec, chainrRec, choice, endBy1, endBy1Rec, endByRec, many1Rec, many1TillRec, many1TillRec_, many1Till_, manyTillRec, manyTillRec_, manyTill_, notFollowedBy, optionMaybe, sepBy1, sepBy1Rec, sepByRec, sepEndBy1Rec, sepEndByRec, skipMany1Rec, skipManyRec, try, (<?>), (<??>), (<~?>))
import Parsing.Expr (Assoc(..), Operator(..), buildExprParser)
import Parsing.Language (haskellDef, haskellStyle, javaStyle)
import Parsing.Pos (Position(..), initialPos)
import Parsing.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, regex, rest, satisfy, string, takeN, whiteSpace)
import Parsing.String (anyChar, anyCodePoint, char, eof, regex, noneOfCodePoints, oneOfCodePoints, rest, satisfy, string, takeN, whiteSpace)
import Parsing.String.Basic (intDecimal, number, letter)
import Parsing.Token (TokenParser, makeTokenParser, match, token, when)
import Parsing.Token as Parser.Token
Expand Down Expand Up @@ -94,6 +100,14 @@ manySatisfyTest = do
_ <- char '?'
pure (fromCharArray r)

mkRegexTest :: String -> String -> String -> RegexFlags -> (Parser String String -> Parser String String) -> Effect Unit
mkRegexTest input expected pattern flags pars =
case regex pattern flags of
Left err -> assert' ("error: " <> show err) false
Right p -> parseTest input expected $ pars p

-- TODO everything is stack-safe now.
--
-- This test doesn't test the actual stack safety of these combinators, mainly
-- because I don't know how to come up with an example guaranteed to be large
-- enough to overflow the stack. But thankfully, their stack safety is more or
Expand Down Expand Up @@ -559,6 +573,7 @@ javaStyleTest = do
main :: Effect Unit
main = do

log "\nTESTS String\n"
parseErrorTestPosition
(many $ char 'f' *> char '?')
"foo"
Expand Down Expand Up @@ -667,6 +682,8 @@ main = do
parseErrorTestPosition (string "a\nb\nc\n" *> eof) "a\nb\nc\nd\n" (Position { column: 1, line: 4 })
parseErrorTestPosition (string "\ta" *> eof) "\tab" (Position { column: 10, line: 1 })

log "\nTESTS number\n"

parseTest "Infinity" infinity number
parseTest "+Infinity" infinity number
parseTest "-Infinity" (negate infinity) number
Expand All @@ -681,6 +698,7 @@ main = do
parseTest "-6.0" (-6.0) number
parseTest "+6.0" (6.0) number

log "\nTESTS Operator\n"
-- test from issue #161
-- all the below operators should play well together
parseErrorTestMessage
Expand Down Expand Up @@ -749,24 +767,23 @@ main = do
-- TODO This shows the current limitations of the number parser. Ideally this parse should fail.
parseTest "1..3" 1.0 $ number <* eof

log "\nTESTS intDecimal\n"
parseTest "-300" (-300) intDecimal

parseTest "regex-" "regex" (regex {} "regex" <* char '-' <* eof)
parseTest "-regex" "regex" (char '-' *> regex {} "regex" <* eof)
parseTest "regexregex" "regexregex" (regex {} "(regex)*")
parseTest "regexregex" "regex" (regex {} "(^regex)*")
parseTest "ReGeX" "ReGeX" (regex { ignoreCase: true } "regex")
parseTest "regexcapregexcap" "regexcap" (regex {} "(?<CaptureGroupName>regexcap)")
parseTest "regexcapregexcap" "regexcap" (regex {} "(((?<CaptureGroupName>(r)e(g)excap)))")

-- Maybe it is nonsense to allow multiline regex.
-- Because an end-of-line regex pattern `$` will match but then the
-- newline character will not be consumed.
-- Also why does this test fail? I think it should succeed.
-- parseTest "regex\nregex\n" "regex\nregex\n" (regex {dotAll: false, multiline: true} "(^regex$)+")
log "\nTESTS Regex\n"
mkRegexTest "regex-" "regex" "regex" noFlags (\regex -> regex <* char '-' <* eof)
mkRegexTest "-regex" "regex" "regex" noFlags (\regex -> char '-' *> regex <* eof)
mkRegexTest "regexregex" "regexregex" "(regex)*" noFlags identity
mkRegexTest "regexregex" "regex" "(^regex)*" noFlags identity
mkRegexTest "ReGeX" "ReGeX" "regex" ignoreCase identity
mkRegexTest "regexcapregexcap" "regexcap" "(?<CaptureGroupName>regexcap)" noFlags identity
mkRegexTest "regexcapregexcap" "regexcap" "(((?<CaptureGroupName>(r)e(g)excap)))" noFlags identity

log "\nTESTS Stack Safe Loops\n"
stackSafeLoopsTest

log "\nTESTS Token Parser\n"

tokenParserIdentifierTest
tokenParserReservedTest
tokenParserOperatorTest
Expand Down Expand Up @@ -799,18 +816,21 @@ main = do
tokenParserCommaSepTest
tokenParserCommaSep1Test

log "\nTESTS Haskell Style\n"
haskellStyleTest
log "\nTESTS Java Style\n"
javaStyleTest

log "\nTESTS region\n"
let
prependContext m' (ParseError m pos) = ParseError (m' <> m) pos
p = region (prependContext "context1 ") $ do
_ <- string "a"
region (prependContext "context2 ") $ do
string "b"
case runParser "aa" p of
Right _ -> assert' "error: ParseError expected!" false
Left (ParseError message _) -> do
let messageExpected = "context1 context2 Expected \"b\""
assert' ("expected message: " <> messageExpected <> ", message: " <> message) (message == messageExpected)
logShow messageExpected
where
prependContext m' (ParseError m pos) = ParseError (m' <> m) pos
p = region (prependContext "context1 ") $ do
_ <- string "a"
region (prependContext "context2 ") $ do
string "b"