Skip to content

Parser.String.regex #153

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@ jobs:
.spago
output

- name: Install dev dependencies
- name: Install dependencies
run: spago install

- name: Build source
run: spago build --no-install --purs-args '--censor-lib --strict --censor-codes='UserDefinedWarning''

- name: Install dev dependencies
run: spago -x spago-dev.dhall install

- name: Run tests
run: spago -x spago-dev.dhall test --no-install

Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ Breaking changes:

New features:

Add `regex` parser.

Bugfixes:

Other improvements:
Expand Down
1 change: 1 addition & 0 deletions bower.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"purescript-newtype": "^v4.0.0",
"purescript-numbers": "^v8.0.0",
"purescript-prelude": "^v5.0.1",
"purescript-record": "^v3.0.0",
"purescript-strings": "^v5.0.0",
"purescript-tailrec": "^v5.0.1",
"purescript-transformers": "^v5.1.0",
Expand Down
1 change: 1 addition & 0 deletions spago.dhall
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
, "newtype"
, "numbers"
, "prelude"
, "record"
, "strings"
, "tailrec"
, "transformers"
Expand Down
87 changes: 87 additions & 0 deletions src/Text/Parsing/Parser/String.purs
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,27 @@ module Text.Parsing.Parser.String
, noneOf
, noneOfCodePoints
, match
, regex
, RegexFlagsRow
) where

import Prelude hiding (between)

import Control.Monad.State (get, put)
import Data.Array (notElem)
import Data.Array.NonEmpty as NonEmptyArray
import Data.Char (fromCharCode)
import Data.CodePoint.Unicode (isSpace)
import Data.Either (Either(..))
import Data.Foldable (elem)
import Data.Maybe (Maybe(..))
import Data.String (CodePoint, Pattern(..), length, null, singleton, splitAt, stripPrefix, uncons)
import Data.String.CodeUnits as SCU
import Data.String.Regex as Regex
import Data.String.Regex.Flags (RegexFlags(..), RegexFlagsRec)
import Data.Tuple (Tuple(..), fst)
import Prim.Row (class Nub, class Union)
import Record (merge)
import Text.Parsing.Parser (ParseState(..), ParserT, consume, fail)
import Text.Parsing.Parser.Combinators (skipMany, tryRethrow, (<?>), (<~?>))
import Text.Parsing.Parser.Pos (Position(..))
Expand Down Expand Up @@ -208,3 +216,82 @@ match p = do
-- | to something other than `newtype CodePoint = CodePoint Int`.
unCodePoint :: CodePoint -> Int
unCodePoint = unsafeCoerce

-- | Parser which uses the `Data.String.Regex` module to match the regular
-- | expression pattern passed as the `String`
-- | argument to the parser.
-- |
-- | This parser will try to match the regular expression pattern starting
-- | at the current parser position. On success, it will return the matched
-- | substring.
-- |
-- | This parser may be useful for quickly consuming a large section of the
-- | input `String`, because in a JavaScript runtime environment the `RegExp`
-- | runtime is a lot faster than primitive parsers.
-- |
-- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet)
-- |
-- | The `Record flags` argument to the parser is for `Regex` flags. Here are
-- | the default flags.
-- |
-- | ```purescript
-- | { dotAll: true
-- | ignoreCase: false
-- | unicode: true
-- | }
-- | ```
-- |
-- | If you want to use the defaults then pass
-- | `{}` as the flags argument. For case-insensitive pattern matching, pass
-- | `{ignoreCase: true}` as the flags argument.
-- | The other `Data.String.Regex.Flags.RegexFlagsRec` fields are mostly
-- | nonsense in the context of parsing
-- | and use of the other flags may cause strange behavior in the parser.
-- |
-- | [*MDN Advanced searching with flags*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#advanced_searching_with_flags)
-- |
-- | If the `Regex` pattern string fails to compile then this parser will fail.
-- | (Note: It’s not possible to use a precompiled `Regex` because this parser
-- | must set flags and make adjustments to the `Regex` pattern string.)
regex
:: forall m flags f_
. Monad m
=> Union flags RegexFlagsRow f_
=> Nub f_ RegexFlagsRow
=> Record flags
-> String
-> ParserT String m String
regex flags pattern =
-- Prefix a ^ to ensure the pattern only matches the current position in the parse
case Regex.regex ("^(" <> pattern <> ")") flags' of
Left paterr ->
fail $ "Regex pattern error " <> paterr
Right regexobj -> do
ParseState input position _ <- get
case NonEmptyArray.head <$> Regex.match regexobj input of
Just (Just matched) -> do
let remainder = SCU.drop (SCU.length matched) input
put $ ParseState remainder (updatePosString position matched) true
pure matched
_ -> fail $ "No Regex pattern match"
where
flags' = RegexFlags
( merge flags
{ dotAll: true
, global: false
, ignoreCase: false
, multiline: false
, sticky: false
, unicode: true
} :: RegexFlagsRec
)

-- | The fields from `Data.String.Regex.Flags.RegexFlagsRec`.
type RegexFlagsRow =
( dotAll :: Boolean
, global :: Boolean
, ignoreCase :: Boolean
, multiline :: Boolean
, sticky :: Boolean
, unicode :: Boolean
)
14 changes: 13 additions & 1 deletion test/Main.purs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import Text.Parsing.Parser.Combinators (between, chainl, chainl1Rec, chainlRec,
import Text.Parsing.Parser.Expr (Assoc(..), Operator(..), buildExprParser)
import Text.Parsing.Parser.Language (haskellDef, haskellStyle, javaStyle)
import Text.Parsing.Parser.Pos (Position(..), initialPos)
import Text.Parsing.Parser.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, rest, satisfy, string, takeN, whiteSpace)
import Text.Parsing.Parser.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, regex, rest, satisfy, string, takeN, whiteSpace)
import Text.Parsing.Parser.String.Basic (intDecimal, number, letter)
import Text.Parsing.Parser.Token (TokenParser, makeTokenParser, match, token, when)
import Text.Parsing.Parser.Token as Parser.Token
Expand Down Expand Up @@ -683,6 +683,18 @@ main = do

parseTest "-300" (-300) intDecimal

parseTest "regex-" "regex" (regex {} "regex" <* char '-' <* eof)
parseTest "-regex" "regex" (char '-' *> regex {} "regex" <* eof)
parseTest "regexregex" "regexregex" (regex {} "(regex)*")
parseTest "regexregex" "regex" (regex {} "(^regex)*")
parseTest "ReGeX" "ReGeX" (regex { ignoreCase: true } "regex")

-- Maybe it is nonsense to allow multiline regex.
-- Because an end-of-line regex pattern `$` will match but then the
-- newline character will not be consumed.
-- Also why does this test fail? I think it should succeed.
-- parseTest "regex\nregex\n" "regex\nregex\n" (regex {dotAll: false, multiline: true} "(^regex$)+")

stackSafeLoopsTest

tokenParserIdentifierTest
Expand Down