Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

try binary-parsers #467

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
use binary-parsers
  • Loading branch information
winterland1989 committed Sep 27, 2016
commit 1c016246954db857b8218683a7b305720f06cf8f
99 changes: 38 additions & 61 deletions Data/Aeson/Parser/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -37,24 +37,19 @@ import Prelude ()
import Prelude.Compat

import Data.Aeson.Types.Internal (IResult(..), JSONPath, Result(..), Value(..))
import Data.Attoparsec.ByteString.Char8 (Parser, char, endOfInput, scientific, skipSpace, string)
import Data.Attoparsec.ByteString.Char8 (Parser, char, endOfInput, scientific, skipSpace, string)
import Data.Binary.Parser (Parser, endOfInput, scientific, skipSpaces, string)
import Data.Binary.Parser.Char8 (char)
import Data.Bits ((.|.), shiftL)
import Data.ByteString.Internal (ByteString(..))
import Data.Char (chr)
import Data.Text (Text)
import Data.Vector as Vector (Vector, empty, fromListN, reverse)
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.Lazy as L
import qualified Data.Binary.Parser as BP
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.HashMap.Strict as H
import Data.Aeson.Parser.Unescape

#if MIN_VERSION_ghc_prim(0,3,1)
import GHC.Base (Int#, (==#), isTrue#, word2Int#)
import GHC.Word (Word8(W8#))
#endif

#define BACKSLASH 92
#define CLOSE_CURLY 125
Expand Down Expand Up @@ -111,21 +106,21 @@ object_' = {-# SCC "object_'" #-} do

objectValues :: Parser Text -> Parser Value -> Parser (H.HashMap Text Value)
objectValues str val = do
skipSpace
w <- A.peekWord8'
skipSpaces
w <- BP.peek
if w == CLOSE_CURLY
then A.anyWord8 >> return H.empty
then BP.skipN 1 >> return H.empty
else loop []
where
-- Why use acc pattern here, you may ask? because 'H.fromList' use 'unsafeInsert'
-- and it's much faster because it's doing in place update to the 'HashMap'!
loop acc = do
k <- str <* skipSpace <* char ':'
v <- val <* skipSpace
ch <- A.satisfy $ \w -> w == COMMA || w == CLOSE_CURLY
k <- str <* skipSpaces <* char ':'
v <- val <* skipSpaces
ch <- BP.satisfy $ \w -> w == COMMA || w == CLOSE_CURLY
let acc' = (k, v) : acc
if ch == COMMA
then skipSpace >> loop acc'
then skipSpaces >> loop acc'
else return (H.fromList acc')
{-# INLINE objectValues #-}

Expand All @@ -139,17 +134,17 @@ array_' = {-# SCC "array_'" #-} do

arrayValues :: Parser Value -> Parser (Vector Value)
arrayValues val = do
skipSpace
w <- A.peekWord8'
skipSpaces
w <- BP.peek
if w == CLOSE_SQUARE
then A.anyWord8 >> return Vector.empty
then BP.skipN 1 >> return Vector.empty
else loop [] 1
where
loop acc !len = do
v <- val <* skipSpace
ch <- A.satisfy $ \w -> w == COMMA || w == CLOSE_SQUARE
v <- val <* skipSpaces
ch <- BP.satisfy $ \w -> w == COMMA || w == CLOSE_SQUARE
if ch == COMMA
then skipSpace >> loop (v:acc) (len+1)
then skipSpaces >> loop (v:acc) (len+1)
else return (Vector.reverse (Vector.fromListN len (v:acc)))
{-# INLINE arrayValues #-}

Expand All @@ -165,12 +160,12 @@ arrayValues val = do
-- to preserve interoperability and security.
value :: Parser Value
value = do
skipSpace
w <- A.peekWord8'
skipSpaces
w <- BP.peek
case w of
DOUBLE_QUOTE -> A.anyWord8 *> (String <$> jstring_)
OPEN_CURLY -> A.anyWord8 *> object_
OPEN_SQUARE -> A.anyWord8 *> array_
DOUBLE_QUOTE -> BP.skipN 1 *> (String <$> jstring_)
OPEN_CURLY -> BP.skipN 1 *> object_
OPEN_SQUARE -> BP.skipN 1 *> array_
C_f -> string "false" *> pure (Bool False)
C_t -> string "true" *> pure (Bool True)
C_n -> string "null" *> pure Null
Expand All @@ -181,14 +176,14 @@ value = do
-- | Strict version of 'value'. See also 'json''.
value' :: Parser Value
value' = do
skipSpace
w <- A.peekWord8'
skipSpaces
w <- BP.peek
case w of
DOUBLE_QUOTE -> do
!s <- A.anyWord8 *> jstring_
!s <- BP.skipN 1 *> jstring_
return (String s)
OPEN_CURLY -> A.anyWord8 *> object_'
OPEN_SQUARE -> A.anyWord8 *> array_'
OPEN_CURLY -> BP.skipN 1 *> object_'
OPEN_SQUARE -> BP.skipN 1 *> array_'
C_f -> string "false" *> pure (Bool False)
C_t -> string "true" *> pure (Bool True)
C_n -> string "null" *> pure Null
Expand All @@ -200,40 +195,22 @@ value' = do

-- | Parse a quoted JSON string.
jstring :: Parser Text
jstring = A.word8 DOUBLE_QUOTE *> jstring_
jstring = BP.word8 DOUBLE_QUOTE *> jstring_
{-# INLINE jstring #-}

-- | Parse a string without a leading quote.
jstring_ :: Parser Text
{-# INLINE jstring_ #-}
jstring_ = {-# SCC "jstring_" #-} do
s <- A.scan startState go <* A.anyWord8
s <- BP.scanChunks (-1) unescapeTextScanner <* BP.skipN 1
case unescapeText s of
Right r -> return r
Left err -> fail $ show err
where
#if MIN_VERSION_ghc_prim(0,3,1)
startState = S 0#
go (S a) (W8# c)
| isTrue# a = Just (S 0#)
| isTrue# (word2Int# c ==# 34#) = Nothing -- double quote
| otherwise = let a' = word2Int# c ==# 92# -- backslash
in Just (S a')

data S = S Int#
#else
startState = False
go a c
| a = Just False
| c == DOUBLE_QUOTE = Nothing
| otherwise = let a' = c == backslash
in Just a'
where backslash = BACKSLASH
#endif

decodeWith :: Parser Value -> (Value -> Result a) -> L.ByteString -> Maybe a
decodeWith p to s =
case L.parse p s of
L.Done _ v -> case to v of
case BP.parseLazy p s of
Right v -> case to v of
Success a -> Just a
_ -> Nothing
_ -> Nothing
Expand All @@ -242,25 +219,25 @@ decodeWith p to s =
decodeStrictWith :: Parser Value -> (Value -> Result a) -> B.ByteString
-> Maybe a
decodeStrictWith p to s =
case either Error to (A.parseOnly p s) of
case either Error to (BP.parseOnly p s) of
Success a -> Just a
_ -> Nothing
{-# INLINE decodeStrictWith #-}

eitherDecodeWith :: Parser Value -> (Value -> IResult a) -> L.ByteString
-> Either (JSONPath, String) a
eitherDecodeWith p to s =
case L.parse p s of
L.Done _ v -> case to v of
case BP.parseLazy p s of
Right v -> case to v of
ISuccess a -> Right a
IError path msg -> Left (path, msg)
L.Fail _ _ msg -> Left ([], msg)
Left msg -> Left ([], msg)
{-# INLINE eitherDecodeWith #-}

eitherDecodeStrictWith :: Parser Value -> (Value -> IResult a) -> B.ByteString
-> Either (JSONPath, String) a
eitherDecodeStrictWith p to s =
case either (IError []) to (A.parseOnly p s) of
case either (IError []) to (BP.parseOnly p s) of
ISuccess a -> Right a
IError path msg -> Left (path, msg)
{-# INLINE eitherDecodeStrictWith #-}
Expand All @@ -287,9 +264,9 @@ eitherDecodeStrictWith p to s =
-- | Parse a top-level JSON value followed by optional whitespace and
-- end-of-input. See also: 'json'.
jsonEOF :: Parser Value
jsonEOF = json <* skipSpace <* endOfInput
jsonEOF = json <* skipSpaces <* endOfInput

-- | Parse a top-level JSON value followed by optional whitespace and
-- end-of-input. See also: 'json''.
jsonEOF' :: Parser Value
jsonEOF' = json' <* skipSpace <* endOfInput
jsonEOF' = json' <* skipSpaces <* endOfInput
15 changes: 14 additions & 1 deletion Data/Aeson/Parser/Unescape.hs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
{-# LANGUAGE UnliftedFFITypes #-}

module Data.Aeson.Parser.Unescape (
unescapeText
unescapeTextScanner
, unescapeText
) where

import Control.Exception (evaluate, throw, try)
Expand All @@ -27,6 +28,18 @@ foreign import ccall unsafe "_js_decode_string" c_js_decode
:: MutableByteArray# s -> Ptr CSize
-> Ptr Word8 -> Ptr Word8 -> IO CInt

foreign import ccall unsafe "_js_find_string_end" c_js_find_string_end
:: CInt -> Ptr Word8 -> Ptr Word8 -> IO CInt

unescapeTextScanner :: CInt -> ByteString -> Either CInt (ByteString, ByteString)
unescapeTextScanner backslashed bs@(PS fp off len) = unsafeDupablePerformIO $
withForeignPtr fp $ \ptr -> do
s <- c_js_find_string_end backslashed (ptr `plusPtr` off) (ptr `plusPtr` (off + len))
if s >= 0
then return . Right $ B.splitAt (fromIntegral s) bs
else return (Left s)
{-# INLINE unescapeTextScanner #-}

unescapeText' :: ByteString -> Text
unescapeText' (PS fp off len) = runText $ \done -> do
let go dest = withForeignPtr fp $ \ptr ->
Expand Down
2 changes: 2 additions & 0 deletions aeson.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ library

build-depends:
attoparsec >= 0.13.0.1,
binary-parsers >= 0.2.1,
base >= 4.5 && < 5,
base-compat >= 0.9.1 && < 0.10,
containers >= 0.2.4.1,
Expand Down Expand Up @@ -176,6 +177,7 @@ test-suite tests
QuickCheck >= 2.7 && <2.9.3,
aeson,
attoparsec,
binary-parsers,
base,
base-compat,
base-orphans >= 0.5.3 && <0.6,
Expand Down
23 changes: 21 additions & 2 deletions benchmarks/AesonEncode.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import Control.DeepSeq
import Control.Exception
import Control.Monad
import Data.Aeson
import Data.Attoparsec.ByteString (IResult(..), parseWith)
import Data.Binary.Parser (parse, Parser, Decoder(..))
import Data.Char (isDigit)
import Data.Time.Clock
import System.Environment (getArgs)
Expand All @@ -30,7 +30,7 @@ main = do
let refill = B.hGet h 16384
result0 <- parseWith refill json =<< refill
r0 <- case result0 of
Done _ r -> return r
Done _ _ r -> return r
_ -> fail $ "failed to read " ++ show arg
start <- getCurrentTime
let loop !n r
Expand All @@ -42,3 +42,22 @@ main = do
let rate = fromIntegral count / realToFrac delta :: Double
putStrLn $ " " ++ cnt ++ " good, " ++ show delta
putStrLn $ " " ++ show (round rate :: Int) ++ " per second"

-- | Run a parser with an initial input string, and a monadic action
-- that can supply more input if needed.
parseWith :: Monad m =>
(m B.ByteString)
-- ^ An action that will be executed to provide the parser
-- with more input, if necessary. The action must return an
-- 'B.empty' string when there is no more input available.
-> Parser a
-> B.ByteString
-- ^ Initial input for the parser.
-> m (Decoder a)
parseWith refill p s = step $ parse p s
where step (Partial k) = do
bs <- refill
if B.null bs then step (k Nothing)
else step (k (Just bs))
step r = return r
{-# INLINE parseWith #-}
27 changes: 23 additions & 4 deletions benchmarks/AesonParse.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import Prelude.Compat
import "aeson-benchmarks" Data.Aeson
import Control.Exception
import Control.Monad
import Data.Attoparsec.ByteString (IResult(..), parseWith)
import Data.Binary.Parser (parse, Parser, Decoder(..))
import Data.Time.Clock
import System.Environment (getArgs)
import System.IO
Expand All @@ -20,7 +20,7 @@ main :: IO ()
main = do
(bs:cnt:args) <- getArgs
let count = read cnt :: Int
blkSize = read bs
blkSize = read bs :: Int
forM_ args $ \arg -> bracket (openFile arg ReadMode) hClose $ \h -> do
putStrLn $ arg ++ ":"
start <- getCurrentTime
Expand All @@ -31,10 +31,29 @@ main = do
let refill = B.hGet h blkSize
result <- parseWith refill json =<< refill
case result of
Done _ _ -> loop (good+1) bad
_ -> loop good (bad+1)
Done _ _ _ -> loop (good+1) bad
_ -> loop good (bad+1)
(good, _) <- loop 0 0
delta <- flip diffUTCTime start `fmap` getCurrentTime
putStrLn $ " " ++ show good ++ " good, " ++ show delta
let rate = fromIntegral count / realToFrac delta :: Double
putStrLn $ " " ++ show (round rate :: Int) ++ " per second"

-- | Run a parser with an initial input string, and a monadic action
-- that can supply more input if needed.
parseWith :: Monad m =>
(m B.ByteString)
-- ^ An action that will be executed to provide the parser
-- with more input, if necessary. The action must return an
-- 'B.empty' string when there is no more input available.
-> Parser a
-> B.ByteString
-- ^ Initial input for the parser.
-> m (Decoder a)
parseWith refill p s = step $ parse p s
where step (Partial k) = do
bs <- refill
if B.null bs then step (k Nothing)
else step (k (Just bs))
step r = return r
{-# INLINE parseWith #-}
3 changes: 3 additions & 0 deletions benchmarks/aeson-benchmarks.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ library

build-depends:
attoparsec >= 0.13.0.1,
binary-parsers >= 0.2.1,
base == 4.*,
base-compat >= 0.9.1 && <0.10,
time-locale-compat >=0.1.1 && <0.2,
Expand Down Expand Up @@ -147,6 +148,7 @@ executable aeson-benchmark-aeson-encode
build-depends:
aeson-benchmarks,
attoparsec,
binary-parsers,
base,
base-compat,
bytestring,
Expand All @@ -159,6 +161,7 @@ executable aeson-benchmark-aeson-parse
build-depends:
aeson-benchmarks,
attoparsec,
binary-parsers,
base,
base-compat,
bytestring,
Expand Down
27 changes: 27 additions & 0 deletions cbits/unescape_string.c
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,30 @@ int _js_decode_string(uint16_t *const dest, size_t *destoff,
DISPATCH_ASCII(unicode1)
}

// if return >= 0, it's the split offset
// if return == -1, then string ends without a tailing backslash
// if return == -2, then string ends with a tailing backslash
int _js_find_string_end(const int bs, uint8_t *const start, uint8_t *const end){
uint8_t *s = start;
int backslash = bs;
while (s < end) {
if (backslash == -2){
backslash = -1;
s++;
if (s == end) {
break;
} else {
s++;
}
}
else if (*s == 92) { // backslash
backslash = -2;
s++;
}
else if (*s == 34) { // double quote
return (s - start);
}
else { s++; }
}
return backslash;
}
Loading