Skip to content

Commit

Permalink
Merge pull request #296 from phadej/bytestring-0.11
Browse files Browse the repository at this point in the history
bytestring-0.11
  • Loading branch information
phadej authored Oct 17, 2020
2 parents 7d3130b + 21281e7 commit b7bff71
Show file tree
Hide file tree
Showing 10 changed files with 183 additions and 84 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
/GNUmakefile
/.ghc.environment.*
/cabal.project.local
/cabal.test.project.local

# Test data repo ignored. Please see instruction in tests-and-benchmarks.markdown
/tests/text-test-data/
1 change: 1 addition & 0 deletions benchmarks/text-benchmarks.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ executable text-benchmarks
Data.Text.Internal.Builder.Functions
Data.Text.Internal.Builder.Int.Digits
Data.Text.Internal.Builder.RealFloat.Functions
Data.Text.Internal.ByteStringCompat
Data.Text.Internal.Encoding.Fusion
Data.Text.Internal.Encoding.Fusion.Common
Data.Text.Internal.Encoding.Utf16
Expand Down
5 changes: 5 additions & 0 deletions cabal.tests.project
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- this project doesn't have local 'text' package,
-- so tests build faster.

packages: tests
tests: True
30 changes: 30 additions & 0 deletions scripts/tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/sh

set -ex

runtest() {
HC=$1
shift

# EDIT last line to pass arguments

cabal run text-tests:test:tests \
--project-file=cabal.tests.project \
--builddir="dist-newstyle/$HC" \
--with-compiler="$HC" \
-- "$@"
}

runtest ghc-8.10.2 "$@"
runtest ghc-8.8.4 "$@"
runtest ghc-8.6.5 "$@"
runtest ghc-8.4.4 "$@"
runtest ghc-8.2.2 "$@"
runtest ghc-8.0.2 "$@"

runtest ghc-7.10.3 "$@"
runtest ghc-7.8.4 "$@"
runtest ghc-7.6.3 "$@"
runtest ghc-7.4.2 "$@"
runtest ghc-7.2.2 "$@"
runtest ghc-7.0.4 "$@"
164 changes: 84 additions & 80 deletions src/Data/Text/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ import qualified Data.Text.Array as A
import qualified Data.Text.Internal.Encoding.Fusion as E
import qualified Data.Text.Internal.Encoding.Utf16 as U16
import qualified Data.Text.Internal.Fusion as F
import Data.Text.Internal.ByteStringCompat

#include "text_cbits.h"

Expand Down Expand Up @@ -123,12 +124,13 @@ decodeASCII = decodeUtf8
-- 'decodeLatin1' is semantically equivalent to
-- @Data.Text.pack . Data.ByteString.Char8.unpack@
decodeLatin1 :: ByteString -> Text
decodeLatin1 (PS fp off len) = text a 0 len
where
a = A.run (A.new len >>= unsafeIOToST . go)
go dest = withForeignPtr fp $ \ptr -> do
c_decode_latin1 (A.maBA dest) (ptr `plusPtr` off) (ptr `plusPtr` (off+len))
return dest
decodeLatin1 bs = withBS bs aux where
aux fp len = text a 0 len
where
a = A.run (A.new len >>= unsafeIOToST . go)
go dest = withForeignPtr fp $ \ptr -> do
c_decode_latin1 (A.maBA dest) ptr (ptr `plusPtr` len)
return dest

-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
Expand All @@ -139,36 +141,38 @@ decodeLatin1 (PS fp off len) = text a 0 len
-- 'error' (/since 1.2.3.1/); For earlier versions of @text@ using
-- those unsupported code points would result in undefined behavior.
decodeUtf8With :: OnDecodeError -> ByteString -> Text
decodeUtf8With onErr (PS fp off len) = runText $ \done -> do
let go dest = withForeignPtr fp $ \ptr ->
with (0::CSize) $ \destOffPtr -> do
let end = ptr `plusPtr` (off + len)
loop curPtr = do
curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
if curPtr' == end
then do
n <- peek destOffPtr
unsafeSTToIO (done dest (fromIntegral n))
else do
x <- peek curPtr'
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c
| c > '\xFFFF' -> throwUnsupportedReplChar
| otherwise -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff)
(safe c)
poke destOffPtr (destOff + fromIntegral w)
loop $ curPtr' `plusPtr` 1
loop (ptr `plusPtr` off)
(unsafeIOToST . go) =<< A.new len
decodeUtf8With onErr bs = withBS bs aux
where
desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
aux fp len = runText $ \done -> do
let go dest = withForeignPtr fp $ \ptr ->
with (0::CSize) $ \destOffPtr -> do
let end = ptr `plusPtr` len
loop curPtr = do
curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
if curPtr' == end
then do
n <- peek destOffPtr
unsafeSTToIO (done dest (fromIntegral n))
else do
x <- peek curPtr'
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c
| c > '\xFFFF' -> throwUnsupportedReplChar
| otherwise -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff)
(safe c)
poke destOffPtr (destOff + fromIntegral w)
loop $ curPtr' `plusPtr` 1
loop ptr
(unsafeIOToST . go) =<< A.new len
where
desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"

throwUnsupportedReplChar = throwIO $
ErrorCall "decodeUtf8With: non-BMP replacement characters not supported"
throwUnsupportedReplChar = throwIO $
ErrorCall "decodeUtf8With: non-BMP replacement characters not supported"
-- TODO: The code currently assumes that the transcoded UTF-16
-- stream is at most twice as long (in bytes) as the input UTF-8
-- stream. To justify this assumption one has to assume that the
Expand Down Expand Up @@ -292,50 +296,50 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
-- potential surrogate pair started in the last buffer
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
-> Decoding
decodeChunk undecoded0 codepoint0 state0 bs@(PS fp off len) =
runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
where
decodeChunkToBuffer :: A.MArray s -> IO Decoding
decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
with (0::CSize) $ \destOffPtr ->
with codepoint0 $ \codepointPtr ->
with state0 $ \statePtr ->
with nullPtr $ \curPtrPtr ->
let end = ptr `plusPtr` (off + len)
loop curPtr = do
poke curPtrPtr curPtr
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
curPtrPtr end codepointPtr statePtr
state <- peek statePtr
case state of
UTF8_REJECT -> do
-- We encountered an encoding error
x <- peek curPtr'
poke statePtr 0
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff) (safe c)
poke destOffPtr (destOff + fromIntegral w)
loop $ curPtr' `plusPtr` 1

_ -> do
-- We encountered the end of the buffer while decoding
n <- peek destOffPtr
codepoint <- peek codepointPtr
chunkText <- unsafeSTToIO $ do
arr <- A.unsafeFreeze dest
return $! text arr 0 (fromIntegral n)
lastPtr <- peek curPtrPtr
let left = lastPtr `minusPtr` curPtr
!undecoded = case state of
UTF8_ACCEPT -> B.empty
_ -> B.append undecoded0 (B.drop left bs)
return $ Some chunkText undecoded
(decodeChunk undecoded codepoint state)
in loop (ptr `plusPtr` off)
decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
where
decodeChunkToBuffer :: A.MArray s -> IO Decoding
decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
with (0::CSize) $ \destOffPtr ->
with codepoint0 $ \codepointPtr ->
with state0 $ \statePtr ->
with nullPtr $ \curPtrPtr ->
let end = ptr `plusPtr` len
loop curPtr = do
poke curPtrPtr curPtr
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
curPtrPtr end codepointPtr statePtr
state <- peek statePtr
case state of
UTF8_REJECT -> do
-- We encountered an encoding error
x <- peek curPtr'
poke statePtr 0
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff) (safe c)
poke destOffPtr (destOff + fromIntegral w)
loop $ curPtr' `plusPtr` 1

_ -> do
-- We encountered the end of the buffer while decoding
n <- peek destOffPtr
codepoint <- peek codepointPtr
chunkText <- unsafeSTToIO $ do
arr <- A.unsafeFreeze dest
return $! text arr 0 (fromIntegral n)
lastPtr <- peek curPtrPtr
let left = lastPtr `minusPtr` curPtr
!undecoded = case state of
UTF8_ACCEPT -> B.empty
_ -> B.append undecoded0 (B.drop left bs)
return $ Some chunkText undecoded
(decodeChunk undecoded codepoint state)
in loop ptr
desc = "Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"

-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
Expand Down Expand Up @@ -436,12 +440,12 @@ encodeUtf8 (Text arr off len)
newDest <- peek destPtr
let utf8len = newDest `minusPtr` ptr
if utf8len >= len `shiftR` 1
then return (PS fp 0 utf8len)
then return (mkBS fp utf8len)
else do
fp' <- mallocByteString utf8len
withForeignPtr fp' $ \ptr' -> do
memcpy ptr' ptr (fromIntegral utf8len)
return (PS fp' 0 utf8len)
return (mkBS fp' utf8len)

-- | Decode text from little endian UTF-16 encoding.
decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
Expand Down
55 changes: 55 additions & 0 deletions src/Data/Text/Internal/ByteStringCompat.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE MagicHash #-}
module Data.Text.Internal.ByteStringCompat (mkBS, withBS) where

import Data.ByteString.Internal (ByteString (..))
import Data.Word (Word8)
import Foreign.ForeignPtr (ForeignPtr)

#if !MIN_VERSION_bytestring(0,11,0)
#if MIN_VERSION_base(4,10,0)
import GHC.ForeignPtr (plusForeignPtr)
#else
import GHC.ForeignPtr (ForeignPtr(ForeignPtr))
import GHC.Types (Int (..))
import GHC.Prim (plusAddr#)
#endif
#endif

mkBS :: ForeignPtr Word8 -> Int -> ByteString
#if MIN_VERSION_bytestring(0,11,0)
mkBS dfp n = BS dfp n
#else
mkBS dfp n = PS dfp 0 n
#endif
{-# INLINE mkBS #-}

withBS :: ByteString -> (ForeignPtr Word8 -> Int -> r) -> r
#if MIN_VERSION_bytestring(0,11,0)
withBS (BS !sfp !slen) kont = kont sfp slen
#else
withBS (PS !sfp !soff !slen) kont = kont (plusForeignPtr sfp soff) slen
#endif
{-# INLINE withBS #-}

#if !MIN_VERSION_bytestring(0,11,0)
#if !MIN_VERSION_base(4,10,0)
-- |Advances the given address by the given offset in bytes.
--
-- The new 'ForeignPtr' shares the finalizer of the original,
-- equivalent from a finalization standpoint to just creating another
-- reference to the original. That is, the finalizer will not be
-- called before the new 'ForeignPtr' is unreachable, nor will it be
-- called an additional time due to this call, and the finalizer will
-- be called with the same address that it would have had this call
-- not happened, *not* the new address.
plusForeignPtr :: ForeignPtr a -> Int -> ForeignPtr b
plusForeignPtr (ForeignPtr addr guts) (I# offset) = ForeignPtr (plusAddr# addr offset) guts
{-# INLINE [0] plusForeignPtr #-}
{-# RULES
"ByteString plusForeignPtr/0" forall fp .
plusForeignPtr fp 0 = fp
#-}
#endif
#endif
3 changes: 2 additions & 1 deletion src/Data/Text/Internal/Encoding/Fusion.hs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ import qualified Data.Text.Internal.Encoding.Utf8 as U8
import qualified Data.Text.Internal.Encoding.Utf16 as U16
import qualified Data.Text.Internal.Encoding.Utf32 as U32
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Text.Internal.ByteStringCompat

streamASCII :: ByteString -> Stream Char
streamASCII bs = Stream next 0 (maxSize l)
Expand Down Expand Up @@ -185,7 +186,7 @@ unstream (Stream next s0 len) = unsafeDupablePerformIO $ do
withForeignPtr fp' $ \p -> pokeByteOff p off x
loop n' (off+1) s fp'
{-# NOINLINE trimUp #-}
trimUp fp _ off = return $! PS fp 0 off
trimUp fp _ off = return $! mkBS fp off
copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
copy0 !src !srcLen !destLen =
#if defined(ASSERTS)
Expand Down
4 changes: 2 additions & 2 deletions src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ import Data.ByteString.Internal (mallocByteString, memcpy)
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
import qualified Data.ByteString.Internal as B
import Data.Text.Internal.ByteStringCompat

data S = S0
| S1 {-# UNPACK #-} !Word8
Expand Down Expand Up @@ -297,7 +297,7 @@ unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0)
fp' <- copy0 fp n n'
withForeignPtr fp' $ \p -> pokeByteOff p off x
loop n' (off+1) s fp'
trimUp fp off = B.PS fp 0 off
trimUp fp off = mkBS fp off
copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
copy0 !src !srcLen !destLen =
#if defined(ASSERTS)
Expand Down
1 change: 1 addition & 0 deletions tests/text-tests.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ test-suite tests
Data.Text.Internal.Builder.Functions
Data.Text.Internal.Builder.Int.Digits
Data.Text.Internal.Builder.RealFloat.Functions
Data.Text.Internal.ByteStringCompat
Data.Text.Internal.Encoding.Fusion
Data.Text.Internal.Encoding.Fusion.Common
Data.Text.Internal.Encoding.Utf16
Expand Down
3 changes: 2 additions & 1 deletion text.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ library
Data.Text.Internal.Builder.Functions
Data.Text.Internal.Builder.Int.Digits
Data.Text.Internal.Builder.RealFloat.Functions
Data.Text.Internal.ByteStringCompat
Data.Text.Internal.Encoding.Fusion
Data.Text.Internal.Encoding.Fusion.Common
Data.Text.Internal.Encoding.Utf16
Expand Down Expand Up @@ -168,7 +169,7 @@ library
build-depends: bytestring >= 0.9 && < 0.10.4,
bytestring-builder >= 0.10.4.0.2 && < 0.11
else
build-depends: bytestring >= 0.10.4 && < 0.11
build-depends: bytestring >= 0.10.4 && < 0.12

ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2
if flag(developer)
Expand Down

0 comments on commit b7bff71

Please sign in to comment.