diff --git a/.gitignore b/.gitignore index 44e8ce9b..f26c6f6b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ /GNUmakefile /.ghc.environment.* /cabal.project.local +/cabal.test.project.local # Test data repo ignored. Please see instruction in tests-and-benchmarks.markdown /tests/text-test-data/ diff --git a/benchmarks/text-benchmarks.cabal b/benchmarks/text-benchmarks.cabal index 7f606f3c..c31f82b2 100644 --- a/benchmarks/text-benchmarks.cabal +++ b/benchmarks/text-benchmarks.cabal @@ -101,6 +101,7 @@ executable text-benchmarks Data.Text.Internal.Builder.Functions Data.Text.Internal.Builder.Int.Digits Data.Text.Internal.Builder.RealFloat.Functions + Data.Text.Internal.ByteStringCompat Data.Text.Internal.Encoding.Fusion Data.Text.Internal.Encoding.Fusion.Common Data.Text.Internal.Encoding.Utf16 diff --git a/cabal.tests.project b/cabal.tests.project new file mode 100644 index 00000000..f6e61142 --- /dev/null +++ b/cabal.tests.project @@ -0,0 +1,5 @@ +-- this project doesn't have local 'text' package, +-- so tests build faster. + +packages: tests +tests: True diff --git a/scripts/tests.sh b/scripts/tests.sh new file mode 100644 index 00000000..b3d6e87f --- /dev/null +++ b/scripts/tests.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +set -ex + +runtest() { + HC=$1 + shift + + # EDIT last line to pass arguments + + cabal run text-tests:test:tests \ + --project-file=cabal.tests.project \ + --builddir="dist-newstyle/$HC" \ + --with-compiler="$HC" \ + -- "$@" +} + +runtest ghc-8.10.2 "$@" +runtest ghc-8.8.4 "$@" +runtest ghc-8.6.5 "$@" +runtest ghc-8.4.4 "$@" +runtest ghc-8.2.2 "$@" +runtest ghc-8.0.2 "$@" + +runtest ghc-7.10.3 "$@" +runtest ghc-7.8.4 "$@" +runtest ghc-7.6.3 "$@" +runtest ghc-7.4.2 "$@" +runtest ghc-7.2.2 "$@" +runtest ghc-7.0.4 "$@" diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index d8936796..239e15e3 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -96,6 +96,7 @@ import qualified Data.Text.Array as A import qualified Data.Text.Internal.Encoding.Fusion as E import qualified Data.Text.Internal.Encoding.Utf16 as U16 import qualified Data.Text.Internal.Fusion as F +import Data.Text.Internal.ByteStringCompat #include "text_cbits.h" @@ -123,12 +124,13 @@ decodeASCII = decodeUtf8 -- 'decodeLatin1' is semantically equivalent to -- @Data.Text.pack . Data.ByteString.Char8.unpack@ decodeLatin1 :: ByteString -> Text -decodeLatin1 (PS fp off len) = text a 0 len - where - a = A.run (A.new len >>= unsafeIOToST . go) - go dest = withForeignPtr fp $ \ptr -> do - c_decode_latin1 (A.maBA dest) (ptr `plusPtr` off) (ptr `plusPtr` (off+len)) - return dest +decodeLatin1 bs = withBS bs aux where + aux fp len = text a 0 len + where + a = A.run (A.new len >>= unsafeIOToST . go) + go dest = withForeignPtr fp $ \ptr -> do + c_decode_latin1 (A.maBA dest) ptr (ptr `plusPtr` len) + return dest -- | Decode a 'ByteString' containing UTF-8 encoded text. -- @@ -139,36 +141,38 @@ decodeLatin1 (PS fp off len) = text a 0 len -- 'error' (/since 1.2.3.1/); For earlier versions of @text@ using -- those unsupported code points would result in undefined behavior. decodeUtf8With :: OnDecodeError -> ByteString -> Text -decodeUtf8With onErr (PS fp off len) = runText $ \done -> do - let go dest = withForeignPtr fp $ \ptr -> - with (0::CSize) $ \destOffPtr -> do - let end = ptr `plusPtr` (off + len) - loop curPtr = do - curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end - if curPtr' == end - then do - n <- peek destOffPtr - unsafeSTToIO (done dest (fromIntegral n)) - else do - x <- peek curPtr' - case onErr desc (Just x) of - Nothing -> loop $ curPtr' `plusPtr` 1 - Just c - | c > '\xFFFF' -> throwUnsupportedReplChar - | otherwise -> do - destOff <- peek destOffPtr - w <- unsafeSTToIO $ - unsafeWrite dest (fromIntegral destOff) - (safe c) - poke destOffPtr (destOff + fromIntegral w) - loop $ curPtr' `plusPtr` 1 - loop (ptr `plusPtr` off) - (unsafeIOToST . go) =<< A.new len +decodeUtf8With onErr bs = withBS bs aux where - desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream" + aux fp len = runText $ \done -> do + let go dest = withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> do + let end = ptr `plusPtr` len + loop curPtr = do + curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end + if curPtr' == end + then do + n <- peek destOffPtr + unsafeSTToIO (done dest (fromIntegral n)) + else do + x <- peek curPtr' + case onErr desc (Just x) of + Nothing -> loop $ curPtr' `plusPtr` 1 + Just c + | c > '\xFFFF' -> throwUnsupportedReplChar + | otherwise -> do + destOff <- peek destOffPtr + w <- unsafeSTToIO $ + unsafeWrite dest (fromIntegral destOff) + (safe c) + poke destOffPtr (destOff + fromIntegral w) + loop $ curPtr' `plusPtr` 1 + loop ptr + (unsafeIOToST . go) =<< A.new len + where + desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream" - throwUnsupportedReplChar = throwIO $ - ErrorCall "decodeUtf8With: non-BMP replacement characters not supported" + throwUnsupportedReplChar = throwIO $ + ErrorCall "decodeUtf8With: non-BMP replacement characters not supported" -- TODO: The code currently assumes that the transcoded UTF-16 -- stream is at most twice as long (in bytes) as the input UTF-8 -- stream. To justify this assumption one has to assume that the @@ -292,50 +296,50 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0 -- potential surrogate pair started in the last buffer decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding - decodeChunk undecoded0 codepoint0 state0 bs@(PS fp off len) = - runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1) - where - decodeChunkToBuffer :: A.MArray s -> IO Decoding - decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> - with (0::CSize) $ \destOffPtr -> - with codepoint0 $ \codepointPtr -> - with state0 $ \statePtr -> - with nullPtr $ \curPtrPtr -> - let end = ptr `plusPtr` (off + len) - loop curPtr = do - poke curPtrPtr curPtr - curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr - curPtrPtr end codepointPtr statePtr - state <- peek statePtr - case state of - UTF8_REJECT -> do - -- We encountered an encoding error - x <- peek curPtr' - poke statePtr 0 - case onErr desc (Just x) of - Nothing -> loop $ curPtr' `plusPtr` 1 - Just c -> do - destOff <- peek destOffPtr - w <- unsafeSTToIO $ - unsafeWrite dest (fromIntegral destOff) (safe c) - poke destOffPtr (destOff + fromIntegral w) - loop $ curPtr' `plusPtr` 1 - - _ -> do - -- We encountered the end of the buffer while decoding - n <- peek destOffPtr - codepoint <- peek codepointPtr - chunkText <- unsafeSTToIO $ do - arr <- A.unsafeFreeze dest - return $! text arr 0 (fromIntegral n) - lastPtr <- peek curPtrPtr - let left = lastPtr `minusPtr` curPtr - !undecoded = case state of - UTF8_ACCEPT -> B.empty - _ -> B.append undecoded0 (B.drop left bs) - return $ Some chunkText undecoded - (decodeChunk undecoded codepoint state) - in loop (ptr `plusPtr` off) + decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where + aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1) + where + decodeChunkToBuffer :: A.MArray s -> IO Decoding + decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> + with codepoint0 $ \codepointPtr -> + with state0 $ \statePtr -> + with nullPtr $ \curPtrPtr -> + let end = ptr `plusPtr` len + loop curPtr = do + poke curPtrPtr curPtr + curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr + curPtrPtr end codepointPtr statePtr + state <- peek statePtr + case state of + UTF8_REJECT -> do + -- We encountered an encoding error + x <- peek curPtr' + poke statePtr 0 + case onErr desc (Just x) of + Nothing -> loop $ curPtr' `plusPtr` 1 + Just c -> do + destOff <- peek destOffPtr + w <- unsafeSTToIO $ + unsafeWrite dest (fromIntegral destOff) (safe c) + poke destOffPtr (destOff + fromIntegral w) + loop $ curPtr' `plusPtr` 1 + + _ -> do + -- We encountered the end of the buffer while decoding + n <- peek destOffPtr + codepoint <- peek codepointPtr + chunkText <- unsafeSTToIO $ do + arr <- A.unsafeFreeze dest + return $! text arr 0 (fromIntegral n) + lastPtr <- peek curPtrPtr + let left = lastPtr `minusPtr` curPtr + !undecoded = case state of + UTF8_ACCEPT -> B.empty + _ -> B.append undecoded0 (B.drop left bs) + return $ Some chunkText undecoded + (decodeChunk undecoded codepoint state) + in loop ptr desc = "Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream" -- | Decode a 'ByteString' containing UTF-8 encoded text that is known @@ -436,12 +440,12 @@ encodeUtf8 (Text arr off len) newDest <- peek destPtr let utf8len = newDest `minusPtr` ptr if utf8len >= len `shiftR` 1 - then return (PS fp 0 utf8len) + then return (mkBS fp utf8len) else do fp' <- mallocByteString utf8len withForeignPtr fp' $ \ptr' -> do memcpy ptr' ptr (fromIntegral utf8len) - return (PS fp' 0 utf8len) + return (mkBS fp' utf8len) -- | Decode text from little endian UTF-16 encoding. decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text diff --git a/src/Data/Text/Internal/ByteStringCompat.hs b/src/Data/Text/Internal/ByteStringCompat.hs new file mode 100644 index 00000000..ee6dc18d --- /dev/null +++ b/src/Data/Text/Internal/ByteStringCompat.hs @@ -0,0 +1,55 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE MagicHash #-} +module Data.Text.Internal.ByteStringCompat (mkBS, withBS) where + +import Data.ByteString.Internal (ByteString (..)) +import Data.Word (Word8) +import Foreign.ForeignPtr (ForeignPtr) + +#if !MIN_VERSION_bytestring(0,11,0) +#if MIN_VERSION_base(4,10,0) +import GHC.ForeignPtr (plusForeignPtr) +#else +import GHC.ForeignPtr (ForeignPtr(ForeignPtr)) +import GHC.Types (Int (..)) +import GHC.Prim (plusAddr#) +#endif +#endif + +mkBS :: ForeignPtr Word8 -> Int -> ByteString +#if MIN_VERSION_bytestring(0,11,0) +mkBS dfp n = BS dfp n +#else +mkBS dfp n = PS dfp 0 n +#endif +{-# INLINE mkBS #-} + +withBS :: ByteString -> (ForeignPtr Word8 -> Int -> r) -> r +#if MIN_VERSION_bytestring(0,11,0) +withBS (BS !sfp !slen) kont = kont sfp slen +#else +withBS (PS !sfp !soff !slen) kont = kont (plusForeignPtr sfp soff) slen +#endif +{-# INLINE withBS #-} + +#if !MIN_VERSION_bytestring(0,11,0) +#if !MIN_VERSION_base(4,10,0) +-- |Advances the given address by the given offset in bytes. +-- +-- The new 'ForeignPtr' shares the finalizer of the original, +-- equivalent from a finalization standpoint to just creating another +-- reference to the original. That is, the finalizer will not be +-- called before the new 'ForeignPtr' is unreachable, nor will it be +-- called an additional time due to this call, and the finalizer will +-- be called with the same address that it would have had this call +-- not happened, *not* the new address. +plusForeignPtr :: ForeignPtr a -> Int -> ForeignPtr b +plusForeignPtr (ForeignPtr addr guts) (I# offset) = ForeignPtr (plusAddr# addr offset) guts +{-# INLINE [0] plusForeignPtr #-} +{-# RULES +"ByteString plusForeignPtr/0" forall fp . + plusForeignPtr fp 0 = fp + #-} +#endif +#endif diff --git a/src/Data/Text/Internal/Encoding/Fusion.hs b/src/Data/Text/Internal/Encoding/Fusion.hs index 66b3d0bc..41e0926f 100644 --- a/src/Data/Text/Internal/Encoding/Fusion.hs +++ b/src/Data/Text/Internal/Encoding/Fusion.hs @@ -53,6 +53,7 @@ import qualified Data.Text.Internal.Encoding.Utf8 as U8 import qualified Data.Text.Internal.Encoding.Utf16 as U16 import qualified Data.Text.Internal.Encoding.Utf32 as U32 import Data.Text.Unsafe (unsafeDupablePerformIO) +import Data.Text.Internal.ByteStringCompat streamASCII :: ByteString -> Stream Char streamASCII bs = Stream next 0 (maxSize l) @@ -185,7 +186,7 @@ unstream (Stream next s0 len) = unsafeDupablePerformIO $ do withForeignPtr fp' $ \p -> pokeByteOff p off x loop n' (off+1) s fp' {-# NOINLINE trimUp #-} - trimUp fp _ off = return $! PS fp 0 off + trimUp fp _ off = return $! mkBS fp off copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8) copy0 !src !srcLen !destLen = #if defined(ASSERTS) diff --git a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs index 7dafc0a2..eff06071 100644 --- a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs +++ b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs @@ -52,7 +52,7 @@ import Data.ByteString.Internal (mallocByteString, memcpy) #if defined(ASSERTS) import Control.Exception (assert) #endif -import qualified Data.ByteString.Internal as B +import Data.Text.Internal.ByteStringCompat data S = S0 | S1 {-# UNPACK #-} !Word8 @@ -297,7 +297,7 @@ unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0) fp' <- copy0 fp n n' withForeignPtr fp' $ \p -> pokeByteOff p off x loop n' (off+1) s fp' - trimUp fp off = B.PS fp 0 off + trimUp fp off = mkBS fp off copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8) copy0 !src !srcLen !destLen = #if defined(ASSERTS) diff --git a/tests/text-tests.cabal b/tests/text-tests.cabal index 7a84c901..e0041154 100644 --- a/tests/text-tests.cabal +++ b/tests/text-tests.cabal @@ -121,6 +121,7 @@ test-suite tests Data.Text.Internal.Builder.Functions Data.Text.Internal.Builder.Int.Digits Data.Text.Internal.Builder.RealFloat.Functions + Data.Text.Internal.ByteStringCompat Data.Text.Internal.Encoding.Fusion Data.Text.Internal.Encoding.Fusion.Common Data.Text.Internal.Encoding.Utf16 diff --git a/text.cabal b/text.cabal index 3991b30f..b7060c2e 100644 --- a/text.cabal +++ b/text.cabal @@ -120,6 +120,7 @@ library Data.Text.Internal.Builder.Functions Data.Text.Internal.Builder.Int.Digits Data.Text.Internal.Builder.RealFloat.Functions + Data.Text.Internal.ByteStringCompat Data.Text.Internal.Encoding.Fusion Data.Text.Internal.Encoding.Fusion.Common Data.Text.Internal.Encoding.Utf16 @@ -168,7 +169,7 @@ library build-depends: bytestring >= 0.9 && < 0.10.4, bytestring-builder >= 0.10.4.0.2 && < 0.11 else - build-depends: bytestring >= 0.10.4 && < 0.11 + build-depends: bytestring >= 0.10.4 && < 0.12 ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2 if flag(developer)