Merge pull request #296 from phadej/bytestring-0.11

bytestring-0.11
haskell · Oct 17, 2020 · b7bff71 · b7bff71
2 parents 7d3130b + 21281e7
commit b7bff71
Show file tree

Hide file tree

Showing 10 changed files with 183 additions and 84 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@
 /GNUmakefile
 /.ghc.environment.*
 /cabal.project.local
+/cabal.test.project.local
 
 # Test data repo ignored. Please see instruction in tests-and-benchmarks.markdown
 /tests/text-test-data/
diff --git a/benchmarks/text-benchmarks.cabal b/benchmarks/text-benchmarks.cabal
@@ -101,6 +101,7 @@ executable text-benchmarks
     Data.Text.Internal.Builder.Functions
     Data.Text.Internal.Builder.Int.Digits
     Data.Text.Internal.Builder.RealFloat.Functions
+    Data.Text.Internal.ByteStringCompat
     Data.Text.Internal.Encoding.Fusion
     Data.Text.Internal.Encoding.Fusion.Common
     Data.Text.Internal.Encoding.Utf16

diff --git a/cabal.tests.project b/cabal.tests.project
@@ -0,0 +1,5 @@
+-- this project doesn't have local 'text' package,
+-- so tests build faster.
+
+packages: tests
+tests: True
diff --git a/scripts/tests.sh b/scripts/tests.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+set -ex
+
+runtest() {
+  HC=$1
+  shift
+
+  # EDIT last line to pass arguments
+
+  cabal run text-tests:test:tests \
+      --project-file=cabal.tests.project \
+      --builddir="dist-newstyle/$HC" \
+      --with-compiler="$HC" \
+      -- "$@"
+}
+
+runtest ghc-8.10.2 "$@"
+runtest ghc-8.8.4  "$@"
+runtest ghc-8.6.5  "$@"
+runtest ghc-8.4.4  "$@"
+runtest ghc-8.2.2  "$@"
+runtest ghc-8.0.2  "$@"
+
+runtest ghc-7.10.3 "$@"
+runtest ghc-7.8.4  "$@"
+runtest ghc-7.6.3  "$@"
+runtest ghc-7.4.2  "$@"
+runtest ghc-7.2.2  "$@"
+runtest ghc-7.0.4  "$@"
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
@@ -96,6 +96,7 @@ import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Encoding.Fusion as E
 import qualified Data.Text.Internal.Encoding.Utf16 as U16
 import qualified Data.Text.Internal.Fusion as F
+import Data.Text.Internal.ByteStringCompat
 
 #include "text_cbits.h"
 
@@ -123,12 +124,13 @@ decodeASCII = decodeUtf8
 -- 'decodeLatin1' is semantically equivalent to
 --  @Data.Text.pack . Data.ByteString.Char8.unpack@
 decodeLatin1 :: ByteString -> Text
-decodeLatin1 (PS fp off len) = text a 0 len
- where
-  a = A.run (A.new len >>= unsafeIOToST . go)
-  go dest = withForeignPtr fp $ \ptr -> do
-    c_decode_latin1 (A.maBA dest) (ptr `plusPtr` off) (ptr `plusPtr` (off+len))
-    return dest
+decodeLatin1 bs = withBS bs aux where
+  aux fp len = text a 0 len
+   where
+    a = A.run (A.new len >>= unsafeIOToST . go)
+    go dest = withForeignPtr fp $ \ptr -> do
+      c_decode_latin1 (A.maBA dest) ptr (ptr `plusPtr` len)
+      return dest
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
@@ -139,36 +141,38 @@ decodeLatin1 (PS fp off len) = text a 0 len
 -- 'error' (/since 1.2.3.1/); For earlier versions of @text@ using
 -- those unsupported code points would result in undefined behavior.
 decodeUtf8With :: OnDecodeError -> ByteString -> Text
-decodeUtf8With onErr (PS fp off len) = runText $ \done -> do
-  let go dest = withForeignPtr fp $ \ptr ->
-        with (0::CSize) $ \destOffPtr -> do
-          let end = ptr `plusPtr` (off + len)
-              loop curPtr = do
-                curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
-                if curPtr' == end
-                  then do
-                    n <- peek destOffPtr
-                    unsafeSTToIO (done dest (fromIntegral n))
-                  else do
-                    x <- peek curPtr'
-                    case onErr desc (Just x) of
-                      Nothing -> loop $ curPtr' `plusPtr` 1
-                      Just c
-                        | c > '\xFFFF' -> throwUnsupportedReplChar
-                        | otherwise -> do
-                            destOff <- peek destOffPtr
-                            w <- unsafeSTToIO $
-                                 unsafeWrite dest (fromIntegral destOff)
-                                             (safe c)
-                            poke destOffPtr (destOff + fromIntegral w)
-                            loop $ curPtr' `plusPtr` 1
-          loop (ptr `plusPtr` off)
-  (unsafeIOToST . go) =<< A.new len
+decodeUtf8With onErr bs = withBS bs aux
  where
-  desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
+  aux fp len = runText $ \done -> do
+    let go dest = withForeignPtr fp $ \ptr ->
+          with (0::CSize) $ \destOffPtr -> do
+            let end = ptr `plusPtr` len
+                loop curPtr = do
+                  curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
+                  if curPtr' == end
+                    then do
+                      n <- peek destOffPtr
+                      unsafeSTToIO (done dest (fromIntegral n))
+                    else do
+                      x <- peek curPtr'
+                      case onErr desc (Just x) of
+                        Nothing -> loop $ curPtr' `plusPtr` 1
+                        Just c
+                          | c > '\xFFFF' -> throwUnsupportedReplChar
+                          | otherwise -> do
+                              destOff <- peek destOffPtr
+                              w <- unsafeSTToIO $
+                                   unsafeWrite dest (fromIntegral destOff)
+                                               (safe c)
+                              poke destOffPtr (destOff + fromIntegral w)
+                              loop $ curPtr' `plusPtr` 1
+            loop ptr
+    (unsafeIOToST . go) =<< A.new len
+   where
+    desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
 
-  throwUnsupportedReplChar = throwIO $
-    ErrorCall "decodeUtf8With: non-BMP replacement characters not supported"
+    throwUnsupportedReplChar = throwIO $
+      ErrorCall "decodeUtf8With: non-BMP replacement characters not supported"
   -- TODO: The code currently assumes that the transcoded UTF-16
   -- stream is at most twice as long (in bytes) as the input UTF-8
   -- stream. To justify this assumption one has to assume that the
@@ -292,50 +296,50 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
   -- potential surrogate pair started in the last buffer
   decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
               -> Decoding
-  decodeChunk undecoded0 codepoint0 state0 bs@(PS fp off len) =
-    runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
-   where
-    decodeChunkToBuffer :: A.MArray s -> IO Decoding
-    decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
-      with (0::CSize) $ \destOffPtr ->
-      with codepoint0 $ \codepointPtr ->
-      with state0 $ \statePtr ->
-      with nullPtr $ \curPtrPtr ->
-        let end = ptr `plusPtr` (off + len)
-            loop curPtr = do
-              poke curPtrPtr curPtr
-              curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
-                         curPtrPtr end codepointPtr statePtr
-              state <- peek statePtr
-              case state of
-                UTF8_REJECT -> do
-                  -- We encountered an encoding error
-                  x <- peek curPtr'
-                  poke statePtr 0
-                  case onErr desc (Just x) of
-                    Nothing -> loop $ curPtr' `plusPtr` 1
-                    Just c -> do
-                      destOff <- peek destOffPtr
-                      w <- unsafeSTToIO $
-                           unsafeWrite dest (fromIntegral destOff) (safe c)
-                      poke destOffPtr (destOff + fromIntegral w)
-                      loop $ curPtr' `plusPtr` 1
-
-                _ -> do
-                  -- We encountered the end of the buffer while decoding
-                  n <- peek destOffPtr
-                  codepoint <- peek codepointPtr
-                  chunkText <- unsafeSTToIO $ do
-                      arr <- A.unsafeFreeze dest
-                      return $! text arr 0 (fromIntegral n)
-                  lastPtr <- peek curPtrPtr
-                  let left = lastPtr `minusPtr` curPtr
-                      !undecoded = case state of
-                        UTF8_ACCEPT -> B.empty
-                        _           -> B.append undecoded0 (B.drop left bs)
-                  return $ Some chunkText undecoded
-                           (decodeChunk undecoded codepoint state)
-        in loop (ptr `plusPtr` off)
+  decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
+    aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
+       where
+        decodeChunkToBuffer :: A.MArray s -> IO Decoding
+        decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+          with (0::CSize) $ \destOffPtr ->
+          with codepoint0 $ \codepointPtr ->
+          with state0 $ \statePtr ->
+          with nullPtr $ \curPtrPtr ->
+            let end = ptr `plusPtr` len
+                loop curPtr = do
+                  poke curPtrPtr curPtr
+                  curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
+                             curPtrPtr end codepointPtr statePtr
+                  state <- peek statePtr
+                  case state of
+                    UTF8_REJECT -> do
+                      -- We encountered an encoding error
+                      x <- peek curPtr'
+                      poke statePtr 0
+                      case onErr desc (Just x) of
+                        Nothing -> loop $ curPtr' `plusPtr` 1
+                        Just c -> do
+                          destOff <- peek destOffPtr
+                          w <- unsafeSTToIO $
+                               unsafeWrite dest (fromIntegral destOff) (safe c)
+                          poke destOffPtr (destOff + fromIntegral w)
+                          loop $ curPtr' `plusPtr` 1
+
+                    _ -> do
+                      -- We encountered the end of the buffer while decoding
+                      n <- peek destOffPtr
+                      codepoint <- peek codepointPtr
+                      chunkText <- unsafeSTToIO $ do
+                          arr <- A.unsafeFreeze dest
+                          return $! text arr 0 (fromIntegral n)
+                      lastPtr <- peek curPtrPtr
+                      let left = lastPtr `minusPtr` curPtr
+                          !undecoded = case state of
+                            UTF8_ACCEPT -> B.empty
+                            _           -> B.append undecoded0 (B.drop left bs)
+                      return $ Some chunkText undecoded
+                               (decodeChunk undecoded codepoint state)
+            in loop ptr
   desc = "Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
@@ -436,12 +440,12 @@ encodeUtf8 (Text arr off len)
       newDest <- peek destPtr
       let utf8len = newDest `minusPtr` ptr
       if utf8len >= len `shiftR` 1
-        then return (PS fp 0 utf8len)
+        then return (mkBS fp utf8len)
         else do
           fp' <- mallocByteString utf8len
           withForeignPtr fp' $ \ptr' -> do
             memcpy ptr' ptr (fromIntegral utf8len)
-            return (PS fp' 0 utf8len)
+            return (mkBS fp' utf8len)
 
 -- | Decode text from little endian UTF-16 encoding.
 decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text

diff --git a/src/Data/Text/Internal/ByteStringCompat.hs b/src/Data/Text/Internal/ByteStringCompat.hs
@@ -0,0 +1,55 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE MagicHash #-}
+module Data.Text.Internal.ByteStringCompat (mkBS, withBS) where
+
+import Data.ByteString.Internal (ByteString (..))
+import Data.Word (Word8)
+import Foreign.ForeignPtr (ForeignPtr)
+
+#if !MIN_VERSION_bytestring(0,11,0)
+#if MIN_VERSION_base(4,10,0)
+import GHC.ForeignPtr (plusForeignPtr)
+#else
+import GHC.ForeignPtr (ForeignPtr(ForeignPtr))
+import GHC.Types (Int (..))
+import GHC.Prim (plusAddr#)
+#endif
+#endif
+
+mkBS :: ForeignPtr Word8 -> Int -> ByteString
+#if MIN_VERSION_bytestring(0,11,0)
+mkBS dfp n = BS dfp n
+#else
+mkBS dfp n = PS dfp 0 n
+#endif
+{-# INLINE mkBS #-}
+
+withBS :: ByteString -> (ForeignPtr Word8 -> Int -> r) -> r
+#if MIN_VERSION_bytestring(0,11,0)
+withBS (BS !sfp !slen)       kont = kont sfp slen
+#else
+withBS (PS !sfp !soff !slen) kont = kont (plusForeignPtr sfp soff) slen
+#endif
+{-# INLINE withBS #-}
+
+#if !MIN_VERSION_bytestring(0,11,0)
+#if !MIN_VERSION_base(4,10,0)
+-- |Advances the given address by the given offset in bytes.
+--
+-- The new 'ForeignPtr' shares the finalizer of the original,
+-- equivalent from a finalization standpoint to just creating another
+-- reference to the original. That is, the finalizer will not be
+-- called before the new 'ForeignPtr' is unreachable, nor will it be
+-- called an additional time due to this call, and the finalizer will
+-- be called with the same address that it would have had this call
+-- not happened, *not* the new address.
+plusForeignPtr :: ForeignPtr a -> Int -> ForeignPtr b
+plusForeignPtr (ForeignPtr addr guts) (I# offset) = ForeignPtr (plusAddr# addr offset) guts
+{-# INLINE [0] plusForeignPtr #-}
+{-# RULES
+"ByteString plusForeignPtr/0" forall fp .
+   plusForeignPtr fp 0 = fp
+ #-}
+#endif
+#endif
diff --git a/src/Data/Text/Internal/Encoding/Fusion.hs b/src/Data/Text/Internal/Encoding/Fusion.hs
@@ -53,6 +53,7 @@ import qualified Data.Text.Internal.Encoding.Utf8 as U8
 import qualified Data.Text.Internal.Encoding.Utf16 as U16
 import qualified Data.Text.Internal.Encoding.Utf32 as U32
 import Data.Text.Unsafe (unsafeDupablePerformIO)
+import Data.Text.Internal.ByteStringCompat
 
 streamASCII :: ByteString -> Stream Char
 streamASCII bs = Stream next 0 (maxSize l)
@@ -185,7 +186,7 @@ unstream (Stream next s0 len) = unsafeDupablePerformIO $ do
         withForeignPtr fp' $ \p -> pokeByteOff p off x
         loop n' (off+1) s fp'
       {-# NOINLINE trimUp #-}
-      trimUp fp _ off = return $! PS fp 0 off
+      trimUp fp _ off = return $! mkBS fp off
       copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
       copy0 !src !srcLen !destLen =
 #if defined(ASSERTS)

diff --git a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
@@ -52,7 +52,7 @@ import Data.ByteString.Internal (mallocByteString, memcpy)
 #if defined(ASSERTS)
 import Control.Exception (assert)
 #endif
-import qualified Data.ByteString.Internal as B
+import Data.Text.Internal.ByteStringCompat
 
 data S = S0
        | S1 {-# UNPACK #-} !Word8
@@ -297,7 +297,7 @@ unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0)
               fp' <- copy0 fp n n'
               withForeignPtr fp' $ \p -> pokeByteOff p off x
               loop n' (off+1) s fp'
-            trimUp fp off = B.PS fp 0 off
+            trimUp fp off = mkBS fp off
             copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
             copy0 !src !srcLen !destLen =
 #if defined(ASSERTS)

diff --git a/tests/text-tests.cabal b/tests/text-tests.cabal
@@ -121,6 +121,7 @@ test-suite tests
     Data.Text.Internal.Builder.Functions
     Data.Text.Internal.Builder.Int.Digits
     Data.Text.Internal.Builder.RealFloat.Functions
+    Data.Text.Internal.ByteStringCompat
     Data.Text.Internal.Encoding.Fusion
     Data.Text.Internal.Encoding.Fusion.Common
     Data.Text.Internal.Encoding.Utf16

diff --git a/text.cabal b/text.cabal
@@ -120,6 +120,7 @@ library
     Data.Text.Internal.Builder.Functions
     Data.Text.Internal.Builder.Int.Digits
     Data.Text.Internal.Builder.RealFloat.Functions
+    Data.Text.Internal.ByteStringCompat
     Data.Text.Internal.Encoding.Fusion
     Data.Text.Internal.Encoding.Fusion.Common
     Data.Text.Internal.Encoding.Utf16
@@ -168,7 +169,7 @@ library
     build-depends: bytestring         >= 0.9    && < 0.10.4,
                    bytestring-builder >= 0.10.4.0.2 && < 0.11
   else
-    build-depends: bytestring         >= 0.10.4 && < 0.11
+    build-depends: bytestring         >= 0.10.4 && < 0.12
 
   ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2
   if flag(developer)