diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 6ecdcc77..4491b699 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -64,6 +64,7 @@ import Control.Monad.ST (runST) import Data.Bits ((.&.)) import Data.ByteString as B import qualified Data.ByteString.Internal as B +import Data.Foldable (traverse_) import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) import Data.Text.Internal (Text(..), safe, text) import Data.Text.Internal.Functions @@ -275,7 +276,7 @@ newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) streamDecodeUtf8 :: ByteString -> Decoding streamDecodeUtf8 = streamDecodeUtf8With strictDecode --- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 +-- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8 -- encoded text. -- -- @since 1.0.0.0 @@ -283,11 +284,14 @@ streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding streamDecodeUtf8With onErr = decodeChunk B.empty 0 0 where -- We create a slightly larger than necessary buffer to accommodate a - -- potential surrogate pair started in the last buffer + -- potential surrogate pair started in the last buffer (@undecoded0@), or + -- replacement characters for each byte in @undecoded0@ if the + -- sequence turns out to be invalid. There can be up to three bytes there, + -- hence we allocate @len+3@ 16-bit words. decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where - aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1) + aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+3) where decodeChunkToBuffer :: A.MArray s -> IO Decoding decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ptr -> @@ -297,23 +301,32 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0 with nullPtr $ \curPtrPtr -> let end = ptr `plusPtr` len loop curPtr = do + prevState <- peek statePtr poke curPtrPtr curPtr - curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr + lastPtr <- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr state <- peek statePtr case state of UTF8_REJECT -> do -- We encountered an encoding error - x <- peek curPtr' poke statePtr 0 - case onErr desc (Just x) of - Nothing -> loop $ curPtr' `plusPtr` 1 - Just c -> do - destOff <- peek destOffPtr - w <- unsafeSTToIO $ - unsafeWrite dest (fromIntegral destOff) (safe c) - poke destOffPtr (destOff + fromIntegral w) - loop $ curPtr' `plusPtr` 1 + let skipByte x = case onErr desc (Just x) of + Nothing -> return () + Just c -> do + destOff <- peek destOffPtr + w <- unsafeSTToIO $ + unsafeWrite dest (fromIntegral destOff) (safe c) + poke destOffPtr (destOff + fromIntegral w) + if ptr == lastPtr && prevState /= UTF8_ACCEPT then do + -- If we can't complete the sequence @undecoded0@ from + -- the previous chunk, we invalidate the bytes from + -- @undecoded0@ and retry decoding the current chunk from + -- the initial state. + traverse_ skipByte (B.unpack undecoded0 ) + loop lastPtr + else do + peek lastPtr >>= skipByte + loop (lastPtr `plusPtr` 1) _ -> do -- We encountered the end of the buffer while decoding @@ -322,11 +335,11 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest return $! text arr 0 (fromIntegral n) - lastPtr <- peek curPtrPtr - let left = lastPtr `minusPtr` curPtr + let left = lastPtr `minusPtr` ptr !undecoded = case state of UTF8_ACCEPT -> B.empty - _ -> B.append undecoded0 (B.drop left bs) + _ | left == 0 && prevState /= UTF8_ACCEPT -> B.append undecoded0 bs + | otherwise -> B.drop left bs return $ Some chunkText undecoded (decodeChunk undecoded codepoint state) in loop ptr diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs index 522a0a81..60de7d1a 100644 --- a/src/Data/Text/Lazy/Encoding.hs +++ b/src/Data/Text/Lazy/Encoding.hs @@ -100,9 +100,13 @@ decodeUtf8With onErr (B.Chunk b0 bs0) = TE.Some t l f -> chunk t (go f l bs) go _ l _ | S.null l = empty - | otherwise = case onErr desc (Just (B.unsafeHead l)) of - Nothing -> empty - Just c -> Chunk (T.singleton c) Empty + | otherwise = + let !t = T.pack (skipBytes l) + skipBytes = S.foldr (\x s' -> + case onErr desc (Just x) of + Just c -> c : s' + Nothing -> s') [] in + Chunk t Empty desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream" decodeUtf8With _ _ = empty diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs index 33af0b0e..f84293bc 100644 --- a/tests/Tests/Properties/Transcoding.hs +++ b/tests/Tests/Properties/Transcoding.hs @@ -24,6 +24,7 @@ import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL import qualified Data.Text as T import qualified Data.Text.Encoding as E +import qualified Data.Text.Encoding.Error as E import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as EL @@ -152,6 +153,18 @@ genInvalidUTF8 = B.pack <$> oneof [ k <- choose (0,n) vectorOf k gen +decodeLL :: BL.ByteString -> TL.Text +decodeLL = EL.decodeUtf8With E.lenientDecode + +decodeL :: B.ByteString -> T.Text +decodeL = E.decodeUtf8With E.lenientDecode + +-- The lenient decoding of lazy bytestrings should not depend on how they are chunked, +-- and it should behave the same as decoding of strict bytestrings. +t_decode_utf8_lenient :: Property +t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs -> + decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs + -- See http://unicode.org/faq/utf_bom.html#gen8 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ... -- When faced with this illegal byte sequence ... a UTF-8 conformant process @@ -206,6 +219,7 @@ testTranscoding = testProperty "t_utf8_err'" t_utf8_err' ], testGroup "error recovery" [ + testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient, testProperty "t_decode_with_error2" t_decode_with_error2, testProperty "t_decode_with_error3" t_decode_with_error3, testProperty "t_decode_with_error4" t_decode_with_error4, diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs index fa1eca7c..d2aa3b73 100644 --- a/tests/Tests/QuickCheckUtils.hs +++ b/tests/Tests/QuickCheckUtils.hs @@ -49,6 +49,7 @@ import Test.QuickCheck.Monadic (assert, monadicIO, run) import Test.QuickCheck.Unicode (string) import Tests.Utils import qualified Data.ByteString as B +import qualified Data.ByteString.Lazy as BL import qualified Data.Text as T import qualified Data.Text.Encoding.Error as T import qualified Data.Text.Internal.Fusion as TF @@ -61,6 +62,9 @@ import qualified System.IO as IO genUnicode :: IsString a => Gen a genUnicode = fromString <$> string +genWord8 :: Gen Word8 +genWord8 = chooseAny + instance Random I16 where randomR = integralRandomR random = randomR (minBound,maxBound) @@ -70,9 +74,23 @@ instance Arbitrary I16 where shrink = shrinkIntegral instance Arbitrary B.ByteString where - arbitrary = B.pack `fmap` arbitrary + arbitrary = B.pack `fmap` listOf genWord8 shrink = map B.pack . shrink . B.unpack +instance Arbitrary BL.ByteString where + arbitrary = oneof + [ BL.fromChunks <$> arbitrary + -- so that a single utf8 code point could appear split over up to 4 chunks + , BL.fromChunks . map B.singleton <$> listOf genWord8 + -- so that a code point with 4 byte long utf8 representation + -- could appear split over 3 non-singleton chunks + , (\a b c -> BL.fromChunks [a, b, c]) + <$> arbitrary + <*> ((\a b -> B.pack [a, b]) <$> genWord8 <*> genWord8) + <*> arbitrary + ] + shrink xs = BL.fromChunks <$> shrink (BL.toChunks xs) + -- For tests that have O(n^2) running times or input sizes, resize -- their inputs to the square root of the originals. unsquare :: (Arbitrary a, Show a, Testable b) => (a -> b) -> Property diff --git a/tests/Tests/Regressions.hs b/tests/Tests/Regressions.hs index d334b971..157d0e89 100644 --- a/tests/Tests/Regressions.hs +++ b/tests/Tests/Regressions.hs @@ -20,6 +20,7 @@ import qualified Data.ByteString.Lazy as LB import qualified Data.Text as T import qualified Data.Text.Array as TA import qualified Data.Text.Encoding as TE +import qualified Data.Text.Encoding.Error as E import qualified Data.Text.Internal as T import qualified Data.Text.IO as T import qualified Data.Text.Lazy as LT @@ -136,6 +137,13 @@ t301 = do original@(T.Text originalArr originalOff originalLen) = T.pack "1234567890" T.Text newArr _off _len = T.take 1 $ T.drop 1 original +t330 :: IO () +t330 = do + let decodeL = LE.decodeUtf8With E.lenientDecode + assertEqual "The lenient decoding of lazy bytestrings should not depend on how they are chunked" + (decodeL (LB.fromChunks [B.pack [194], B.pack [97, 98, 99]])) + (decodeL (LB.fromChunks [B.pack [194, 97, 98, 99]])) + tests :: F.TestTree tests = F.testGroup "Regressions" [ F.testCase "hGetContents_crash" hGetContents_crash @@ -149,4 +157,5 @@ tests = F.testGroup "Regressions" , F.testCase "t280/fromString" t280_fromString , F.testCase "t280/singleton" t280_singleton , F.testCase "t301" t301 + , F.testCase "t330" t330 ]