Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix UTF-8 decoding of lazy bytestrings #333

Merged
merged 1 commit into from
May 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 29 additions & 16 deletions src/Data/Text/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ import Control.Monad.ST (runST)
import Data.Bits ((.&.))
import Data.ByteString as B
import qualified Data.ByteString.Internal as B
import Data.Foldable (traverse_)
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Internal (Text(..), safe, text)
import Data.Text.Internal.Functions
Expand Down Expand Up @@ -275,19 +276,22 @@ newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
streamDecodeUtf8 :: ByteString -> Decoding
streamDecodeUtf8 = streamDecodeUtf8With strictDecode

-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
-- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8
-- encoded text.
--
-- @since 1.0.0.0
streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
where
-- We create a slightly larger than necessary buffer to accommodate a
-- potential surrogate pair started in the last buffer
-- potential surrogate pair started in the last buffer (@undecoded0@), or
-- replacement characters for each byte in @undecoded0@ if the
-- sequence turns out to be invalid. There can be up to three bytes there,
-- hence we allocate @len+3@ 16-bit words.
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
-> Decoding
decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+3)
where
decodeChunkToBuffer :: A.MArray s -> IO Decoding
decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ptr ->
Expand All @@ -297,23 +301,32 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
with nullPtr $ \curPtrPtr ->
let end = ptr `plusPtr` len
loop curPtr = do
prevState <- peek statePtr
poke curPtrPtr curPtr
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
lastPtr <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
curPtrPtr end codepointPtr statePtr
state <- peek statePtr
case state of
UTF8_REJECT -> do
-- We encountered an encoding error
x <- peek curPtr'
poke statePtr 0
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff) (safe c)
poke destOffPtr (destOff + fromIntegral w)
loop $ curPtr' `plusPtr` 1
let skipByte x = case onErr desc (Just x) of
Nothing -> return ()
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff) (safe c)
poke destOffPtr (destOff + fromIntegral w)
if ptr == lastPtr && prevState /= UTF8_ACCEPT then do
-- If we can't complete the sequence @undecoded0@ from
-- the previous chunk, we invalidate the bytes from
-- @undecoded0@ and retry decoding the current chunk from
-- the initial state.
traverse_ skipByte (B.unpack undecoded0 )
loop lastPtr
else do
peek lastPtr >>= skipByte
loop (lastPtr `plusPtr` 1)

_ -> do
-- We encountered the end of the buffer while decoding
Expand All @@ -322,11 +335,11 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
chunkText <- unsafeSTToIO $ do
arr <- A.unsafeFreeze dest
return $! text arr 0 (fromIntegral n)
lastPtr <- peek curPtrPtr
let left = lastPtr `minusPtr` curPtr
let left = lastPtr `minusPtr` ptr
!undecoded = case state of
UTF8_ACCEPT -> B.empty
_ -> B.append undecoded0 (B.drop left bs)
_ | left == 0 && prevState /= UTF8_ACCEPT -> B.append undecoded0 bs
| otherwise -> B.drop left bs
return $ Some chunkText undecoded
(decodeChunk undecoded codepoint state)
in loop ptr
Expand Down
10 changes: 7 additions & 3 deletions src/Data/Text/Lazy/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,13 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
TE.Some t l f -> chunk t (go f l bs)
go _ l _
| S.null l = empty
| otherwise = case onErr desc (Just (B.unsafeHead l)) of
Nothing -> empty
Just c -> Chunk (T.singleton c) Empty
| otherwise =
let !t = T.pack (skipBytes l)
skipBytes = S.foldr (\x s' ->
case onErr desc (Just x) of
Just c -> c : s'
Nothing -> s') [] in
Chunk t Empty
desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
decodeUtf8With _ _ = empty

Expand Down
14 changes: 14 additions & 0 deletions tests/Tests/Properties/Transcoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import qualified Data.Text.Encoding.Error as E
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as EL

Expand Down Expand Up @@ -152,6 +153,18 @@ genInvalidUTF8 = B.pack <$> oneof [
k <- choose (0,n)
vectorOf k gen

decodeLL :: BL.ByteString -> TL.Text
decodeLL = EL.decodeUtf8With E.lenientDecode

decodeL :: B.ByteString -> T.Text
decodeL = E.decodeUtf8With E.lenientDecode

-- The lenient decoding of lazy bytestrings should not depend on how they are chunked,
-- and it should behave the same as decoding of strict bytestrings.
t_decode_utf8_lenient :: Property
t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs

-- See http://unicode.org/faq/utf_bom.html#gen8
-- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
-- When faced with this illegal byte sequence ... a UTF-8 conformant process
Expand Down Expand Up @@ -206,6 +219,7 @@ testTranscoding =
testProperty "t_utf8_err'" t_utf8_err'
],
testGroup "error recovery" [
testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient,
testProperty "t_decode_with_error2" t_decode_with_error2,
testProperty "t_decode_with_error3" t_decode_with_error3,
testProperty "t_decode_with_error4" t_decode_with_error4,
Expand Down
20 changes: 19 additions & 1 deletion tests/Tests/QuickCheckUtils.hs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ import Test.QuickCheck.Monadic (assert, monadicIO, run)
import Test.QuickCheck.Unicode (string)
import Tests.Utils
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as T
import qualified Data.Text.Encoding.Error as T
import qualified Data.Text.Internal.Fusion as TF
Expand All @@ -61,6 +62,9 @@ import qualified System.IO as IO
genUnicode :: IsString a => Gen a
genUnicode = fromString <$> string

genWord8 :: Gen Word8
genWord8 = chooseAny

instance Random I16 where
randomR = integralRandomR
random = randomR (minBound,maxBound)
Expand All @@ -70,9 +74,23 @@ instance Arbitrary I16 where
shrink = shrinkIntegral

instance Arbitrary B.ByteString where
arbitrary = B.pack `fmap` arbitrary
arbitrary = B.pack `fmap` listOf genWord8
shrink = map B.pack . shrink . B.unpack

instance Arbitrary BL.ByteString where
arbitrary = oneof
[ BL.fromChunks <$> arbitrary
-- so that a single utf8 code point could appear split over up to 4 chunks
, BL.fromChunks . map B.singleton <$> listOf genWord8
-- so that a code point with 4 byte long utf8 representation
-- could appear split over 3 non-singleton chunks
, (\a b c -> BL.fromChunks [a, b, c])
<$> arbitrary
<*> ((\a b -> B.pack [a, b]) <$> genWord8 <*> genWord8)
<*> arbitrary
]
shrink xs = BL.fromChunks <$> shrink (BL.toChunks xs)

-- For tests that have O(n^2) running times or input sizes, resize
-- their inputs to the square root of the originals.
unsquare :: (Arbitrary a, Show a, Testable b) => (a -> b) -> Property
Expand Down
9 changes: 9 additions & 0 deletions tests/Tests/Regressions.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import qualified Data.ByteString.Lazy as LB
import qualified Data.Text as T
import qualified Data.Text.Array as TA
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Encoding.Error as E
import qualified Data.Text.Internal as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as LT
Expand Down Expand Up @@ -136,6 +137,13 @@ t301 = do
original@(T.Text originalArr originalOff originalLen) = T.pack "1234567890"
T.Text newArr _off _len = T.take 1 $ T.drop 1 original

t330 :: IO ()
t330 = do
let decodeL = LE.decodeUtf8With E.lenientDecode
assertEqual "The lenient decoding of lazy bytestrings should not depend on how they are chunked"
(decodeL (LB.fromChunks [B.pack [194], B.pack [97, 98, 99]]))
(decodeL (LB.fromChunks [B.pack [194, 97, 98, 99]]))

tests :: F.TestTree
tests = F.testGroup "Regressions"
[ F.testCase "hGetContents_crash" hGetContents_crash
Expand All @@ -149,4 +157,5 @@ tests = F.testGroup "Regressions"
, F.testCase "t280/fromString" t280_fromString
, F.testCase "t280/singleton" t280_singleton
, F.testCase "t301" t301
, F.testCase "t330" t330
]