Skip to content

Commit

Permalink
Fix UTF-8 decoding of lazy bytestrings
Browse files Browse the repository at this point in the history
At the beginning of a new chunk we may be trying to complete a UTF-8
sequence started in the previous chunk (contained in the `undecode0`
buffer). If it turns out to be invalid, we must apply the `onErr`
handler to every character in that buffer.

When we reach the end of the chunk, we must also be more careful
about when to keep the previous buffer: a UTF-8 sequence (up to 4 bytes)
can span more than two chunks, when those chunks are very short
(of length 0, 1, or 2).
  • Loading branch information
Lysxia committed May 22, 2021
1 parent 8d1b6ff commit c9874d3
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 20 deletions.
45 changes: 29 additions & 16 deletions src/Data/Text/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ import Control.Monad.ST (runST)
import Data.Bits ((.&.))
import Data.ByteString as B
import qualified Data.ByteString.Internal as B
import Data.Foldable (traverse_)
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Internal (Text(..), safe, text)
import Data.Text.Internal.Functions
Expand Down Expand Up @@ -275,19 +276,22 @@ newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
streamDecodeUtf8 :: ByteString -> Decoding
streamDecodeUtf8 = streamDecodeUtf8With strictDecode

-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
-- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8
-- encoded text.
--
-- @since 1.0.0.0
streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
where
-- We create a slightly larger than necessary buffer to accommodate a
-- potential surrogate pair started in the last buffer
-- potential surrogate pair started in the last buffer (@undecoded0@), or
-- replacement characters for each byte in @undecoded0@ if the
-- sequence turns out to be invalid. There can be up to three bytes there,
-- hence we allocate @len+3@ 16-bit words.
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
-> Decoding
decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+3)
where
decodeChunkToBuffer :: A.MArray s -> IO Decoding
decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ptr ->
Expand All @@ -297,23 +301,32 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
with nullPtr $ \curPtrPtr ->
let end = ptr `plusPtr` len
loop curPtr = do
prevState <- peek statePtr
poke curPtrPtr curPtr
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
lastPtr <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
curPtrPtr end codepointPtr statePtr
state <- peek statePtr
case state of
UTF8_REJECT -> do
-- We encountered an encoding error
x <- peek curPtr'
poke statePtr 0
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff) (safe c)
poke destOffPtr (destOff + fromIntegral w)
loop $ curPtr' `plusPtr` 1
let skipByte x = case onErr desc (Just x) of
Nothing -> return ()
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff) (safe c)
poke destOffPtr (destOff + fromIntegral w)
if ptr == lastPtr && prevState /= UTF8_ACCEPT then do
-- If we can't complete the sequence @undecoded0@ from
-- the previous chunk, we invalidate the bytes from
-- @undecoded0@ and retry decoding the current chunk from
-- the initial state.
traverse_ skipByte (B.unpack undecoded0 )
loop lastPtr
else do
peek lastPtr >>= skipByte
loop (lastPtr `plusPtr` 1)

_ -> do
-- We encountered the end of the buffer while decoding
Expand All @@ -322,11 +335,11 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
chunkText <- unsafeSTToIO $ do
arr <- A.unsafeFreeze dest
return $! text arr 0 (fromIntegral n)
lastPtr <- peek curPtrPtr
let left = lastPtr `minusPtr` curPtr
let left = lastPtr `minusPtr` ptr
!undecoded = case state of
UTF8_ACCEPT -> B.empty
_ -> B.append undecoded0 (B.drop left bs)
_ | left == 0 && prevState /= UTF8_ACCEPT -> B.append undecoded0 bs
| otherwise -> B.drop left bs
return $ Some chunkText undecoded
(decodeChunk undecoded codepoint state)
in loop ptr
Expand Down
10 changes: 7 additions & 3 deletions src/Data/Text/Lazy/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,13 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
TE.Some t l f -> chunk t (go f l bs)
go _ l _
| S.null l = empty
| otherwise = case onErr desc (Just (B.unsafeHead l)) of
Nothing -> empty
Just c -> Chunk (T.singleton c) Empty
| otherwise =
let !t = T.pack (skipBytes l)
skipBytes = S.foldr (\x s' ->
case onErr desc (Just x) of
Just c -> c : s'
Nothing -> s') [] in
Chunk t Empty
desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
decodeUtf8With _ _ = empty

Expand Down
14 changes: 14 additions & 0 deletions tests/Tests/Properties/Transcoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import qualified Data.Text.Encoding.Error as E
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as EL

Expand Down Expand Up @@ -152,6 +153,18 @@ genInvalidUTF8 = B.pack <$> oneof [
k <- choose (0,n)
vectorOf k gen

decodeLL :: BL.ByteString -> TL.Text
decodeLL = EL.decodeUtf8With E.lenientDecode

decodeL :: B.ByteString -> T.Text
decodeL = E.decodeUtf8With E.lenientDecode

-- The lenient decoding of lazy bytestrings should not depend on how they are chunked,
-- and it should behave the same as decoding of strict bytestrings.
t_decode_utf8_lenient :: Property
t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs

-- See http://unicode.org/faq/utf_bom.html#gen8
-- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
-- When faced with this illegal byte sequence ... a UTF-8 conformant process
Expand Down Expand Up @@ -206,6 +219,7 @@ testTranscoding =
testProperty "t_utf8_err'" t_utf8_err'
],
testGroup "error recovery" [
testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient,
testProperty "t_decode_with_error2" t_decode_with_error2,
testProperty "t_decode_with_error3" t_decode_with_error3,
testProperty "t_decode_with_error4" t_decode_with_error4,
Expand Down
20 changes: 19 additions & 1 deletion tests/Tests/QuickCheckUtils.hs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ import Test.QuickCheck.Monadic (assert, monadicIO, run)
import Test.QuickCheck.Unicode (string)
import Tests.Utils
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as T
import qualified Data.Text.Encoding.Error as T
import qualified Data.Text.Internal.Fusion as TF
Expand All @@ -61,6 +62,9 @@ import qualified System.IO as IO
genUnicode :: IsString a => Gen a
genUnicode = fromString <$> string

genWord8 :: Gen Word8
genWord8 = chooseAny

instance Random I16 where
randomR = integralRandomR
random = randomR (minBound,maxBound)
Expand All @@ -70,9 +74,23 @@ instance Arbitrary I16 where
shrink = shrinkIntegral

instance Arbitrary B.ByteString where
arbitrary = B.pack `fmap` arbitrary
arbitrary = B.pack `fmap` listOf genWord8
shrink = map B.pack . shrink . B.unpack

instance Arbitrary BL.ByteString where
arbitrary = oneof
[ BL.fromChunks <$> arbitrary
-- so that a single utf8 code point could appear split over up to 4 chunks
, BL.fromChunks . map B.singleton <$> listOf genWord8
-- so that a code point with 4 byte long utf8 representation
-- could appear split over 3 non-singleton chunks
, (\a b c -> BL.fromChunks [a, b, c])
<$> arbitrary
<*> ((\a b -> B.pack [a, b]) <$> genWord8 <*> genWord8)
<*> arbitrary
]
shrink xs = BL.fromChunks <$> shrink (BL.toChunks xs)

-- For tests that have O(n^2) running times or input sizes, resize
-- their inputs to the square root of the originals.
unsquare :: (Arbitrary a, Show a, Testable b) => (a -> b) -> Property
Expand Down
9 changes: 9 additions & 0 deletions tests/Tests/Regressions.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import qualified Data.ByteString.Lazy as LB
import qualified Data.Text as T
import qualified Data.Text.Array as TA
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Encoding.Error as E
import qualified Data.Text.Internal as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as LT
Expand Down Expand Up @@ -136,6 +137,13 @@ t301 = do
original@(T.Text originalArr originalOff originalLen) = T.pack "1234567890"
T.Text newArr _off _len = T.take 1 $ T.drop 1 original

t330 :: IO ()
t330 = do
let decodeL = LE.decodeUtf8With E.lenientDecode
assertEqual "The lenient decoding of lazy bytestrings should not depend on how they are chunked"
(decodeL (LB.fromChunks [B.pack [194], B.pack [97, 98, 99]]))
(decodeL (LB.fromChunks [B.pack [194, 97, 98, 99]]))

tests :: F.TestTree
tests = F.testGroup "Regressions"
[ F.testCase "hGetContents_crash" hGetContents_crash
Expand All @@ -149,4 +157,5 @@ tests = F.testGroup "Regressions"
, F.testCase "t280/fromString" t280_fromString
, F.testCase "t280/singleton" t280_singleton
, F.testCase "t301" t301
, F.testCase "t330" t330
]

0 comments on commit c9874d3

Please sign in to comment.