Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix isValidUtf8ByteArray #553

Merged
merged 4 commits into from
Jan 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 4 additions & 49 deletions src/Data/Text/Internal/Validate.hs
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,8 @@ import Data.Text.Internal.ByteStringCompat (withBS)
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Data.Text.Internal.Validate.Simd (c_is_valid_utf8_bytearray_safe,c_is_valid_utf8_bytearray_unsafe,c_is_valid_utf8_ptr_unsafe)
#else
import GHC.Exts (ByteArray#)
import Data.Text.Internal.Encoding.Utf8 (CodePoint(..),DecoderResult(..),utf8DecodeStart,utf8DecodeContinue)
import GHC.Exts (Int(I#),indexWord8Array#)
import GHC.Word (Word8(W8#))
import qualified Data.ByteString as B
#if !MIN_VERSION_bytestring(0,11,2)
import qualified Data.ByteString.Unsafe as B
#endif
import qualified Data.Text.Internal.Validate.Native as N
#endif

-- | Is the ByteString a valid UTF-8 byte sequence?
Expand All @@ -56,21 +50,7 @@ isValidUtf8ByteString bs = withBS bs $ \fp len -> unsafeDupablePerformIO $
#if MIN_VERSION_bytestring(0,11,2)
isValidUtf8ByteString = B.isValidUtf8
#else
isValidUtf8ByteString bs = start 0
where
start ix
| ix >= B.length bs = True
| otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st _ -> step (ix + 1) st
step ix st
| ix >= B.length bs = False
-- We do not use decoded code point, so passing a dummy value to save an argument.
| otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st' _ -> step (ix + 1) st'
isValidUtf8ByteString = N.isValidUtf8ByteStringHaskell
#endif
#endif

Expand Down Expand Up @@ -103,7 +83,7 @@ isValidUtf8ByteArrayUnpinned ::
isValidUtf8ByteArrayUnpinned (ByteArray bs) !off !len =
unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_unsafe bs (fromIntegral off) (fromIntegral len)
#else
isValidUtf8ByteArrayUnpinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs
isValidUtf8ByteArrayUnpinned = N.isValidUtf8ByteArrayHaskell
#endif

-- | This uses the @safe@ FFI. GC may run concurrently with @safe@
Expand All @@ -120,30 +100,5 @@ isValidUtf8ByteArrayPinned ::
isValidUtf8ByteArrayPinned (ByteArray bs) !off !len =
unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_safe bs (fromIntegral off) (fromIntegral len)
#else
isValidUtf8ByteArrayPinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs
#endif

#ifndef SIMDUTF
isValidUtf8ByteArrayHaskell# ::
ByteArray# -- ^ Bytes
-> Int -- ^ Offset
-> Int -- ^ Length
-> Bool
isValidUtf8ByteArrayHaskell# b !off !len = start off
where
indexWord8 :: ByteArray# -> Int -> Word8
indexWord8 !x (I# i) = W8# (indexWord8Array# x i)
start ix
| ix >= len = True
| otherwise = case utf8DecodeStart (indexWord8 b ix) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st _ -> step (ix + 1) st
step ix st
| ix >= len = False
-- We do not use decoded code point, so passing a dummy value to save an argument.
| otherwise = case utf8DecodeContinue (indexWord8 b ix) st (CodePoint 0) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st' _ -> step (ix + 1) st'
isValidUtf8ByteArrayPinned = N.isValidUtf8ByteArrayHaskell
#endif
60 changes: 60 additions & 0 deletions src/Data/Text/Internal/Validate/Native.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE MagicHash #-}

-- | Native implementation of 'Data.Text.Internal.Validate'.
module Data.Text.Internal.Validate.Native
( isValidUtf8ByteStringHaskell
, isValidUtf8ByteArrayHaskell
) where

import Data.Array.Byte (ByteArray(ByteArray))
import Data.ByteString (ByteString)
import GHC.Exts (ByteArray#,Int(I#),indexWord8Array#)
import GHC.Word (Word8(W8#))
import Data.Text.Internal.Encoding.Utf8 (CodePoint(..),DecoderResult(..),utf8DecodeStart,utf8DecodeContinue)
import qualified Data.ByteString as B
import qualified Data.ByteString.Unsafe as B

-- | Native implementation of 'Data.Text.Internal.Validate.isValidUtf8ByteString'.
isValidUtf8ByteStringHaskell :: ByteString -> Bool
isValidUtf8ByteStringHaskell bs = start 0
where
start ix
| ix >= B.length bs = True
| otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st _ -> step (ix + 1) st
step ix st
| ix >= B.length bs = False
-- We do not use decoded code point, so passing a dummy value to save an argument.
| otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st' _ -> step (ix + 1) st'

-- | Native implementation of
-- 'Data.Text.Internal.Validate.isValidUtf8ByteArrayUnpinned'
-- and 'Data.Text.Internal.Validate.isValidUtf8ByteArrayPinned'.
isValidUtf8ByteArrayHaskell ::
ByteArray -- ^ Bytes
-> Int -- ^ Offset
-> Int -- ^ Length
-> Bool
isValidUtf8ByteArrayHaskell (ByteArray b) !off !len = start off
where
indexWord8 :: ByteArray# -> Int -> Word8
indexWord8 !x (I# i) = W8# (indexWord8Array# x i)
start ix
| ix >= off + len = True
| otherwise = case utf8DecodeStart (indexWord8 b ix) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st _ -> step (ix + 1) st
step ix st
| ix >= off + len = False
-- We do not use decoded code point, so passing a dummy value to save an argument.
| otherwise = case utf8DecodeContinue (indexWord8 b ix) st (CodePoint 0) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st' _ -> step (ix + 1) st'
4 changes: 3 additions & 1 deletion tests/Tests/Properties.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import Tests.Properties.Substrings (testSubstrings)
import Tests.Properties.Read (testRead)
import Tests.Properties.Text (testText)
import Tests.Properties.Transcoding (testTranscoding)
import Tests.Properties.Validate (testValidate)

tests :: TestTree
tests =
Expand All @@ -28,5 +29,6 @@ tests =
testSubstrings,
testBuilder,
testLowLevel,
testRead
testRead,
testValidate
]
51 changes: 51 additions & 0 deletions tests/Tests/Properties/Validate.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{-# LANGUAGE CPP #-}
module Tests.Properties.Validate (testValidate) where

import Data.Array.Byte (ByteArray)
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import Data.ByteString.Short (toShort)
import Data.Either (isRight)
import Data.Text.Encoding (decodeUtf8', encodeUtf8)
import Data.Text.Internal.Validate (isValidUtf8ByteString, isValidUtf8ByteArray)
import Test.Tasty (TestTree, testGroup)
import Test.Tasty.QuickCheck ((===), Gen, Property,
testProperty, arbitrary, forAllShrink, oneof, shrink)
import Tests.QuickCheckUtils ()
#if MIN_VERSION_bytestring(0,12,0)
import Data.ByteString.Short (unShortByteString)
#else
#if MIN_VERSION_bytestring(0,11,1)
import Data.ByteString.Short (ShortByteString(SBS))
#else
import Data.ByteString.Short.Internal (ShortByteString(SBS))
#endif
import Data.Array.Byte (ByteArray(ByteArray))

unShortByteString :: ShortByteString -> ByteArray
unShortByteString (SBS ba) = ByteArray ba
#endif

testValidate :: TestTree
testValidate = testGroup "validate"
[ testProperty "bytestring" $ forAllShrink genByteString shrink $ \bs ->
isValidUtf8ByteString bs === isRight (decodeUtf8' bs)
-- We have all we need to shrink here but I'm too lazy to do that now.
, testProperty "bytearray" $ forAllByteArray $ \ba off len bs ->
isValidUtf8ByteArray ba off len === isRight (decodeUtf8' bs)
]

genByteString :: Gen ByteString
genByteString = oneof
[ arbitrary
, encodeUtf8 <$> arbitrary
]

-- | We want to test 'isValidUtf8ByteArray' with various offsets, so we insert a random
-- prefix and remember its length.
forAllByteArray :: (ByteArray -> Int -> Int -> ByteString -> Property) -> Property
forAllByteArray prop =
forAllShrink genByteString shrink $ \mainSlice ->
forAllShrink arbitrary shrink $ \prefix ->
let bs2ba = unShortByteString . toShort in
prop (bs2ba (prefix `B.append` mainSlice)) (B.length prefix) (B.length mainSlice) mainSlice
5 changes: 4 additions & 1 deletion text.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ library
Data.Text.Internal.Unsafe
Data.Text.Internal.Unsafe.Char
Data.Text.Internal.Validate
Data.Text.Internal.Validate.Native
Data.Text.Lazy
Data.Text.Lazy.Builder
Data.Text.Lazy.Builder.Int
Expand Down Expand Up @@ -274,6 +275,7 @@ test-suite tests
Tests.Properties.Substrings
Tests.Properties.Text
Tests.Properties.Transcoding
Tests.Properties.Validate
Tests.QuickCheckUtils
Tests.RebindableSyntaxTest
Tests.Regressions
Expand All @@ -294,7 +296,8 @@ test-suite tests
template-haskell,
transformers,
text

if impl(ghc < 9.4)
build-depends: data-array-byte >= 0.1 && < 0.2
-- Plugin infrastructure does not work properly in 8.6.1, and
-- ghc-9.2.1 library depends on parsec, which causes a circular dependency.
if impl(ghc >= 8.2.1 && < 8.6 || >= 8.6.2 && < 9.2 || >= 9.2.2)
Expand Down
Loading