haskell · Bodigrim · Jan 27, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/src/Data/Text/Internal/Validate.hs b/src/Data/Text/Internal/Validate.hs
@@ -37,14 +37,8 @@ import Data.Text.Internal.ByteStringCompat (withBS)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Text.Internal.Validate.Simd (c_is_valid_utf8_bytearray_safe,c_is_valid_utf8_bytearray_unsafe,c_is_valid_utf8_ptr_unsafe)
 #else
-import GHC.Exts (ByteArray#)
-import Data.Text.Internal.Encoding.Utf8 (CodePoint(..),DecoderResult(..),utf8DecodeStart,utf8DecodeContinue)
-import GHC.Exts (Int(I#),indexWord8Array#)
-import GHC.Word (Word8(W8#))
 import qualified Data.ByteString as B
-#if !MIN_VERSION_bytestring(0,11,2)
-import qualified Data.ByteString.Unsafe as B
-#endif
+import qualified Data.Text.Internal.Validate.Native as N
 #endif
 
 -- | Is the ByteString a valid UTF-8 byte sequence?
@@ -56,21 +50,7 @@ isValidUtf8ByteString bs = withBS bs $ \fp len -> unsafeDupablePerformIO $
 #if MIN_VERSION_bytestring(0,11,2)
 isValidUtf8ByteString = B.isValidUtf8
 #else
-isValidUtf8ByteString bs = start 0
-  where
-    start ix
-      | ix >= B.length bs = True
-      | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of
-        Accept{} -> start (ix + 1)
-        Reject{} -> False
-        Incomplete st _ -> step (ix + 1) st
-    step ix st
-      | ix >= B.length bs = False
-      -- We do not use decoded code point, so passing a dummy value to save an argument.
-      | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of
-        Accept{} -> start (ix + 1)
-        Reject{} -> False
-        Incomplete st' _ -> step (ix + 1) st'
+isValidUtf8ByteString = N.isValidUtf8ByteStringHaskell
 #endif
 #endif
 
@@ -103,7 +83,7 @@ isValidUtf8ByteArrayUnpinned ::
 isValidUtf8ByteArrayUnpinned (ByteArray bs) !off !len =
   unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_unsafe bs (fromIntegral off) (fromIntegral len)
 #else
-isValidUtf8ByteArrayUnpinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs
+isValidUtf8ByteArrayUnpinned = N.isValidUtf8ByteArrayHaskell
 #endif
 
 -- | This uses the @safe@ FFI. GC may run concurrently with @safe@
@@ -120,30 +100,5 @@ isValidUtf8ByteArrayPinned ::
 isValidUtf8ByteArrayPinned (ByteArray bs) !off !len =
   unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_safe bs (fromIntegral off) (fromIntegral len)
 #else
-isValidUtf8ByteArrayPinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs
-#endif
-
-#ifndef SIMDUTF
-isValidUtf8ByteArrayHaskell# ::
-     ByteArray# -- ^ Bytes
-  -> Int -- ^ Offset
-  -> Int -- ^ Length
-  -> Bool
-isValidUtf8ByteArrayHaskell# b !off !len = start off
-  where
-    indexWord8 :: ByteArray# -> Int -> Word8
-    indexWord8 !x (I# i) = W8# (indexWord8Array# x i)
-    start ix
-      | ix >= len = True
-      | otherwise = case utf8DecodeStart (indexWord8 b ix) of
-        Accept{} -> start (ix + 1)
-        Reject{} -> False
-        Incomplete st _ -> step (ix + 1) st
-    step ix st
-      | ix >= len = False
-      -- We do not use decoded code point, so passing a dummy value to save an argument.
-      | otherwise = case utf8DecodeContinue (indexWord8 b ix) st (CodePoint 0) of
-        Accept{} -> start (ix + 1)
-        Reject{} -> False
-        Incomplete st' _ -> step (ix + 1) st'
+isValidUtf8ByteArrayPinned = N.isValidUtf8ByteArrayHaskell
 #endif
diff --git a/src/Data/Text/Internal/Validate/Native.hs b/src/Data/Text/Internal/Validate/Native.hs
@@ -0,0 +1,60 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE MagicHash #-}
+
+-- | Native implementation of 'Data.Text.Internal.Validate'.
+module Data.Text.Internal.Validate.Native
+  ( isValidUtf8ByteStringHaskell
+  , isValidUtf8ByteArrayHaskell
+  ) where
+
+import Data.Array.Byte (ByteArray(ByteArray))
+import Data.ByteString (ByteString)
+import GHC.Exts (ByteArray#,Int(I#),indexWord8Array#)
+import GHC.Word (Word8(W8#))
+import Data.Text.Internal.Encoding.Utf8 (CodePoint(..),DecoderResult(..),utf8DecodeStart,utf8DecodeContinue)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Unsafe as B
+
+-- | Native implementation of 'Data.Text.Internal.Validate.isValidUtf8ByteString'.
+isValidUtf8ByteStringHaskell :: ByteString -> Bool
+isValidUtf8ByteStringHaskell bs = start 0
+  where
+    start ix
+      | ix >= B.length bs = True
+      | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st _ -> step (ix + 1) st
+    step ix st
+      | ix >= B.length bs = False
+      -- We do not use decoded code point, so passing a dummy value to save an argument.
+      | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st' _ -> step (ix + 1) st'
+
+-- | Native implementation of
+-- 'Data.Text.Internal.Validate.isValidUtf8ByteArrayUnpinned'
+-- and 'Data.Text.Internal.Validate.isValidUtf8ByteArrayPinned'.
+isValidUtf8ByteArrayHaskell ::
+     ByteArray -- ^ Bytes
+  -> Int -- ^ Offset
+  -> Int -- ^ Length
+  -> Bool
+isValidUtf8ByteArrayHaskell (ByteArray b) !off !len = start off
+  where
+    indexWord8 :: ByteArray# -> Int -> Word8
+    indexWord8 !x (I# i) = W8# (indexWord8Array# x i)
+    start ix
+      | ix >= off + len = True
+      | otherwise = case utf8DecodeStart (indexWord8 b ix) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st _ -> step (ix + 1) st
+    step ix st
+      | ix >= off + len = False
+      -- We do not use decoded code point, so passing a dummy value to save an argument.
+      | otherwise = case utf8DecodeContinue (indexWord8 b ix) st (CodePoint 0) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st' _ -> step (ix + 1) st'
diff --git a/tests/Tests/Properties.hs b/tests/Tests/Properties.hs
@@ -16,6 +16,7 @@ import Tests.Properties.Substrings (testSubstrings)
 import Tests.Properties.Read (testRead)
 import Tests.Properties.Text (testText)
 import Tests.Properties.Transcoding (testTranscoding)
+import Tests.Properties.Validate (testValidate)
 
 tests :: TestTree
 tests =
@@ -28,5 +29,6 @@ tests =
     testSubstrings,
     testBuilder,
     testLowLevel,
-    testRead
+    testRead,
+    testValidate
   ]
diff --git a/tests/Tests/Properties/Validate.hs b/tests/Tests/Properties/Validate.hs
@@ -0,0 +1,51 @@
+{-# LANGUAGE CPP #-}
+module Tests.Properties.Validate (testValidate) where
+
+import Data.Array.Byte (ByteArray)
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as B
+import Data.ByteString.Short (toShort)
+import Data.Either (isRight)
+import Data.Text.Encoding (decodeUtf8', encodeUtf8)
+import Data.Text.Internal.Validate (isValidUtf8ByteString, isValidUtf8ByteArray)
+import Test.Tasty (TestTree, testGroup)
+import Test.Tasty.QuickCheck ((===), Gen, Property,
+  testProperty, arbitrary, forAllShrink, oneof, shrink)
+import Tests.QuickCheckUtils ()
+#if MIN_VERSION_bytestring(0,12,0)
+import Data.ByteString.Short (unShortByteString)
+#else
+#if MIN_VERSION_bytestring(0,11,1)
+import Data.ByteString.Short (ShortByteString(SBS))
+#else
+import Data.ByteString.Short.Internal (ShortByteString(SBS))
+#endif
+import Data.Array.Byte (ByteArray(ByteArray))
+
+unShortByteString :: ShortByteString -> ByteArray
+unShortByteString (SBS ba) = ByteArray ba
+#endif
+
+testValidate :: TestTree
+testValidate = testGroup "validate"
+  [ testProperty "bytestring" $ forAllShrink genByteString shrink $ \bs ->
+      isValidUtf8ByteString bs === isRight (decodeUtf8' bs)
+    -- We have all we need to shrink here but I'm too lazy to do that now.
+  , testProperty "bytearray" $ forAllByteArray $ \ba off len bs ->
+      isValidUtf8ByteArray ba off len === isRight (decodeUtf8' bs)
+  ]
+
+genByteString :: Gen ByteString
+genByteString = oneof
+  [ arbitrary
+  , encodeUtf8 <$> arbitrary
+  ]
+
+-- | We want to test 'isValidUtf8ByteArray' with various offsets, so we insert a random
+-- prefix and remember its length.
+forAllByteArray :: (ByteArray -> Int -> Int -> ByteString -> Property) -> Property
+forAllByteArray prop =
+  forAllShrink genByteString shrink $ \mainSlice ->
+  forAllShrink arbitrary shrink $ \prefix ->
+  let bs2ba = unShortByteString . toShort in
+  prop (bs2ba (prefix `B.append` mainSlice)) (B.length prefix) (B.length mainSlice) mainSlice
diff --git a/text.cabal b/text.cabal
@@ -192,6 +192,7 @@ library
     Data.Text.Internal.Unsafe
     Data.Text.Internal.Unsafe.Char
     Data.Text.Internal.Validate
+    Data.Text.Internal.Validate.Native
     Data.Text.Lazy
     Data.Text.Lazy.Builder
     Data.Text.Lazy.Builder.Int
@@ -274,6 +275,7 @@ test-suite tests
     Tests.Properties.Substrings
     Tests.Properties.Text
     Tests.Properties.Transcoding
+    Tests.Properties.Validate
     Tests.QuickCheckUtils
     Tests.RebindableSyntaxTest
     Tests.Regressions
@@ -294,7 +296,8 @@ test-suite tests
     template-haskell,
     transformers,
     text
-
+  if impl(ghc < 9.4)
+    build-depends: data-array-byte >= 0.1 && < 0.2
   -- Plugin infrastructure does not work properly in 8.6.1, and
   -- ghc-9.2.1 library depends on parsec, which causes a circular dependency.
   if impl(ghc >= 8.2.1 && < 8.6 || >= 8.6.2 && < 9.2 || >= 9.2.2)