Skip to content

Commit

Permalink
Further tuneup of cstring{,Utf8}
Browse files Browse the repository at this point in the history
Moved to Data.ByteString.Builder.Internal, as these no longer
have anything to do with 'BoundedPrim', and can benefit from
supporting internal code in their new home.
  • Loading branch information
hs-viktor committed Jan 14, 2023
1 parent 266d6da commit e6cc4a2
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 115 deletions.
78 changes: 77 additions & 1 deletion Data/ByteString/Builder/Internal.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{-# LANGUAGE ScopedTypeVariables, CPP, BangPatterns, RankNTypes, TupleSections #-}
{-# LANGUAGE Unsafe #-}
{-# LANGUAGE MagicHash, ViewPatterns, Unsafe #-}
{-# OPTIONS_HADDOCK not-home #-}
-- | Copyright : (c) 2010 - 2011 Simon Meier
-- License : BSD3-style (see LICENSE)
Expand Down Expand Up @@ -86,6 +86,8 @@ module Data.ByteString.Builder.Internal (
, byteStringCopy
, byteStringInsert
, byteStringThreshold
, cstring
, cstringUtf8

, lazyByteStringCopy
, lazyByteStringInsert
Expand Down Expand Up @@ -127,6 +129,7 @@ module Data.ByteString.Builder.Internal (
) where

import Control.Arrow (second)
import Control.Monad (when)

#if !(MIN_VERSION_base(4,11,0))
import Data.Semigroup (Semigroup((<>)))
Expand All @@ -140,10 +143,12 @@ import qualified Data.ByteString.Short.Internal as Sh
import qualified GHC.IO.Buffer as IO (Buffer(..), newByteBuffer)
import GHC.IO.Handle.Internals (wantWritableHandle, flushWriteBuffer)
import GHC.IO.Handle.Types (Handle__, haByteBuffer, haBufferMode)
import GHC.Exts
import System.IO (hFlush, BufferMode(..), Handle)
import Data.IORef

import Foreign
import Foreign.C.String (CString)
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import System.IO.Unsafe (unsafeDupablePerformIO)

Expand Down Expand Up @@ -857,6 +862,77 @@ byteStringInsert :: S.ByteString -> Builder
byteStringInsert =
\bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

-- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
-- Null characters are not representable.
--
-- @since 0.11.5.0
{-# INLINABLE cstring #-}
cstring :: Addr# -> Builder
cstring = \addr -> builder $ \k br -> do
let ip = Ptr addr
#if __GLASGOW_HASKELL__ >= 811
ipe = Ptr (addr `plusAddr#` (cstringLength# addr))
#else
!ipe <- plusPtr ip . fromIntegral <$> S.c_strlen ip
#endif
wrappedBytesCopyStep (BufferRange ip ipe) k br

-- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
-- Null characters can be encoded as @0xc0 0x80@.
--
-- @since 0.11.5.0
cstringUtf8 :: Addr# -> Builder
cstringUtf8 = \addr0 -> builder $ \k br -> do
#if __GLASGOW_HASKELL__ >= 811
let len = cstringLength# addr0
#else
(I# len) <- fromIntegral <$> S.c_strlen (Ptr addr0)
#endif
nullAt <- c_strstr (Ptr addr0) (Ptr "\xc0\x80"#)
cstringUtf8_step addr0 len nullAt k br
{-# INLINABLE cstringUtf8 #-}

cstringUtf8_step :: Addr# -> Int# -> Ptr Word8 -> BuildStep r -> BuildStep r
cstringUtf8_step addr len ((== nullPtr) -> True) k br =
-- Contains no encoded nulls, use simple copy codepath
wrappedBytesCopyStep (BufferRange ip ipe) k br
where
ip = Ptr addr
ipe = Ptr (addr `plusAddr#` len)
cstringUtf8_step addr len !nullAt k (BufferRange op0 ope)
-- Copy as much of the null-free portion of the string as fits into the
-- available buffer space. If the string is long enough, we may have asked
-- for less than its full length, filling the buffer with the rest will go
-- into the next builder step.
| avail > nullFree = do
when (nullFree > 0) (S.memcpy op0 (Ptr addr) nullFree)
pokeElemOff op0 nullFree 0
let !op' = op0 `plusPtr` (nullFree + 1)
nread# = nullFree# +# 2#
addr' = addr `plusAddr#` nread#
len' = len -# nread#
nullAt' <- c_strstr (Ptr addr') (Ptr "\xc0\x80"#)
cstringUtf8_step addr' len' nullAt' k (BufferRange op' ope)
| otherwise = do
let !copy@(I# copy#) = min avail nullFree
when (copy > 0) (S.memcpy op0 (Ptr addr) copy)
let !op' = op0 `plusPtr` copy
addr' = addr `plusAddr#` copy#
len' = len -# copy#
return $ bufferFull 1 op' (cstringUtf8_step addr' len' nullAt k)
where
!avail = ope `minusPtr` op0
!nullFree@(I# nullFree#) = nullAt `minusPtr` (Ptr addr)

foreign import ccall unsafe "string.h strstr" c_strstr
:: CString -> CString -> IO (Ptr Word8)


-- Short bytestrings
------------------------------------------------------------------------------

Expand Down
98 changes: 4 additions & 94 deletions Data/ByteString/Builder/Prim.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{-# LANGUAGE BangPatterns, CPP, ScopedTypeVariables #-}
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
{-# LANGUAGE MagicHash, UnboxedTuples, PatternGuards #-}
{-# OPTIONS_GHC -fno-warn-unused-imports #-}
{-# LANGUAGE Trustworthy #-}
Expand Down Expand Up @@ -433,8 +433,8 @@ module Data.ByteString.Builder.Prim (
-- a decimal number with UTF-8 encoded characters.
, charUtf8

, cstring
, cstringUtf8
, cstring -- Backwards-compatibility re-exports from Internal.hs
, cstringUtf8 -- these no longer make use of the BoundPrim API.

{-
-- * Testing support
Expand Down Expand Up @@ -468,6 +468,7 @@ import Data.ByteString.Builder.Prim.ASCII

import Foreign
import Foreign.C.Types
import Foreign.C.String (CString)
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import GHC.Int (Int (..))
import GHC.Word (Word8 (..))
Expand Down Expand Up @@ -664,97 +665,6 @@ primMapLazyByteStringBounded w =
L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

-- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
-- Null characters are not representable.
--
-- @since 0.11.0.0
cstring :: Addr# -> Builder
cstring = \addr0 -> builder $ \k br -> do
#if __GLASGOW_HASKELL__ >= 811
let len = cstringLength# addr0
#else
(I# len) <- fromIntegral <$> S.c_strlen (Ptr addr0)
#endif
cstring_step addr0 len k br
{-# INLINE cstring #-}

cstring_step :: Addr# -> Int# -> BuildStep r -> BuildStep r
cstring_step !addr !len !k br@(BufferRange op0 ope)
-- String is empty, process the continuation
| (I# len) == 0 = k br
-- Buffer is full, allocate some more... We ask for just one more
-- byte, but the builder allocation strategy will in practice give
-- us more space, which we'll consume in a single step.
| op0 == ope =
return $ bufferFull 1 op0 (cstring_step addr len k)
-- Copy as much of the string as fits into the available buffer space.
-- If the string is long enough, we may have asked for less than its
-- full length, filling the buffer with the rest will go into the next
-- builder step.
| otherwise = do
let !avail@(I# avail#) = min (I# len) (ope `minusPtr` op0)
br' = BufferRange (op0 `plusPtr` avail) ope
addr' = addr `plusAddr#` avail#
len' = len -# avail#
S.memcpy op0 (Ptr addr) avail
cstring_step addr' len' k br'

-- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
-- Null characters can be encoded as @0xc0 0x80@.
--
-- @since 0.11.0.0
cstringUtf8 :: Addr# -> Builder
cstringUtf8 = \addr0 -> builder $ \k br -> do
#if __GLASGOW_HASKELL__ >= 811
let len = cstringLength# addr0
#else
(I# len) <- fromIntegral <$> S.c_strlen (Ptr addr0)
#endif
nullAt <- S.c_strstr (Ptr addr0) (Ptr "\xc0\x80"#)
cstringUtf8_step addr0 len nullAt k br
{-# INLINE cstringUtf8 #-}

cstringUtf8_step :: Addr# -> Int# -> Ptr Word8 -> BuildStep r -> BuildStep r
cstringUtf8_step !addr !len !nullAt !k br@(BufferRange op0@(Ptr op0#) ope)
-- String is empty, process the continuation
| (I# len) == 0 = k br
-- Contains no encoded nulls, use simpler 'cstring' code
| nullPtr == nullAt =
cstring_step addr len k br
-- Buffer is full, allocate some more... We ask for just one more
-- byte, but the builder allocation strategy will in practice give
-- us more space, which we'll consume in a single step.
| op0 == ope =
return $ bufferFull 1 op0 (cstringUtf8_step addr len nullAt k)
-- We're at the encoded null-byte, append a '\0' to the buffer and
-- continue with the rest of the input string, after locating the
-- next encoded null-byte, if any.
| (Ptr addr) == nullAt = do
let !(W8# nullByte#) = 0
IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
addr' = addr `plusAddr#` 2#
len' = len -# 2#
nullAt' <- S.c_strstr (Ptr addr') (Ptr "\xc0\x80"#)
cstringUtf8_step addr' len' nullAt' k br'
-- Copy as much of the null-free portion of the string as fits into the
-- available buffer space. If the string is long enough, we may have asked
-- for less than its full length, filling the buffer with the rest will go
-- into the next builder step.
| otherwise = do
let !nullFree = nullAt `minusPtr` (Ptr addr)
!avail@(I# avail#) = min nullFree (ope `minusPtr` op0)
br' = BufferRange (op0 `plusPtr` avail) ope
addr' = addr `plusAddr#` avail#
len' = len -# avail#
S.memcpy op0 (Ptr addr) avail
cstringUtf8_step addr' len' nullAt k br'

------------------------------------------------------------------------------
-- Char8 encoding
------------------------------------------------------------------------------
Expand Down
1 change: 0 additions & 1 deletion Data/ByteString/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ module Data.ByteString.Internal (

-- * Standard C Functions
c_strlen,
c_strstr,
c_free_finalizer,

memchr,
Expand Down
4 changes: 0 additions & 4 deletions Data/ByteString/Internal/Type.hs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ module Data.ByteString.Internal.Type (

-- * Standard C Functions
c_strlen,
c_strstr,
c_free_finalizer,

memchr,
Expand Down Expand Up @@ -1002,9 +1001,6 @@ accursedUnutterablePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
foreign import ccall unsafe "string.h strlen" c_strlen
:: CString -> IO CSize

foreign import ccall unsafe "string.h strstr" c_strstr
:: CString -> CString -> IO (Ptr Word8)

foreign import ccall unsafe "static stdlib.h &free" c_free_finalizer
:: FunPtr (Ptr Word8 -> IO ())

Expand Down
2 changes: 2 additions & 0 deletions bench/BenchAll.hs
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,8 @@ main = do
, benchB' "UTF-8 String" () $ \() -> P.cstringUtf8 "hello world\0"#
, benchB' "String (naive)" "hello world!" fromString
, benchB' "String" () $ \() -> P.cstring "hello world!"#
, benchB' "AsciiLit64" () $ \() -> P.cstring "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
, benchB' "Utf8Lit64" () $ \() -> P.cstringUtf8 "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xc0\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
]

, bgroup "Encoding wrappers"
Expand Down
15 changes: 1 addition & 14 deletions tests/builder/Data/ByteString/Builder/Prim/Tests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,7 @@ import Test.Tasty.QuickCheck

tests :: [TestTree]
tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8
, testsCombinatorsB, [testCString, testCStringUtf8] ]

testCString :: TestTree
testCString = testProperty "cstring" $
toLazyByteString (BP.cstring "hello world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!"

testCStringUtf8 :: TestTree
testCStringUtf8 = testProperty "cstringUtf8" $
toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world\xc0\x80!"#) ==
LC.pack "hello" `L.append` L.singleton 0x00
`L.append` LC.pack "world"
`L.append` L.singleton 0x00
`L.append` LC.singleton '!'
, testsCombinatorsB ]

------------------------------------------------------------------------------
-- Binary
Expand Down
19 changes: 18 additions & 1 deletion tests/builder/Data/ByteString/Builder/Tests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import Data.Word
import qualified Data.ByteString as S
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as LC
import qualified Data.ByteString.Short as Sh

import Data.ByteString.Builder
Expand Down Expand Up @@ -73,7 +74,8 @@ tests =
testsASCII ++
testsFloating ++
testsChar8 ++
testsUtf8
testsUtf8 ++
testCString


------------------------------------------------------------------------------
Expand Down Expand Up @@ -988,3 +990,18 @@ testsUtf8 =
[ testBuilderConstr "charUtf8" charUtf8_list charUtf8
, testBuilderConstr "stringUtf8" (foldMap charUtf8_list) stringUtf8
]

testCString :: [TestTree]
testCString =
[ testProperty "cstring" $
toLazyByteString (BI.cstring "hello world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x20
`L.append` LC.pack "world!"
, testProperty "cstringUtf8" $
toLazyByteString (BI.cstringUtf8 "hello\xc0\x80\xc0\x80world\xc0\x80!"#) ==
LC.pack "hello" `L.append` L.singleton 0x00
`L.append` L.singleton 0x00
`L.append` LC.pack "world"
`L.append` L.singleton 0x00
`L.append` LC.singleton '!'
]

0 comments on commit e6cc4a2

Please sign in to comment.