]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Internal.hs
renamed fold foldChars and began updating documentation
[github/fretlink/text-pipes.git] / Pipes / Text / Internal.hs
CommitLineData
64e03122 1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
4
c9d1c945 5-- This module lifts assorted materials from Brian O'Sullivan's text package
8c482809 6-- especially Data.Text.Encoding in order to define a pipes-appropriate
7-- streamDecodeUtf8
8module Pipes.Text.Internal
9 ( Decoding(..)
8c482809 10 , streamDecodeUtf8
3694350a 11 , decodeSomeUtf8
64e03122 12 , Codec(..)
13 , TextException(..)
14 , utf8
8c482809 15 ) where
8c482809 16import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
17import Control.Monad.ST (ST, runST)
8c482809 18import Data.Bits ((.&.))
3694350a 19import Data.ByteString as B
64e03122 20import Data.ByteString (ByteString)
3694350a 21import Data.ByteString.Internal as B
64e03122 22import Data.ByteString.Char8 as B8
23import Data.Text (Text)
24import qualified Data.Text as T
25import qualified Data.Text.Encoding as TE
3694350a 26import Data.Text.Encoding.Error ()
27import Data.Text.Internal (Text, textP)
8c482809 28import Foreign.C.Types (CSize)
29import Foreign.ForeignPtr (withForeignPtr)
30import Foreign.Marshal.Utils (with)
31import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
32import Foreign.Storable (Storable, peek, poke)
3694350a 33import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
34import GHC.Word (Word8, Word32)
8c482809 35import qualified Data.Text.Array as A
64e03122 36import Data.Word (Word8, Word16)
37import System.IO.Unsafe (unsafePerformIO)
38import qualified Control.Exception as Exc
39import Data.Bits ((.&.), (.|.), shiftL)
40import Data.Typeable
41import Control.Arrow (first)
42import Data.Maybe (catMaybes)
8c482809 43#include "pipes_text_cbits.h"
44
64e03122 45
46-- | A specific character encoding.
47--
48-- Since 0.3.0
49data Codec = Codec
50 { codecName :: Text
51 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
52 , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
53 }
54
55instance Show Codec where
56 showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c)
57
58-- Since 0.3.0
59data TextException = DecodeException Codec Word8
60 | EncodeException Codec Char
61 | LengthExceeded Int
62 | TextException Exc.SomeException
63 deriving (Show, Typeable)
64instance Exc.Exception TextException
65
66toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
67 -> (ByteString -> Decoding)
68toDecoding op = loop B.empty where
69 loop extra bs0 = case op (B.append extra bs0) of
70 (txt, Right bs) -> Some txt bs (loop bs)
71 (txt, Left (_,bs)) -> Other txt bs
72
73
74splitSlowly :: (ByteString -> Text)
75 -> ByteString
76 -> (Text, Either (TextException, ByteString) ByteString)
77splitSlowly dec bytes = valid where
78 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
79 splits 0 = [(B.empty, bytes)]
80 splits n = B.splitAt n bytes : splits (n - 1)
81 decFirst (a, b) = case tryEvaluate (dec a) of
82 Left _ -> Nothing
83 Right text -> let trouble = case tryEvaluate (dec b) of
84 Left exc -> Left (TextException exc, b)
85 Right _ -> Right B.empty
86 in Just (text, trouble)
87 -- this case shouldn't occur,
88 -- since splitSlowly is only called
89 -- when parsing failed somewhere
90
91utf8 :: Codec
92utf8 = Codec name enc (toDecoding dec) where
93 name = T.pack "UTF-8"
94 enc text = (TE.encodeUtf8 text, Nothing)
95 dec bytes = case decodeSomeUtf8 bytes of
96 (t,b) -> (t, Right b)
97
98-- -- Whether the given byte is a continuation byte.
99-- isContinuation byte = byte .&. 0xC0 == 0x80
100--
101-- -- The number of continuation bytes needed by the given
102-- -- non-continuation byte. Returns -1 for an illegal UTF-8
103-- -- non-continuation byte and the whole split quickly must fail so
104-- -- as the input is passed to TE.decodeUtf8, which will issue a
105-- -- suitable error.
106-- required x0
107-- | x0 .&. 0x80 == 0x00 = 0
108-- | x0 .&. 0xE0 == 0xC0 = 1
109-- | x0 .&. 0xF0 == 0xE0 = 2
110-- | x0 .&. 0xF8 == 0xF0 = 3
111-- | otherwise = -1
112--
113-- splitQuickly bytes
114-- | B.null l || req == -1 = Nothing
115-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
116-- | otherwise = Just (TE.decodeUtf8 l', r')
117-- where
118-- (l, r) = B.spanEnd isContinuation bytes
119-- req = required (B.last l)
120-- l' = B.init l
121-- r' = B.cons (B.last l) r
122
123-- |
124-- Since 0.3.0
125utf16_le :: Codec
126utf16_le = Codec name enc (toDecoding dec) where
127 name = T.pack "UTF-16-LE"
128 enc text = (TE.encodeUtf16LE text, Nothing)
129 dec bytes = case splitQuickly bytes of
130 Just (text, extra) -> (text, Right extra)
131 Nothing -> splitSlowly TE.decodeUtf16LE bytes
132
133 splitQuickly bytes = maybeDecode (loop 0) where
134 maxN = B.length bytes
135
136 loop n | n == maxN = decodeAll
137 | (n + 1) == maxN = decodeTo n
138 loop n = let
139 req = utf16Required
140 (B.index bytes n)
141 (B.index bytes (n + 1))
142 decodeMore = loop $! n + req
143 in if n + req > maxN
144 then decodeTo n
145 else decodeMore
146
147 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
148 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
149
150-- |
151-- Since 0.3.0
152utf16_be :: Codec
153utf16_be = Codec name enc (toDecoding dec) where
154 name = T.pack "UTF-16-BE"
155 enc text = (TE.encodeUtf16BE text, Nothing)
156 dec bytes = case splitQuickly bytes of
157 Just (text, extra) -> (text, Right extra)
158 Nothing -> splitSlowly TE.decodeUtf16BE bytes
159
160 splitQuickly bytes = maybeDecode (loop 0) where
161 maxN = B.length bytes
162
163 loop n | n == maxN = decodeAll
164 | (n + 1) == maxN = decodeTo n
165 loop n = let
166 req = utf16Required
167 (B.index bytes (n + 1))
168 (B.index bytes n)
169 decodeMore = loop $! n + req
170 in if n + req > maxN
171 then decodeTo n
172 else decodeMore
173
174 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
175 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
176
177utf16Required :: Word8 -> Word8 -> Int
178utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
179 x :: Word16
180 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
181
182-- |
183-- Since 0.3.0
184utf32_le :: Codec
185utf32_le = Codec name enc (toDecoding dec) where
186 name = T.pack "UTF-32-LE"
187 enc text = (TE.encodeUtf32LE text, Nothing)
188 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
189 Just (text, extra) -> (text, Right extra)
190 Nothing -> splitSlowly TE.decodeUtf32LE bs
191
192-- |
193-- Since 0.3.0
194utf32_be :: Codec
195utf32_be = Codec name enc (toDecoding dec) where
196 name = T.pack "UTF-32-BE"
197 enc text = (TE.encodeUtf32BE text, Nothing)
198 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
199 Just (text, extra) -> (text, Right extra)
200 Nothing -> splitSlowly TE.decodeUtf32BE bs
201
202utf32SplitBytes :: (ByteString -> Text)
203 -> ByteString
204 -> Maybe (Text, ByteString)
205utf32SplitBytes dec bytes = split where
206 split = maybeDecode (dec toDecode, extra)
207 len = B.length bytes
208 lenExtra = mod len 4
209
210 lenToDecode = len - lenExtra
211 (toDecode, extra) = if lenExtra == 0
212 then (bytes, B.empty)
213 else B.splitAt lenToDecode bytes
214
215-- |
216-- Since 0.3.0
217ascii :: Codec
218ascii = Codec name enc (toDecoding dec) where
219 name = T.pack "ASCII"
220 enc text = (bytes, extra) where
221 (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
222 bytes = B8.pack (T.unpack safe)
223 extra = if T.null unsafe
224 then Nothing
225 else Just (EncodeException ascii (T.head unsafe), unsafe)
226
227 dec bytes = (text, extra) where
228 (safe, unsafe) = B.span (<= 0x7F) bytes
229 text = T.pack (B8.unpack safe)
230 extra = if B.null unsafe
231 then Right B.empty
232 else Left (DecodeException ascii (B.head unsafe), unsafe)
233
234-- |
235-- Since 0.3.0
236iso8859_1 :: Codec
237iso8859_1 = Codec name enc (toDecoding dec) where
238 name = T.pack "ISO-8859-1"
239 enc text = (bytes, extra) where
240 (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
241 bytes = B8.pack (T.unpack safe)
242 extra = if T.null unsafe
243 then Nothing
244 else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
245
246 dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
247
248tryEvaluate :: a -> Either Exc.SomeException a
249tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
250
251maybeDecode :: (a, b) -> Maybe (a, b)
252maybeDecode (a, b) = case tryEvaluate a of
253 Left _ -> Nothing
254 Right _ -> Just (a, b)
255
8c482809 256-- | A stream oriented decoding result.
257data Decoding = Some Text ByteString (ByteString -> Decoding)
258 | Other Text ByteString
259instance Show Decoding where
260 showsPrec d (Some t bs _) = showParen (d > prec) $
261 showString "Some " . showsPrec prec' t .
262 showChar ' ' . showsPrec prec' bs .
263 showString " _"
264 where prec = 10; prec' = prec + 1
265 showsPrec d (Other t bs) = showParen (d > prec) $
266 showString "Other " . showsPrec prec' t .
267 showChar ' ' . showsPrec prec' bs .
268 showString " _"
269 where prec = 10; prec' = prec + 1
3694350a 270
8c482809 271newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
272newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
273
8c482809 274streamDecodeUtf8 :: ByteString -> Decoding
3694350a 275streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
276 where
277 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
278 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
279 runST $ do marray <- A.new (len+1)
280 unsafeIOToST (decodeChunkToBuffer marray)
281 where
282 decodeChunkToBuffer :: A.MArray s -> IO Decoding
283 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
284 with (0::CSize) $ \destOffPtr ->
285 with codepoint0 $ \codepointPtr ->
286 with state0 $ \statePtr ->
287 with nullPtr $ \curPtrPtr ->
288 do let end = ptr `plusPtr` (off + len)
289 curPtr = ptr `plusPtr` off
290 poke curPtrPtr curPtr
291 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
292 state <- peek statePtr
293 lastPtr <- peek curPtrPtr
294 codepoint <- peek codepointPtr
295 n <- peek destOffPtr
296 chunkText <- mkText dest n
297 let left = lastPtr `minusPtr` curPtr
298 remaining = B.drop left bs
299 accum = if T.null chunkText then B.append old remaining else remaining
300 return $! case state of
301 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
302 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
303 {-# INLINE decodeChunkToBuffer #-}
304 {-# INLINE decodeChunkUtf8 #-}
305{-# INLINE streamDecodeUtf8 #-}
8c482809 306
3694350a 307decodeSomeUtf8 :: ByteString -> (Text, ByteString)
308decodeSomeUtf8 bs@(PS fp off len) = runST $ do
309 dest <- A.new (len+1)
310 unsafeIOToST $
311 withForeignPtr fp $ \ptr ->
312 with (0::CSize) $ \destOffPtr ->
313 with (0::CodePoint) $ \codepointPtr ->
314 with (0::DecoderState) $ \statePtr ->
315 with nullPtr $ \curPtrPtr ->
c9d1c945 316 do let end = ptr `plusPtr` (off + len)
317 curPtr = ptr `plusPtr` off
318 poke curPtrPtr curPtr
3694350a 319 c_decode_utf8_with_state (A.maBA dest) destOffPtr
320 curPtrPtr end codepointPtr statePtr
c9d1c945 321 state <- peek statePtr
322 lastPtr <- peek curPtrPtr
323 codepoint <- peek codepointPtr
324 n <- peek destOffPtr
3694350a 325 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
326 return $! textP arr 0 (fromIntegral n)
c9d1c945 327 let left = lastPtr `minusPtr` curPtr
328 remaining = B.drop left bs
3694350a 329 return $! (chunkText, remaining)
330{-# INLINE decodeSomeUtf8 #-}
331
c9d1c945 332mkText :: A.MArray s -> CSize -> IO Text
333mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
334 return $! textP arr 0 (fromIntegral n)
3694350a 335{-# INLINE mkText #-}
8c482809 336
337ord :: Char -> Int
338ord (C# c#) = I# (ord# c#)
339{-# INLINE ord #-}
340
8c482809 341unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
342unsafeWrite marr i c
c9d1c945 343 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
344 return 1
345 | otherwise = do A.unsafeWrite marr i lo
346 A.unsafeWrite marr (i+1) hi
347 return 2
8c482809 348 where n = ord c
349 m = n - 0x10000
350 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
351 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
352 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
3694350a 353 {-# INLINE shiftR #-}
8c482809 354{-# INLINE unsafeWrite #-}
355
356foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
357 :: MutableByteArray# s -> Ptr CSize
358 -> Ptr (Ptr Word8) -> Ptr Word8
359 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)