]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Internal/Decoding.hs
little Haddock notes on Internal material
[github/fretlink/text-pipes.git] / Pipes / Text / Internal / Decoding.hs
CommitLineData
409759e8 1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
4
4da91c37 5{- |
6This module lifts assorted materials from Brian O'Sullivan's text package
7especially @Data.Text.Encoding@ in order to define a pipes-appropriate
8'streamDecodeUtf8'
9-}
409759e8 10module Pipes.Text.Internal.Decoding
11 ( Decoding(..)
12 , streamDecodeUtf8
13 , decodeSomeUtf8
14 ) where
15import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
16import Control.Monad.ST (ST, runST)
17import Data.Bits ((.&.))
18import Data.ByteString as B
19import Data.ByteString (ByteString)
20import Data.ByteString.Internal as B
21import Data.ByteString.Char8 as B8
22import Data.Text (Text)
23import qualified Data.Text as T
24import qualified Data.Text.Encoding as TE
25import Data.Text.Encoding.Error ()
26import Data.Text.Internal (Text, textP)
27import Foreign.C.Types (CSize)
28import Foreign.ForeignPtr (withForeignPtr)
29import Foreign.Marshal.Utils (with)
30import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
31import Foreign.Storable (Storable, peek, poke)
32import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
33import GHC.Word (Word8, Word32)
34import qualified Data.Text.Array as A
35import Data.Word (Word8, Word16)
36import System.IO.Unsafe (unsafePerformIO)
37import qualified Control.Exception as Exc
38import Data.Bits ((.&.), (.|.), shiftL)
39import Data.Typeable
40import Control.Arrow (first)
41import Data.Maybe (catMaybes)
42#include "pipes_text_cbits.h"
43
44
45
4da91c37 46-- | A stream oriented decoding result. Distinct from the similar type in @Data.Text.Encoding@
47data Decoding = Some Text ByteString (ByteString -> Decoding) -- | Text, continuation and any undecoded fragment.
48 | Other Text ByteString -- | Text followed by an undecodable ByteString
409759e8 49instance Show Decoding where
50 showsPrec d (Some t bs _) = showParen (d > prec) $
51 showString "Some " . showsPrec prec' t .
52 showChar ' ' . showsPrec prec' bs .
53 showString " _"
54 where prec = 10; prec' = prec + 1
55 showsPrec d (Other t bs) = showParen (d > prec) $
56 showString "Other " . showsPrec prec' t .
57 showChar ' ' . showsPrec prec' bs .
58 showString " _"
59 where prec = 10; prec' = prec + 1
60
61newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
62newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
63
4da91c37 64-- | Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'.
409759e8 65streamDecodeUtf8 :: ByteString -> Decoding
66streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
67 where
68 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
69 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
70 runST $ do marray <- A.new (len+1)
71 unsafeIOToST (decodeChunkToBuffer marray)
72 where
73 decodeChunkToBuffer :: A.MArray s -> IO Decoding
74 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
75 with (0::CSize) $ \destOffPtr ->
76 with codepoint0 $ \codepointPtr ->
77 with state0 $ \statePtr ->
78 with nullPtr $ \curPtrPtr ->
79 do let end = ptr `plusPtr` (off + len)
80 curPtr = ptr `plusPtr` off
81 poke curPtrPtr curPtr
82 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
83 state <- peek statePtr
84 lastPtr <- peek curPtrPtr
85 codepoint <- peek codepointPtr
86 n <- peek destOffPtr
87 chunkText <- mkText dest n
88 let left = lastPtr `minusPtr` curPtr
89 remaining = B.drop left bs
90 accum = if T.null chunkText then B.append old remaining else remaining
91 return $! case state of
92 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
93 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
94 {-# INLINE decodeChunkToBuffer #-}
95 {-# INLINE decodeChunkUtf8 #-}
96{-# INLINE streamDecodeUtf8 #-}
97
4da91c37 98-- | Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble
409759e8 99decodeSomeUtf8 :: ByteString -> (Text, ByteString)
100decodeSomeUtf8 bs@(PS fp off len) = runST $ do
101 dest <- A.new (len+1)
102 unsafeIOToST $
103 withForeignPtr fp $ \ptr ->
104 with (0::CSize) $ \destOffPtr ->
105 with (0::CodePoint) $ \codepointPtr ->
106 with (0::DecoderState) $ \statePtr ->
107 with nullPtr $ \curPtrPtr ->
108 do let end = ptr `plusPtr` (off + len)
109 curPtr = ptr `plusPtr` off
110 poke curPtrPtr curPtr
111 c_decode_utf8_with_state (A.maBA dest) destOffPtr
112 curPtrPtr end codepointPtr statePtr
113 state <- peek statePtr
114 lastPtr <- peek curPtrPtr
115 codepoint <- peek codepointPtr
116 n <- peek destOffPtr
117 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
118 return $! textP arr 0 (fromIntegral n)
119 let left = lastPtr `minusPtr` curPtr
120 remaining = B.drop left bs
121 return $! (chunkText, remaining)
122{-# INLINE decodeSomeUtf8 #-}
123
124mkText :: A.MArray s -> CSize -> IO Text
125mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
126 return $! textP arr 0 (fromIntegral n)
127{-# INLINE mkText #-}
128
129ord :: Char -> Int
130ord (C# c#) = I# (ord# c#)
131{-# INLINE ord #-}
132
133unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
134unsafeWrite marr i c
135 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
136 return 1
137 | otherwise = do A.unsafeWrite marr i lo
138 A.unsafeWrite marr (i+1) hi
139 return 2
140 where n = ord c
141 m = n - 0x10000
142 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
143 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
144 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
145 {-# INLINE shiftR #-}
146{-# INLINE unsafeWrite #-}
147
148foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
149 :: MutableByteArray# s -> Ptr CSize
150 -> Ptr (Ptr Word8) -> Ptr Word8
151 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)