diff options
-rw-r--r-- | LICENSE | 2 | ||||
-rw-r--r-- | Pipes/Text.hs | 20 | ||||
-rw-r--r-- | Pipes/Text/Internal.hs | 152 | ||||
-rw-r--r-- | pipes-text.cabal | 6 |
4 files changed, 24 insertions, 156 deletions
@@ -1,4 +1,4 @@ | |||
1 | Copyright (c) 2013, Gabriel Gonzalez, Tobias Florek, Michael Thompson | 1 | Copyright (c) 2013-14, Gabriel Gonzalez, Tobias Florek, Michael Thompson |
2 | 2 | ||
3 | All rights reserved. | 3 | All rights reserved. |
4 | 4 | ||
diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 0957a7d..796f672 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs | |||
@@ -167,7 +167,7 @@ module Pipes.Text ( | |||
167 | , module Data.Word | 167 | , module Data.Word |
168 | , module Pipes.Parse | 168 | , module Pipes.Parse |
169 | , module Pipes.Group | 169 | , module Pipes.Group |
170 | , module Pipes.Text.Internal.Codec | 170 | , module Pipes.Text.Internal |
171 | ) where | 171 | ) where |
172 | 172 | ||
173 | import Control.Exception (throwIO, try) | 173 | import Control.Exception (throwIO, try) |
@@ -197,8 +197,8 @@ import Foreign.C.Error (Errno(Errno), ePIPE) | |||
197 | import qualified GHC.IO.Exception as G | 197 | import qualified GHC.IO.Exception as G |
198 | import Pipes | 198 | import Pipes |
199 | import qualified Pipes.ByteString as PB | 199 | import qualified Pipes.ByteString as PB |
200 | import qualified Pipes.Text.Internal.Decoding as PE | 200 | import qualified Pipes.Text.Internal as PI |
201 | import Pipes.Text.Internal.Codec | 201 | import Pipes.Text.Internal |
202 | import Pipes.Core (respond, Server') | 202 | import Pipes.Core (respond, Server') |
203 | import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) | 203 | import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) |
204 | import qualified Pipes.Group as PG | 204 | import qualified Pipes.Group as PG |
@@ -729,7 +729,7 @@ isEndOfChars = do | |||
729 | decodeUtf8 :: Monad m => Lens' (Producer ByteString m r) | 729 | decodeUtf8 :: Monad m => Lens' (Producer ByteString m r) |
730 | (Producer Text m (Producer ByteString m r)) | 730 | (Producer Text m (Producer ByteString m r)) |
731 | decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8))) | 731 | decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8))) |
732 | (k (go B.empty PE.streamDecodeUtf8 p0)) where | 732 | (k (go B.empty PI.streamDecodeUtf8 p0)) where |
733 | go !carry dec0 p = do | 733 | go !carry dec0 p = do |
734 | x <- lift (next p) | 734 | x <- lift (next p) |
735 | case x of Left r -> return (if B.null carry | 735 | case x of Left r -> return (if B.null carry |
@@ -738,9 +738,9 @@ decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8))) | |||
738 | return r)) | 738 | return r)) |
739 | 739 | ||
740 | Right (chunk, p') -> case dec0 chunk of | 740 | Right (chunk, p') -> case dec0 chunk of |
741 | PE.Some text carry2 dec -> do yield text | 741 | PI.Some text carry2 dec -> do yield text |
742 | go carry2 dec p' | 742 | go carry2 dec p' |
743 | PE.Other text bs -> do yield text | 743 | PI.Other text bs -> do yield text |
744 | return (do yield bs -- an invalid blob remains | 744 | return (do yield bs -- an invalid blob remains |
745 | p') | 745 | p') |
746 | {-# INLINABLE decodeUtf8 #-} | 746 | {-# INLINABLE decodeUtf8 #-} |
@@ -1093,19 +1093,19 @@ unwords = intercalate (yield $ T.singleton ' ') | |||
1093 | codec :: Monad m => Codec -> Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) | 1093 | codec :: Monad m => Codec -> Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) |
1094 | codec (Codec _ enc dec) k p0 = fmap (\p -> join (for p (yield . fst . enc))) | 1094 | codec (Codec _ enc dec) k p0 = fmap (\p -> join (for p (yield . fst . enc))) |
1095 | (k (decoder (dec B.empty) p0) ) where | 1095 | (k (decoder (dec B.empty) p0) ) where |
1096 | decoder :: Monad m => PE.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 1096 | decoder :: Monad m => PI.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
1097 | decoder !d p0 = case d of | 1097 | decoder !d p0 = case d of |
1098 | PE.Other txt bad -> do yield txt | 1098 | PI.Other txt bad -> do yield txt |
1099 | return (do yield bad | 1099 | return (do yield bad |
1100 | p0) | 1100 | p0) |
1101 | PE.Some txt extra dec -> do yield txt | 1101 | PI.Some txt extra dec -> do yield txt |
1102 | x <- lift (next p0) | 1102 | x <- lift (next p0) |
1103 | case x of Left r -> return (do yield extra | 1103 | case x of Left r -> return (do yield extra |
1104 | return r) | 1104 | return r) |
1105 | Right (chunk,p1) -> decoder (dec chunk) p1 | 1105 | Right (chunk,p1) -> decoder (dec chunk) p1 |
1106 | 1106 | ||
1107 | -- decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8))) | 1107 | -- decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8))) |
1108 | -- (k (go B.empty PE.streamDecodeUtf8 p0)) where | 1108 | -- (k (go B.empty PI.streamDecodeUtf8 p0)) where |
1109 | 1109 | ||
1110 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | 1110 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) |
1111 | encodeAscii = go where | 1111 | encodeAscii = go where |
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index bcee278..2530b23 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs | |||
@@ -1,147 +1,15 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} | 1 | module Pipes.Text.Internal |
2 | {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} | ||
3 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | ||
4 | |||
5 | -- This module lifts assorted materials from Brian O'Sullivan's text package | ||
6 | -- especially Data.Text.Encoding in order to define a pipes-appropriate | ||
7 | -- streamDecodeUtf8 | ||
8 | module Pipes.Text.Internal | ||
9 | ( Decoding(..) | 2 | ( Decoding(..) |
10 | , streamDecodeUtf8 | 3 | , streamDecodeUtf8 |
11 | , decodeSomeUtf8 | 4 | , decodeSomeUtf8 |
5 | , Codec(..) | ||
6 | , TextException(..) | ||
7 | , utf8 | ||
8 | , utf16_le | ||
9 | , utf16_be | ||
10 | , utf32_le | ||
11 | , utf32_be | ||
12 | ) where | 12 | ) where |
13 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | ||
14 | import Control.Monad.ST (ST, runST) | ||
15 | import Data.Bits ((.&.)) | ||
16 | import Data.ByteString as B | ||
17 | import Data.ByteString (ByteString) | ||
18 | import Data.ByteString.Internal as B | ||
19 | import Data.ByteString.Char8 as B8 | ||
20 | import Data.Text (Text) | ||
21 | import qualified Data.Text as T | ||
22 | import qualified Data.Text.Encoding as TE | ||
23 | import Data.Text.Encoding.Error () | ||
24 | import Data.Text.Internal (Text, textP) | ||
25 | import Foreign.C.Types (CSize) | ||
26 | import Foreign.ForeignPtr (withForeignPtr) | ||
27 | import Foreign.Marshal.Utils (with) | ||
28 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | ||
29 | import Foreign.Storable (Storable, peek, poke) | ||
30 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) | ||
31 | import GHC.Word (Word8, Word32) | ||
32 | import qualified Data.Text.Array as A | ||
33 | import Data.Word (Word8, Word16) | ||
34 | import System.IO.Unsafe (unsafePerformIO) | ||
35 | import qualified Control.Exception as Exc | ||
36 | import Data.Bits ((.&.), (.|.), shiftL) | ||
37 | import Data.Typeable | ||
38 | import Control.Arrow (first) | ||
39 | import Data.Maybe (catMaybes) | ||
40 | #include "pipes_text_cbits.h" | ||
41 | |||
42 | |||
43 | |||
44 | -- | A stream oriented decoding result. | ||
45 | data Decoding = Some Text ByteString (ByteString -> Decoding) | ||
46 | | Other Text ByteString | ||
47 | instance Show Decoding where | ||
48 | showsPrec d (Some t bs _) = showParen (d > prec) $ | ||
49 | showString "Some " . showsPrec prec' t . | ||
50 | showChar ' ' . showsPrec prec' bs . | ||
51 | showString " _" | ||
52 | where prec = 10; prec' = prec + 1 | ||
53 | showsPrec d (Other t bs) = showParen (d > prec) $ | ||
54 | showString "Other " . showsPrec prec' t . | ||
55 | showChar ' ' . showsPrec prec' bs . | ||
56 | showString " _" | ||
57 | where prec = 10; prec' = prec + 1 | ||
58 | |||
59 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | ||
60 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | ||
61 | |||
62 | streamDecodeUtf8 :: ByteString -> Decoding | ||
63 | streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 | ||
64 | where | ||
65 | decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | ||
66 | decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = | ||
67 | runST $ do marray <- A.new (len+1) | ||
68 | unsafeIOToST (decodeChunkToBuffer marray) | ||
69 | where | ||
70 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | ||
71 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
72 | with (0::CSize) $ \destOffPtr -> | ||
73 | with codepoint0 $ \codepointPtr -> | ||
74 | with state0 $ \statePtr -> | ||
75 | with nullPtr $ \curPtrPtr -> | ||
76 | do let end = ptr `plusPtr` (off + len) | ||
77 | curPtr = ptr `plusPtr` off | ||
78 | poke curPtrPtr curPtr | ||
79 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
80 | state <- peek statePtr | ||
81 | lastPtr <- peek curPtrPtr | ||
82 | codepoint <- peek codepointPtr | ||
83 | n <- peek destOffPtr | ||
84 | chunkText <- mkText dest n | ||
85 | let left = lastPtr `minusPtr` curPtr | ||
86 | remaining = B.drop left bs | ||
87 | accum = if T.null chunkText then B.append old remaining else remaining | ||
88 | return $! case state of | ||
89 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | ||
90 | _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) | ||
91 | {-# INLINE decodeChunkToBuffer #-} | ||
92 | {-# INLINE decodeChunkUtf8 #-} | ||
93 | {-# INLINE streamDecodeUtf8 #-} | ||
94 | |||
95 | decodeSomeUtf8 :: ByteString -> (Text, ByteString) | ||
96 | decodeSomeUtf8 bs@(PS fp off len) = runST $ do | ||
97 | dest <- A.new (len+1) | ||
98 | unsafeIOToST $ | ||
99 | withForeignPtr fp $ \ptr -> | ||
100 | with (0::CSize) $ \destOffPtr -> | ||
101 | with (0::CodePoint) $ \codepointPtr -> | ||
102 | with (0::DecoderState) $ \statePtr -> | ||
103 | with nullPtr $ \curPtrPtr -> | ||
104 | do let end = ptr `plusPtr` (off + len) | ||
105 | curPtr = ptr `plusPtr` off | ||
106 | poke curPtrPtr curPtr | ||
107 | c_decode_utf8_with_state (A.maBA dest) destOffPtr | ||
108 | curPtrPtr end codepointPtr statePtr | ||
109 | state <- peek statePtr | ||
110 | lastPtr <- peek curPtrPtr | ||
111 | codepoint <- peek codepointPtr | ||
112 | n <- peek destOffPtr | ||
113 | chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
114 | return $! textP arr 0 (fromIntegral n) | ||
115 | let left = lastPtr `minusPtr` curPtr | ||
116 | remaining = B.drop left bs | ||
117 | return $! (chunkText, remaining) | ||
118 | {-# INLINE decodeSomeUtf8 #-} | ||
119 | |||
120 | mkText :: A.MArray s -> CSize -> IO Text | ||
121 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
122 | return $! textP arr 0 (fromIntegral n) | ||
123 | {-# INLINE mkText #-} | ||
124 | |||
125 | ord :: Char -> Int | ||
126 | ord (C# c#) = I# (ord# c#) | ||
127 | {-# INLINE ord #-} | ||
128 | |||
129 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | ||
130 | unsafeWrite marr i c | ||
131 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) | ||
132 | return 1 | ||
133 | | otherwise = do A.unsafeWrite marr i lo | ||
134 | A.unsafeWrite marr (i+1) hi | ||
135 | return 2 | ||
136 | where n = ord c | ||
137 | m = n - 0x10000 | ||
138 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | ||
139 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | ||
140 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | ||
141 | {-# INLINE shiftR #-} | ||
142 | {-# INLINE unsafeWrite #-} | ||
143 | 13 | ||
144 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | 14 | import Pipes.Text.Internal.Decoding |
145 | :: MutableByteArray# s -> Ptr CSize | 15 | import Pipes.Text.Internal.Codec \ No newline at end of file |
146 | -> Ptr (Ptr Word8) -> Ptr Word8 | ||
147 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file | ||
diff --git a/pipes-text.cabal b/pipes-text.cabal index 1333f92..9af2a09 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal | |||
@@ -1,5 +1,5 @@ | |||
1 | name: pipes-text | 1 | name: pipes-text |
2 | version: 0.0.1.0 | 2 | version: 0.0.0.0 |
3 | synopsis: Text pipes. | 3 | synopsis: Text pipes. |
4 | description: Many of the pipes and other operations defined here mirror those in | 4 | description: Many of the pipes and other operations defined here mirror those in |
5 | the `pipes-bytestring` library. Folds like `length` and grouping | 5 | the `pipes-bytestring` library. Folds like `length` and grouping |
@@ -34,8 +34,8 @@ extra-source-files: README.md | |||
34 | library | 34 | library |
35 | c-sources: cbits/cbits.c | 35 | c-sources: cbits/cbits.c |
36 | include-dirs: include | 36 | include-dirs: include |
37 | exposed-modules: Pipes.Text, Pipes.Text.Internal.Decoding, Pipes.Text.Internal.Codec | 37 | exposed-modules: Pipes.Text, Pipes.Text.Internal |
38 | -- other-modules: | 38 | other-modules: Pipes.Text.Internal.Decoding, Pipes.Text.Internal.Codec |
39 | other-extensions: RankNTypes | 39 | other-extensions: RankNTypes |
40 | build-depends: base >= 4 && < 5 , | 40 | build-depends: base >= 4 && < 5 , |
41 | bytestring >=0.10 && < 0.11, | 41 | bytestring >=0.10 && < 0.11, |