aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--.DS_Storebin0 -> 6148 bytes
-rw-r--r--Pipes/Text.hs5
-rw-r--r--Pipes/Text/Internal/Codec.hs215
-rw-r--r--Pipes/Text/Internal/Decoding.hs147
-rw-r--r--pipes-text.cabal23
5 files changed, 384 insertions, 6 deletions
diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..1538af8
--- /dev/null
+++ b/.DS_Store
Binary files differ
diff --git a/Pipes/Text.hs b/Pipes/Text.hs
index 18ec8ec..0957a7d 100644
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -167,6 +167,7 @@ module Pipes.Text (
167 , module Data.Word 167 , module Data.Word
168 , module Pipes.Parse 168 , module Pipes.Parse
169 , module Pipes.Group 169 , module Pipes.Group
170 , module Pipes.Text.Internal.Codec
170 ) where 171 ) where
171 172
172import Control.Exception (throwIO, try) 173import Control.Exception (throwIO, try)
@@ -196,8 +197,8 @@ import Foreign.C.Error (Errno(Errno), ePIPE)
196import qualified GHC.IO.Exception as G 197import qualified GHC.IO.Exception as G
197import Pipes 198import Pipes
198import qualified Pipes.ByteString as PB 199import qualified Pipes.ByteString as PB
199import qualified Pipes.Text.Internal as PE 200import qualified Pipes.Text.Internal.Decoding as PE
200import Pipes.Text.Codec 201import Pipes.Text.Internal.Codec
201import Pipes.Core (respond, Server') 202import Pipes.Core (respond, Server')
202import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) 203import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))
203import qualified Pipes.Group as PG 204import qualified Pipes.Group as PG
diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs
new file mode 100644
index 0000000..4b9367f
--- /dev/null
+++ b/Pipes/Text/Internal/Codec.hs
@@ -0,0 +1,215 @@
1
2{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
3-- |
4-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
5-- License: MIT
6--
7-- Parts of this code were taken from enumerator and conduits, and adapted for pipes.
8
9module Pipes.Text.Internal.Codec
10 ( Decoding(..)
11 , streamDecodeUtf8
12 , decodeSomeUtf8
13 , Codec(..)
14 , TextException(..)
15 , utf8
16 , utf16_le
17 , utf16_be
18 , utf32_le
19 , utf32_be
20 ) where
21
22import Data.Bits ((.&.))
23import Data.Char (ord)
24import Data.ByteString as B
25import Data.ByteString (ByteString)
26import Data.ByteString.Internal as B
27import Data.ByteString.Char8 as B8
28import Data.Text (Text)
29import qualified Data.Text as T
30import qualified Data.Text.Encoding as TE
31import Data.Text.Encoding.Error ()
32import GHC.Word (Word8, Word32)
33import qualified Data.Text.Array as A
34import Data.Word (Word8, Word16)
35import System.IO.Unsafe (unsafePerformIO)
36import qualified Control.Exception as Exc
37import Data.Bits ((.&.), (.|.), shiftL)
38import Data.Typeable
39import Control.Arrow (first)
40import Data.Maybe (catMaybes)
41import Pipes.Text.Internal.Decoding
42import Pipes
43-- | A specific character encoding.
44--
45-- Since 0.3.0
46data Codec = Codec
47 { codecName :: Text
48 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
49 , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
50 }
51
52instance Show Codec where
53 showsPrec d c = showParen (d > 10) $
54 showString "Codec " . shows (codecName c)
55
56data TextException = DecodeException Codec Word8
57 | EncodeException Codec Char
58 | LengthExceeded Int
59 | TextException Exc.SomeException
60 deriving (Show, Typeable)
61instance Exc.Exception TextException
62
63
64toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
65 -> (ByteString -> Decoding)
66toDecoding op = loop B.empty where
67 loop !extra bs0 = case op (B.append extra bs0) of
68 (txt, Right bs) -> Some txt bs (loop bs)
69 (txt, Left (_,bs)) -> Other txt bs
70-- To do: toDecoding should be inlined in each of the 'Codec' definitions
71-- or else Codec changed to the conduit/enumerator definition. We have
72-- altered it to use 'streamDecodeUtf8'
73
74splitSlowly :: (ByteString -> Text)
75 -> ByteString
76 -> (Text, Either (TextException, ByteString) ByteString)
77splitSlowly dec bytes = valid where
78 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
79 splits 0 = [(B.empty, bytes)]
80 splits n = B.splitAt n bytes : splits (n - 1)
81 decFirst (a, b) = case tryEvaluate (dec a) of
82 Left _ -> Nothing
83 Right text -> let trouble = case tryEvaluate (dec b) of
84 Left exc -> Left (TextException exc, b)
85 Right _ -> Right B.empty
86 in Just (text, trouble) -- this case shouldn't occur,
87 -- since splitSlowly is only called
88 -- when parsing failed somewhere
89
90utf8 :: Codec
91utf8 = Codec name enc (toDecoding dec) where
92 name = T.pack "UTF-8"
93 enc text = (TE.encodeUtf8 text, Nothing)
94 dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
95
96-- -- Whether the given byte is a continuation byte.
97-- isContinuation byte = byte .&. 0xC0 == 0x80
98--
99-- -- The number of continuation bytes needed by the given
100-- -- non-continuation byte. Returns -1 for an illegal UTF-8
101-- -- non-continuation byte and the whole split quickly must fail so
102-- -- as the input is passed to TE.decodeUtf8, which will issue a
103-- -- suitable error.
104-- required x0
105-- | x0 .&. 0x80 == 0x00 = 0
106-- | x0 .&. 0xE0 == 0xC0 = 1
107-- | x0 .&. 0xF0 == 0xE0 = 2
108-- | x0 .&. 0xF8 == 0xF0 = 3
109-- | otherwise = -1
110--
111-- splitQuickly bytes
112-- | B.null l || req == -1 = Nothing
113-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
114-- | otherwise = Just (TE.decodeUtf8 l', r')
115-- where
116-- (l, r) = B.spanEnd isContinuation bytes
117-- req = required (B.last l)
118-- l' = B.init l
119-- r' = B.cons (B.last l) r
120
121
122utf16_le :: Codec
123utf16_le = Codec name enc (toDecoding dec) where
124 name = T.pack "UTF-16-LE"
125 enc text = (TE.encodeUtf16LE text, Nothing)
126 dec bytes = case splitQuickly bytes of
127 Just (text, extra) -> (text, Right extra)
128 Nothing -> splitSlowly TE.decodeUtf16LE bytes
129
130 splitQuickly bytes = maybeDecode (loop 0) where
131 maxN = B.length bytes
132
133 loop n | n == maxN = decodeAll
134 | (n + 1) == maxN = decodeTo n
135 loop n = let
136 req = utf16Required
137 (B.index bytes n)
138 (B.index bytes (n + 1))
139 decodeMore = loop $! n + req
140 in if n + req > maxN
141 then decodeTo n
142 else decodeMore
143
144 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
145 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
146
147utf16_be :: Codec
148utf16_be = Codec name enc (toDecoding dec) where
149 name = T.pack "UTF-16-BE"
150 enc text = (TE.encodeUtf16BE text, Nothing)
151 dec bytes = case splitQuickly bytes of
152 Just (text, extra) -> (text, Right extra)
153 Nothing -> splitSlowly TE.decodeUtf16BE bytes
154
155 splitQuickly bytes = maybeDecode (loop 0) where
156 maxN = B.length bytes
157
158 loop n | n == maxN = decodeAll
159 | (n + 1) == maxN = decodeTo n
160 loop n = let
161 req = utf16Required
162 (B.index bytes (n + 1))
163 (B.index bytes n)
164 decodeMore = loop $! n + req
165 in if n + req > maxN
166 then decodeTo n
167 else decodeMore
168
169 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
170 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
171
172utf16Required :: Word8 -> Word8 -> Int
173utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
174 x :: Word16
175 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
176
177
178utf32_le :: Codec
179utf32_le = Codec name enc (toDecoding dec) where
180 name = T.pack "UTF-32-LE"
181 enc text = (TE.encodeUtf32LE text, Nothing)
182 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
183 Just (text, extra) -> (text, Right extra)
184 Nothing -> splitSlowly TE.decodeUtf32LE bs
185
186
187utf32_be :: Codec
188utf32_be = Codec name enc (toDecoding dec) where
189 name = T.pack "UTF-32-BE"
190 enc text = (TE.encodeUtf32BE text, Nothing)
191 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
192 Just (text, extra) -> (text, Right extra)
193 Nothing -> splitSlowly TE.decodeUtf32BE bs
194
195utf32SplitBytes :: (ByteString -> Text)
196 -> ByteString
197 -> Maybe (Text, ByteString)
198utf32SplitBytes dec bytes = split where
199 split = maybeDecode (dec toDecode, extra)
200 len = B.length bytes
201 lenExtra = mod len 4
202
203 lenToDecode = len - lenExtra
204 (toDecode, extra) = if lenExtra == 0
205 then (bytes, B.empty)
206 else B.splitAt lenToDecode bytes
207
208
209tryEvaluate :: a -> Either Exc.SomeException a
210tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
211
212maybeDecode :: (a, b) -> Maybe (a, b)
213maybeDecode (a, b) = case tryEvaluate a of
214 Left _ -> Nothing
215 Right _ -> Just (a, b)
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs
new file mode 100644
index 0000000..531104a
--- /dev/null
+++ b/Pipes/Text/Internal/Decoding.hs
@@ -0,0 +1,147 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
4
5-- This module lifts assorted materials from Brian O'Sullivan's text package
6-- especially Data.Text.Encoding in order to define a pipes-appropriate
7-- streamDecodeUtf8
8module Pipes.Text.Internal.Decoding
9 ( Decoding(..)
10 , streamDecodeUtf8
11 , decodeSomeUtf8
12 ) where
13import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
14import Control.Monad.ST (ST, runST)
15import Data.Bits ((.&.))
16import Data.ByteString as B
17import Data.ByteString (ByteString)
18import Data.ByteString.Internal as B
19import Data.ByteString.Char8 as B8
20import Data.Text (Text)
21import qualified Data.Text as T
22import qualified Data.Text.Encoding as TE
23import Data.Text.Encoding.Error ()
24import Data.Text.Internal (Text, textP)
25import Foreign.C.Types (CSize)
26import Foreign.ForeignPtr (withForeignPtr)
27import Foreign.Marshal.Utils (with)
28import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
29import Foreign.Storable (Storable, peek, poke)
30import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
31import GHC.Word (Word8, Word32)
32import qualified Data.Text.Array as A
33import Data.Word (Word8, Word16)
34import System.IO.Unsafe (unsafePerformIO)
35import qualified Control.Exception as Exc
36import Data.Bits ((.&.), (.|.), shiftL)
37import Data.Typeable
38import Control.Arrow (first)
39import Data.Maybe (catMaybes)
40#include "pipes_text_cbits.h"
41
42
43
44-- | A stream oriented decoding result.
45data Decoding = Some Text ByteString (ByteString -> Decoding)
46 | Other Text ByteString
47instance Show Decoding where
48 showsPrec d (Some t bs _) = showParen (d > prec) $
49 showString "Some " . showsPrec prec' t .
50 showChar ' ' . showsPrec prec' bs .
51 showString " _"
52 where prec = 10; prec' = prec + 1
53 showsPrec d (Other t bs) = showParen (d > prec) $
54 showString "Other " . showsPrec prec' t .
55 showChar ' ' . showsPrec prec' bs .
56 showString " _"
57 where prec = 10; prec' = prec + 1
58
59newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
60newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
61
62streamDecodeUtf8 :: ByteString -> Decoding
63streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
64 where
65 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
66 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
67 runST $ do marray <- A.new (len+1)
68 unsafeIOToST (decodeChunkToBuffer marray)
69 where
70 decodeChunkToBuffer :: A.MArray s -> IO Decoding
71 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
72 with (0::CSize) $ \destOffPtr ->
73 with codepoint0 $ \codepointPtr ->
74 with state0 $ \statePtr ->
75 with nullPtr $ \curPtrPtr ->
76 do let end = ptr `plusPtr` (off + len)
77 curPtr = ptr `plusPtr` off
78 poke curPtrPtr curPtr
79 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
80 state <- peek statePtr
81 lastPtr <- peek curPtrPtr
82 codepoint <- peek codepointPtr
83 n <- peek destOffPtr
84 chunkText <- mkText dest n
85 let left = lastPtr `minusPtr` curPtr
86 remaining = B.drop left bs
87 accum = if T.null chunkText then B.append old remaining else remaining
88 return $! case state of
89 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
90 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
91 {-# INLINE decodeChunkToBuffer #-}
92 {-# INLINE decodeChunkUtf8 #-}
93{-# INLINE streamDecodeUtf8 #-}
94
95decodeSomeUtf8 :: ByteString -> (Text, ByteString)
96decodeSomeUtf8 bs@(PS fp off len) = runST $ do
97 dest <- A.new (len+1)
98 unsafeIOToST $
99 withForeignPtr fp $ \ptr ->
100 with (0::CSize) $ \destOffPtr ->
101 with (0::CodePoint) $ \codepointPtr ->
102 with (0::DecoderState) $ \statePtr ->
103 with nullPtr $ \curPtrPtr ->
104 do let end = ptr `plusPtr` (off + len)
105 curPtr = ptr `plusPtr` off
106 poke curPtrPtr curPtr
107 c_decode_utf8_with_state (A.maBA dest) destOffPtr
108 curPtrPtr end codepointPtr statePtr
109 state <- peek statePtr
110 lastPtr <- peek curPtrPtr
111 codepoint <- peek codepointPtr
112 n <- peek destOffPtr
113 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
114 return $! textP arr 0 (fromIntegral n)
115 let left = lastPtr `minusPtr` curPtr
116 remaining = B.drop left bs
117 return $! (chunkText, remaining)
118{-# INLINE decodeSomeUtf8 #-}
119
120mkText :: A.MArray s -> CSize -> IO Text
121mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
122 return $! textP arr 0 (fromIntegral n)
123{-# INLINE mkText #-}
124
125ord :: Char -> Int
126ord (C# c#) = I# (ord# c#)
127{-# INLINE ord #-}
128
129unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
130unsafeWrite marr i c
131 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
132 return 1
133 | otherwise = do A.unsafeWrite marr i lo
134 A.unsafeWrite marr (i+1) hi
135 return 2
136 where n = ord c
137 m = n - 0x10000
138 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
139 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
140 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
141 {-# INLINE shiftR #-}
142{-# INLINE unsafeWrite #-}
143
144foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
145 :: MutableByteArray# s -> Ptr CSize
146 -> Ptr (Ptr Word8) -> Ptr Word8
147 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file
diff --git a/pipes-text.cabal b/pipes-text.cabal
index 44bc551..fda6f59 100644
--- a/pipes-text.cabal
+++ b/pipes-text.cabal
@@ -1,20 +1,35 @@
1name: pipes-text 1name: pipes-text
2version: 0.1.0.0 2version: 0.1.0.0
3synopsis: Text pipes. 3synopsis: Text pipes.
4description: Text pipes. 4description: Most of the pipes and other operations defined in `pipes-text`
5 closely mirrors the `pipes-bytestring` library, simply adjusting for
6 the difference between strict `ByteString` and strict `Text`, and between
7 `Word8` and `Char`. To this core are added some simple functions
8 some akin to the `String` operations in `Pipes.Prelude`, some like the
9 utilities in `Data.Text`.
10
11 All of the `IO` operations defined here - e.g `readFile`, `stdout` etc.
12 - are conveniences akin to those in `Data.Text.IO` which e.g. try to
13 find the system encoding and use the exceptions defined in the `text`
14 library. Proper `IO` in the sense of this library will employ
15 `pipes-bytestring` in conjuntion with 'pure' operations like
16 `decodeUtf8` and `encodeUtf8` that are defined here.
17
5homepage: github.com/michaelt/text-pipes 18homepage: github.com/michaelt/text-pipes
6license: BSD3 19license: BSD3
7license-file: LICENSE 20license-file: LICENSE
8author: michaelt 21author: Michael Thompson
9maintainer: what_is_it_to_do_anything@yahoo.com 22maintainer: what_is_it_to_do_anything@yahoo.com
10category: Text 23category: Text, Pipes
11build-type: Simple 24build-type: Simple
12cabal-version: >=1.10 25cabal-version: >=1.10
26extra-source-files: README.md
27 include/*.h
13 28
14library 29library
15 c-sources: cbits/cbits.c 30 c-sources: cbits/cbits.c
16 include-dirs: include 31 include-dirs: include
17 exposed-modules: Pipes.Text, Pipes.Text.Internal, Pipes.Text.Codec 32 exposed-modules: Pipes.Text, Pipes.Text.Internal.Decoding, Pipes.Text.Internal.Codec
18 -- other-modules: 33 -- other-modules:
19 other-extensions: RankNTypes 34 other-extensions: RankNTypes
20 build-depends: base >= 4 && < 5 , 35 build-depends: base >= 4 && < 5 ,