aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes/Text
diff options
context:
space:
mode:
Diffstat (limited to 'Pipes/Text')
-rw-r--r--Pipes/Text/Codec.hs240
-rw-r--r--Pipes/Text/Internal.hs212
2 files changed, 240 insertions, 212 deletions
diff --git a/Pipes/Text/Codec.hs b/Pipes/Text/Codec.hs
new file mode 100644
index 0000000..e4357b9
--- /dev/null
+++ b/Pipes/Text/Codec.hs
@@ -0,0 +1,240 @@
1
2{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
3-- |
4-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
5-- License: MIT
6--
7-- Handle streams of text.
8--
9-- Parts of this code were taken from enumerator and conduits, and adapted for pipes.
10
11module Pipes.Text.Codec
12 ( Decoding(..)
13 , streamDecodeUtf8
14 , decodeSomeUtf8
15 , Codec(..)
16 , TextException(..)
17 , utf8
18 ) where
19
20import Data.Bits ((.&.))
21import Data.Char (ord)
22import Data.ByteString as B
23import Data.ByteString (ByteString)
24import Data.ByteString.Internal as B
25import Data.ByteString.Char8 as B8
26import Data.Text (Text)
27import qualified Data.Text as T
28import qualified Data.Text.Encoding as TE
29import Data.Text.Encoding.Error ()
30import GHC.Word (Word8, Word32)
31import qualified Data.Text.Array as A
32import Data.Word (Word8, Word16)
33import System.IO.Unsafe (unsafePerformIO)
34import qualified Control.Exception as Exc
35import Data.Bits ((.&.), (.|.), shiftL)
36import Data.Typeable
37import Control.Arrow (first)
38import Data.Maybe (catMaybes)
39import Pipes.Text.Internal
40
41-- | A specific character encoding.
42--
43-- Since 0.3.0
44data Codec = Codec
45 { codecName :: Text
46 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
47 , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
48 }
49
50instance Show Codec where
51 showsPrec d c = showParen (d > 10) $
52 showString "Codec " . shows (codecName c)
53
54data TextException = DecodeException Codec Word8
55 | EncodeException Codec Char
56 | LengthExceeded Int
57 | TextException Exc.SomeException
58 deriving (Show, Typeable)
59instance Exc.Exception TextException
60
61
62toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
63 -> (ByteString -> Decoding)
64toDecoding op = loop B.empty where
65 loop extra bs0 = case op (B.append extra bs0) of
66 (txt, Right bs) -> Some txt bs (loop bs)
67 (txt, Left (_,bs)) -> Other txt bs
68
69
70splitSlowly :: (ByteString -> Text)
71 -> ByteString
72 -> (Text, Either (TextException, ByteString) ByteString)
73splitSlowly dec bytes = valid where
74 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
75 splits 0 = [(B.empty, bytes)]
76 splits n = B.splitAt n bytes : splits (n - 1)
77 decFirst (a, b) = case tryEvaluate (dec a) of
78 Left _ -> Nothing
79 Right text -> let trouble = case tryEvaluate (dec b) of
80 Left exc -> Left (TextException exc, b)
81 Right _ -> Right B.empty
82 in Just (text, trouble) -- this case shouldn't occur,
83 -- since splitSlowly is only called
84 -- when parsing failed somewhere
85
86utf8 :: Codec
87utf8 = Codec name enc (toDecoding dec) where
88 name = T.pack "UTF-8"
89 enc text = (TE.encodeUtf8 text, Nothing)
90 dec bytes = case decodeSomeUtf8 bytes of
91 (t,b) -> (t, Right b)
92
93-- -- Whether the given byte is a continuation byte.
94-- isContinuation byte = byte .&. 0xC0 == 0x80
95--
96-- -- The number of continuation bytes needed by the given
97-- -- non-continuation byte. Returns -1 for an illegal UTF-8
98-- -- non-continuation byte and the whole split quickly must fail so
99-- -- as the input is passed to TE.decodeUtf8, which will issue a
100-- -- suitable error.
101-- required x0
102-- | x0 .&. 0x80 == 0x00 = 0
103-- | x0 .&. 0xE0 == 0xC0 = 1
104-- | x0 .&. 0xF0 == 0xE0 = 2
105-- | x0 .&. 0xF8 == 0xF0 = 3
106-- | otherwise = -1
107--
108-- splitQuickly bytes
109-- | B.null l || req == -1 = Nothing
110-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
111-- | otherwise = Just (TE.decodeUtf8 l', r')
112-- where
113-- (l, r) = B.spanEnd isContinuation bytes
114-- req = required (B.last l)
115-- l' = B.init l
116-- r' = B.cons (B.last l) r
117
118
119utf16_le :: Codec
120utf16_le = Codec name enc (toDecoding dec) where
121 name = T.pack "UTF-16-LE"
122 enc text = (TE.encodeUtf16LE text, Nothing)
123 dec bytes = case splitQuickly bytes of
124 Just (text, extra) -> (text, Right extra)
125 Nothing -> splitSlowly TE.decodeUtf16LE bytes
126
127 splitQuickly bytes = maybeDecode (loop 0) where
128 maxN = B.length bytes
129
130 loop n | n == maxN = decodeAll
131 | (n + 1) == maxN = decodeTo n
132 loop n = let
133 req = utf16Required
134 (B.index bytes n)
135 (B.index bytes (n + 1))
136 decodeMore = loop $! n + req
137 in if n + req > maxN
138 then decodeTo n
139 else decodeMore
140
141 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
142 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
143
144utf16_be :: Codec
145utf16_be = Codec name enc (toDecoding dec) where
146 name = T.pack "UTF-16-BE"
147 enc text = (TE.encodeUtf16BE text, Nothing)
148 dec bytes = case splitQuickly bytes of
149 Just (text, extra) -> (text, Right extra)
150 Nothing -> splitSlowly TE.decodeUtf16BE bytes
151
152 splitQuickly bytes = maybeDecode (loop 0) where
153 maxN = B.length bytes
154
155 loop n | n == maxN = decodeAll
156 | (n + 1) == maxN = decodeTo n
157 loop n = let
158 req = utf16Required
159 (B.index bytes (n + 1))
160 (B.index bytes n)
161 decodeMore = loop $! n + req
162 in if n + req > maxN
163 then decodeTo n
164 else decodeMore
165
166 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
167 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
168
169utf16Required :: Word8 -> Word8 -> Int
170utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
171 x :: Word16
172 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
173
174
175utf32_le :: Codec
176utf32_le = Codec name enc (toDecoding dec) where
177 name = T.pack "UTF-32-LE"
178 enc text = (TE.encodeUtf32LE text, Nothing)
179 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
180 Just (text, extra) -> (text, Right extra)
181 Nothing -> splitSlowly TE.decodeUtf32LE bs
182
183
184utf32_be :: Codec
185utf32_be = Codec name enc (toDecoding dec) where
186 name = T.pack "UTF-32-BE"
187 enc text = (TE.encodeUtf32BE text, Nothing)
188 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
189 Just (text, extra) -> (text, Right extra)
190 Nothing -> splitSlowly TE.decodeUtf32BE bs
191
192utf32SplitBytes :: (ByteString -> Text)
193 -> ByteString
194 -> Maybe (Text, ByteString)
195utf32SplitBytes dec bytes = split where
196 split = maybeDecode (dec toDecode, extra)
197 len = B.length bytes
198 lenExtra = mod len 4
199
200 lenToDecode = len - lenExtra
201 (toDecode, extra) = if lenExtra == 0
202 then (bytes, B.empty)
203 else B.splitAt lenToDecode bytes
204
205ascii :: Codec
206ascii = Codec name enc (toDecoding dec) where
207 name = T.pack "ASCII"
208 enc text = (bytes, extra) where
209 (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
210 bytes = B8.pack (T.unpack safe)
211 extra = if T.null unsafe
212 then Nothing
213 else Just (EncodeException ascii (T.head unsafe), unsafe)
214
215 dec bytes = (text, extra) where
216 (safe, unsafe) = B.span (<= 0x7F) bytes
217 text = T.pack (B8.unpack safe)
218 extra = if B.null unsafe
219 then Right B.empty
220 else Left (DecodeException ascii (B.head unsafe), unsafe)
221
222iso8859_1 :: Codec
223iso8859_1 = Codec name enc (toDecoding dec) where
224 name = T.pack "ISO-8859-1"
225 enc text = (bytes, extra) where
226 (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
227 bytes = B8.pack (T.unpack safe)
228 extra = if T.null unsafe
229 then Nothing
230 else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
231
232 dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
233
234tryEvaluate :: a -> Either Exc.SomeException a
235tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
236
237maybeDecode :: (a, b) -> Maybe (a, b)
238maybeDecode (a, b) = case tryEvaluate a of
239 Left _ -> Nothing
240 Right _ -> Just (a, b)
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
index 76c2f4f..bcee278 100644
--- a/Pipes/Text/Internal.hs
+++ b/Pipes/Text/Internal.hs
@@ -9,9 +9,6 @@ module Pipes.Text.Internal
9 ( Decoding(..) 9 ( Decoding(..)
10 , streamDecodeUtf8 10 , streamDecodeUtf8
11 , decodeSomeUtf8 11 , decodeSomeUtf8
12 , Codec(..)
13 , TextException(..)
14 , utf8
15 ) where 12 ) where
16import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) 13import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
17import Control.Monad.ST (ST, runST) 14import Control.Monad.ST (ST, runST)
@@ -43,215 +40,6 @@ import Data.Maybe (catMaybes)
43#include "pipes_text_cbits.h" 40#include "pipes_text_cbits.h"
44 41
45 42
46-- | A specific character encoding.
47--
48-- Since 0.3.0
49data Codec = Codec
50 { codecName :: Text
51 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
52 , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
53 }
54
55instance Show Codec where
56 showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c)
57
58-- Since 0.3.0
59data TextException = DecodeException Codec Word8
60 | EncodeException Codec Char
61 | LengthExceeded Int
62 | TextException Exc.SomeException
63 deriving (Show, Typeable)
64instance Exc.Exception TextException
65
66toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
67 -> (ByteString -> Decoding)
68toDecoding op = loop B.empty where
69 loop extra bs0 = case op (B.append extra bs0) of
70 (txt, Right bs) -> Some txt bs (loop bs)
71 (txt, Left (_,bs)) -> Other txt bs
72
73
74splitSlowly :: (ByteString -> Text)
75 -> ByteString
76 -> (Text, Either (TextException, ByteString) ByteString)
77splitSlowly dec bytes = valid where
78 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
79 splits 0 = [(B.empty, bytes)]
80 splits n = B.splitAt n bytes : splits (n - 1)
81 decFirst (a, b) = case tryEvaluate (dec a) of
82 Left _ -> Nothing
83 Right text -> let trouble = case tryEvaluate (dec b) of
84 Left exc -> Left (TextException exc, b)
85 Right _ -> Right B.empty
86 in Just (text, trouble)
87 -- this case shouldn't occur,
88 -- since splitSlowly is only called
89 -- when parsing failed somewhere
90
91utf8 :: Codec
92utf8 = Codec name enc (toDecoding dec) where
93 name = T.pack "UTF-8"
94 enc text = (TE.encodeUtf8 text, Nothing)
95 dec bytes = case decodeSomeUtf8 bytes of
96 (t,b) -> (t, Right b)
97
98-- -- Whether the given byte is a continuation byte.
99-- isContinuation byte = byte .&. 0xC0 == 0x80
100--
101-- -- The number of continuation bytes needed by the given
102-- -- non-continuation byte. Returns -1 for an illegal UTF-8
103-- -- non-continuation byte and the whole split quickly must fail so
104-- -- as the input is passed to TE.decodeUtf8, which will issue a
105-- -- suitable error.
106-- required x0
107-- | x0 .&. 0x80 == 0x00 = 0
108-- | x0 .&. 0xE0 == 0xC0 = 1
109-- | x0 .&. 0xF0 == 0xE0 = 2
110-- | x0 .&. 0xF8 == 0xF0 = 3
111-- | otherwise = -1
112--
113-- splitQuickly bytes
114-- | B.null l || req == -1 = Nothing
115-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
116-- | otherwise = Just (TE.decodeUtf8 l', r')
117-- where
118-- (l, r) = B.spanEnd isContinuation bytes
119-- req = required (B.last l)
120-- l' = B.init l
121-- r' = B.cons (B.last l) r
122
123-- |
124-- Since 0.3.0
125utf16_le :: Codec
126utf16_le = Codec name enc (toDecoding dec) where
127 name = T.pack "UTF-16-LE"
128 enc text = (TE.encodeUtf16LE text, Nothing)
129 dec bytes = case splitQuickly bytes of
130 Just (text, extra) -> (text, Right extra)
131 Nothing -> splitSlowly TE.decodeUtf16LE bytes
132
133 splitQuickly bytes = maybeDecode (loop 0) where
134 maxN = B.length bytes
135
136 loop n | n == maxN = decodeAll
137 | (n + 1) == maxN = decodeTo n
138 loop n = let
139 req = utf16Required
140 (B.index bytes n)
141 (B.index bytes (n + 1))
142 decodeMore = loop $! n + req
143 in if n + req > maxN
144 then decodeTo n
145 else decodeMore
146
147 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
148 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
149
150-- |
151-- Since 0.3.0
152utf16_be :: Codec
153utf16_be = Codec name enc (toDecoding dec) where
154 name = T.pack "UTF-16-BE"
155 enc text = (TE.encodeUtf16BE text, Nothing)
156 dec bytes = case splitQuickly bytes of
157 Just (text, extra) -> (text, Right extra)
158 Nothing -> splitSlowly TE.decodeUtf16BE bytes
159
160 splitQuickly bytes = maybeDecode (loop 0) where
161 maxN = B.length bytes
162
163 loop n | n == maxN = decodeAll
164 | (n + 1) == maxN = decodeTo n
165 loop n = let
166 req = utf16Required
167 (B.index bytes (n + 1))
168 (B.index bytes n)
169 decodeMore = loop $! n + req
170 in if n + req > maxN
171 then decodeTo n
172 else decodeMore
173
174 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
175 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
176
177utf16Required :: Word8 -> Word8 -> Int
178utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
179 x :: Word16
180 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
181
182-- |
183-- Since 0.3.0
184utf32_le :: Codec
185utf32_le = Codec name enc (toDecoding dec) where
186 name = T.pack "UTF-32-LE"
187 enc text = (TE.encodeUtf32LE text, Nothing)
188 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
189 Just (text, extra) -> (text, Right extra)
190 Nothing -> splitSlowly TE.decodeUtf32LE bs
191
192-- |
193-- Since 0.3.0
194utf32_be :: Codec
195utf32_be = Codec name enc (toDecoding dec) where
196 name = T.pack "UTF-32-BE"
197 enc text = (TE.encodeUtf32BE text, Nothing)
198 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
199 Just (text, extra) -> (text, Right extra)
200 Nothing -> splitSlowly TE.decodeUtf32BE bs
201
202utf32SplitBytes :: (ByteString -> Text)
203 -> ByteString
204 -> Maybe (Text, ByteString)
205utf32SplitBytes dec bytes = split where
206 split = maybeDecode (dec toDecode, extra)
207 len = B.length bytes
208 lenExtra = mod len 4
209
210 lenToDecode = len - lenExtra
211 (toDecode, extra) = if lenExtra == 0
212 then (bytes, B.empty)
213 else B.splitAt lenToDecode bytes
214
215-- |
216-- Since 0.3.0
217ascii :: Codec
218ascii = Codec name enc (toDecoding dec) where
219 name = T.pack "ASCII"
220 enc text = (bytes, extra) where
221 (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
222 bytes = B8.pack (T.unpack safe)
223 extra = if T.null unsafe
224 then Nothing
225 else Just (EncodeException ascii (T.head unsafe), unsafe)
226
227 dec bytes = (text, extra) where
228 (safe, unsafe) = B.span (<= 0x7F) bytes
229 text = T.pack (B8.unpack safe)
230 extra = if B.null unsafe
231 then Right B.empty
232 else Left (DecodeException ascii (B.head unsafe), unsafe)
233
234-- |
235-- Since 0.3.0
236iso8859_1 :: Codec
237iso8859_1 = Codec name enc (toDecoding dec) where
238 name = T.pack "ISO-8859-1"
239 enc text = (bytes, extra) where
240 (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
241 bytes = B8.pack (T.unpack safe)
242 extra = if T.null unsafe
243 then Nothing
244 else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
245
246 dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
247
248tryEvaluate :: a -> Either Exc.SomeException a
249tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
250
251maybeDecode :: (a, b) -> Maybe (a, b)
252maybeDecode (a, b) = case tryEvaluate a of
253 Left _ -> Nothing
254 Right _ -> Just (a, b)
255 43
256-- | A stream oriented decoding result. 44-- | A stream oriented decoding result.
257data Decoding = Some Text ByteString (ByteString -> Decoding) 45data Decoding = Some Text ByteString (ByteString -> Decoding)