]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Codec.hs
moved enumerator/conduit Codec business to its own module
[github/fretlink/text-pipes.git] / Pipes / Text / Codec.hs
CommitLineData
4cbc92cc 1
2{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
3-- |
4-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
5-- License: MIT
6--
7-- Handle streams of text.
8--
9-- Parts of this code were taken from enumerator and conduits, and adapted for pipes.
10
11module Pipes.Text.Codec
12 ( Decoding(..)
13 , streamDecodeUtf8
14 , decodeSomeUtf8
15 , Codec(..)
16 , TextException(..)
17 , utf8
18 ) where
19
20import Data.Bits ((.&.))
21import Data.Char (ord)
22import Data.ByteString as B
23import Data.ByteString (ByteString)
24import Data.ByteString.Internal as B
25import Data.ByteString.Char8 as B8
26import Data.Text (Text)
27import qualified Data.Text as T
28import qualified Data.Text.Encoding as TE
29import Data.Text.Encoding.Error ()
30import GHC.Word (Word8, Word32)
31import qualified Data.Text.Array as A
32import Data.Word (Word8, Word16)
33import System.IO.Unsafe (unsafePerformIO)
34import qualified Control.Exception as Exc
35import Data.Bits ((.&.), (.|.), shiftL)
36import Data.Typeable
37import Control.Arrow (first)
38import Data.Maybe (catMaybes)
39import Pipes.Text.Internal
40
41-- | A specific character encoding.
42--
43-- Since 0.3.0
44data Codec = Codec
45 { codecName :: Text
46 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
47 , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
48 }
49
50instance Show Codec where
51 showsPrec d c = showParen (d > 10) $
52 showString "Codec " . shows (codecName c)
53
54data TextException = DecodeException Codec Word8
55 | EncodeException Codec Char
56 | LengthExceeded Int
57 | TextException Exc.SomeException
58 deriving (Show, Typeable)
59instance Exc.Exception TextException
60
61
62toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
63 -> (ByteString -> Decoding)
64toDecoding op = loop B.empty where
65 loop extra bs0 = case op (B.append extra bs0) of
66 (txt, Right bs) -> Some txt bs (loop bs)
67 (txt, Left (_,bs)) -> Other txt bs
68
69
70splitSlowly :: (ByteString -> Text)
71 -> ByteString
72 -> (Text, Either (TextException, ByteString) ByteString)
73splitSlowly dec bytes = valid where
74 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
75 splits 0 = [(B.empty, bytes)]
76 splits n = B.splitAt n bytes : splits (n - 1)
77 decFirst (a, b) = case tryEvaluate (dec a) of
78 Left _ -> Nothing
79 Right text -> let trouble = case tryEvaluate (dec b) of
80 Left exc -> Left (TextException exc, b)
81 Right _ -> Right B.empty
82 in Just (text, trouble) -- this case shouldn't occur,
83 -- since splitSlowly is only called
84 -- when parsing failed somewhere
85
86utf8 :: Codec
87utf8 = Codec name enc (toDecoding dec) where
88 name = T.pack "UTF-8"
89 enc text = (TE.encodeUtf8 text, Nothing)
90 dec bytes = case decodeSomeUtf8 bytes of
91 (t,b) -> (t, Right b)
92
93-- -- Whether the given byte is a continuation byte.
94-- isContinuation byte = byte .&. 0xC0 == 0x80
95--
96-- -- The number of continuation bytes needed by the given
97-- -- non-continuation byte. Returns -1 for an illegal UTF-8
98-- -- non-continuation byte and the whole split quickly must fail so
99-- -- as the input is passed to TE.decodeUtf8, which will issue a
100-- -- suitable error.
101-- required x0
102-- | x0 .&. 0x80 == 0x00 = 0
103-- | x0 .&. 0xE0 == 0xC0 = 1
104-- | x0 .&. 0xF0 == 0xE0 = 2
105-- | x0 .&. 0xF8 == 0xF0 = 3
106-- | otherwise = -1
107--
108-- splitQuickly bytes
109-- | B.null l || req == -1 = Nothing
110-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
111-- | otherwise = Just (TE.decodeUtf8 l', r')
112-- where
113-- (l, r) = B.spanEnd isContinuation bytes
114-- req = required (B.last l)
115-- l' = B.init l
116-- r' = B.cons (B.last l) r
117
118
119utf16_le :: Codec
120utf16_le = Codec name enc (toDecoding dec) where
121 name = T.pack "UTF-16-LE"
122 enc text = (TE.encodeUtf16LE text, Nothing)
123 dec bytes = case splitQuickly bytes of
124 Just (text, extra) -> (text, Right extra)
125 Nothing -> splitSlowly TE.decodeUtf16LE bytes
126
127 splitQuickly bytes = maybeDecode (loop 0) where
128 maxN = B.length bytes
129
130 loop n | n == maxN = decodeAll
131 | (n + 1) == maxN = decodeTo n
132 loop n = let
133 req = utf16Required
134 (B.index bytes n)
135 (B.index bytes (n + 1))
136 decodeMore = loop $! n + req
137 in if n + req > maxN
138 then decodeTo n
139 else decodeMore
140
141 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
142 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
143
144utf16_be :: Codec
145utf16_be = Codec name enc (toDecoding dec) where
146 name = T.pack "UTF-16-BE"
147 enc text = (TE.encodeUtf16BE text, Nothing)
148 dec bytes = case splitQuickly bytes of
149 Just (text, extra) -> (text, Right extra)
150 Nothing -> splitSlowly TE.decodeUtf16BE bytes
151
152 splitQuickly bytes = maybeDecode (loop 0) where
153 maxN = B.length bytes
154
155 loop n | n == maxN = decodeAll
156 | (n + 1) == maxN = decodeTo n
157 loop n = let
158 req = utf16Required
159 (B.index bytes (n + 1))
160 (B.index bytes n)
161 decodeMore = loop $! n + req
162 in if n + req > maxN
163 then decodeTo n
164 else decodeMore
165
166 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
167 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
168
169utf16Required :: Word8 -> Word8 -> Int
170utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
171 x :: Word16
172 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
173
174
175utf32_le :: Codec
176utf32_le = Codec name enc (toDecoding dec) where
177 name = T.pack "UTF-32-LE"
178 enc text = (TE.encodeUtf32LE text, Nothing)
179 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
180 Just (text, extra) -> (text, Right extra)
181 Nothing -> splitSlowly TE.decodeUtf32LE bs
182
183
184utf32_be :: Codec
185utf32_be = Codec name enc (toDecoding dec) where
186 name = T.pack "UTF-32-BE"
187 enc text = (TE.encodeUtf32BE text, Nothing)
188 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
189 Just (text, extra) -> (text, Right extra)
190 Nothing -> splitSlowly TE.decodeUtf32BE bs
191
192utf32SplitBytes :: (ByteString -> Text)
193 -> ByteString
194 -> Maybe (Text, ByteString)
195utf32SplitBytes dec bytes = split where
196 split = maybeDecode (dec toDecode, extra)
197 len = B.length bytes
198 lenExtra = mod len 4
199
200 lenToDecode = len - lenExtra
201 (toDecode, extra) = if lenExtra == 0
202 then (bytes, B.empty)
203 else B.splitAt lenToDecode bytes
204
205ascii :: Codec
206ascii = Codec name enc (toDecoding dec) where
207 name = T.pack "ASCII"
208 enc text = (bytes, extra) where
209 (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
210 bytes = B8.pack (T.unpack safe)
211 extra = if T.null unsafe
212 then Nothing
213 else Just (EncodeException ascii (T.head unsafe), unsafe)
214
215 dec bytes = (text, extra) where
216 (safe, unsafe) = B.span (<= 0x7F) bytes
217 text = T.pack (B8.unpack safe)
218 extra = if B.null unsafe
219 then Right B.empty
220 else Left (DecodeException ascii (B.head unsafe), unsafe)
221
222iso8859_1 :: Codec
223iso8859_1 = Codec name enc (toDecoding dec) where
224 name = T.pack "ISO-8859-1"
225 enc text = (bytes, extra) where
226 (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
227 bytes = B8.pack (T.unpack safe)
228 extra = if T.null unsafe
229 then Nothing
230 else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
231
232 dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
233
234tryEvaluate :: a -> Either Exc.SomeException a
235tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
236
237maybeDecode :: (a, b) -> Maybe (a, b)
238maybeDecode (a, b) = case tryEvaluate a of
239 Left _ -> Nothing
240 Right _ -> Just (a, b)