]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Codec.hs
finished draft of codec lens, ascii and iso8859 are not codecs, so they are given...
[github/fretlink/text-pipes.git] / Pipes / Text / Codec.hs
1
2 {-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
3 -- |
4 -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
5 -- License: MIT
6 --
7 -- Parts of this code were taken from enumerator and conduits, and adapted for pipes.
8
9 module Pipes.Text.Codec
10 ( Decoding(..)
11 , streamDecodeUtf8
12 , decodeSomeUtf8
13 , Codec(..)
14 , TextException(..)
15 , utf8
16 , utf16_le
17 , utf16_be
18 , utf32_le
19 , utf32_be
20 ) where
21
22 import Data.Bits ((.&.))
23 import Data.Char (ord)
24 import Data.ByteString as B
25 import Data.ByteString (ByteString)
26 import Data.ByteString.Internal as B
27 import Data.ByteString.Char8 as B8
28 import Data.Text (Text)
29 import qualified Data.Text as T
30 import qualified Data.Text.Encoding as TE
31 import Data.Text.Encoding.Error ()
32 import GHC.Word (Word8, Word32)
33 import qualified Data.Text.Array as A
34 import Data.Word (Word8, Word16)
35 import System.IO.Unsafe (unsafePerformIO)
36 import qualified Control.Exception as Exc
37 import Data.Bits ((.&.), (.|.), shiftL)
38 import Data.Typeable
39 import Control.Arrow (first)
40 import Data.Maybe (catMaybes)
41 import Pipes.Text.Internal
42 import Pipes
43 -- | A specific character encoding.
44 --
45 -- Since 0.3.0
46 data Codec = Codec
47 { codecName :: Text
48 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
49 , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
50 }
51
52 instance Show Codec where
53 showsPrec d c = showParen (d > 10) $
54 showString "Codec " . shows (codecName c)
55
56 data TextException = DecodeException Codec Word8
57 | EncodeException Codec Char
58 | LengthExceeded Int
59 | TextException Exc.SomeException
60 deriving (Show, Typeable)
61 instance Exc.Exception TextException
62
63
64 toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
65 -> (ByteString -> Decoding)
66 toDecoding op = loop B.empty where
67 loop !extra bs0 = case op (B.append extra bs0) of
68 (txt, Right bs) -> Some txt bs (loop bs)
69 (txt, Left (_,bs)) -> Other txt bs
70 -- To do: toDecoding should be inlined in each of the 'Codec' definitions
71 -- or else Codec changed to the conduit/enumerator definition. We have
72 -- altered it to use 'streamDecodeUtf8'
73
74 splitSlowly :: (ByteString -> Text)
75 -> ByteString
76 -> (Text, Either (TextException, ByteString) ByteString)
77 splitSlowly dec bytes = valid where
78 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
79 splits 0 = [(B.empty, bytes)]
80 splits n = B.splitAt n bytes : splits (n - 1)
81 decFirst (a, b) = case tryEvaluate (dec a) of
82 Left _ -> Nothing
83 Right text -> let trouble = case tryEvaluate (dec b) of
84 Left exc -> Left (TextException exc, b)
85 Right _ -> Right B.empty
86 in Just (text, trouble) -- this case shouldn't occur,
87 -- since splitSlowly is only called
88 -- when parsing failed somewhere
89
90 utf8 :: Codec
91 utf8 = Codec name enc (toDecoding dec) where
92 name = T.pack "UTF-8"
93 enc text = (TE.encodeUtf8 text, Nothing)
94 dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
95
96 -- -- Whether the given byte is a continuation byte.
97 -- isContinuation byte = byte .&. 0xC0 == 0x80
98 --
99 -- -- The number of continuation bytes needed by the given
100 -- -- non-continuation byte. Returns -1 for an illegal UTF-8
101 -- -- non-continuation byte and the whole split quickly must fail so
102 -- -- as the input is passed to TE.decodeUtf8, which will issue a
103 -- -- suitable error.
104 -- required x0
105 -- | x0 .&. 0x80 == 0x00 = 0
106 -- | x0 .&. 0xE0 == 0xC0 = 1
107 -- | x0 .&. 0xF0 == 0xE0 = 2
108 -- | x0 .&. 0xF8 == 0xF0 = 3
109 -- | otherwise = -1
110 --
111 -- splitQuickly bytes
112 -- | B.null l || req == -1 = Nothing
113 -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
114 -- | otherwise = Just (TE.decodeUtf8 l', r')
115 -- where
116 -- (l, r) = B.spanEnd isContinuation bytes
117 -- req = required (B.last l)
118 -- l' = B.init l
119 -- r' = B.cons (B.last l) r
120
121
122 utf16_le :: Codec
123 utf16_le = Codec name enc (toDecoding dec) where
124 name = T.pack "UTF-16-LE"
125 enc text = (TE.encodeUtf16LE text, Nothing)
126 dec bytes = case splitQuickly bytes of
127 Just (text, extra) -> (text, Right extra)
128 Nothing -> splitSlowly TE.decodeUtf16LE bytes
129
130 splitQuickly bytes = maybeDecode (loop 0) where
131 maxN = B.length bytes
132
133 loop n | n == maxN = decodeAll
134 | (n + 1) == maxN = decodeTo n
135 loop n = let
136 req = utf16Required
137 (B.index bytes n)
138 (B.index bytes (n + 1))
139 decodeMore = loop $! n + req
140 in if n + req > maxN
141 then decodeTo n
142 else decodeMore
143
144 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
145 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
146
147 utf16_be :: Codec
148 utf16_be = Codec name enc (toDecoding dec) where
149 name = T.pack "UTF-16-BE"
150 enc text = (TE.encodeUtf16BE text, Nothing)
151 dec bytes = case splitQuickly bytes of
152 Just (text, extra) -> (text, Right extra)
153 Nothing -> splitSlowly TE.decodeUtf16BE bytes
154
155 splitQuickly bytes = maybeDecode (loop 0) where
156 maxN = B.length bytes
157
158 loop n | n == maxN = decodeAll
159 | (n + 1) == maxN = decodeTo n
160 loop n = let
161 req = utf16Required
162 (B.index bytes (n + 1))
163 (B.index bytes n)
164 decodeMore = loop $! n + req
165 in if n + req > maxN
166 then decodeTo n
167 else decodeMore
168
169 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
170 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
171
172 utf16Required :: Word8 -> Word8 -> Int
173 utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
174 x :: Word16
175 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
176
177
178 utf32_le :: Codec
179 utf32_le = Codec name enc (toDecoding dec) where
180 name = T.pack "UTF-32-LE"
181 enc text = (TE.encodeUtf32LE text, Nothing)
182 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
183 Just (text, extra) -> (text, Right extra)
184 Nothing -> splitSlowly TE.decodeUtf32LE bs
185
186
187 utf32_be :: Codec
188 utf32_be = Codec name enc (toDecoding dec) where
189 name = T.pack "UTF-32-BE"
190 enc text = (TE.encodeUtf32BE text, Nothing)
191 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
192 Just (text, extra) -> (text, Right extra)
193 Nothing -> splitSlowly TE.decodeUtf32BE bs
194
195 utf32SplitBytes :: (ByteString -> Text)
196 -> ByteString
197 -> Maybe (Text, ByteString)
198 utf32SplitBytes dec bytes = split where
199 split = maybeDecode (dec toDecode, extra)
200 len = B.length bytes
201 lenExtra = mod len 4
202
203 lenToDecode = len - lenExtra
204 (toDecode, extra) = if lenExtra == 0
205 then (bytes, B.empty)
206 else B.splitAt lenToDecode bytes
207
208
209 tryEvaluate :: a -> Either Exc.SomeException a
210 tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
211
212 maybeDecode :: (a, b) -> Maybe (a, b)
213 maybeDecode (a, b) = case tryEvaluate a of
214 Left _ -> Nothing
215 Right _ -> Just (a, b)