]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Internal/Codec.hs
cabal file more descriptive
[github/fretlink/text-pipes.git] / Pipes / Text / Internal / Codec.hs
1
2 {-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
3 -- |
4 -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
5 -- License: MIT
6 -- This Parts of this code were taken from enumerator and conduits, and adapted for pipes
7 {- | This module follows the model of the enumerator and conduits libraries, and defines
8 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and
9 iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so
10 special functions appear in @Pipes.Text@
11 -}
12
13 module Pipes.Text.Internal.Codec
14 ( Decoding(..)
15 , streamDecodeUtf8
16 , decodeSomeUtf8
17 , Codec(..)
18 , TextException(..)
19 , utf8
20 , utf16_le
21 , utf16_be
22 , utf32_le
23 , utf32_be
24 ) where
25
26 import Data.Bits ((.&.))
27 import Data.Char (ord)
28 import Data.ByteString as B
29 import Data.ByteString (ByteString)
30 import Data.ByteString.Internal as B
31 import Data.ByteString.Char8 as B8
32 import Data.Text (Text)
33 import qualified Data.Text as T
34 import qualified Data.Text.Encoding as TE
35 import Data.Text.Encoding.Error ()
36 import GHC.Word (Word8, Word32)
37 import qualified Data.Text.Array as A
38 import Data.Word (Word8, Word16)
39 import System.IO.Unsafe (unsafePerformIO)
40 import qualified Control.Exception as Exc
41 import Data.Bits ((.&.), (.|.), shiftL)
42 import Data.Typeable
43 import Control.Arrow (first)
44 import Data.Maybe (catMaybes)
45 import Pipes.Text.Internal.Decoding
46 import Pipes
47 -- | A specific character encoding.
48
49 data Codec = Codec
50 { codecName :: Text
51 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
52 , codecDecode :: ByteString -> Decoding
53 }
54
55 instance Show Codec where
56 showsPrec d c = showParen (d > 10) $
57 showString "Codec " . shows (codecName c)
58
59 data TextException = DecodeException Codec Word8
60 | EncodeException Codec Char
61 | LengthExceeded Int
62 | TextException Exc.SomeException
63 deriving (Show, Typeable)
64 instance Exc.Exception TextException
65
66
67 toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
68 -> (ByteString -> Decoding)
69 toDecoding op = loop B.empty where
70 loop !extra bs0 = case op (B.append extra bs0) of
71 (txt, Right bs) -> Some txt bs (loop bs)
72 (txt, Left (_,bs)) -> Other txt bs
73 -- To do: toDecoding should be inlined in each of the 'Codec' definitions
74 -- or else Codec changed to the conduit/enumerator definition. We have
75 -- altered it to use 'streamDecodeUtf8'
76
77 splitSlowly :: (ByteString -> Text)
78 -> ByteString
79 -> (Text, Either (TextException, ByteString) ByteString)
80 splitSlowly dec bytes = valid where
81 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
82 splits 0 = [(B.empty, bytes)]
83 splits n = B.splitAt n bytes : splits (n - 1)
84 decFirst (a, b) = case tryEvaluate (dec a) of
85 Left _ -> Nothing
86 Right text -> let trouble = case tryEvaluate (dec b) of
87 Left exc -> Left (TextException exc, b)
88 Right _ -> Right B.empty
89 in Just (text, trouble) -- this case shouldn't occur,
90 -- since splitSlowly is only called
91 -- when parsing failed somewhere
92
93 utf8 :: Codec
94 utf8 = Codec name enc (toDecoding dec) where
95 name = T.pack "UTF-8"
96 enc text = (TE.encodeUtf8 text, Nothing)
97 dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
98
99 -- -- Whether the given byte is a continuation byte.
100 -- isContinuation byte = byte .&. 0xC0 == 0x80
101 --
102 -- -- The number of continuation bytes needed by the given
103 -- -- non-continuation byte. Returns -1 for an illegal UTF-8
104 -- -- non-continuation byte and the whole split quickly must fail so
105 -- -- as the input is passed to TE.decodeUtf8, which will issue a
106 -- -- suitable error.
107 -- required x0
108 -- | x0 .&. 0x80 == 0x00 = 0
109 -- | x0 .&. 0xE0 == 0xC0 = 1
110 -- | x0 .&. 0xF0 == 0xE0 = 2
111 -- | x0 .&. 0xF8 == 0xF0 = 3
112 -- | otherwise = -1
113 --
114 -- splitQuickly bytes
115 -- | B.null l || req == -1 = Nothing
116 -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
117 -- | otherwise = Just (TE.decodeUtf8 l', r')
118 -- where
119 -- (l, r) = B.spanEnd isContinuation bytes
120 -- req = required (B.last l)
121 -- l' = B.init l
122 -- r' = B.cons (B.last l) r
123
124
125 utf16_le :: Codec
126 utf16_le = Codec name enc (toDecoding dec) where
127 name = T.pack "UTF-16-LE"
128 enc text = (TE.encodeUtf16LE text, Nothing)
129 dec bytes = case splitQuickly bytes of
130 Just (text, extra) -> (text, Right extra)
131 Nothing -> splitSlowly TE.decodeUtf16LE bytes
132
133 splitQuickly bytes = maybeDecode (loop 0) where
134 maxN = B.length bytes
135
136 loop n | n == maxN = decodeAll
137 | (n + 1) == maxN = decodeTo n
138 loop n = let
139 req = utf16Required
140 (B.index bytes n)
141 (B.index bytes (n + 1))
142 decodeMore = loop $! n + req
143 in if n + req > maxN
144 then decodeTo n
145 else decodeMore
146
147 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
148 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
149
150 utf16_be :: Codec
151 utf16_be = Codec name enc (toDecoding dec) where
152 name = T.pack "UTF-16-BE"
153 enc text = (TE.encodeUtf16BE text, Nothing)
154 dec bytes = case splitQuickly bytes of
155 Just (text, extra) -> (text, Right extra)
156 Nothing -> splitSlowly TE.decodeUtf16BE bytes
157
158 splitQuickly bytes = maybeDecode (loop 0) where
159 maxN = B.length bytes
160
161 loop n | n == maxN = decodeAll
162 | (n + 1) == maxN = decodeTo n
163 loop n = let
164 req = utf16Required
165 (B.index bytes (n + 1))
166 (B.index bytes n)
167 decodeMore = loop $! n + req
168 in if n + req > maxN
169 then decodeTo n
170 else decodeMore
171
172 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
173 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
174
175 utf16Required :: Word8 -> Word8 -> Int
176 utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
177 x :: Word16
178 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
179
180
181 utf32_le :: Codec
182 utf32_le = Codec name enc (toDecoding dec) where
183 name = T.pack "UTF-32-LE"
184 enc text = (TE.encodeUtf32LE text, Nothing)
185 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
186 Just (text, extra) -> (text, Right extra)
187 Nothing -> splitSlowly TE.decodeUtf32LE bs
188
189
190 utf32_be :: Codec
191 utf32_be = Codec name enc (toDecoding dec) where
192 name = T.pack "UTF-32-BE"
193 enc text = (TE.encodeUtf32BE text, Nothing)
194 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
195 Just (text, extra) -> (text, Right extra)
196 Nothing -> splitSlowly TE.decodeUtf32BE bs
197
198 utf32SplitBytes :: (ByteString -> Text)
199 -> ByteString
200 -> Maybe (Text, ByteString)
201 utf32SplitBytes dec bytes = split where
202 split = maybeDecode (dec toDecode, extra)
203 len = B.length bytes
204 lenExtra = mod len 4
205
206 lenToDecode = len - lenExtra
207 (toDecode, extra) = if lenExtra == 0
208 then (bytes, B.empty)
209 else B.splitAt lenToDecode bytes
210
211
212 tryEvaluate :: a -> Either Exc.SomeException a
213 tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
214
215 maybeDecode :: (a, b) -> Maybe (a, b)
216 maybeDecode (a, b) = case tryEvaluate a of
217 Left _ -> Nothing
218 Right _ -> Just (a, b)