]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Internal/Codec.hs
075a1520795b64e8be713cc26723db071c0df5b6
[github/fretlink/text-pipes.git] / Pipes / Text / Internal / Codec.hs
1
2 {-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
3 -- |
4 -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
5 -- License: MIT
6 -- This Parts of this code were taken from enumerator and conduits, and adapted for pipes
7
8 -- This module follows the model of the enumerator and conduits libraries, and defines
9 -- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and
10 -- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so
11 -- special functions appear in @Pipes.Text@
12
13
14 module Pipes.Text.Internal.Codec
15 ( Codec(..)
16 , TextException(..)
17 , utf8
18 , utf16_le
19 , utf16_be
20 , utf32_le
21 , utf32_be
22 ) where
23
24 import Data.Bits ((.&.))
25 import Data.Char (ord)
26 import Data.ByteString as B
27 import Data.ByteString (ByteString)
28 import Data.ByteString.Internal as B
29 import Data.ByteString.Char8 as B8
30 import Data.Text (Text)
31 import qualified Data.Text as T
32 import qualified Data.Text.Encoding as TE
33 import Data.Text.Encoding.Error ()
34 import GHC.Word (Word8, Word32)
35 import qualified Data.Text.Array as A
36 import Data.Word (Word8, Word16)
37 import System.IO.Unsafe (unsafePerformIO)
38 import qualified Control.Exception as Exc
39 import Data.Bits ((.&.), (.|.), shiftL)
40 import Data.Typeable
41 import Control.Arrow (first)
42 import Data.Maybe (catMaybes)
43 import Pipes.Text.Internal.Decoding
44 import Pipes
45 -- | A specific character encoding.
46
47 data Codec = Codec
48 { codecName :: Text
49 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
50 , codecDecode :: ByteString -> Decoding
51 }
52
53 instance Show Codec where
54 showsPrec d c = showParen (d > 10) $
55 showString "Codec " . shows (codecName c)
56
57 data TextException = DecodeException Codec Word8
58 | EncodeException Codec Char
59 | LengthExceeded Int
60 | TextException Exc.SomeException
61 deriving (Show, Typeable)
62 instance Exc.Exception TextException
63
64
65 toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
66 -> (ByteString -> Decoding)
67 toDecoding op = loop B.empty where
68 loop !extra bs0 = case op (B.append extra bs0) of
69 (txt, Right bs) -> Some txt bs (loop bs)
70 (txt, Left (_,bs)) -> Other txt bs
71 -- To do: toDecoding should be inlined in each of the 'Codec' definitions
72 -- or else Codec changed to the conduit/enumerator definition. We have
73 -- altered it to use 'streamDecodeUtf8'
74
75 splitSlowly :: (ByteString -> Text)
76 -> ByteString
77 -> (Text, Either (TextException, ByteString) ByteString)
78 splitSlowly dec bytes = valid where
79 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
80 splits 0 = [(B.empty, bytes)]
81 splits n = B.splitAt n bytes : splits (n - 1)
82 decFirst (a, b) = case tryEvaluate (dec a) of
83 Left _ -> Nothing
84 Right text -> let trouble = case tryEvaluate (dec b) of
85 Left exc -> Left (TextException exc, b)
86 Right _ -> Right B.empty
87 in Just (text, trouble) -- this case shouldn't occur,
88 -- since splitSlowly is only called
89 -- when parsing failed somewhere
90
91 utf8 :: Codec
92 utf8 = Codec name enc (toDecoding dec) where
93 name = T.pack "UTF-8"
94 enc text = (TE.encodeUtf8 text, Nothing)
95 dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
96
97 -- -- Whether the given byte is a continuation byte.
98 -- isContinuation byte = byte .&. 0xC0 == 0x80
99 --
100 -- -- The number of continuation bytes needed by the given
101 -- -- non-continuation byte. Returns -1 for an illegal UTF-8
102 -- -- non-continuation byte and the whole split quickly must fail so
103 -- -- as the input is passed to TE.decodeUtf8, which will issue a
104 -- -- suitable error.
105 -- required x0
106 -- | x0 .&. 0x80 == 0x00 = 0
107 -- | x0 .&. 0xE0 == 0xC0 = 1
108 -- | x0 .&. 0xF0 == 0xE0 = 2
109 -- | x0 .&. 0xF8 == 0xF0 = 3
110 -- | otherwise = -1
111 --
112 -- splitQuickly bytes
113 -- | B.null l || req == -1 = Nothing
114 -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
115 -- | otherwise = Just (TE.decodeUtf8 l', r')
116 -- where
117 -- (l, r) = B.spanEnd isContinuation bytes
118 -- req = required (B.last l)
119 -- l' = B.init l
120 -- r' = B.cons (B.last l) r
121
122
123 utf16_le :: Codec
124 utf16_le = Codec name enc (toDecoding dec) where
125 name = T.pack "UTF-16-LE"
126 enc text = (TE.encodeUtf16LE text, Nothing)
127 dec bytes = case splitQuickly bytes of
128 Just (text, extra) -> (text, Right extra)
129 Nothing -> splitSlowly TE.decodeUtf16LE bytes
130
131 splitQuickly bytes = maybeDecode (loop 0) where
132 maxN = B.length bytes
133
134 loop n | n == maxN = decodeAll
135 | (n + 1) == maxN = decodeTo n
136 loop n = let
137 req = utf16Required
138 (B.index bytes n)
139 (B.index bytes (n + 1))
140 decodeMore = loop $! n + req
141 in if n + req > maxN
142 then decodeTo n
143 else decodeMore
144
145 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
146 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
147
148 utf16_be :: Codec
149 utf16_be = Codec name enc (toDecoding dec) where
150 name = T.pack "UTF-16-BE"
151 enc text = (TE.encodeUtf16BE text, Nothing)
152 dec bytes = case splitQuickly bytes of
153 Just (text, extra) -> (text, Right extra)
154 Nothing -> splitSlowly TE.decodeUtf16BE bytes
155
156 splitQuickly bytes = maybeDecode (loop 0) where
157 maxN = B.length bytes
158
159 loop n | n == maxN = decodeAll
160 | (n + 1) == maxN = decodeTo n
161 loop n = let
162 req = utf16Required
163 (B.index bytes (n + 1))
164 (B.index bytes n)
165 decodeMore = loop $! n + req
166 in if n + req > maxN
167 then decodeTo n
168 else decodeMore
169
170 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
171 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
172
173 utf16Required :: Word8 -> Word8 -> Int
174 utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
175 x :: Word16
176 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
177
178
179 utf32_le :: Codec
180 utf32_le = Codec name enc (toDecoding dec) where
181 name = T.pack "UTF-32-LE"
182 enc text = (TE.encodeUtf32LE text, Nothing)
183 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
184 Just (text, extra) -> (text, Right extra)
185 Nothing -> splitSlowly TE.decodeUtf32LE bs
186
187
188 utf32_be :: Codec
189 utf32_be = Codec name enc (toDecoding dec) where
190 name = T.pack "UTF-32-BE"
191 enc text = (TE.encodeUtf32BE text, Nothing)
192 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
193 Just (text, extra) -> (text, Right extra)
194 Nothing -> splitSlowly TE.decodeUtf32BE bs
195
196 utf32SplitBytes :: (ByteString -> Text)
197 -> ByteString
198 -> Maybe (Text, ByteString)
199 utf32SplitBytes dec bytes = split where
200 split = maybeDecode (dec toDecode, extra)
201 len = B.length bytes
202 lenExtra = mod len 4
203
204 lenToDecode = len - lenExtra
205 (toDecode, extra) = if lenExtra == 0
206 then (bytes, B.empty)
207 else B.splitAt lenToDecode bytes
208
209
210 tryEvaluate :: a -> Either Exc.SomeException a
211 tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
212
213 maybeDecode :: (a, b) -> Maybe (a, b)
214 maybeDecode (a, b) = case tryEvaluate a of
215 Left _ -> Nothing
216 Right _ -> Just (a, b)