[github/fretlink/text-pipes.git] / Pipes / Text / Internal / Codec.hs


{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
-- |
-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
-- License: MIT
--  This Parts of this code were taken from enumerator and conduits, and adapted for pipes
{- | This module follows the model of the enumerator and conduits libraries, and defines
     'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and 
     iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so
     special functions appear in @Pipes.Text@
-}

module Pipes.Text.Internal.Codec
    ( Decoding(..)
    , streamDecodeUtf8
    , decodeSomeUtf8
    , Codec(..)
    , TextException(..)
    , utf8
    , utf16_le
    , utf16_be
    , utf32_le
    , utf32_be
    ) where

import Data.Bits ((.&.))
import Data.Char (ord)
import Data.ByteString as B 
import Data.ByteString (ByteString)
import Data.ByteString.Internal as B 
import Data.ByteString.Char8 as B8
import Data.Text (Text)
import qualified Data.Text as T 
import qualified Data.Text.Encoding as TE 
import Data.Text.Encoding.Error ()
import GHC.Word (Word8, Word32)
import qualified Data.Text.Array as A
import Data.Word (Word8, Word16)
import System.IO.Unsafe (unsafePerformIO)
import qualified Control.Exception as Exc
import Data.Bits ((.&.), (.|.), shiftL)
import Data.Typeable
import Control.Arrow (first)
import Data.Maybe (catMaybes)
import Pipes.Text.Internal.Decoding
import Pipes
-- | A specific character encoding.

data Codec = Codec
  { codecName :: Text
  , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
  , codecDecode :: ByteString -> Decoding 
  }

instance Show Codec where
    showsPrec d c = showParen (d > 10) $ 
                    showString "Codec " . shows (codecName c)

data TextException = DecodeException Codec Word8
                   | EncodeException Codec Char
                   | LengthExceeded Int
                   | TextException Exc.SomeException
    deriving (Show, Typeable)
instance Exc.Exception TextException


toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
           -> (ByteString -> Decoding)
toDecoding op = loop B.empty where
  loop !extra bs0 = case op (B.append extra bs0) of
                      (txt, Right bs) -> Some txt bs (loop bs)
                      (txt, Left (_,bs)) -> Other txt bs
-- To do: toDecoding should be inlined in each of the 'Codec' definitions
-- or else Codec changed to the conduit/enumerator definition.  We have
-- altered it to use 'streamDecodeUtf8'

splitSlowly :: (ByteString -> Text)
            -> ByteString 
            -> (Text, Either (TextException, ByteString) ByteString)
splitSlowly dec bytes = valid where
    valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
    splits 0 = [(B.empty, bytes)]
    splits n = B.splitAt n bytes : splits (n - 1)
    decFirst (a, b) = case tryEvaluate (dec a) of
        Left _ -> Nothing
        Right text -> let trouble = case tryEvaluate (dec b) of
                            Left exc -> Left (TextException exc, b)
                            Right _  -> Right B.empty 
                      in Just (text, trouble) -- this case shouldn't occur, 
                                      -- since splitSlowly is only called
                                      -- when parsing failed somewhere

utf8 :: Codec
utf8 = Codec name enc (toDecoding dec) where
    name = T.pack "UTF-8"
    enc text = (TE.encodeUtf8 text, Nothing)
    dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)

--     -- Whether the given byte is a continuation byte.
--     isContinuation byte = byte .&. 0xC0 == 0x80
-- 
--     -- The number of continuation bytes needed by the given
--     -- non-continuation byte. Returns -1 for an illegal UTF-8
--     -- non-continuation byte and the whole split quickly must fail so
--     -- as the input is passed to TE.decodeUtf8, which will issue a
--     -- suitable error.
--     required x0
--         | x0 .&. 0x80 == 0x00 = 0
--         | x0 .&. 0xE0 == 0xC0 = 1
--         | x0 .&. 0xF0 == 0xE0 = 2
--         | x0 .&. 0xF8 == 0xF0 = 3
--         | otherwise           = -1
-- 
--     splitQuickly bytes
--         | B.null l || req == -1 = Nothing
--         | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
--         | otherwise = Just (TE.decodeUtf8 l', r')
--       where
--         (l, r) = B.spanEnd isContinuation bytes
--         req = required (B.last l)
--         l' = B.init l
--         r' = B.cons (B.last l) r


utf16_le :: Codec
utf16_le = Codec name enc (toDecoding dec) where
    name = T.pack "UTF-16-LE"
    enc text = (TE.encodeUtf16LE text, Nothing)
    dec bytes = case splitQuickly bytes of
        Just (text, extra) -> (text, Right extra)
        Nothing -> splitSlowly TE.decodeUtf16LE bytes

    splitQuickly bytes = maybeDecode (loop 0) where
        maxN = B.length bytes

        loop n |  n      == maxN = decodeAll
               | (n + 1) == maxN = decodeTo n
        loop n = let
            req = utf16Required
                (B.index bytes n)
                (B.index bytes (n + 1))
            decodeMore = loop $! n + req
            in if n + req > maxN
                then decodeTo n
                else decodeMore

        decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
        decodeAll = (TE.decodeUtf16LE bytes, B.empty)

utf16_be :: Codec
utf16_be = Codec name enc (toDecoding dec) where
    name = T.pack "UTF-16-BE"
    enc text = (TE.encodeUtf16BE text, Nothing)
    dec bytes = case splitQuickly bytes of
        Just (text, extra) -> (text, Right extra)
        Nothing -> splitSlowly TE.decodeUtf16BE bytes

    splitQuickly bytes = maybeDecode (loop 0) where
        maxN = B.length bytes

        loop n |  n      == maxN = decodeAll
               | (n + 1) == maxN = decodeTo n
        loop n = let
            req = utf16Required
                (B.index bytes (n + 1))
                (B.index bytes n)
            decodeMore = loop $! n + req
            in if n + req > maxN
                then decodeTo n
                else decodeMore

        decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
        decodeAll = (TE.decodeUtf16BE bytes, B.empty)

utf16Required :: Word8 -> Word8 -> Int
utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
    x :: Word16
    x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0


utf32_le :: Codec
utf32_le = Codec name enc (toDecoding dec) where
    name = T.pack "UTF-32-LE"
    enc text = (TE.encodeUtf32LE text, Nothing)
    dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
        Just (text, extra) -> (text, Right extra)
        Nothing -> splitSlowly TE.decodeUtf32LE bs


utf32_be :: Codec
utf32_be = Codec name enc (toDecoding dec) where
    name = T.pack "UTF-32-BE"
    enc text = (TE.encodeUtf32BE text, Nothing)
    dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
        Just (text, extra) -> (text, Right extra)
        Nothing -> splitSlowly TE.decodeUtf32BE bs

utf32SplitBytes :: (ByteString -> Text)
                -> ByteString
                -> Maybe (Text, ByteString)
utf32SplitBytes dec bytes = split where
    split = maybeDecode (dec toDecode, extra)
    len = B.length bytes
    lenExtra = mod len 4

    lenToDecode = len - lenExtra
    (toDecode, extra) = if lenExtra == 0
        then (bytes, B.empty)
        else B.splitAt lenToDecode bytes


tryEvaluate :: a -> Either Exc.SomeException a
tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate

maybeDecode :: (a, b) -> Maybe (a, b)
maybeDecode (a, b) = case tryEvaluate a of
    Left _ -> Nothing
    Right _ -> Just (a, b)
Commit	Line	Data
409759e8	1
	2	{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
	3	-- \|
	4	-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
	5	-- License: MIT
4da91c37	6	-- This Parts of this code were taken from enumerator and conduits, and adapted for pipes
	7	{- \| This module follows the model of the enumerator and conduits libraries, and defines
	8	'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and
	9	iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so
	10	special functions appear in @Pipes.Text@
	11	-}
409759e8	12
	13	module Pipes.Text.Internal.Codec
	14	( Decoding(..)
	15	, streamDecodeUtf8
	16	, decodeSomeUtf8
	17	, Codec(..)
	18	, TextException(..)
	19	, utf8
	20	, utf16_le
	21	, utf16_be
	22	, utf32_le
	23	, utf32_be
	24	) where
	25
	26	import Data.Bits ((.&.))
	27	import Data.Char (ord)
	28	import Data.ByteString as B
	29	import Data.ByteString (ByteString)
	30	import Data.ByteString.Internal as B
	31	import Data.ByteString.Char8 as B8
	32	import Data.Text (Text)
	33	import qualified Data.Text as T
	34	import qualified Data.Text.Encoding as TE
	35	import Data.Text.Encoding.Error ()
	36	import GHC.Word (Word8, Word32)
	37	import qualified Data.Text.Array as A
	38	import Data.Word (Word8, Word16)
	39	import System.IO.Unsafe (unsafePerformIO)
	40	import qualified Control.Exception as Exc
	41	import Data.Bits ((.&.), (.\|.), shiftL)
	42	import Data.Typeable
	43	import Control.Arrow (first)
	44	import Data.Maybe (catMaybes)
	45	import Pipes.Text.Internal.Decoding
	46	import Pipes
	47	-- \| A specific character encoding.
4da91c37	48
409759e8	49	data Codec = Codec
	50	{ codecName :: Text
	51	, codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
4da91c37	52	, codecDecode :: ByteString -> Decoding
409759e8	53	}
	54
	55	instance Show Codec where
	56	showsPrec d c = showParen (d > 10) $
	57	showString "Codec " . shows (codecName c)
	58
	59	data TextException = DecodeException Codec Word8
	60	\| EncodeException Codec Char
	61	\| LengthExceeded Int
	62	\| TextException Exc.SomeException
	63	deriving (Show, Typeable)
	64	instance Exc.Exception TextException
	65
	66
	67	toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
	68	-> (ByteString -> Decoding)
	69	toDecoding op = loop B.empty where
	70	loop !extra bs0 = case op (B.append extra bs0) of
	71	(txt, Right bs) -> Some txt bs (loop bs)
	72	(txt, Left (_,bs)) -> Other txt bs
	73	-- To do: toDecoding should be inlined in each of the 'Codec' definitions
	74	-- or else Codec changed to the conduit/enumerator definition. We have
	75	-- altered it to use 'streamDecodeUtf8'
	76
	77	splitSlowly :: (ByteString -> Text)
	78	-> ByteString
	79	-> (Text, Either (TextException, ByteString) ByteString)
	80	splitSlowly dec bytes = valid where
	81	valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
	82	splits 0 = [(B.empty, bytes)]
	83	splits n = B.splitAt n bytes : splits (n - 1)
	84	decFirst (a, b) = case tryEvaluate (dec a) of
	85	Left _ -> Nothing
	86	Right text -> let trouble = case tryEvaluate (dec b) of
	87	Left exc -> Left (TextException exc, b)
	88	Right _ -> Right B.empty
	89	in Just (text, trouble) -- this case shouldn't occur,
	90	-- since splitSlowly is only called
	91	-- when parsing failed somewhere
	92
	93	utf8 :: Codec
	94	utf8 = Codec name enc (toDecoding dec) where
	95	name = T.pack "UTF-8"
	96	enc text = (TE.encodeUtf8 text, Nothing)
	97	dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
	98
	99	-- -- Whether the given byte is a continuation byte.
	100	-- isContinuation byte = byte .&. 0xC0 == 0x80
	101	--
	102	-- -- The number of continuation bytes needed by the given
	103	-- -- non-continuation byte. Returns -1 for an illegal UTF-8
	104	-- -- non-continuation byte and the whole split quickly must fail so
	105	-- -- as the input is passed to TE.decodeUtf8, which will issue a
	106	-- -- suitable error.
	107	-- required x0
	108	-- \| x0 .&. 0x80 == 0x00 = 0
	109	-- \| x0 .&. 0xE0 == 0xC0 = 1
	110	-- \| x0 .&. 0xF0 == 0xE0 = 2
	111	-- \| x0 .&. 0xF8 == 0xF0 = 3
	112	-- \| otherwise = -1
	113	--
	114	-- splitQuickly bytes
	115	-- \| B.null l \|\| req == -1 = Nothing
	116	-- \| req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
117	-- \| otherwise = Just (TE.decodeUtf8 l', r')
118	-- where
119	-- (l, r) = B.spanEnd isContinuation bytes
120	-- req = required (B.last l)
121	-- l' = B.init l
122	-- r' = B.cons (B.last l) r
123
124
125	utf16_le :: Codec
126	utf16_le = Codec name enc (toDecoding dec) where
127	name = T.pack "UTF-16-LE"
128	enc text = (TE.encodeUtf16LE text, Nothing)
129	dec bytes = case splitQuickly bytes of
130	Just (text, extra) -> (text, Right extra)
131	Nothing -> splitSlowly TE.decodeUtf16LE bytes
132
133	splitQuickly bytes = maybeDecode (loop 0) where
134	maxN = B.length bytes
135
136	loop n \| n == maxN = decodeAll
137	\| (n + 1) == maxN = decodeTo n
138	loop n = let
139	req = utf16Required
140	(B.index bytes n)
141	(B.index bytes (n + 1))
142	decodeMore = loop $! n + req
143	in if n + req > maxN
144	then decodeTo n
145	else decodeMore
146
147	decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
148	decodeAll = (TE.decodeUtf16LE bytes, B.empty)
149
150	utf16_be :: Codec
151	utf16_be = Codec name enc (toDecoding dec) where
152	name = T.pack "UTF-16-BE"
153	enc text = (TE.encodeUtf16BE text, Nothing)
154	dec bytes = case splitQuickly bytes of
155	Just (text, extra) -> (text, Right extra)
156	Nothing -> splitSlowly TE.decodeUtf16BE bytes
157
158	splitQuickly bytes = maybeDecode (loop 0) where
159	maxN = B.length bytes
160
161	loop n \| n == maxN = decodeAll
162	\| (n + 1) == maxN = decodeTo n
163	loop n = let
164	req = utf16Required
165	(B.index bytes (n + 1))
166	(B.index bytes n)
167	decodeMore = loop $! n + req
168	in if n + req > maxN
169	then decodeTo n
170	else decodeMore
171
172	decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
173	decodeAll = (TE.decodeUtf16BE bytes, B.empty)
174
175	utf16Required :: Word8 -> Word8 -> Int
176	utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
177	x :: Word16
178	x = (fromIntegral x1 `shiftL` 8) .\|. fromIntegral x0
179
180
181	utf32_le :: Codec
182	utf32_le = Codec name enc (toDecoding dec) where
183	name = T.pack "UTF-32-LE"
184	enc text = (TE.encodeUtf32LE text, Nothing)
185	dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
186	Just (text, extra) -> (text, Right extra)
187	Nothing -> splitSlowly TE.decodeUtf32LE bs
188
189
190	utf32_be :: Codec
191	utf32_be = Codec name enc (toDecoding dec) where
192	name = T.pack "UTF-32-BE"
193	enc text = (TE.encodeUtf32BE text, Nothing)
194	dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
195	Just (text, extra) -> (text, Right extra)
196	Nothing -> splitSlowly TE.decodeUtf32BE bs
197
198	utf32SplitBytes :: (ByteString -> Text)
199	-> ByteString
200	-> Maybe (Text, ByteString)
201	utf32SplitBytes dec bytes = split where
202	split = maybeDecode (dec toDecode, extra)
203	len = B.length bytes
204	lenExtra = mod len 4
205
206	lenToDecode = len - lenExtra
207	(toDecode, extra) = if lenExtra == 0
208	then (bytes, B.empty)
209	else B.splitAt lenToDecode bytes
210
211
212	tryEvaluate :: a -> Either Exc.SomeException a
213	tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
214
215	maybeDecode :: (a, b) -> Maybe (a, b)
216	maybeDecode (a, b) = case tryEvaluate a of
217	Left _ -> Nothing
218	Right _ -> Just (a, b)