[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs

{-# LANGUAGE RankNTypes, BangPatterns #-}

-- | This module uses the stream decoding functions from Michael Snoyman's new
--  <http://hackage.haskell.org/package/text-stream-decode text-stream-decode> 
--  package to define decoding functions and lenses.  The exported names
--  conflict with names in @Data.Text.Encoding@ but the module can otherwise be 
--  imported unqualified. 

module Pipes.Text.Encoding
    ( 
    -- * The Lens or Codec type
    -- $lenses
    Codec
    , decode
    , eof
    -- * \'Viewing\' the Text in a byte stream
    -- $codecs
    , utf8
    , utf8Pure
    , utf16LE
    , utf16BE
    , utf32LE
    , utf32BE
    -- * Non-lens decoding functions 
    -- $decoders
    , decodeUtf8
    , decodeUtf8Pure
    , decodeUtf16LE
    , decodeUtf16BE
    , decodeUtf32LE
    , decodeUtf32BE
    -- * Re-encoding functions
    -- $encoders
    , encodeUtf8
    , encodeUtf16LE
    , encodeUtf16BE
    , encodeUtf32LE
    , encodeUtf32BE
    -- * Functions for latin and ascii text
    -- $ascii
    , encodeAscii
    , decodeAscii
    , encodeIso8859_1
    , decodeIso8859_1
    ) 
    where

import Data.Functor.Constant (Constant(..))
import Data.Char (ord)
import Data.ByteString as B 
import Data.ByteString (ByteString)
import Data.ByteString.Char8 as B8
import Data.Text (Text)
import qualified Data.Text as T 
import qualified Data.Text.Encoding as TE 
import qualified Data.Streaming.Text as Stream
import Data.Streaming.Text (DecodeResult(..))
import Control.Monad (join)
import Data.Word (Word8)
import Pipes

type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)

{- $lenses
    The 'Codec' type is a simple specializion of 
    the @Lens'@ type synonymn used by the standard lens libraries, 
    <http://hackage.haskell.org/package/lens lens> and 
    <http://hackage.haskell.org/package/lens-family lens-family>. That type, 
    
>   type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)

    is just an alias for a Prelude type. Thus you use any particular codec with
    the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
    we presuppose neither since we already have access to the types they require.

    -}

type Codec
    =  forall m r
    .  Monad m
    => Lens' (Producer ByteString m r)
             (Producer Text m (Producer ByteString m r))

{- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
      exported here under a name appropriate to the material. All of these are
      the same: 

>    decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8

-}


decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
decode codec a = getConstant (codec Constant a)

{- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead
      reaches end-of-file happily. (Without it one just makes an explicit test 
      for emptiness of the resulting bytestring production using 'next') 
      Thus

>     decode (utf8 . eof) p =  view (utf8 . eof) p = p^.utf8.eof
 
      will be a text producer. If we hit undecodable bytes, the remaining
      bytestring producer will be returned as a 'Left' value; 
      in the happy case, a 'Right' value is returned with the anticipated 
      return value for the original bytestring producer. 
      ) 

-}

eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
                        (Producer Text m (Either (Producer ByteString m r) r))
eof k p = fmap fromEither (k (toEither p)) where

  fromEither = liftM (either id return)

  toEither pp = do p <- pp
                   check p

  check p = do e <- lift (next p)
               case e of 
                 Left r -> return (Right r)
                 Right (bs,pb) ->  if B.null bs 
                                     then check pb
                                     else return (Left (do yield bs
                                                           pb))


{- $codecs
    
    Each Codec-lens looks into a byte stream that is supposed to contain text.
    The particular \'Codec\' lenses are named in accordance with the expected 
    encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function, 
    use @view@ / @(^.)@ -- here also called 'decode':

>   view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
>   decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
>   Bytes.stdin ^. utf8 ::  Producer Text IO (Producer ByteString IO r)

    Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized 
    decoding functions exported here, e.g. 

>   decodeUtf8 ::  Producer ByteString m r -> Producer Text m (Producer ByteString m r)
>   decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)

    The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head. 
    At any point of decoding failure, the stream of text ends and reverts to (returns) 
    the original byte stream. Thus if the first bytes are already
    un-decodable, the whole ByteString producer will be returned, i.e.

>   view utf8 bytestream 

    will just come to the same as 

>   return bytestream

    Where there is no decoding failure, the return value of the text stream will be
    an empty byte stream followed by its own return value.  In all cases you must
    deal with the fact that it is a /ByteString producer/ that is returned, even if
    it can be thrown away with @Control.Monad.void@

>   void (Bytes.stdin ^. utf8) :: Producer Text IO ()
    
    @zoom@ converts a Text parser into a ByteString parser:

>   zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)

    or, using the type synonymn from @Pipes.Parse@:
    
>   zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)

    Thus we can define a ByteString parser like this:
    
>   withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8))) 
>   withNextByte = do char_ <- zoom utf8 Text.drawChar
>                     byte_ <- Bytes.peekByte
>                     return (char_, byte_)

     Though @withNextByte@ is partly defined with a Text parser 'drawChar'; 
     but it is a ByteString parser; it will return the first valid utf8-encoded 
     Char in a ByteString, whatever its length, 
     and the first byte of the next character, if they exist. Because 
     we \'draw\' one and \'peek\' at the other, the parser as a whole only 
     advances one Char's length along the bytestring, whatever that length may be.
     See the slightly more complex example \'decode.hs\' in the 
     <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall> 
     discussion of this type of byte stream parsing.
    -}

utf8 :: Codec
utf8 = mkCodec decodeUtf8 TE.encodeUtf8

utf8Pure :: Codec
utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8

utf16LE :: Codec
utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE

utf16BE :: Codec
utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE

utf32LE :: Codec
utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE

utf32BE :: Codec
utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE

decodeStream :: Monad m 
       => (B.ByteString -> DecodeResult) 
       -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeStream = loop where
  loop dec0 p = 
    do x <- lift (next p) 
       case x of Left r -> return (return r)
                 Right (chunk, p') -> case dec0 chunk of 
                    DecodeResultSuccess text dec -> do yield text
                                                       loop dec p'
                    DecodeResultFailure text bs -> do yield text 
                                                      return (do yield bs 
                                                                 p')
{-# INLINABLE decodeStream#-}


{- $decoders
   These are functions with the simple type:
   
>   decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)

   Thus in general 

>     decodeUtf8 = view utf8
>     decodeUtf16LE = view utf16LE

   and so forth, but these forms
   may be more convenient (and give better type errors!) where lenses are
   not desired.
-}


decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf8 = decodeStream Stream.decodeUtf8
{-# INLINE decodeUtf8 #-}

decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
{-# INLINE decodeUtf8Pure #-}

decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf16LE = decodeStream Stream.decodeUtf16LE
{-# INLINE decodeUtf16LE #-}

decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf16BE = decodeStream Stream.decodeUtf16BE
{-# INLINE decodeUtf16BE #-}

decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf32LE = decodeStream Stream.decodeUtf32LE
{-# INLINE decodeUtf32LE #-}

decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf32BE = decodeStream Stream.decodeUtf32BE
{-# INLINE decodeUtf32BE #-}


{- $encoders
   These are simply defined 
   
>      encodeUtf8 = yield . TE.encodeUtf8
   
   They are intended for use with 'for'
   
>      for Text.stdin encodeUtf8 :: Producer ByteString IO ()

   which would have the effect of 
   
>      Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)

   using the encoding functions from Data.Text.Encoding 
-}

encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
encodeUtf8 = yield . TE.encodeUtf8
encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
encodeUtf16LE = yield . TE.encodeUtf16LE
encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
encodeUtf16BE = yield . TE.encodeUtf16BE
encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
encodeUtf32LE = yield . TE.encodeUtf32LE
encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
encodeUtf32BE = yield . TE.encodeUtf32BE

mkCodec :: (forall r m . Monad m => 
           Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
        -> (Text -> ByteString)
        -> Codec
mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc)))  (k (dec p0))


{- $ascii
   ascii and latin encodings only use a small number of the characters 'Text'
     recognizes; thus we cannot use the pipes @Lens@ style to work with them. 
     Rather we simply define functions each way. 
-}


-- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
--   returning the rest of the 'Text' at the first non-ascii 'Char'

encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
encodeAscii = go where
  go p = do e <- lift (next p)
            case e of 
              Left r -> return (return r)
              Right (chunk, p') -> 
                 if T.null chunk 
                   then go p'
                   else let (safe, unsafe)  = T.span (\c -> ord c <= 0x7F) chunk
                        in do yield (B8.pack (T.unpack safe))
                              if T.null unsafe
                                then go p'
                                else return $ do yield unsafe 
                                                 p'
                                                 
{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
     returning the rest of the 'Text' upon hitting any non-latin 'Char'
   -}
encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
encodeIso8859_1 = go where
  go p = do e <- lift (next p)
            case e of 
              Left r -> return (return r)
              Right (txt, p') -> 
                 if T.null txt 
                   then go p'
                   else let (safe, unsafe)  = T.span (\c -> ord c <= 0xFF) txt
                        in do yield (B8.pack (T.unpack safe))
                              if T.null unsafe
                                then go p'
                                else return $ do yield unsafe 
                                                 p'

{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
     unused 'ByteString' upon hitting an un-ascii byte.
   -}
decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeAscii = go where
  go p = do e <- lift (next p)
            case e of 
              Left r -> return (return r)
              Right (chunk, p') -> 
                 if B.null chunk 
                   then go p'
                   else let (safe, unsafe) = B.span (<= 0x7F) chunk
                        in do yield (T.pack (B8.unpack safe))
                              if B.null unsafe
                                then go p'
                                else return (do yield unsafe 
                                                p')

{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
     unused 'ByteString' upon hitting the rare un-latinizable byte.
     -}
decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeIso8859_1 = go where
  go p = do e <- lift (next p)
            case e of 
              Left r -> return (return r)
              Right (chunk, p') -> 
                 if B.null chunk 
                    then go p'
                    else do let (safe, unsafe) = B.span (<= 0xFF) chunk
                            yield (T.pack (B8.unpack safe))
                            if B.null unsafe 
                               then go p'
                               else return (do yield unsafe 
                                               p')
Commit	Line	Data
bbdfd305	1	{-# LANGUAGE RankNTypes, BangPatterns #-}
89d80557	2
0ac0c414	3	-- \| This module uses the stream decoding functions from Michael Snoyman's new
0ac0c414	4	-- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
4ea59a8b	5	-- package to define decoding functions and lenses. The exported names
	6	-- conflict with names in @Data.Text.Encoding@ but the module can otherwise be
	7	-- imported unqualified.
bbdfd305	8
bbdfd305	9	module Pipes.Text.Encoding
fafcbeb5	10	(
0ac0c414	11	-- * The Lens or Codec type
fafcbeb5	12	-- $lenses
fafcbeb5	13	Codec
4ea59a8b	14	, decode
6c2fffdc	15	, eof
a4913c42	16	-- * \'Viewing\' the Text in a byte stream
fafcbeb5	17	-- $codecs
bbdfd305	18	, utf8
	19	, utf8Pure
	20	, utf16LE
	21	, utf16BE
	22	, utf32LE
	23	, utf32BE
fafcbeb5	24	-- * Non-lens decoding functions
0ac0c414	25	-- $decoders
89d80557	26	, decodeUtf8
	27	, decodeUtf8Pure
	28	, decodeUtf16LE
	29	, decodeUtf16BE
	30	, decodeUtf32LE
	31	, decodeUtf32BE
0ac0c414	32	-- * Re-encoding functions
	33	-- $encoders
	34	, encodeUtf8
	35	, encodeUtf16LE
	36	, encodeUtf16BE
	37	, encodeUtf32LE
	38	, encodeUtf32BE
fafcbeb5	39	-- * Functions for latin and ascii text
fafcbeb5	40	-- $ascii
bbdfd305	41	, encodeAscii
	42	, decodeAscii
	43	, encodeIso8859_1
	44	, decodeIso8859_1
	45	)
	46	where
	47
0ac0c414	48	import Data.Functor.Constant (Constant(..))
bbdfd305	49	import Data.Char (ord)
	50	import Data.ByteString as B
	51	import Data.ByteString (ByteString)
bbdfd305	52	import Data.ByteString.Char8 as B8
	53	import Data.Text (Text)
	54	import qualified Data.Text as T
	55	import qualified Data.Text.Encoding as TE
eae50557	56	import qualified Data.Streaming.Text as Stream
eae50557	57	import Data.Streaming.Text (DecodeResult(..))
70125641	58	import Control.Monad (join)
89d80557	59	import Data.Word (Word8)
bbdfd305	60	import Pipes
bbdfd305	61
2f4a83f8	62	type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
bbdfd305	63
fafcbeb5	64	{- $lenses
0ac0c414	65	The 'Codec' type is a simple specializion of
2f4a83f8	66	the @Lens'@ type synonymn used by the standard lens libraries,
0ac0c414	67	<http://hackage.haskell.org/package/lens lens> and
0ac0c414	68	<http://hackage.haskell.org/package/lens-family lens-family>. That type,
fafcbeb5	69
2f4a83f8	70	> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
0ac0c414	71
4ea59a8b	72	is just an alias for a Prelude type. Thus you use any particular codec with
	73	the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
	74	we presuppose neither since we already have access to the types they require.
0ac0c414	75
fafcbeb5	76	-}
fafcbeb5	77
21eb409c	78	type Codec
d199072b	79	= forall m r
21eb409c	80	. Monad m
2f4a83f8	81	=> Lens' (Producer ByteString m r)
d199072b	82	(Producer Text m (Producer ByteString m r))
d199072b	83
0ac0c414	84	{- \| 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
4ea59a8b	85	exported here under a name appropriate to the material. All of these are
4ea59a8b	86	the same:
0ac0c414	87
4ea59a8b	88	> decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
0ac0c414	89
	90	-}
	91
6c2fffdc	92
0ac0c414	93	decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
	94	decode codec a = getConstant (codec Constant a)
	95
6c2fffdc	96	{- \| 'eof' tells you explicitly when decoding stops due to bad bytes or instead
	97	reaches end-of-file happily. (Without it one just makes an explicit test
	98	for emptiness of the resulting bytestring production using 'next')
	99	Thus
	100
	101	> decode (utf8 . eof) p = view (utf8 . eof) p = p^.utf8.eof
	102
	103	will be a text producer. If we hit undecodable bytes, the remaining
	104	bytestring producer will be returned as a 'Left' value;
	105	in the happy case, a 'Right' value is returned with the anticipated
	106	return value for the original bytestring producer.
	107	)
	108
	109	-}
	110
	111	eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
	112	(Producer Text m (Either (Producer ByteString m r) r))
	113	eof k p = fmap fromEither (k (toEither p)) where
	114
	115	fromEither = liftM (either id return)
	116
	117	toEither pp = do p <- pp
	118	check p
	119
	120	check p = do e <- lift (next p)
	121	case e of
	122	Left r -> return (Right r)
	123	Right (bs,pb) -> if B.null bs
	124	then check pb
	125	else return (Left (do yield bs
	126	pb))
	127
bbdfd305	128
4ea59a8b	129	{- $codecs
	130
	131	Each Codec-lens looks into a byte stream that is supposed to contain text.
	132	The particular \'Codec\' lenses are named in accordance with the expected
	133	encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
	134	use @view@ / @(^.)@ -- here also called 'decode':
	135
	136	> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
	137	> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
	138	> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
	139
	140	Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
	141	decoding functions exported here, e.g.
	142
	143	> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
	144	> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
	145
	146	The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
	147	At any point of decoding failure, the stream of text ends and reverts to (returns)
	148	the original byte stream. Thus if the first bytes are already
	149	un-decodable, the whole ByteString producer will be returned, i.e.
	150
	151	> view utf8 bytestream
	152
	153	will just come to the same as
	154
	155	> return bytestream
	156
	157	Where there is no decoding failure, the return value of the text stream will be
	158	an empty byte stream followed by its own return value. In all cases you must
	159	deal with the fact that it is a /ByteString producer/ that is returned, even if
	160	it can be thrown away with @Control.Monad.void@
	161
	162	> void (Bytes.stdin ^. utf8) :: Producer Text IO ()
	163
	164	@zoom@ converts a Text parser into a ByteString parser:
	165
	166	> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
	167
a4913c42	168	or, using the type synonymn from @Pipes.Parse@:
4ea59a8b	169
	170	> zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
	171
a4913c42	172	Thus we can define a ByteString parser like this:
4ea59a8b	173
	174	> withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
	175	> withNextByte = do char_ <- zoom utf8 Text.drawChar
	176	> byte_ <- Bytes.peekByte
	177	> return (char_, byte_)
	178
	179	Though @withNextByte@ is partly defined with a Text parser 'drawChar';
	180	but it is a ByteString parser; it will return the first valid utf8-encoded
	181	Char in a ByteString, whatever its length,
	182	and the first byte of the next character, if they exist. Because
	183	we \'draw\' one and \'peek\' at the other, the parser as a whole only
	184	advances one Char's length along the bytestring, whatever that length may be.
	185	See the slightly more complex example \'decode.hs\' in the
	186	<http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
	187	discussion of this type of byte stream parsing.
	188	-}
	189
	190	utf8 :: Codec
	191	utf8 = mkCodec decodeUtf8 TE.encodeUtf8
	192
	193	utf8Pure :: Codec
	194	utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
	195
	196	utf16LE :: Codec
	197	utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
	198
	199	utf16BE :: Codec
	200	utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
	201
	202	utf32LE :: Codec
	203	utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
	204
	205	utf32BE :: Codec
	206	utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
	207
bbdfd305	208	decodeStream :: Monad m
	209	=> (B.ByteString -> DecodeResult)
	210	-> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
	211	decodeStream = loop where
	212	loop dec0 p =
	213	do x <- lift (next p)
	214	case x of Left r -> return (return r)
	215	Right (chunk, p') -> case dec0 chunk of
	216	DecodeResultSuccess text dec -> do yield text
	217	loop dec p'
	218	DecodeResultFailure text bs -> do yield text
	219	return (do yield bs
	220	p')
	221	{-# INLINABLE decodeStream#-}
	222
6c2fffdc	223
0ac0c414	224	{- $decoders
	225	These are functions with the simple type:
	226
	227	> decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
	228
	229	Thus in general
	230
	231	> decodeUtf8 = view utf8
	232	> decodeUtf16LE = view utf16LE
fafcbeb5	233
0ac0c414	234	and so forth, but these forms
	235	may be more convenient (and give better type errors!) where lenses are
	236	not desired.
	237	-}
fafcbeb5	238
fafcbeb5	239
bbdfd305	240	decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557	241	decodeUtf8 = decodeStream Stream.decodeUtf8
bbdfd305	242	{-# INLINE decodeUtf8 #-}
	243
	244	decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557	245	decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
bbdfd305	246	{-# INLINE decodeUtf8Pure #-}
	247
	248	decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557	249	decodeUtf16LE = decodeStream Stream.decodeUtf16LE
bbdfd305	250	{-# INLINE decodeUtf16LE #-}
	251
	252	decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557	253	decodeUtf16BE = decodeStream Stream.decodeUtf16BE
bbdfd305	254	{-# INLINE decodeUtf16BE #-}
	255
	256	decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557	257	decodeUtf32LE = decodeStream Stream.decodeUtf32LE
bbdfd305	258	{-# INLINE decodeUtf32LE #-}
	259
	260	decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557	261	decodeUtf32BE = decodeStream Stream.decodeUtf32BE
bbdfd305	262	{-# INLINE decodeUtf32BE #-}
bbdfd305	263
0ac0c414	264
	265	{- $encoders
	266	These are simply defined
	267
	268	> encodeUtf8 = yield . TE.encodeUtf8
	269
	270	They are intended for use with 'for'
	271
	272	> for Text.stdin encodeUtf8 :: Producer ByteString IO ()
	273
	274	which would have the effect of
	275
	276	> Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
	277
	278	using the encoding functions from Data.Text.Encoding
	279	-}
	280
10cfd90e	281	encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
0ac0c414	282	encodeUtf8 = yield . TE.encodeUtf8
10cfd90e	283	encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414	284	encodeUtf16LE = yield . TE.encodeUtf16LE
10cfd90e	285	encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414	286	encodeUtf16BE = yield . TE.encodeUtf16BE
10cfd90e	287	encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414	288	encodeUtf32LE = yield . TE.encodeUtf32LE
10cfd90e	289	encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414	290	encodeUtf32BE = yield . TE.encodeUtf32BE
0ac0c414	291
bbdfd305	292	mkCodec :: (forall r m . Monad m =>
	293	Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
	294	-> (Text -> ByteString)
	295	-> Codec
	296	mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
	297
	298
bbdfd305	299
fafcbeb5	300	{- $ascii
	301	ascii and latin encodings only use a small number of the characters 'Text'
	302	recognizes; thus we cannot use the pipes @Lens@ style to work with them.
bbdfd305	303	Rather we simply define functions each way.
bbdfd305	304	-}
bbdfd305	305
fafcbeb5	306
0ac0c414	307	-- \| 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
fafcbeb5	308	-- returning the rest of the 'Text' at the first non-ascii 'Char'
fafcbeb5	309
bbdfd305	310	encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
	311	encodeAscii = go where
	312	go p = do e <- lift (next p)
	313	case e of
	314	Left r -> return (return r)
	315	Right (chunk, p') ->
	316	if T.null chunk
	317	then go p'
	318	else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
	319	in do yield (B8.pack (T.unpack safe))
	320	if T.null unsafe
	321	then go p'
	322	else return $ do yield unsafe
	323	p'
	324
	325	{- \| Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
	326	returning the rest of the 'Text' upon hitting any non-latin 'Char'
	327	-}
	328	encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
	329	encodeIso8859_1 = go where
	330	go p = do e <- lift (next p)
	331	case e of
	332	Left r -> return (return r)
	333	Right (txt, p') ->
	334	if T.null txt
	335	then go p'
	336	else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
	337	in do yield (B8.pack (T.unpack safe))
	338	if T.null unsafe
	339	then go p'
	340	else return $ do yield unsafe
	341	p'
	342
	343	{- \| Reduce a byte stream to a corresponding stream of ascii chars, returning the
	344	unused 'ByteString' upon hitting an un-ascii byte.
	345	-}
	346	decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
	347	decodeAscii = go where
	348	go p = do e <- lift (next p)
	349	case e of
	350	Left r -> return (return r)
	351	Right (chunk, p') ->
	352	if B.null chunk
	353	then go p'
	354	else let (safe, unsafe) = B.span (<= 0x7F) chunk
	355	in do yield (T.pack (B8.unpack safe))
	356	if B.null unsafe
	357	then go p'
	358	else return (do yield unsafe
	359	p')
	360
	361	{- \| Reduce a byte stream to a corresponding stream of ascii chars, returning the
	362	unused 'ByteString' upon hitting the rare un-latinizable byte.
	363	-}
	364	decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
	365	decodeIso8859_1 = go where
	366	go p = do e <- lift (next p)
	367	case e of
	368	Left r -> return (return r)
	369	Right (chunk, p') ->
	370	if B.null chunk
	371	then go p'
	372	else do let (safe, unsafe) = B.span (<= 0xFF) chunk
	373	yield (T.pack (B8.unpack safe))
374	if B.null unsafe
375	then go p'
376	else return (do yield unsafe
377	p')
378
379
380