Pipes/Text/Encoding.hs

   1 {-# LANGUAGE RankNTypes, BangPatterns #-}
   2
   3 -- | This module uses the stream decoding functions from Michael Snoyman's new
   4 --  <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
   5 --  package to define decoding functions and lenses.  The exported names
   6 --  conflict with names in @Data.Text.Encoding@ but the module can otherwise be
   7 --  imported unqualified.
   8
   9 module Pipes.Text.Encoding
  10     (
  11     -- * The Lens or Codec type
  12     -- $lenses
  13     Codec
  14     , decode
  15     -- * \'Viewing\' the Text in a byte stream
  16     -- $codecs
  17     , utf8
  18     , utf8Pure
  19     , utf16LE
  20     , utf16BE
  21     , utf32LE
  22     , utf32BE
  23     -- * Non-lens decoding functions
  24     -- $decoders
  25     , decodeUtf8
  26     , decodeUtf8Pure
  27     , decodeUtf16LE
  28     , decodeUtf16BE
  29     , decodeUtf32LE
  30     , decodeUtf32BE
  31     -- * Re-encoding functions
  32     -- $encoders
  33     , encodeUtf8
  34     , encodeUtf16LE
  35     , encodeUtf16BE
  36     , encodeUtf32LE
  37     , encodeUtf32BE
  38     -- * Functions for latin and ascii text
  39     -- $ascii
  40     , encodeAscii
  41     , decodeAscii
  42     , encodeIso8859_1
  43     , decodeIso8859_1
  44     )
  45     where
  46
  47 import Data.Functor.Constant (Constant(..))
  48 import Data.Char (ord)
  49 import Data.ByteString as B
  50 import Data.ByteString (ByteString)
  51 import Data.ByteString.Char8 as B8
  52 import Data.Text (Text)
  53 import qualified Data.Text as T
  54 import qualified Data.Text.Encoding as TE
  55 import Data.Text.StreamDecoding
  56 import Control.Monad (join)
  57 import Data.Word (Word8)
  58 import Pipes
  59
  60 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  61
  62 {- $lenses
  63     The 'Codec' type is a simple specializion of
  64     the @Lens'@ type synonymn used by the standard lens libraries,
  65     <http://hackage.haskell.org/package/lens lens> and
  66     <http://hackage.haskell.org/package/lens-family lens-family>. That type,
  67
  68 >   type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  69
  70     is just an alias for a Prelude type. Thus you use any particular codec with
  71     the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
  72     we presuppose neither since we already have access to the types they require.
  73
  74     -}
  75
  76 type Codec
  77     =  forall m r
  78     .  Monad m
  79     => Lens' (Producer ByteString m r)
  80              (Producer Text m (Producer ByteString m r))
  81
  82 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
  83       exported here under a name appropriate to the material. All of these are
  84       the same:
  85
  86 >    decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
  87
  88 -}
  89
  90 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
  91 decode codec a = getConstant (codec Constant a)
  92
  93
  94 {- $codecs
  95
  96     Each Codec-lens looks into a byte stream that is supposed to contain text.
  97     The particular \'Codec\' lenses are named in accordance with the expected
  98     encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
  99     use @view@ / @(^.)@ -- here also called 'decode':
 100
 101 >   view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 102 >   decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 103 >   Bytes.stdin ^. utf8 ::  Producer Text IO (Producer ByteString IO r)
 104
 105     Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
 106     decoding functions exported here, e.g.
 107
 108 >   decodeUtf8 ::  Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 109 >   decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 110
 111     The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
 112     At any point of decoding failure, the stream of text ends and reverts to (returns)
 113     the original byte stream. Thus if the first bytes are already
 114     un-decodable, the whole ByteString producer will be returned, i.e.
 115
 116 >   view utf8 bytestream
 117
 118     will just come to the same as
 119
 120 >   return bytestream
 121
 122     Where there is no decoding failure, the return value of the text stream will be
 123     an empty byte stream followed by its own return value.  In all cases you must
 124     deal with the fact that it is a /ByteString producer/ that is returned, even if
 125     it can be thrown away with @Control.Monad.void@
 126
 127 >   void (Bytes.stdin ^. utf8) :: Producer Text IO ()
 128
 129     @zoom@ converts a Text parser into a ByteString parser:
 130
 131 >   zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
 132
 133     or, using the type synonymn from @Pipes.Parse@:
 134
 135 >   zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
 136
 137     Thus we can define a ByteString parser like this:
 138
 139 >   withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
 140 >   withNextByte = do char_ <- zoom utf8 Text.drawChar
 141 >                     byte_ <- Bytes.peekByte
 142 >                     return (char_, byte_)
 143
 144      Though @withNextByte@ is partly defined with a Text parser 'drawChar';
 145      but it is a ByteString parser; it will return the first valid utf8-encoded
 146      Char in a ByteString, whatever its length,
 147      and the first byte of the next character, if they exist. Because
 148      we \'draw\' one and \'peek\' at the other, the parser as a whole only
 149      advances one Char's length along the bytestring, whatever that length may be.
 150      See the slightly more complex example \'decode.hs\' in the
 151      <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
 152      discussion of this type of byte stream parsing.
 153     -}
 154
 155 utf8 :: Codec
 156 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
 157
 158 utf8Pure :: Codec
 159 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
 160
 161 utf16LE :: Codec
 162 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
 163
 164 utf16BE :: Codec
 165 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
 166
 167 utf32LE :: Codec
 168 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
 169
 170 utf32BE :: Codec
 171 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
 172
 173 decodeStream :: Monad m
 174        => (B.ByteString -> DecodeResult)
 175        -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 176 decodeStream = loop where
 177   loop dec0 p =
 178     do x <- lift (next p)
 179        case x of Left r -> return (return r)
 180                  Right (chunk, p') -> case dec0 chunk of
 181                     DecodeResultSuccess text dec -> do yield text
 182                                                        loop dec p'
 183                     DecodeResultFailure text bs -> do yield text
 184                                                       return (do yield bs
 185                                                                  p')
 186 {-# INLINABLE decodeStream#-}
 187
 188 {- $decoders
 189    These are functions with the simple type:
 190
 191 >   decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 192
 193    Thus in general
 194
 195 >     decodeUtf8 = view utf8
 196 >     decodeUtf16LE = view utf16LE
 197
 198    and so forth, but these forms
 199    may be more convenient (and give better type errors!) where lenses are
 200    not desired.
 201 -}
 202
 203
 204 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 205 decodeUtf8 = decodeStream streamUtf8
 206 {-# INLINE decodeUtf8 #-}
 207
 208 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 209 decodeUtf8Pure = decodeStream streamUtf8Pure
 210 {-# INLINE decodeUtf8Pure #-}
 211
 212 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 213 decodeUtf16LE = decodeStream streamUtf16LE
 214 {-# INLINE decodeUtf16LE #-}
 215
 216 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 217 decodeUtf16BE = decodeStream streamUtf16BE
 218 {-# INLINE decodeUtf16BE #-}
 219
 220 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 221 decodeUtf32LE = decodeStream streamUtf32LE
 222 {-# INLINE decodeUtf32LE #-}
 223
 224 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 225 decodeUtf32BE = decodeStream streamUtf32BE
 226 {-# INLINE decodeUtf32BE #-}
 227
 228
 229 {- $encoders
 230    These are simply defined
 231
 232 >      encodeUtf8 = yield . TE.encodeUtf8
 233
 234    They are intended for use with 'for'
 235
 236 >      for Text.stdin encodeUtf8 :: Producer ByteString IO ()
 237
 238    which would have the effect of
 239
 240 >      Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
 241
 242    using the encoding functions from Data.Text.Encoding
 243 -}
 244
 245 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
 246 encodeUtf8 = yield . TE.encodeUtf8
 247 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
 248 encodeUtf16LE = yield . TE.encodeUtf16LE
 249 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
 250 encodeUtf16BE = yield . TE.encodeUtf16BE
 251 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
 252 encodeUtf32LE = yield . TE.encodeUtf32LE
 253 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
 254 encodeUtf32BE = yield . TE.encodeUtf32BE
 255
 256 mkCodec :: (forall r m . Monad m =>
 257            Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
 258         -> (Text -> ByteString)
 259         -> Codec
 260 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc)))  (k (dec p0))
 261
 262
 263
 264 {- $ascii
 265    ascii and latin encodings only use a small number of the characters 'Text'
 266      recognizes; thus we cannot use the pipes @Lens@ style to work with them.
 267      Rather we simply define functions each way.
 268 -}
 269
 270
 271 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
 272 --   returning the rest of the 'Text' at the first non-ascii 'Char'
 273
 274 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 275 encodeAscii = go where
 276   go p = do e <- lift (next p)
 277             case e of
 278               Left r -> return (return r)
 279               Right (chunk, p') ->
 280                  if T.null chunk
 281                    then go p'
 282                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0x7F) chunk
 283                         in do yield (B8.pack (T.unpack safe))
 284                               if T.null unsafe
 285                                 then go p'
 286                                 else return $ do yield unsafe
 287                                                  p'
 288
 289 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
 290      returning the rest of the 'Text' upon hitting any non-latin 'Char'
 291    -}
 292 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 293 encodeIso8859_1 = go where
 294   go p = do e <- lift (next p)
 295             case e of
 296               Left r -> return (return r)
 297               Right (txt, p') ->
 298                  if T.null txt
 299                    then go p'
 300                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0xFF) txt
 301                         in do yield (B8.pack (T.unpack safe))
 302                               if T.null unsafe
 303                                 then go p'
 304                                 else return $ do yield unsafe
 305                                                  p'
 306
 307 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 308      unused 'ByteString' upon hitting an un-ascii byte.
 309    -}
 310 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 311 decodeAscii = go where
 312   go p = do e <- lift (next p)
 313             case e of
 314               Left r -> return (return r)
 315               Right (chunk, p') ->
 316                  if B.null chunk
 317                    then go p'
 318                    else let (safe, unsafe) = B.span (<= 0x7F) chunk
 319                         in do yield (T.pack (B8.unpack safe))
 320                               if B.null unsafe
 321                                 then go p'
 322                                 else return (do yield unsafe
 323                                                 p')
 324
 325 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 326      unused 'ByteString' upon hitting the rare un-latinizable byte.
 327      -}
 328 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 329 decodeIso8859_1 = go where
 330   go p = do e <- lift (next p)
 331             case e of
 332               Left r -> return (return r)
 333               Right (chunk, p') ->
 334                  if B.null chunk
 335                     then go p'
 336                     else do let (safe, unsafe) = B.span (<= 0xFF) chunk
 337                             yield (T.pack (B8.unpack safe))
 338                             if B.null unsafe
 339                                then go p'
 340                                else return (do yield unsafe
 341                                                p')
 342
 343
 344