Pipes/Text/Encoding.hs

   1 {-# LANGUAGE RankNTypes, BangPatterns #-}
   2
   3 -- | This module uses the stream decoding functions from
   4 --  <http://hackage.haskell.org/package/streaming-commons streaming-commons>
   5 --  package to define decoding functions and lenses.  The exported names
   6 --  conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
   7
   8 module Pipes.Text.Encoding
   9     (
  10     -- * Decoding ByteStrings and Encoding Texts
  11     -- ** Simple usage
  12     -- $usage
  13
  14     -- ** Lens usage
  15     -- $lenses
  16
  17
  18     -- * Basic lens operations
  19     Codec
  20     , decode
  21     , eof
  22     -- * Decoding lenses
  23     , utf8
  24     , utf8Pure
  25     , utf16LE
  26     , utf16BE
  27     , utf32LE
  28     , utf32BE
  29     -- * Non-lens decoding functions
  30     -- $decoders
  31     , decodeUtf8
  32     , decodeUtf8Pure
  33     , decodeUtf16LE
  34     , decodeUtf16BE
  35     , decodeUtf32LE
  36     , decodeUtf32BE
  37     -- * Re-encoding functions
  38     -- $encoders
  39     , encodeUtf8
  40     , encodeUtf16LE
  41     , encodeUtf16BE
  42     , encodeUtf32LE
  43     , encodeUtf32BE
  44     -- * Functions for latin and ascii text
  45     -- $ascii
  46     , encodeAscii
  47     , decodeAscii
  48     , encodeIso8859_1
  49     , decodeIso8859_1
  50     )
  51     where
  52
  53 import Data.Functor.Constant (Constant(..))
  54 import Data.Char (ord)
  55 import Data.ByteString as B
  56 import Data.ByteString.Char8 as B8
  57 import Data.Text (Text)
  58 import qualified Data.Text as T
  59 import qualified Data.Text.Encoding as TE
  60 import qualified Data.Streaming.Text as Stream
  61 import Data.Streaming.Text (DecodeResult(..))
  62 import Control.Monad (join, liftM)
  63 import Pipes
  64
  65
  66
  67 {- $usage
  68     Encoding is of course simple. Given
  69
  70 >   text :: Producer Text IO ()
  71
  72     we can encode it with @Data.Text.Encoding.encodeUtf8@
  73
  74 >   TE.encodeUtf8 :: Text -> ByteString
  75
  76     and ordinary pipe operations:
  77
  78 >   text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO ()
  79
  80     or, equivalently
  81
  82 >   for text (yield . TE.encodeUtf8)
  83
  84     But, using this module, we might use
  85
  86 >   encodeUtf8 :: Text -> Producer ByteString m ()
  87
  88     to write
  89
  90 >   for text encodeUtf8 :: Producer.ByteString IO ()
  91
  92     All of the above come to the same.
  93
  94
  95     Given
  96
  97 >   bytes :: Producer ByteString IO ()
  98
  99     we can apply a decoding function from this module:
 100
 101 >   decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ())
 102
 103     The Text producer ends wherever decoding first fails. The un-decoded
 104     material is returned. If we are confident it is of no interest, we can
 105     write:
 106
 107 >   void $ decodeUtf8 bytes :: Producer Text IO ()
 108
 109     Thus we can re-encode
 110     as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g.
 111
 112 >   for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ())
 113
 114     The bytestring producer that is returned begins with where utf16BE decoding
 115     failed; if it didn't fail the producer is empty.
 116
 117 -}
 118
 119 {- $lenses
 120     We get a bit more flexibility, particularly in the use of pipes-style "parsers",
 121     if we use a lens like @utf8@ or @utf16BE@
 122     that focusses on the text in an appropriately encoded byte stream.
 123
 124 >   type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
 125
 126     is just an alias for a Prelude type.  We abbreviate this further, for our use case, as
 127
 128 >   type Codec
 129 >     =  forall m r .  Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
 130
 131     and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can
 132     re-encode what they have decoded.  Thus you use any particular codec with
 133     the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries;
 134     <http://hackage.haskell.org/package/lens lens>,
 135     <http://hackage.haskell.org/package/lens-family lens-family>,
 136     <http://hackage.haskell.org/package/lens-simple lens-simple>, or one of the
 137     and <http://hackage.haskell.org/package/microlens microlens> packages will all work
 138     the same, since we already have access to the types they require.
 139
 140     Each decoding lens looks into a byte stream that is supposed to contain text.
 141     The particular lenses are named in accordance with the expected
 142     encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@
 143     into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode':
 144
 145 >   view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 146 >   decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 147 >   Bytes.stdin ^. utf8 ::  Producer Text IO (Producer ByteString IO r)
 148
 149     Of course, we could always do this with the specialized decoding functions, e.g.
 150
 151 >   decodeUtf8 ::  Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 152 >   decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 153
 154     As with these functions, the stream of text that a @Codec@ \'sees\'
 155     in the stream of bytes begins at its head.
 156     At any point of decoding failure, the stream of text ends and reverts to (returns)
 157     the original byte stream. Thus if the first bytes are already
 158     un-decodable, the whole ByteString producer will be returned, i.e.
 159
 160 >   view utf8 bad_bytestream
 161
 162     will just come to the same as
 163
 164 >   return bad_bytestream
 165
 166     Where there is no decoding failure, the return value of the text stream will be
 167     an empty byte stream followed by its own return value.  In all cases you must
 168     deal with the fact that it is a /ByteString producer/ that is returned, even if
 169     it can be thrown away with @Control.Monad.void@
 170
 171 >   void (Bytes.stdin ^. utf8) :: Producer Text IO ()
 172
 173     The @eof@ lens permits you to pattern match: if there is a Right value,
 174     it is the leftover bytestring producer, if there is a Right value, it
 175     is the return value of the original bytestring producer:
 176
 177 >   Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ())
 178
 179     Thus for the stream of un-decodable bytes mentioned above,
 180
 181 >   view (utf8 . eof) bad_bytestream
 182
 183     will be the same as
 184
 185 >   return (Left bad_bytestream)
 186
 187     @zoom utf8@ converts a Text parser into a ByteString parser:
 188
 189 >   zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
 190
 191     or, using the type synonymn from @Pipes.Parse@:
 192
 193 >   zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
 194
 195     Thus we can define a ByteString parser (in the pipes-parse sense) like this:
 196
 197 >   charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
 198 >   charPlusByte = do char_ <- zoom utf8 Text.drawChar
 199 >                     byte_ <- Bytes.peekByte
 200 >                     return (char_, byte_)
 201
 202      Though @charPlusByte@ is partly defined with a Text parser 'drawChar';
 203      but it is a ByteString parser; it will return the first valid utf8-encoded
 204      Char in a ByteString, /whatever its byte-length/,
 205      and the first byte following, if both exist. Because
 206      we \'draw\' one and \'peek\' at the other, the parser as a whole only
 207      advances one Char's length along the bytestring, whatever that length may be.
 208      See the slightly more complex example \'decode.hs\' in the
 209      <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog>
 210      discussion of this type of byte stream parsing.
 211     -}
 212
 213 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
 214
 215 type Codec
 216     =  forall m r
 217     .  Monad m
 218     => Lens' (Producer ByteString m r)
 219              (Producer Text m (Producer ByteString m r))
 220
 221
 222 {- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries;
 223    exported here under a name appropriate to the material. Thus
 224
 225 >    decode utf8 bytes :: Producer Text IO (Producer ByteString IO ())
 226
 227     All of these are thus the same:
 228
 229 >    decode utf8 bytes = view utf8 bytes = bytes ^. utf8 = decodeUtf8 bytes
 230
 231
 232 -}
 233
 234 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
 235 decode codec a = getConstant (codec Constant a)
 236
 237 {- | @eof@ tells you explicitly when decoding stops due to bad bytes or
 238     instead reaches end-of-file happily. (Without it one just makes an explicit
 239     test for emptiness of the resulting bytestring production using next) Thus
 240
 241 >    decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ())
 242
 243     If we hit undecodable bytes, the remaining bytestring producer will be
 244     returned as a Left value; in the happy case, a Right value is returned
 245     with the anticipated return value for the original bytestring producer.
 246
 247     Again, all of these are the same
 248
 249 >    decode (utf8 . eof) bytes = view (utf8 . eof) p = p^.utf8.eof
 250
 251 -}
 252
 253 eof :: (Monad m, Monad (t m), MonadTrans t) => Lens' (t m (Producer ByteString m r))
 254                        (t m (Either (Producer ByteString m r) r))
 255 eof k p0 = fmap fromEither (k (toEither p0)) where
 256
 257  fromEither = liftM (either id return)
 258
 259  toEither pp = do p <- pp
 260                   check p
 261
 262  check p = do e <- lift (next p)
 263               case e of
 264                 Left r -> return (Right r)
 265                 Right (bs,pb) ->  if B.null bs
 266                                     then check pb
 267                                     else return (Left (do yield bs
 268                                                           pb))
 269
 270 utf8 :: Codec
 271 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
 272
 273 utf8Pure :: Codec
 274 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
 275
 276 utf16LE :: Codec
 277 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
 278
 279 utf16BE :: Codec
 280 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
 281
 282 utf32LE :: Codec
 283 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
 284
 285 utf32BE :: Codec
 286 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
 287
 288 decodeStream :: Monad m
 289        => (B.ByteString -> DecodeResult)
 290        -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 291 decodeStream = loop where
 292   loop dec0 p =
 293     do x <- lift (next p)
 294        case x of
 295          Left r -> return (return r)
 296          Right (chunk, p') -> case dec0 chunk of
 297            DecodeResultSuccess text dec -> do yield text
 298                                               loop dec p'
 299            DecodeResultFailure text bs -> do yield text
 300                                              return (do yield bs
 301                                                         p')
 302 {-# INLINABLE decodeStream#-}
 303
 304
 305 {- $decoders
 306    These are functions with the simple type:
 307
 308 >   decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 309
 310    Thus in general
 311
 312 >     decodeUtf8 = view utf8
 313 >     decodeUtf16LE = view utf16LE
 314
 315    and so forth, but these forms
 316    may be more convenient (and give better type errors!) where lenses are
 317    not desired.
 318 -}
 319
 320
 321 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 322 decodeUtf8 = decodeStream Stream.decodeUtf8
 323 {-# INLINE decodeUtf8 #-}
 324
 325 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 326 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
 327 {-# INLINE decodeUtf8Pure #-}
 328
 329 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 330 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
 331 {-# INLINE decodeUtf16LE #-}
 332
 333 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 334 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
 335 {-# INLINE decodeUtf16BE #-}
 336
 337 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 338 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
 339 {-# INLINE decodeUtf32LE #-}
 340
 341 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 342 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
 343 {-# INLINE decodeUtf32BE #-}
 344
 345
 346 {- $encoders
 347    These are simply defined
 348
 349 >      encodeUtf8 = yield . TE.encodeUtf8
 350
 351    They are intended for use with 'for'
 352
 353 >      for Text.stdin encodeUtf8 :: Producer ByteString IO ()
 354
 355    which would have the effect of
 356
 357 >      Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
 358
 359    using the encoding functions from Data.Text.Encoding
 360 -}
 361
 362 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
 363 encodeUtf8 = yield . TE.encodeUtf8
 364 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
 365 encodeUtf16LE = yield . TE.encodeUtf16LE
 366 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
 367 encodeUtf16BE = yield . TE.encodeUtf16BE
 368 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
 369 encodeUtf32LE = yield . TE.encodeUtf32LE
 370 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
 371 encodeUtf32BE = yield . TE.encodeUtf32BE
 372
 373 mkCodec :: (forall r m . Monad m =>
 374            Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
 375         -> (Text -> ByteString)
 376         -> Codec
 377 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc)))  (k (dec p0))
 378
 379
 380
 381 {- $ascii
 382    ascii and latin encodings only use a small number of the characters 'Text'
 383      recognizes; thus we cannot use the pipes @Lens@ style to work with them.
 384      Rather we simply define functions each way.
 385 -}
 386
 387
 388 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
 389 --   returning the rest of the 'Text' at the first non-ascii 'Char'
 390
 391 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 392 encodeAscii = go where
 393   go p = do e <- lift (next p)
 394             case e of
 395               Left r -> return (return r)
 396               Right (chunk, p') ->
 397                  if T.null chunk
 398                    then go p'
 399                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0x7F) chunk
 400                         in do yield (B8.pack (T.unpack safe))
 401                               if T.null unsafe
 402                                 then go p'
 403                                 else return $ do yield unsafe
 404                                                  p'
 405
 406 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
 407      returning the rest of the 'Text' upon hitting any non-latin 'Char'
 408    -}
 409 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 410 encodeIso8859_1 = go where
 411   go p = do e <- lift (next p)
 412             case e of
 413               Left r -> return (return r)
 414               Right (txt, p') ->
 415                  if T.null txt
 416                    then go p'
 417                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0xFF) txt
 418                         in do yield (B8.pack (T.unpack safe))
 419                               if T.null unsafe
 420                                 then go p'
 421                                 else return $ do yield unsafe
 422                                                  p'
 423
 424 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 425      unused 'ByteString' upon hitting an un-ascii byte.
 426    -}
 427 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 428 decodeAscii = go where
 429   go p = do e <- lift (next p)
 430             case e of
 431               Left r -> return (return r)
 432               Right (chunk, p') ->
 433                  if B.null chunk
 434                    then go p'
 435                    else let (safe, unsafe) = B.span (<= 0x7F) chunk
 436                         in do yield (T.pack (B8.unpack safe))
 437                               if B.null unsafe
 438                                 then go p'
 439                                 else return (do yield unsafe
 440                                                 p')
 441
 442 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 443      unused 'ByteString' upon hitting the rare un-latinizable byte.
 444      -}
 445 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 446 decodeIso8859_1 = go where
 447   go p = do e <- lift (next p)
 448             case e of
 449               Left r -> return (return r)
 450               Right (chunk, p') ->
 451                  if B.null chunk
 452                     then go p'
 453                     else do let (safe, unsafe) = B.span (<= 0xFF) chunk
 454                             yield (T.pack (B8.unpack safe))
 455                             if B.null unsafe
 456                                then go p'
 457                                else return (do yield unsafe
 458                                                p')
 459
 460
 461