Pipes/Text/Encoding.hs

   1 {-# LANGUAGE RankNTypes, BangPatterns #-}
   2
   3 -- | This module uses the stream decoding functions from
   4 --  <http://hackage.haskell.org/package/streaming-commons streaming-commons>
   5 --  package to define decoding functions and lenses.  The exported names
   6 --  conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
   7
   8 module Pipes.Text.Encoding
   9     (
  10     -- * Decoding ByteStrings and Encoding Texts
  11     -- ** Simple usage
  12     -- $usage
  13
  14     -- ** Lens usage
  15     -- $lenses
  16
  17
  18     -- * Basic lens operations
  19     Codec
  20     , decode
  21     , eof
  22     -- * Decoding lenses
  23     , utf8
  24     , utf8Pure
  25     , utf16LE
  26     , utf16BE
  27     , utf32LE
  28     , utf32BE
  29     -- * Non-lens decoding functions
  30     -- $decoders
  31     , decodeUtf8
  32     , decodeUtf8Pure
  33     , decodeUtf16LE
  34     , decodeUtf16BE
  35     , decodeUtf32LE
  36     , decodeUtf32BE
  37     -- * Re-encoding functions
  38     -- $encoders
  39     , encodeUtf8
  40     , encodeUtf16LE
  41     , encodeUtf16BE
  42     , encodeUtf32LE
  43     , encodeUtf32BE
  44     -- * Functions for latin and ascii text
  45     -- $ascii
  46     , encodeAscii
  47     , decodeAscii
  48     , encodeIso8859_1
  49     , decodeIso8859_1
  50     )
  51     where
  52
  53 import Data.Functor.Constant (Constant(..))
  54 import Data.Char (ord)
  55 import Data.ByteString as B
  56 import Data.ByteString.Char8 as B8
  57 import Data.Text (Text)
  58 import qualified Data.Text as T
  59 import qualified Data.Text.Encoding as TE
  60 import qualified Data.Streaming.Text as Stream
  61 import Data.Streaming.Text (DecodeResult(..))
  62 import Control.Monad (join, liftM)
  63 import Pipes
  64
  65
  66
  67 {- $usage
  68     Encoding is of course simple. Given
  69
  70 >   text :: Producer Text IO ()
  71
  72     we can encode it with @Data.Text.Encoding.encodeUtf8@
  73
  74 >   TE.encodeUtf8 :: Text -> ByteString
  75
  76     and ordinary pipe operations:
  77
  78 >   text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO ()
  79
  80     or, equivalently
  81
  82 >   for text (yield . TE.encodeUtf8)
  83
  84     But, using this module, we might use
  85
  86 >   encodeUtf8 :: Text -> Producer ByteString m ()
  87
  88     to write
  89
  90 >   for text encodeUtf8 :: Producer.ByteString IO ()
  91
  92     All of the above come to the same.
  93
  94
  95     Given
  96
  97 >   bytes :: Producer ByteString IO ()
  98
  99     we can apply a decoding function from this module:
 100
 101 >   decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ())
 102
 103     The Text producer ends wherever decoding first fails. The un-decoded
 104     material is returned. If we are confident it is of no interest, we can
 105     write:
 106
 107 >   void $ decodeUtf8 bytes :: Producer Text IO ()
 108
 109     Thus we can re-encode
 110     as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g.
 111
 112 >   for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ())
 113
 114     The bytestring producer that is returned begins with where utf16BE decoding
 115     failed; if it didn't fail the producer is empty.
 116
 117 -}
 118
 119 {- $lenses
 120     We get a bit more flexibility, particularly in the use of pipes-style "parsers",
 121     if we use a lens like @utf8@ or @utf16BE@
 122     that focusses on the text in an appropriately encoded byte stream.
 123
 124 >   type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
 125
 126     is just an alias for a Prelude type.  We abbreviate this further, for our use case, as
 127
 128 >   type Codec
 129 >     =  forall m r .  Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
 130
 131     and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can
 132     re-encode what they have decoded.  Thus you use any particular codec with
 133     the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries;
 134     <http://hackage.haskell.org/package/lens lens>,
 135     <http://hackage.haskell.org/package/lens-family lens-family>,
 136     <http://hackage.haskell.org/package/lens-simple lens-simple>, or one of the
 137     and <http://hackage.haskell.org/package/microlens microlens> packages will all work
 138     the same, since we already have access to the types they require.
 139
 140     Each decoding lens looks into a byte stream that is supposed to contain text.
 141     The particular lenses are named in accordance with the expected
 142     encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@
 143     into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode':
 144
 145 >   view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 146 >   decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 147 >   Bytes.stdin ^. utf8 ::  Producer Text IO (Producer ByteString IO r)
 148
 149     Of course, we could always do this with the specialized decoding functions, e.g.
 150
 151 >   decodeUtf8 ::  Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 152 >   decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 153
 154     As with these functions, the stream of text that a @Codec@ \'sees\'
 155     in the stream of bytes begins at its head.
 156     At any point of decoding failure, the stream of text ends and reverts to (returns)
 157     the original byte stream. Thus if the first bytes are already
 158     un-decodable, the whole ByteString producer will be returned, i.e.
 159
 160 >   view utf8 bad_bytestream
 161
 162     will just come to the same as
 163
 164 >   return bad_bytestream
 165
 166     Where there is no decoding failure, the return value of the text stream will be
 167     an empty byte stream followed by its own return value.  In all cases you must
 168     deal with the fact that it is a /ByteString producer/ that is returned, even if
 169     it can be thrown away with @Control.Monad.void@
 170
 171 >   void (Bytes.stdin ^. utf8) :: Producer Text IO ()
 172
 173     The @eof@ lens permits you to pattern match: if there is a Right value,
 174     it is the leftover bytestring producer, if there is a Right value, it
 175     is the return value of the original bytestring producer:
 176
 177 >   Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ())
 178
 179     Thus for the stream of un-decodable bytes mentioned above,
 180
 181 >   view (utf8 . eof) bad_bytestream
 182
 183     will be the same as
 184
 185 >   return (Left bad_bytestream)
 186
 187     @zoom utf8@ converts a Text parser into a ByteString parser:
 188
 189 >   zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
 190
 191     or, using the type synonymn from @Pipes.Parse@:
 192
 193 >   zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
 194
 195     Thus we can define a ByteString parser (in the pipes-parse sense) like this:
 196
 197 >   charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
 198 >   charPlusByte = do char_ <- zoom utf8 Text.drawChar
 199 >                     byte_ <- Bytes.peekByte
 200 >                     return (char_, byte_)
 201
 202      Though @charPlusByte@ is partly defined with a Text parser 'drawChar';
 203      but it is a ByteString parser; it will return the first valid utf8-encoded
 204      Char in a ByteString, /whatever its byte-length/,
 205      and the first byte following, if both exist. Because
 206      we \'draw\' one and \'peek\' at the other, the parser as a whole only
 207      advances one Char's length along the bytestring, whatever that length may be.
 208      See the slightly more complex example \'decode.hs\' in the
 209      <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog>
 210      discussion of this type of byte stream parsing.
 211     -}
 212
 213 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
 214
 215 type Codec
 216     =  forall m r
 217     .  Monad m
 218     => Lens' (Producer ByteString m r)
 219              (Producer Text m (Producer ByteString m r))
 220
 221
 222 {- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries;
 223    exported here under a name appropriate to the material.
 224    Thus given a bytestring producer called @bytes@ we have
 225
 226 >    decode utf8 bytes :: Producer Text IO (Producer ByteString IO ())
 227
 228     All of these are thus the same:
 229
 230 >    decode utf8 bytes
 231 >    view utf8 bytes
 232 >    bytes ^. utf8
 233 >    decodeUtf8 bytes
 234
 235 -}
 236
 237 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
 238 decode codec a = getConstant (codec Constant a)
 239
 240 {- | @eof@ tells you explicitly when decoding stops due to bad bytes or
 241     instead reaches end-of-file happily. (Without it one just makes an explicit
 242     test for emptiness of the resulting bytestring production using next) Thus
 243
 244 >    decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ())
 245
 246     If we hit undecodable bytes, the remaining bytestring producer will be
 247     returned as a Left value; in the happy case, a Right value is returned
 248     with the anticipated return value for the original bytestring producer.
 249
 250     Given a bytestring producer called @bytes@ all of these will be the same:
 251
 252 >    decode (utf8 . eof) bytes
 253 >    view (utf8 . eof) bytes
 254 >    bytes^.utf8.eof
 255
 256 -}
 257
 258 eof :: (Monad m, Monad (t m), MonadTrans t) => Lens' (t m (Producer ByteString m r))
 259                        (t m (Either (Producer ByteString m r) r))
 260 eof k p0 = fmap fromEither (k (toEither p0)) where
 261
 262  fromEither = liftM (either id return)
 263
 264  toEither pp = do p <- pp
 265                   check p
 266
 267  check p = do e <- lift (next p)
 268               case e of
 269                 Left r -> return (Right r)
 270                 Right (bs,pb) ->  if B.null bs
 271                                     then check pb
 272                                     else return (Left (do yield bs
 273                                                           pb))
 274
 275 utf8 :: Codec
 276 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
 277
 278 utf8Pure :: Codec
 279 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
 280
 281 utf16LE :: Codec
 282 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
 283
 284 utf16BE :: Codec
 285 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
 286
 287 utf32LE :: Codec
 288 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
 289
 290 utf32BE :: Codec
 291 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
 292
 293 decodeStream :: Monad m
 294        => (B.ByteString -> DecodeResult)
 295        -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 296 decodeStream = loop where
 297   loop dec0 p =
 298     do x <- lift (next p)
 299        case x of
 300          Left r -> return (return r)
 301          Right (chunk, p') -> case dec0 chunk of
 302            DecodeResultSuccess text dec -> do yield text
 303                                               loop dec p'
 304            DecodeResultFailure text bs -> do yield text
 305                                              return (do yield bs
 306                                                         p')
 307 {-# INLINABLE decodeStream#-}
 308
 309
 310 {- $decoders
 311    These are functions with the simple type:
 312
 313 >   decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 314
 315    Thus in general
 316
 317 >     decodeUtf8 = view utf8
 318 >     decodeUtf16LE = view utf16LE
 319
 320    and so forth, but these forms
 321    may be more convenient (and give better type errors!) where lenses are
 322    not desired.
 323 -}
 324
 325
 326 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 327 decodeUtf8 = decodeStream Stream.decodeUtf8
 328 {-# INLINE decodeUtf8 #-}
 329
 330 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 331 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
 332 {-# INLINE decodeUtf8Pure #-}
 333
 334 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 335 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
 336 {-# INLINE decodeUtf16LE #-}
 337
 338 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 339 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
 340 {-# INLINE decodeUtf16BE #-}
 341
 342 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 343 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
 344 {-# INLINE decodeUtf32LE #-}
 345
 346 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 347 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
 348 {-# INLINE decodeUtf32BE #-}
 349
 350
 351 {- $encoders
 352    These are simply defined
 353
 354 >      encodeUtf8 = yield . TE.encodeUtf8
 355
 356    They are intended for use with 'for'
 357
 358 >      for Text.stdin encodeUtf8 :: Producer ByteString IO ()
 359
 360    which would have the effect of
 361
 362 >      Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
 363
 364    using the encoding functions from Data.Text.Encoding
 365 -}
 366
 367 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
 368 encodeUtf8 = yield . TE.encodeUtf8
 369 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
 370 encodeUtf16LE = yield . TE.encodeUtf16LE
 371 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
 372 encodeUtf16BE = yield . TE.encodeUtf16BE
 373 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
 374 encodeUtf32LE = yield . TE.encodeUtf32LE
 375 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
 376 encodeUtf32BE = yield . TE.encodeUtf32BE
 377
 378 mkCodec :: (forall r m . Monad m =>
 379            Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
 380         -> (Text -> ByteString)
 381         -> Codec
 382 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc)))  (k (dec p0))
 383
 384
 385
 386 {- $ascii
 387    ascii and latin encodings only use a small number of the characters 'Text'
 388      recognizes; thus we cannot use the pipes @Lens@ style to work with them.
 389      Rather we simply define functions each way.
 390 -}
 391
 392
 393 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
 394 --   returning the rest of the 'Text' at the first non-ascii 'Char'
 395
 396 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 397 encodeAscii = go where
 398   go p = do e <- lift (next p)
 399             case e of
 400               Left r -> return (return r)
 401               Right (chunk, p') ->
 402                  if T.null chunk
 403                    then go p'
 404                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0x7F) chunk
 405                         in do yield (B8.pack (T.unpack safe))
 406                               if T.null unsafe
 407                                 then go p'
 408                                 else return $ do yield unsafe
 409                                                  p'
 410
 411 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
 412      returning the rest of the 'Text' upon hitting any non-latin 'Char'
 413    -}
 414 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 415 encodeIso8859_1 = go where
 416   go p = do e <- lift (next p)
 417             case e of
 418               Left r -> return (return r)
 419               Right (txt, p') ->
 420                  if T.null txt
 421                    then go p'
 422                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0xFF) txt
 423                         in do yield (B8.pack (T.unpack safe))
 424                               if T.null unsafe
 425                                 then go p'
 426                                 else return $ do yield unsafe
 427                                                  p'
 428
 429 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 430      unused 'ByteString' upon hitting an un-ascii byte.
 431    -}
 432 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 433 decodeAscii = go where
 434   go p = do e <- lift (next p)
 435             case e of
 436               Left r -> return (return r)
 437               Right (chunk, p') ->
 438                  if B.null chunk
 439                    then go p'
 440                    else let (safe, unsafe) = B.span (<= 0x7F) chunk
 441                         in do yield (T.pack (B8.unpack safe))
 442                               if B.null unsafe
 443                                 then go p'
 444                                 else return (do yield unsafe
 445                                                 p')
 446
 447 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 448      unused 'ByteString' upon hitting the rare un-latinizable byte.
 449      -}
 450 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 451 decodeIso8859_1 = go where
 452   go p = do e <- lift (next p)
 453             case e of
 454               Left r -> return (return r)
 455               Right (chunk, p') ->
 456                  if B.null chunk
 457                     then go p'
 458                     else do let (safe, unsafe) = B.span (<= 0xFF) chunk
 459                             yield (T.pack (B8.unpack safe))
 460                             if B.null unsafe
 461                                then go p'
 462                                else return (do yield unsafe
 463                                                p')
 464
 465
 466