Pipes/Text/Encoding.hs

   1 {-# LANGUAGE RankNTypes, BangPatterns #-}
   2
   3 -- | This module uses the stream decoding functions from
   4 --  <http://hackage.haskell.org/package/streaming-commons streaming-commons>
   5 --  package to define decoding functions and lenses.  The exported names
   6 --  conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
   7
   8 module Pipes.Text.Encoding
   9     (
  10     -- * Decoding ByteStrings and Encoding Texts
  11     -- ** Simple usage
  12     -- $usage
  13
  14     -- ** Lens usage
  15     -- $lenses
  16
  17
  18     -- * Basic lens operations
  19     Codec
  20     , decode
  21     , eof
  22     -- * Decoding lenses
  23     , utf8
  24     , utf8Pure
  25     , utf16LE
  26     , utf16BE
  27     , utf32LE
  28     , utf32BE
  29     -- * Non-lens decoding functions
  30     -- $decoders
  31     , decodeUtf8
  32     , decodeUtf8Pure
  33     , decodeUtf16LE
  34     , decodeUtf16BE
  35     , decodeUtf32LE
  36     , decodeUtf32BE
  37     -- * Re-encoding functions
  38     -- $encoders
  39     , encodeUtf8
  40     , encodeUtf16LE
  41     , encodeUtf16BE
  42     , encodeUtf32LE
  43     , encodeUtf32BE
  44     -- * Functions for latin and ascii text
  45     -- $ascii
  46     , encodeAscii
  47     , decodeAscii
  48     , encodeIso8859_1
  49     , decodeIso8859_1
  50     )
  51     where
  52
  53 import Data.Functor.Constant (Constant(..))
  54 import Data.Char (ord)
  55 import Data.ByteString as B
  56 import Data.ByteString (ByteString)
  57 import Data.ByteString.Char8 as B8
  58 import Data.Text (Text)
  59 import qualified Data.Text as T
  60 import qualified Data.Text.Encoding as TE
  61 import qualified Data.Streaming.Text as Stream
  62 import Data.Streaming.Text (DecodeResult(..))
  63 import Control.Monad (join, liftM)
  64 import Data.Word (Word8)
  65 import Pipes
  66
  67
  68
  69 {- $usage
  70     Given
  71
  72 >   text :: Producer Text IO ()
  73
  74     we can encode it with @Data.Text.Encoding@ and ordinary pipe operations:
  75
  76 >   text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO ()
  77
  78     or, using this module, with
  79
  80 >   for text encodeUtf8 :: Producer.ByteString IO ()
  81
  82     Given
  83
  84 >   bytes :: Producer ByteString Text IO ()
  85
  86     we can apply a decoding function from this module:
  87
  88 >   decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ())
  89
  90     The Text producer ends wherever decoding first fails. Thus we can re-encode
  91     as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g.
  92
  93 >   for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ())
  94
  95     The bytestring producer that is returned begins with where utf16BE decoding
  96     failed; it it didn't fail the producer is empty.
  97
  98 -}
  99
 100 {- $lenses
 101     We get a bit more flexibility, though, if we use a lens like @utf8@ or @utf16BE@
 102     that looks for text in an appropriately encoded byte stream.
 103
 104 >   type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
 105
 106     is just an alias for a Prelude type.   We abbreviate this further, for our use case, as
 107
 108 >   type Codec
 109 >     =  forall m r .  Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
 110
 111     and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can
 112     re-encode what they have decoded.  Thus you use any particular codec with
 113     the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries;
 114     we presuppose neither <http://hackage.haskell.org/package/lens lens>
 115     nor  <http://hackage.haskell.org/package/lens-family lens-family>
 116     since we already have access to the types they require.
 117
 118     Each decoding lens looks into a byte stream that is supposed to contain text.
 119     The particular lenses are named in accordance with the expected
 120     encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@
 121     into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode':
 122
 123 >   view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 124 >   decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 125 >   Bytes.stdin ^. utf8 ::  Producer Text IO (Producer ByteString IO r)
 126
 127     These simple uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by
 128     the specialized decoding functions exported here, e.g.
 129
 130 >   decodeUtf8 ::  Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 131 >   decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 132
 133     As with these functions, the stream of text that a @Codec@ \'sees\'
 134     in the stream of bytes begins at its head.
 135     At any point of decoding failure, the stream of text ends and reverts to (returns)
 136     the original byte stream. Thus if the first bytes are already
 137     un-decodable, the whole ByteString producer will be returned, i.e.
 138
 139 >   view utf8 bad_bytestream
 140
 141     will just come to the same as
 142
 143 >   return bad_bytestream
 144
 145     Where there is no decoding failure, the return value of the text stream will be
 146     an empty byte stream followed by its own return value.  In all cases you must
 147     deal with the fact that it is a /ByteString producer/ that is returned, even if
 148     it can be thrown away with @Control.Monad.void@
 149
 150 >   void (Bytes.stdin ^. utf8) :: Producer Text IO ()
 151
 152     The @eof@ lens permits you to pattern match: if there is a Right value,
 153     it is the leftover bytestring producer, if there is a Right value, it
 154     is the return value of the original bytestring producer:
 155
 156 >   Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ())
 157
 158     Thus for the stream of un-decodable bytes mentioned above,
 159
 160 >   view (utf8 . eof) bad_bytestream
 161
 162     will be the same as
 163
 164 >   return (Left bad_bytestream)
 165
 166     @zoom@ converts a Text parser into a ByteString parser:
 167
 168 >   zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
 169
 170     or, using the type synonymn from @Pipes.Parse@:
 171
 172 >   zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
 173
 174     Thus we can define a ByteString parser (in the pipes-parse sense) like this:
 175
 176 >   charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
 177 >   charPlusByte = do char_ <- zoom utf8 Text.drawChar
 178 >                     byte_ <- Bytes.peekByte
 179 >                     return (char_, byte_)
 180
 181      Though @charPlusByte@ is partly defined with a Text parser 'drawChar';
 182      but it is a ByteString parser; it will return the first valid utf8-encoded
 183      Char in a ByteString, whatever its byte-length,
 184      and the first byte following, if both exist. Because
 185      we \'draw\' one and \'peek\' at the other, the parser as a whole only
 186      advances one Char's length along the bytestring, whatever that length may be.
 187      See the slightly more complex example \'decode.hs\' in the
 188      <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog>
 189      discussion of this type of byte stream parsing.
 190     -}
 191
 192 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
 193
 194 type Codec
 195     =  forall m r
 196     .  Monad m
 197     => Lens' (Producer ByteString m r)
 198              (Producer Text m (Producer ByteString m r))
 199
 200
 201 {- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries;
 202    exported here under a name appropriate to the material. Thus
 203
 204 >    decode utf8 bytes :: Producer Text IO (Producer ByteString IO ())
 205
 206     All of these are thus the same:
 207
 208 >    decode utf8 bytes = view utf8 bytes = bytes ^. utf8 = decodeUtf8 bytes
 209
 210
 211 -}
 212
 213 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
 214 decode codec a = getConstant (codec Constant a)
 215
 216 {- | @eof@ tells you explicitly when decoding stops due to bad bytes or
 217     instead reaches end-of-file happily. (Without it one just makes an explicit
 218     test for emptiness of the resulting bytestring production using next) Thus
 219
 220 >    decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ())
 221
 222     If we hit undecodable bytes, the remaining bytestring producer will be
 223     returned as a Left value; in the happy case, a Right value is returned
 224     with the anticipated return value for the original bytestring producer.
 225
 226     Again, all of these are the same
 227
 228 >    decode (utf8 . eof) bytes = view (utf8 . eof) p = p^.utf8.eof
 229
 230 -}
 231
 232 eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
 233                        (Producer Text m (Either (Producer ByteString m r) r))
 234 eof k p = fmap fromEither (k (toEither p)) where
 235
 236  fromEither = liftM (either id return)
 237
 238  toEither pp = do p <- pp
 239                   check p
 240
 241  check p = do e <- lift (next p)
 242               case e of
 243                 Left r -> return (Right r)
 244                 Right (bs,pb) ->  if B.null bs
 245                                     then check pb
 246                                     else return (Left (do yield bs
 247                                                           pb))
 248
 249 utf8 :: Codec
 250 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
 251
 252 utf8Pure :: Codec
 253 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
 254
 255 utf16LE :: Codec
 256 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
 257
 258 utf16BE :: Codec
 259 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
 260
 261 utf32LE :: Codec
 262 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
 263
 264 utf32BE :: Codec
 265 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
 266
 267 decodeStream :: Monad m
 268        => (B.ByteString -> DecodeResult)
 269        -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 270 decodeStream = loop where
 271   loop dec0 p =
 272     do x <- lift (next p)
 273        case x of Left r -> return (return r)
 274                  Right (chunk, p') -> case dec0 chunk of
 275                     DecodeResultSuccess text dec -> do yield text
 276                                                        loop dec p'
 277                     DecodeResultFailure text bs -> do yield text
 278                                                       return (do yield bs
 279                                                                  p')
 280 {-# INLINABLE decodeStream#-}
 281
 282
 283 {- $decoders
 284    These are functions with the simple type:
 285
 286 >   decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 287
 288    Thus in general
 289
 290 >     decodeUtf8 = view utf8
 291 >     decodeUtf16LE = view utf16LE
 292
 293    and so forth, but these forms
 294    may be more convenient (and give better type errors!) where lenses are
 295    not desired.
 296 -}
 297
 298
 299 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 300 decodeUtf8 = decodeStream Stream.decodeUtf8
 301 {-# INLINE decodeUtf8 #-}
 302
 303 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 304 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
 305 {-# INLINE decodeUtf8Pure #-}
 306
 307 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 308 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
 309 {-# INLINE decodeUtf16LE #-}
 310
 311 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 312 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
 313 {-# INLINE decodeUtf16BE #-}
 314
 315 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 316 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
 317 {-# INLINE decodeUtf32LE #-}
 318
 319 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 320 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
 321 {-# INLINE decodeUtf32BE #-}
 322
 323
 324 {- $encoders
 325    These are simply defined
 326
 327 >      encodeUtf8 = yield . TE.encodeUtf8
 328
 329    They are intended for use with 'for'
 330
 331 >      for Text.stdin encodeUtf8 :: Producer ByteString IO ()
 332
 333    which would have the effect of
 334
 335 >      Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
 336
 337    using the encoding functions from Data.Text.Encoding
 338 -}
 339
 340 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
 341 encodeUtf8 = yield . TE.encodeUtf8
 342 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
 343 encodeUtf16LE = yield . TE.encodeUtf16LE
 344 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
 345 encodeUtf16BE = yield . TE.encodeUtf16BE
 346 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
 347 encodeUtf32LE = yield . TE.encodeUtf32LE
 348 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
 349 encodeUtf32BE = yield . TE.encodeUtf32BE
 350
 351 mkCodec :: (forall r m . Monad m =>
 352            Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
 353         -> (Text -> ByteString)
 354         -> Codec
 355 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc)))  (k (dec p0))
 356
 357
 358
 359 {- $ascii
 360    ascii and latin encodings only use a small number of the characters 'Text'
 361      recognizes; thus we cannot use the pipes @Lens@ style to work with them.
 362      Rather we simply define functions each way.
 363 -}
 364
 365
 366 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
 367 --   returning the rest of the 'Text' at the first non-ascii 'Char'
 368
 369 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 370 encodeAscii = go where
 371   go p = do e <- lift (next p)
 372             case e of
 373               Left r -> return (return r)
 374               Right (chunk, p') ->
 375                  if T.null chunk
 376                    then go p'
 377                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0x7F) chunk
 378                         in do yield (B8.pack (T.unpack safe))
 379                               if T.null unsafe
 380                                 then go p'
 381                                 else return $ do yield unsafe
 382                                                  p'
 383
 384 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
 385      returning the rest of the 'Text' upon hitting any non-latin 'Char'
 386    -}
 387 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 388 encodeIso8859_1 = go where
 389   go p = do e <- lift (next p)
 390             case e of
 391               Left r -> return (return r)
 392               Right (txt, p') ->
 393                  if T.null txt
 394                    then go p'
 395                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0xFF) txt
 396                         in do yield (B8.pack (T.unpack safe))
 397                               if T.null unsafe
 398                                 then go p'
 399                                 else return $ do yield unsafe
 400                                                  p'
 401
 402 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 403      unused 'ByteString' upon hitting an un-ascii byte.
 404    -}
 405 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 406 decodeAscii = go where
 407   go p = do e <- lift (next p)
 408             case e of
 409               Left r -> return (return r)
 410               Right (chunk, p') ->
 411                  if B.null chunk
 412                    then go p'
 413                    else let (safe, unsafe) = B.span (<= 0x7F) chunk
 414                         in do yield (T.pack (B8.unpack safe))
 415                               if B.null unsafe
 416                                 then go p'
 417                                 else return (do yield unsafe
 418                                                 p')
 419
 420 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 421      unused 'ByteString' upon hitting the rare un-latinizable byte.
 422      -}
 423 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 424 decodeIso8859_1 = go where
 425   go p = do e <- lift (next p)
 426             case e of
 427               Left r -> return (return r)
 428               Right (chunk, p') ->
 429                  if B.null chunk
 430                     then go p'
 431                     else do let (safe, unsafe) = B.span (<= 0xFF) chunk
 432                             yield (T.pack (B8.unpack safe))
 433                             if B.null unsafe
 434                                then go p'
 435                                else return (do yield unsafe
 436                                                p')
 437
 438
 439