Pipes/Text/Encoding.hs

   1 {-# LANGUAGE RankNTypes, BangPatterns #-}
   2
   3 -- | This module uses the stream decoding functions from
   4 --  <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
   5 --  package to define decoding functions and lenses.  The exported names
   6 --  conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
   7
   8 module Pipes.Text.Encoding
   9     (
  10     -- * The Lens or Codec type
  11     -- $lenses
  12     Codec
  13     , decode
  14     , eof
  15     -- * \'Viewing\' the Text in a byte stream
  16     -- $codecs
  17     , utf8
  18     , utf8Pure
  19     , utf16LE
  20     , utf16BE
  21     , utf32LE
  22     , utf32BE
  23     -- * Non-lens decoding functions
  24     -- $decoders
  25     , decodeUtf8
  26     , decodeUtf8Pure
  27     , decodeUtf16LE
  28     , decodeUtf16BE
  29     , decodeUtf32LE
  30     , decodeUtf32BE
  31     -- * Re-encoding functions
  32     -- $encoders
  33     , encodeUtf8
  34     , encodeUtf16LE
  35     , encodeUtf16BE
  36     , encodeUtf32LE
  37     , encodeUtf32BE
  38     -- * Functions for latin and ascii text
  39     -- $ascii
  40     , encodeAscii
  41     , decodeAscii
  42     , encodeIso8859_1
  43     , decodeIso8859_1
  44     )
  45     where
  46
  47 import Data.Functor.Constant (Constant(..))
  48 import Data.Char (ord)
  49 import Data.ByteString as B
  50 import Data.ByteString (ByteString)
  51 import Data.ByteString.Char8 as B8
  52 import Data.Text (Text)
  53 import qualified Data.Text as T
  54 import qualified Data.Text.Encoding as TE
  55 import qualified Data.Streaming.Text as Stream
  56 import Data.Streaming.Text (DecodeResult(..))
  57 import Control.Monad (join, liftM)
  58 import Data.Word (Word8)
  59 import Pipes
  60
  61 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  62
  63 {- $lenses
  64     The 'Codec' type is a simple specializion of
  65     the @Lens'@ type synonymn used by the standard lens libraries,
  66     <http://hackage.haskell.org/package/lens lens> and
  67     <http://hackage.haskell.org/package/lens-family lens-family>. That type,
  68
  69 >   type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  70
  71     is just an alias for a Prelude type. Thus you use any particular codec with
  72     the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
  73     we presuppose neither library since we already have access to the types they require.
  74     -}
  75
  76 type Codec
  77     =  forall m r
  78     .  Monad m
  79     => Lens' (Producer ByteString m r)
  80              (Producer Text m (Producer ByteString m r))
  81
  82 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
  83       exported here under a name appropriate to the material. All of these are
  84       the same:
  85
  86 >    decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
  87
  88 -}
  89
  90
  91 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
  92 decode codec a = getConstant (codec Constant a)
  93
  94 {- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead
  95       reaches end-of-file happily. (Without it one just makes an explicit test
  96       for emptiness of the resulting bytestring production using 'next')
  97       Thus
  98
  99 >     decode (utf8 . eof) p =  view (utf8 . eof) p = p^.utf8.eof
 100
 101       will be a text producer. If we hit undecodable bytes, the remaining
 102       bytestring producer will be returned as a 'Left' value;
 103       in the happy case, a 'Right' value is returned with the anticipated
 104       return value for the original bytestring producer.
 105       )
 106
 107 -}
 108
 109 eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
 110                         (Producer Text m (Either (Producer ByteString m r) r))
 111 eof k p = fmap fromEither (k (toEither p)) where
 112
 113   fromEither = liftM (either id return)
 114
 115   toEither pp = do p <- pp
 116                    check p
 117
 118   check p = do e <- lift (next p)
 119                case e of
 120                  Left r -> return (Right r)
 121                  Right (bs,pb) ->  if B.null bs
 122                                      then check pb
 123                                      else return (Left (do yield bs
 124                                                            pb))
 125
 126
 127 {- $codecs
 128
 129     Each Codec-lens looks into a byte stream that is supposed to contain text.
 130     The particular \'Codec\' lenses are named in accordance with the expected
 131     encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
 132     use @view@ / @(^.)@ -- here also called 'decode':
 133
 134 >   view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 135 >   decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 136 >   Bytes.stdin ^. utf8 ::  Producer Text IO (Producer ByteString IO r)
 137
 138     Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
 139     decoding functions exported here, e.g.
 140
 141 >   decodeUtf8 ::  Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 142 >   decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 143
 144     The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
 145     At any point of decoding failure, the stream of text ends and reverts to (returns)
 146     the original byte stream. Thus if the first bytes are already
 147     un-decodable, the whole ByteString producer will be returned, i.e.
 148
 149 >   view utf8 bytestream
 150
 151     will just come to the same as
 152
 153 >   return bytestream
 154
 155     Where there is no decoding failure, the return value of the text stream will be
 156     an empty byte stream followed by its own return value.  In all cases you must
 157     deal with the fact that it is a /ByteString producer/ that is returned, even if
 158     it can be thrown away with @Control.Monad.void@
 159
 160 >   void (Bytes.stdin ^. utf8) :: Producer Text IO ()
 161
 162     @zoom@ converts a Text parser into a ByteString parser:
 163
 164 >   zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
 165
 166     or, using the type synonymn from @Pipes.Parse@:
 167
 168 >   zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
 169
 170     Thus we can define a ByteString parser like this:
 171
 172 >   withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
 173 >   withNextByte = do char_ <- zoom utf8 Text.drawChar
 174 >                     byte_ <- Bytes.peekByte
 175 >                     return (char_, byte_)
 176
 177      Though @withNextByte@ is partly defined with a Text parser 'drawChar';
 178      but it is a ByteString parser; it will return the first valid utf8-encoded
 179      Char in a ByteString, whatever its length,
 180      and the first byte of the next character, if they exist. Because
 181      we \'draw\' one and \'peek\' at the other, the parser as a whole only
 182      advances one Char's length along the bytestring, whatever that length may be.
 183      See the slightly more complex example \'decode.hs\' in the
 184      <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
 185      discussion of this type of byte stream parsing.
 186     -}
 187
 188 utf8 :: Codec
 189 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
 190
 191 utf8Pure :: Codec
 192 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
 193
 194 utf16LE :: Codec
 195 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
 196
 197 utf16BE :: Codec
 198 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
 199
 200 utf32LE :: Codec
 201 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
 202
 203 utf32BE :: Codec
 204 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
 205
 206 decodeStream :: Monad m
 207        => (B.ByteString -> DecodeResult)
 208        -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 209 decodeStream = loop where
 210   loop dec0 p =
 211     do x <- lift (next p)
 212        case x of Left r -> return (return r)
 213                  Right (chunk, p') -> case dec0 chunk of
 214                     DecodeResultSuccess text dec -> do yield text
 215                                                        loop dec p'
 216                     DecodeResultFailure text bs -> do yield text
 217                                                       return (do yield bs
 218                                                                  p')
 219 {-# INLINABLE decodeStream#-}
 220
 221
 222 {- $decoders
 223    These are functions with the simple type:
 224
 225 >   decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 226
 227    Thus in general
 228
 229 >     decodeUtf8 = view utf8
 230 >     decodeUtf16LE = view utf16LE
 231
 232    and so forth, but these forms
 233    may be more convenient (and give better type errors!) where lenses are
 234    not desired.
 235 -}
 236
 237
 238 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 239 decodeUtf8 = decodeStream Stream.decodeUtf8
 240 {-# INLINE decodeUtf8 #-}
 241
 242 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 243 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
 244 {-# INLINE decodeUtf8Pure #-}
 245
 246 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 247 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
 248 {-# INLINE decodeUtf16LE #-}
 249
 250 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 251 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
 252 {-# INLINE decodeUtf16BE #-}
 253
 254 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 255 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
 256 {-# INLINE decodeUtf32LE #-}
 257
 258 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 259 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
 260 {-# INLINE decodeUtf32BE #-}
 261
 262
 263 {- $encoders
 264    These are simply defined
 265
 266 >      encodeUtf8 = yield . TE.encodeUtf8
 267
 268    They are intended for use with 'for'
 269
 270 >      for Text.stdin encodeUtf8 :: Producer ByteString IO ()
 271
 272    which would have the effect of
 273
 274 >      Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
 275
 276    using the encoding functions from Data.Text.Encoding
 277 -}
 278
 279 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
 280 encodeUtf8 = yield . TE.encodeUtf8
 281 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
 282 encodeUtf16LE = yield . TE.encodeUtf16LE
 283 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
 284 encodeUtf16BE = yield . TE.encodeUtf16BE
 285 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
 286 encodeUtf32LE = yield . TE.encodeUtf32LE
 287 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
 288 encodeUtf32BE = yield . TE.encodeUtf32BE
 289
 290 mkCodec :: (forall r m . Monad m =>
 291            Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
 292         -> (Text -> ByteString)
 293         -> Codec
 294 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc)))  (k (dec p0))
 295
 296
 297
 298 {- $ascii
 299    ascii and latin encodings only use a small number of the characters 'Text'
 300      recognizes; thus we cannot use the pipes @Lens@ style to work with them.
 301      Rather we simply define functions each way.
 302 -}
 303
 304
 305 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
 306 --   returning the rest of the 'Text' at the first non-ascii 'Char'
 307
 308 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 309 encodeAscii = go where
 310   go p = do e <- lift (next p)
 311             case e of
 312               Left r -> return (return r)
 313               Right (chunk, p') ->
 314                  if T.null chunk
 315                    then go p'
 316                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0x7F) chunk
 317                         in do yield (B8.pack (T.unpack safe))
 318                               if T.null unsafe
 319                                 then go p'
 320                                 else return $ do yield unsafe
 321                                                  p'
 322
 323 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
 324      returning the rest of the 'Text' upon hitting any non-latin 'Char'
 325    -}
 326 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 327 encodeIso8859_1 = go where
 328   go p = do e <- lift (next p)
 329             case e of
 330               Left r -> return (return r)
 331               Right (txt, p') ->
 332                  if T.null txt
 333                    then go p'
 334                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0xFF) txt
 335                         in do yield (B8.pack (T.unpack safe))
 336                               if T.null unsafe
 337                                 then go p'
 338                                 else return $ do yield unsafe
 339                                                  p'
 340
 341 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 342      unused 'ByteString' upon hitting an un-ascii byte.
 343    -}
 344 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 345 decodeAscii = go where
 346   go p = do e <- lift (next p)
 347             case e of
 348               Left r -> return (return r)
 349               Right (chunk, p') ->
 350                  if B.null chunk
 351                    then go p'
 352                    else let (safe, unsafe) = B.span (<= 0x7F) chunk
 353                         in do yield (T.pack (B8.unpack safe))
 354                               if B.null unsafe
 355                                 then go p'
 356                                 else return (do yield unsafe
 357                                                 p')
 358
 359 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 360      unused 'ByteString' upon hitting the rare un-latinizable byte.
 361      -}
 362 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 363 decodeIso8859_1 = go where
 364   go p = do e <- lift (next p)
 365             case e of
 366               Left r -> return (return r)
 367               Right (chunk, p') ->
 368                  if B.null chunk
 369                     then go p'
 370                     else do let (safe, unsafe) = B.span (<= 0xFF) chunk
 371                             yield (T.pack (B8.unpack safe))
 372                             if B.null unsafe
 373                                then go p'
 374                                else return (do yield unsafe
 375                                                p')
 376
 377
 378