Pipes/Text/Encoding.hs

   1 {-# LANGUAGE RankNTypes, BangPatterns #-}
   2
   3 -- | This module uses the stream decoding functions from Michael Snoyman's new
   4 --  <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
   5 --  package to define decoding functions and lenses.  The exported names
   6 --  conflict with names in @Data.Text.Encoding@ but the module can otherwise be
   7 --  imported unqualified.
   8
   9 module Pipes.Text.Encoding
  10     (
  11     -- * The Lens or Codec type
  12     -- $lenses
  13     Codec
  14     , decode
  15     , eof
  16     -- * \'Viewing\' the Text in a byte stream
  17     -- $codecs
  18     , utf8
  19     , utf8Pure
  20     , utf16LE
  21     , utf16BE
  22     , utf32LE
  23     , utf32BE
  24     -- * Non-lens decoding functions
  25     -- $decoders
  26     , decodeUtf8
  27     , decodeUtf8Pure
  28     , decodeUtf16LE
  29     , decodeUtf16BE
  30     , decodeUtf32LE
  31     , decodeUtf32BE
  32     -- * Re-encoding functions
  33     -- $encoders
  34     , encodeUtf8
  35     , encodeUtf16LE
  36     , encodeUtf16BE
  37     , encodeUtf32LE
  38     , encodeUtf32BE
  39     -- * Functions for latin and ascii text
  40     -- $ascii
  41     , encodeAscii
  42     , decodeAscii
  43     , encodeIso8859_1
  44     , decodeIso8859_1
  45     )
  46     where
  47
  48 import Data.Functor.Constant (Constant(..))
  49 import Data.Char (ord)
  50 import Data.ByteString as B
  51 import Data.ByteString (ByteString)
  52 import Data.ByteString.Char8 as B8
  53 import Data.Text (Text)
  54 import qualified Data.Text as T
  55 import qualified Data.Text.Encoding as TE
  56 import qualified Data.Streaming.Text as Stream
  57 import Data.Streaming.Text (DecodeResult(..))
  58 import Control.Monad (join)
  59 import Data.Word (Word8)
  60 import Pipes
  61
  62 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  63
  64 {- $lenses
  65     The 'Codec' type is a simple specializion of
  66     the @Lens'@ type synonymn used by the standard lens libraries,
  67     <http://hackage.haskell.org/package/lens lens> and
  68     <http://hackage.haskell.org/package/lens-family lens-family>. That type,
  69
  70 >   type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  71
  72     is just an alias for a Prelude type. Thus you use any particular codec with
  73     the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
  74     we presuppose neither since we already have access to the types they require.
  75
  76     -}
  77
  78 type Codec
  79     =  forall m r
  80     .  Monad m
  81     => Lens' (Producer ByteString m r)
  82              (Producer Text m (Producer ByteString m r))
  83
  84 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
  85       exported here under a name appropriate to the material. All of these are
  86       the same:
  87
  88 >    decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
  89
  90 -}
  91
  92
  93 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
  94 decode codec a = getConstant (codec Constant a)
  95
  96 {- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead
  97       reaches end-of-file happily. (Without it one just makes an explicit test
  98       for emptiness of the resulting bytestring production using 'next')
  99       Thus
 100
 101 >     decode (utf8 . eof) p =  view (utf8 . eof) p = p^.utf8.eof
 102
 103       will be a text producer. If we hit undecodable bytes, the remaining
 104       bytestring producer will be returned as a 'Left' value;
 105       in the happy case, a 'Right' value is returned with the anticipated
 106       return value for the original bytestring producer.
 107       )
 108
 109 -}
 110
 111 eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
 112                         (Producer Text m (Either (Producer ByteString m r) r))
 113 eof k p = fmap fromEither (k (toEither p)) where
 114
 115   fromEither = liftM (either id return)
 116
 117   toEither pp = do p <- pp
 118                    check p
 119
 120   check p = do e <- lift (next p)
 121                case e of
 122                  Left r -> return (Right r)
 123                  Right (bs,pb) ->  if B.null bs
 124                                      then check pb
 125                                      else return (Left (do yield bs
 126                                                            pb))
 127
 128
 129 {- $codecs
 130
 131     Each Codec-lens looks into a byte stream that is supposed to contain text.
 132     The particular \'Codec\' lenses are named in accordance with the expected
 133     encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
 134     use @view@ / @(^.)@ -- here also called 'decode':
 135
 136 >   view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 137 >   decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 138 >   Bytes.stdin ^. utf8 ::  Producer Text IO (Producer ByteString IO r)
 139
 140     Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
 141     decoding functions exported here, e.g.
 142
 143 >   decodeUtf8 ::  Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 144 >   decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 145
 146     The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
 147     At any point of decoding failure, the stream of text ends and reverts to (returns)
 148     the original byte stream. Thus if the first bytes are already
 149     un-decodable, the whole ByteString producer will be returned, i.e.
 150
 151 >   view utf8 bytestream
 152
 153     will just come to the same as
 154
 155 >   return bytestream
 156
 157     Where there is no decoding failure, the return value of the text stream will be
 158     an empty byte stream followed by its own return value.  In all cases you must
 159     deal with the fact that it is a /ByteString producer/ that is returned, even if
 160     it can be thrown away with @Control.Monad.void@
 161
 162 >   void (Bytes.stdin ^. utf8) :: Producer Text IO ()
 163
 164     @zoom@ converts a Text parser into a ByteString parser:
 165
 166 >   zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
 167
 168     or, using the type synonymn from @Pipes.Parse@:
 169
 170 >   zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
 171
 172     Thus we can define a ByteString parser like this:
 173
 174 >   withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
 175 >   withNextByte = do char_ <- zoom utf8 Text.drawChar
 176 >                     byte_ <- Bytes.peekByte
 177 >                     return (char_, byte_)
 178
 179      Though @withNextByte@ is partly defined with a Text parser 'drawChar';
 180      but it is a ByteString parser; it will return the first valid utf8-encoded
 181      Char in a ByteString, whatever its length,
 182      and the first byte of the next character, if they exist. Because
 183      we \'draw\' one and \'peek\' at the other, the parser as a whole only
 184      advances one Char's length along the bytestring, whatever that length may be.
 185      See the slightly more complex example \'decode.hs\' in the
 186      <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
 187      discussion of this type of byte stream parsing.
 188     -}
 189
 190 utf8 :: Codec
 191 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
 192
 193 utf8Pure :: Codec
 194 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
 195
 196 utf16LE :: Codec
 197 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
 198
 199 utf16BE :: Codec
 200 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
 201
 202 utf32LE :: Codec
 203 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
 204
 205 utf32BE :: Codec
 206 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
 207
 208 decodeStream :: Monad m
 209        => (B.ByteString -> DecodeResult)
 210        -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 211 decodeStream = loop where
 212   loop dec0 p =
 213     do x <- lift (next p)
 214        case x of Left r -> return (return r)
 215                  Right (chunk, p') -> case dec0 chunk of
 216                     DecodeResultSuccess text dec -> do yield text
 217                                                        loop dec p'
 218                     DecodeResultFailure text bs -> do yield text
 219                                                       return (do yield bs
 220                                                                  p')
 221 {-# INLINABLE decodeStream#-}
 222
 223
 224 {- $decoders
 225    These are functions with the simple type:
 226
 227 >   decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 228
 229    Thus in general
 230
 231 >     decodeUtf8 = view utf8
 232 >     decodeUtf16LE = view utf16LE
 233
 234    and so forth, but these forms
 235    may be more convenient (and give better type errors!) where lenses are
 236    not desired.
 237 -}
 238
 239
 240 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 241 decodeUtf8 = decodeStream Stream.decodeUtf8
 242 {-# INLINE decodeUtf8 #-}
 243
 244 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 245 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
 246 {-# INLINE decodeUtf8Pure #-}
 247
 248 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 249 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
 250 {-# INLINE decodeUtf16LE #-}
 251
 252 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 253 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
 254 {-# INLINE decodeUtf16BE #-}
 255
 256 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 257 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
 258 {-# INLINE decodeUtf32LE #-}
 259
 260 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 261 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
 262 {-# INLINE decodeUtf32BE #-}
 263
 264
 265 {- $encoders
 266    These are simply defined
 267
 268 >      encodeUtf8 = yield . TE.encodeUtf8
 269
 270    They are intended for use with 'for'
 271
 272 >      for Text.stdin encodeUtf8 :: Producer ByteString IO ()
 273
 274    which would have the effect of
 275
 276 >      Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
 277
 278    using the encoding functions from Data.Text.Encoding
 279 -}
 280
 281 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
 282 encodeUtf8 = yield . TE.encodeUtf8
 283 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
 284 encodeUtf16LE = yield . TE.encodeUtf16LE
 285 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
 286 encodeUtf16BE = yield . TE.encodeUtf16BE
 287 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
 288 encodeUtf32LE = yield . TE.encodeUtf32LE
 289 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
 290 encodeUtf32BE = yield . TE.encodeUtf32BE
 291
 292 mkCodec :: (forall r m . Monad m =>
 293            Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
 294         -> (Text -> ByteString)
 295         -> Codec
 296 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc)))  (k (dec p0))
 297
 298
 299
 300 {- $ascii
 301    ascii and latin encodings only use a small number of the characters 'Text'
 302      recognizes; thus we cannot use the pipes @Lens@ style to work with them.
 303      Rather we simply define functions each way.
 304 -}
 305
 306
 307 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
 308 --   returning the rest of the 'Text' at the first non-ascii 'Char'
 309
 310 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 311 encodeAscii = go where
 312   go p = do e <- lift (next p)
 313             case e of
 314               Left r -> return (return r)
 315               Right (chunk, p') ->
 316                  if T.null chunk
 317                    then go p'
 318                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0x7F) chunk
 319                         in do yield (B8.pack (T.unpack safe))
 320                               if T.null unsafe
 321                                 then go p'
 322                                 else return $ do yield unsafe
 323                                                  p'
 324
 325 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
 326      returning the rest of the 'Text' upon hitting any non-latin 'Char'
 327    -}
 328 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 329 encodeIso8859_1 = go where
 330   go p = do e <- lift (next p)
 331             case e of
 332               Left r -> return (return r)
 333               Right (txt, p') ->
 334                  if T.null txt
 335                    then go p'
 336                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0xFF) txt
 337                         in do yield (B8.pack (T.unpack safe))
 338                               if T.null unsafe
 339                                 then go p'
 340                                 else return $ do yield unsafe
 341                                                  p'
 342
 343 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 344      unused 'ByteString' upon hitting an un-ascii byte.
 345    -}
 346 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 347 decodeAscii = go where
 348   go p = do e <- lift (next p)
 349             case e of
 350               Left r -> return (return r)
 351               Right (chunk, p') ->
 352                  if B.null chunk
 353                    then go p'
 354                    else let (safe, unsafe) = B.span (<= 0x7F) chunk
 355                         in do yield (T.pack (B8.unpack safe))
 356                               if B.null unsafe
 357                                 then go p'
 358                                 else return (do yield unsafe
 359                                                 p')
 360
 361 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 362      unused 'ByteString' upon hitting the rare un-latinizable byte.
 363      -}
 364 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 365 decodeIso8859_1 = go where
 366   go p = do e <- lift (next p)
 367             case e of
 368               Left r -> return (return r)
 369               Right (chunk, p') ->
 370                  if B.null chunk
 371                     then go p'
 372                     else do let (safe, unsafe) = B.span (<= 0xFF) chunk
 373                             yield (T.pack (B8.unpack safe))
 374                             if B.null unsafe
 375                                then go p'
 376                                else return (do yield unsafe
 377                                                p')
 378
 379
 380