Pipes/Text/Encoding.hs

   1 {-# LANGUAGE RankNTypes, BangPatterns #-}
   2
   3 -- | This module uses the stream decoding functions from Michael Snoyman's new
   4 --  <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
   5 --  package to define decoding functions and lenses.  The exported names
   6 --  conflict with names in @Data.Text.Encoding@ but the module can otherwise be
   7 --  imported unqualified.
   8
   9 module Pipes.Text.Encoding
  10     (
  11     -- * The Lens or Codec type
  12     -- $lenses
  13     Codec
  14     , decode
  15     -- * Viewing the Text in a ByteString
  16     -- $codecs
  17     , utf8
  18     , utf8Pure
  19     , utf16LE
  20     , utf16BE
  21     , utf32LE
  22     , utf32BE
  23     -- * Non-lens decoding functions
  24     -- $decoders
  25     , decodeUtf8
  26     , decodeUtf8Pure
  27     , decodeUtf16LE
  28     , decodeUtf16BE
  29     , decodeUtf32LE
  30     , decodeUtf32BE
  31     -- * Re-encoding functions
  32     -- $encoders
  33     , encodeUtf8
  34     , encodeUtf16LE
  35     , encodeUtf16BE
  36     , encodeUtf32LE
  37     , encodeUtf32BE
  38     -- * Functions for latin and ascii text
  39     -- $ascii
  40     , encodeAscii
  41     , decodeAscii
  42     , encodeIso8859_1
  43     , decodeIso8859_1
  44     )
  45     where
  46
  47 import Data.Functor.Constant (Constant(..))
  48 import Data.Char (ord)
  49 import Data.ByteString as B
  50 import Data.ByteString (ByteString)
  51 import Data.ByteString.Char8 as B8
  52 import Data.Text (Text)
  53 import qualified Data.Text as T
  54 import qualified Data.Text.Encoding as TE
  55 import Data.Text.StreamDecoding
  56 import Control.Monad (join)
  57 import Data.Word (Word8)
  58 import Pipes
  59
  60
  61 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  62
  63 {- $lenses
  64     The 'Codec' type is a simple specializion of
  65     the @Lens'@ type synonymn used by the standard lens libraries,
  66     <http://hackage.haskell.org/package/lens lens> and
  67     <http://hackage.haskell.org/package/lens-family lens-family>. That type,
  68
  69 >   type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  70
  71     is just an alias for a Prelude type. Thus you use any particular codec with
  72     the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
  73     we presuppose neither since we already have access to the types they require.
  74
  75     -}
  76
  77 type Codec
  78     =  forall m r
  79     .  Monad m
  80     => Lens' (Producer ByteString m r)
  81              (Producer Text m (Producer ByteString m r))
  82
  83 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
  84       exported here under a name appropriate to the material. All of these are
  85       the same:
  86
  87 >    decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
  88
  89 -}
  90
  91 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
  92 decode codec a = getConstant (codec Constant a)
  93
  94
  95 {- $codecs
  96
  97     Each Codec-lens looks into a byte stream that is supposed to contain text.
  98     The particular \'Codec\' lenses are named in accordance with the expected
  99     encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
 100     use @view@ / @(^.)@ -- here also called 'decode':
 101
 102 >   view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 103 >   decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 104 >   Bytes.stdin ^. utf8 ::  Producer Text IO (Producer ByteString IO r)
 105
 106     Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
 107     decoding functions exported here, e.g.
 108
 109 >   decodeUtf8 ::  Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 110 >   decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
 111
 112     The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
 113     At any point of decoding failure, the stream of text ends and reverts to (returns)
 114     the original byte stream. Thus if the first bytes are already
 115     un-decodable, the whole ByteString producer will be returned, i.e.
 116
 117 >   view utf8 bytestream
 118
 119     will just come to the same as
 120
 121 >   return bytestream
 122
 123     Where there is no decoding failure, the return value of the text stream will be
 124     an empty byte stream followed by its own return value.  In all cases you must
 125     deal with the fact that it is a /ByteString producer/ that is returned, even if
 126     it can be thrown away with @Control.Monad.void@
 127
 128 >   void (Bytes.stdin ^. utf8) :: Producer Text IO ()
 129
 130     @zoom@ converts a Text parser into a ByteString parser:
 131
 132 >   zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
 133
 134     or, with the type synonymn of @Pipes.Parse@:
 135
 136 >   zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
 137
 138     Thus we can define ByteString like this:
 139
 140 >   withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
 141 >   withNextByte = do char_ <- zoom utf8 Text.drawChar
 142 >                     byte_ <- Bytes.peekByte
 143 >                     return (char_, byte_)
 144
 145      Though @withNextByte@ is partly defined with a Text parser 'drawChar';
 146      but it is a ByteString parser; it will return the first valid utf8-encoded
 147      Char in a ByteString, whatever its length,
 148      and the first byte of the next character, if they exist. Because
 149      we \'draw\' one and \'peek\' at the other, the parser as a whole only
 150      advances one Char's length along the bytestring, whatever that length may be.
 151      See the slightly more complex example \'decode.hs\' in the
 152      <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
 153      discussion of this type of byte stream parsing.
 154     -}
 155
 156 utf8 :: Codec
 157 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
 158
 159 utf8Pure :: Codec
 160 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
 161
 162 utf16LE :: Codec
 163 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
 164
 165 utf16BE :: Codec
 166 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
 167
 168 utf32LE :: Codec
 169 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
 170
 171 utf32BE :: Codec
 172 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
 173
 174 decodeStream :: Monad m
 175        => (B.ByteString -> DecodeResult)
 176        -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 177 decodeStream = loop where
 178   loop dec0 p =
 179     do x <- lift (next p)
 180        case x of Left r -> return (return r)
 181                  Right (chunk, p') -> case dec0 chunk of
 182                     DecodeResultSuccess text dec -> do yield text
 183                                                        loop dec p'
 184                     DecodeResultFailure text bs -> do yield text
 185                                                       return (do yield bs
 186                                                                  p')
 187 {-# INLINABLE decodeStream#-}
 188
 189 {- $decoders
 190    These are functions with the simple type:
 191
 192 >   decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 193
 194    Thus in general
 195
 196 >     decodeUtf8 = view utf8
 197 >     decodeUtf16LE = view utf16LE
 198
 199    and so forth, but these forms
 200    may be more convenient (and give better type errors!) where lenses are
 201    not desired.
 202 -}
 203
 204
 205 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 206 decodeUtf8 = decodeStream streamUtf8
 207 {-# INLINE decodeUtf8 #-}
 208
 209 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 210 decodeUtf8Pure = decodeStream streamUtf8Pure
 211 {-# INLINE decodeUtf8Pure #-}
 212
 213 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 214 decodeUtf16LE = decodeStream streamUtf16LE
 215 {-# INLINE decodeUtf16LE #-}
 216
 217 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 218 decodeUtf16BE = decodeStream streamUtf16BE
 219 {-# INLINE decodeUtf16BE #-}
 220
 221 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 222 decodeUtf32LE = decodeStream streamUtf32LE
 223 {-# INLINE decodeUtf32LE #-}
 224
 225 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 226 decodeUtf32BE = decodeStream streamUtf32BE
 227 {-# INLINE decodeUtf32BE #-}
 228
 229
 230 {- $encoders
 231    These are simply defined
 232
 233 >      encodeUtf8 = yield . TE.encodeUtf8
 234
 235    They are intended for use with 'for'
 236
 237 >      for Text.stdin encodeUtf8 :: Producer ByteString IO ()
 238
 239    which would have the effect of
 240
 241 >      Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
 242
 243    using the encoding functions from Data.Text.Encoding
 244 -}
 245
 246 encodeUtf8 :: Monad m => Text -> Producer ByteString m ()
 247 encodeUtf8 = yield . TE.encodeUtf8
 248 encodeUtf16LE :: Monad m => Text -> Producer ByteString m ()
 249 encodeUtf16LE = yield . TE.encodeUtf16LE
 250 encodeUtf16BE :: Monad m => Text -> Producer ByteString m ()
 251 encodeUtf16BE = yield . TE.encodeUtf16BE
 252 encodeUtf32LE :: Monad m => Text -> Producer ByteString m ()
 253 encodeUtf32LE = yield . TE.encodeUtf32LE
 254 encodeUtf32BE :: Monad m => Text -> Producer ByteString m ()
 255 encodeUtf32BE = yield . TE.encodeUtf32BE
 256
 257 mkCodec :: (forall r m . Monad m =>
 258            Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
 259         -> (Text -> ByteString)
 260         -> Codec
 261 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc)))  (k (dec p0))
 262
 263
 264
 265 {- $ascii
 266    ascii and latin encodings only use a small number of the characters 'Text'
 267      recognizes; thus we cannot use the pipes @Lens@ style to work with them.
 268      Rather we simply define functions each way.
 269 -}
 270
 271
 272 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
 273 --   returning the rest of the 'Text' at the first non-ascii 'Char'
 274
 275 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 276 encodeAscii = go where
 277   go p = do e <- lift (next p)
 278             case e of
 279               Left r -> return (return r)
 280               Right (chunk, p') ->
 281                  if T.null chunk
 282                    then go p'
 283                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0x7F) chunk
 284                         in do yield (B8.pack (T.unpack safe))
 285                               if T.null unsafe
 286                                 then go p'
 287                                 else return $ do yield unsafe
 288                                                  p'
 289
 290 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
 291      returning the rest of the 'Text' upon hitting any non-latin 'Char'
 292    -}
 293 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
 294 encodeIso8859_1 = go where
 295   go p = do e <- lift (next p)
 296             case e of
 297               Left r -> return (return r)
 298               Right (txt, p') ->
 299                  if T.null txt
 300                    then go p'
 301                    else let (safe, unsafe)  = T.span (\c -> ord c <= 0xFF) txt
 302                         in do yield (B8.pack (T.unpack safe))
 303                               if T.null unsafe
 304                                 then go p'
 305                                 else return $ do yield unsafe
 306                                                  p'
 307
 308 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 309      unused 'ByteString' upon hitting an un-ascii byte.
 310    -}
 311 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 312 decodeAscii = go where
 313   go p = do e <- lift (next p)
 314             case e of
 315               Left r -> return (return r)
 316               Right (chunk, p') ->
 317                  if B.null chunk
 318                    then go p'
 319                    else let (safe, unsafe) = B.span (<= 0x7F) chunk
 320                         in do yield (T.pack (B8.unpack safe))
 321                               if B.null unsafe
 322                                 then go p'
 323                                 else return (do yield unsafe
 324                                                 p')
 325
 326 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
 327      unused 'ByteString' upon hitting the rare un-latinizable byte.
 328      -}
 329 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 330 decodeIso8859_1 = go where
 331   go p = do e <- lift (next p)
 332             case e of
 333               Left r -> return (return r)
 334               Right (chunk, p') ->
 335                  if B.null chunk
 336                     then go p'
 337                     else do let (safe, unsafe) = B.span (<= 0xFF) chunk
 338                             yield (T.pack (B8.unpack safe))
 339                             if B.null unsafe
 340                                then go p'
 341                                else return (do yield unsafe
 342                                                p')
 343
 344
 345