encoding documentation

author michaelt <what_is_it_to_do_anything@yahoo.com>

Mon, 4 May 2015 18:16:55 +0000 (14:16 -0400)

committer michaelt <what_is_it_to_do_anything@yahoo.com>

Mon, 4 May 2015 18:16:55 +0000 (14:16 -0400)
author michaelt <what_is_it_to_do_anything@yahoo.com>
Mon, 4 May 2015 18:16:55 +0000 (14:16 -0400)
committer michaelt <what_is_it_to_do_anything@yahoo.com>
Mon, 4 May 2015 18:16:55 +0000 (14:16 -0400)
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs

index 97a9c2391ee015710119a9dc8dddd98391d5f252..b6aa709d313caac8fccf5ae1d85fe5f65b04ca91 100644 (file)
--- a/Pipes/Text/Encoding.hs
+++ b/Pipes/Text/Encoding.hs
@@ -1,19 +1,25 @@
  {-# LANGUAGE RankNTypes, BangPatterns #-}
  
  -- | This module uses the stream decoding functions from
---  <http://hackage.haskell.org/package/text-stream-decode text-stream-decode> 
+--  <http://hackage.haskell.org/package/streaming-commons streaming-commons> 
  --  package to define decoding functions and lenses.  The exported names
  --  conflict with names in @Data.Text.Encoding@ but not with the @Prelude@ 
  
  module Pipes.Text.Encoding
      ( 
-    -- * The Lens or Codec type
+    -- * Decoding ByteStrings and Encoding Texts
+    -- ** Simple usage
+    -- $usage
+    
+    -- ** Lens usage
      -- $lenses
+  
+    
+    -- * Basic lens operations
      Codec
      , decode
      , eof
-    -- * \'Viewing\' the Text in a byte stream
-    -- $codecs
+    -- * Decoding lenses
      , utf8
      , utf8Pure
      , utf16LE
@@ -58,99 +64,83 @@ import Control.Monad (join, liftM)
  import Data.Word (Word8)
  import Pipes
  
-type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  
-{- $lenses
-    The 'Codec' type is a simple specializion of 
-    the @Lens'@ type synonymn used by the standard lens libraries, 
-    <http://hackage.haskell.org/package/lens lens> and 
-    <http://hackage.haskell.org/package/lens-family lens-family>. That type, 
-    
->   type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  
-    is just an alias for a Prelude type. Thus you use any particular codec with
-    the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
-    we presuppose neither library since we already have access to the types they require.
-    -}
+{- $usage
+    Given 
  
-type Codec
-    =  forall m r
-    .  Monad m
-    => Lens' (Producer ByteString m r)
-             (Producer Text m (Producer ByteString m r))
+>   text :: Producer Text IO ()
  
-{- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
-      exported here under a name appropriate to the material. All of these are
-      the same: 
+    we can encode it with @Data.Text.Encoding@ and ordinary pipe operations:
  
->    decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
+>   text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO ()
  
--}
+    or, using this module, with
  
+>   for text encodeUtf8 :: Producer.ByteString IO ()
  
-decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
-decode codec a = getConstant (codec Constant a)
+    Given 
+
+>   bytes :: Producer ByteString Text IO ()
  
-{- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead
-      reaches end-of-file happily. (Without it one just makes an explicit test 
-      for emptiness of the resulting bytestring production using 'next') 
-      Thus
+    we can apply a decoding function from this module:
  
->     decode (utf8 . eof) p =  view (utf8 . eof) p = p^.utf8.eof
- 
-      will be a text producer. If we hit undecodable bytes, the remaining
-      bytestring producer will be returned as a 'Left' value; 
-      in the happy case, a 'Right' value is returned with the anticipated 
-      return value for the original bytestring producer. 
-      ) 
+>   decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ())
+
+    The Text producer ends wherever decoding first fails. Thus we can re-encode
+    as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g.
+
+>   for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ())
+    
+    The bytestring producer that is returned begins with where utf16BE decoding
+    failed; it it didn't fail the producer is empty. 
  
  -}
  
-eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
-                        (Producer Text m (Either (Producer ByteString m r) r))
-eof k p = fmap fromEither (k (toEither p)) where
+{- $lenses
+    We get a bit more flexibility, though, if we use a lens like @utf8@ or @utf16BE@ 
+    that looks for text in an appropriately encoded byte stream.
  
-  fromEither = liftM (either id return)
+>   type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  
-  toEither pp = do p <- pp
-                   check p
+    is just an alias for a Prelude type.   We abbreviate this further, for our use case, as
  
-  check p = do e <- lift (next p)
-               case e of 
-                 Left r -> return (Right r)
-                 Right (bs,pb) ->  if B.null bs 
-                                     then check pb
-                                     else return (Left (do yield bs
-                                                           pb))
+>   type Codec
+>     =  forall m r .  Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
  
+    and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can 
+    re-encode what they have decoded.  Thus you use any particular codec with
+    the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries;
+    we presuppose neither <http://hackage.haskell.org/package/lens lens> 
+    nor  <http://hackage.haskell.org/package/lens-family lens-family> 
+    since we already have access to the types they require.             
  
-{- $codecs
-    
-    Each Codec-lens looks into a byte stream that is supposed to contain text.
-    The particular \'Codec\' lenses are named in accordance with the expected 
-    encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function, 
-    use @view@ / @(^.)@ -- here also called 'decode':
+    Each decoding lens looks into a byte stream that is supposed to contain text.
+    The particular lenses are named in accordance with the expected 
+    encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@ 
+    into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode':
  
  >   view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
  >   decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
  >   Bytes.stdin ^. utf8 ::  Producer Text IO (Producer ByteString IO r)
  
-    Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized 
-    decoding functions exported here, e.g. 
+    These simple uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by 
+    the specialized decoding functions exported here, e.g. 
  
  >   decodeUtf8 ::  Producer ByteString m r -> Producer Text m (Producer ByteString m r)
  >   decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
  
-    The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head. 
+    As with these functions, the stream of text that a @Codec@ \'sees\' 
+    in the stream of bytes begins at its head. 
      At any point of decoding failure, the stream of text ends and reverts to (returns) 
      the original byte stream. Thus if the first bytes are already
      un-decodable, the whole ByteString producer will be returned, i.e.
  
->   view utf8 bytestream 
+>   view utf8 bad_bytestream 
  
      will just come to the same as 
  
->   return bytestream
+>   return bad_bytestream
  
      Where there is no decoding failure, the return value of the text stream will be
      an empty byte stream followed by its own return value.  In all cases you must
@@ -158,7 +148,21 @@ eof k p = fmap fromEither (k (toEither p)) where
      it can be thrown away with @Control.Monad.void@
  
  >   void (Bytes.stdin ^. utf8) :: Producer Text IO ()
+
+    The @eof@ lens permits you to pattern match: if there is a Right value,
+    it is the leftover bytestring producer, if there is a Right value, it 
+    is the return value of the original bytestring producer:
+
+>   Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ())
      
+    Thus for the stream of un-decodable bytes mentioned above,
+
+>   view (utf8 . eof) bad_bytestream
+
+    will be the same as 
+
+>   return (Left bad_bytestream)
+
      @zoom@ converts a Text parser into a ByteString parser:
  
  >   zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
@@ -167,24 +171,81 @@ eof k p = fmap fromEither (k (toEither p)) where
      
  >   zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
  
-    Thus we can define a ByteString parser like this:
+    Thus we can define a ByteString parser (in the pipes-parse sense) like this:
      
->   withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8))) 
->   withNextByte = do char_ <- zoom utf8 Text.drawChar
+>   charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8))) 
+>   charPlusByte = do char_ <- zoom utf8 Text.drawChar
  >                     byte_ <- Bytes.peekByte
  >                     return (char_, byte_)
  
-     Though @withNextByte@ is partly defined with a Text parser 'drawChar'; 
+     Though @charPlusByte@ is partly defined with a Text parser 'drawChar'; 
       but it is a ByteString parser; it will return the first valid utf8-encoded 
-     Char in a ByteString, whatever its length, 
-     and the first byte of the next character, if they exist. Because 
+     Char in a ByteString, whatever its byte-length, 
+     and the first byte following, if both exist. Because 
       we \'draw\' one and \'peek\' at the other, the parser as a whole only 
       advances one Char's length along the bytestring, whatever that length may be.
       See the slightly more complex example \'decode.hs\' in the 
-     <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall> 
-     discussion of this type of byte stream parsing.
+     <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog> 
+     discussion of this type of byte stream parsing. 
      -}
  
+type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
+
+type Codec
+    =  forall m r
+    .  Monad m
+    => Lens' (Producer ByteString m r)
+             (Producer Text m (Producer ByteString m r))
+
+
+{- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries;
+   exported here under a name appropriate to the material. Thus
+
+>    decode utf8 bytes :: Producer Text IO (Producer ByteString IO ())
+
+    All of these are thus the same:
+
+>    decode utf8 bytes = view utf8 bytes = bytes ^. utf8 = decodeUtf8 bytes
+
+
+-}
+
+decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
+decode codec a = getConstant (codec Constant a)
+
+{- | @eof@ tells you explicitly when decoding stops due to bad bytes or 
+    instead reaches end-of-file happily. (Without it one just makes an explicit 
+    test for emptiness of the resulting bytestring production using next) Thus
+
+>    decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ())
+
+    If we hit undecodable bytes, the remaining bytestring producer will be 
+    returned as a Left value; in the happy case, a Right value is returned 
+    with the anticipated return value for the original bytestring producer.
+
+    Again, all of these are the same
+
+>    decode (utf8 . eof) bytes = view (utf8 . eof) p = p^.utf8.eof
+
+-}
+
+eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
+                       (Producer Text m (Either (Producer ByteString m r) r))
+eof k p = fmap fromEither (k (toEither p)) where
+
+ fromEither = liftM (either id return)
+
+ toEither pp = do p <- pp
+                  check p
+
+ check p = do e <- lift (next p)
+              case e of 
+                Left r -> return (Right r)
+                Right (bs,pb) ->  if B.null bs 
+                                    then check pb
+                                    else return (Left (do yield bs
+                                                          pb))
+
  utf8 :: Codec
  utf8 = mkCodec decodeUtf8 TE.encodeUtf8
  
diff --git a/examples/attoparser.hs b/examples/attoparser.hs

index 6328991a815bbe1eb5d946614b267eca5d5ae237..b7d8c11093327b3aa163a06c137309829825288c 100644 (file)
--- a/examples/attoparser.hs
+++ b/examples/attoparser.hs
@@ -18,10 +18,10 @@ testParser = do
    return $ Test a b
    
  main = IO.withFile "./testfile" IO.ReadMode $ \handle -> runEffect $
-    for  (parsed testParser (fromHandle handle)) 
-         (lift . print)
-
-
+   do leftover <- for (parsed testParser (fromHandle handle)) 
+                   (lift . print)
+      return () -- ignore unparsed material
+      
  -- >>> :! cat testfile
  -- 1 1
  -- 2 2
diff --git a/pipes-text.cabal b/pipes-text.cabal

index 0691ba3e13423223f7a8a8438d8e1c7ac9f6bbb2..673d1221395f0a15d8523f6bda4bfc5edadcfa24 100644 (file)
--- a/pipes-text.cabal
+++ b/pipes-text.cabal
@@ -1,5 +1,5 @@
  name:                pipes-text
-version:             0.0.0.15
+version:             0.0.0.16
  synopsis:            Text pipes.
  description:         * This package will be in a draft, or testing, phase until version 0.0.1. Please report any installation difficulties, or any wisdom about the api, on the github page or the <https://groups.google.com/forum/#!forum/haskell-pipes pipes list>
                       .
author	michaelt <what_is_it_to_do_anything@yahoo.com>
	Mon, 4 May 2015 18:16:55 +0000 (14:16 -0400)
committer	michaelt <what_is_it_to_do_anything@yahoo.com>
	Mon, 4 May 2015 18:16:55 +0000 (14:16 -0400)
Pipes/Text/Encoding.hs		patch \| blob \| blame \| history
examples/attoparser.hs		patch \| blob \| blame \| history
pipes-text.cabal		patch \| blob \| blame \| history