[github/fretlink/text-pipes.git] / examples / zoom.hs

-- this file illustrates several uses of `zoom` 
-- one of them is quadratic in the length of the file
-- since it has to decode and encode repeatedly,
-- and is thus no good on long files. 

{-# LANGUAGE OverloadedStrings  #-}
{-# LANGUAGE BangPatterns#-}
{-# LANGUAGE RankNTypes #-}
import           Blaze.ByteString.Builder  (Builder, fromByteString, toByteString)
import           Control.Exception         (Exception)
import           Control.Monad.Trans.Class (lift)
import           Data.ByteString           (ByteString)
import qualified Data.ByteString           as S
import qualified Data.ByteString.Lazy      as L
import           Data.Monoid
import           Data.Text                 (Text)
import qualified Data.Text                 as T
import qualified Data.Text.Encoding        as TEE
import qualified Data.Text.Lazy            as TL
import qualified Data.Text.Lazy.Encoding   as TLE

import Pipes
import Pipes.Parse
import qualified Pipes.Prelude as PP
import qualified Pipes.ByteString as Bytes
import qualified Pipes.Text as Txt
import Pipes.Text.Encoding (utf8)

import Control.Lens -- we use 'zoom' with MonadState, not just StateT
import Control.Monad
import qualified System.IO as IO
import Control.Monad.Trans.Maybe
import Control.Monad.State.Class

main :: IO ()
main = do S.writeFile fp $ contents 10000 -- 10000 cannot be handled fileParser0 and 1
          -- parse_file fileParser0  -- pathological
          -- parse_file fileParser1  -- programs
          parse_file fileParser2  -- good program 
          
   where 
   parse_file parser = IO.withBinaryFile fp IO.ReadMode $ \h ->
                         do p' <- runEffect $ parseWith parser ( Bytes.fromHandle h ) >-> PP.print
                            runEffect $ p' >-> PP.print
   parseWith parser = loop where
      loop p = do (m,p') <- lift (runStateT (runMaybeT parser) p)
                  case m of Nothing -> return p'
                            Just file -> do yield file
                                            loop p'
   fp = "encoded.fileformat"
   contents n =  (toByteString . mconcat . replicate n . encodeFiles) input
                 <> S.pack (replicate 10 250)


fileParser0, fileParser1, fileParser2 :: Monad m => MaybeT (StateT (Producer ByteString m x) m) File
fileParser0  = do (name, len) <- zoom utf8 parseText
                  contents    <- zoom (Bytes.splitAt len) (lift drawAll)
                  return (File name (S.concat contents))
    where
    -- this parser aggregates all Text parsing into one preliminary parser
    -- which is then applied with `zoom utf8`
    -- we cannot tell in advance how long, e.g. the file name will be
    parseText :: Monad m => MaybeT (StateT (Producer Text m x) m) (Text, Int)
    parseText = do nameLength    <- parseNumber
                   names         <- zoom (Txt.splitAt nameLength) $ (lift drawAll)
                   contentLength <- parseNumber
                   return $! (T.concat names, contentLength)

-- here we disaggregate the little Text parsers but still apply them with `zoom utf8`
-- this makes no difference
fileParser1  = do nameLength    <- zoom utf8 parseNumber
                  names         <- zoom (utf8 . Txt.splitAt nameLength)  (lift drawAll)
                  contentLength <- zoom utf8 parseNumber
                  contents      <- zoom (Bytes.splitAt contentLength) (lift drawAll)
                  return (File (T.concat names) (S.concat contents))

-- This is the good program; by reflecting on the fact that file names
-- should not be a 1000 bytes long, and binary files longer than e.g. 10 ^ 10
-- we can restrict the length of the byte stream to which we apply `zoom utf8`
fileParser2  = do nameLength  <- zoom (Bytes.splitAt 3 . utf8) parseNumber
                  names       <- zoom (Bytes.splitAt nameLength . utf8) (lift drawAll)
                  len         <- zoom (Bytes.splitAt 10 . utf8) parseNumber
                  contents    <- zoom (Bytes.splitAt len) (lift drawAll)
                  return (File (T.concat names) (S.concat contents))


parseNumber :: Monad m =>  MaybeT (StateT (Producer Text m x) m) Int
parseNumber  = loop  0 where
   loop !n = do c <- MaybeT  Txt.drawChar
                case c of ':' -> return n
                          _   -> do guard ('0' <= c && c <= '9')
                                    loop  $! n * 10 + (fromEnum c - fromEnum '0')


-- --- Michael S's `File` type and its binary encoding, etc.


data File = File
    { fileName     :: !Text
    , fileContents :: !ByteString
    }
    deriving Show

encodeFile :: File -> Builder
encodeFile (File name contents) =
    tellLength (S.length bytesname) <>
    fromByteString bytesname        <>
    tellLength (S.length contents)  <>
    fromByteString contents
  where
    tellLength i = fromByteString $ TEE.encodeUtf8 (T.pack (shows i ":"))
    bytesname = TEE.encodeUtf8 name

encodeFiles :: [File] -> Builder
encodeFiles = mconcat . map encodeFile

input :: [File]
input =
    [ File "utf8.txt" $ TEE.encodeUtf8 "This file is in UTF-8"
    , File "utf16.txt" $ TEE.encodeUtf16LE "This file is in UTF-16"
    , File "binary.dat" "we'll pretend to be binary"
    ]
Commit	Line	Data
955edd33	1	-- this file illustrates several uses of `zoom`
	2	-- one of them is quadratic in the length of the file
	3	-- since it has to decode and encode repeatedly,
	4	-- and is thus no good on long files.
	5
	6	{-# LANGUAGE OverloadedStrings #-}
	7	{-# LANGUAGE BangPatterns#-}
	8	{-# LANGUAGE RankNTypes #-}
	9	import Blaze.ByteString.Builder (Builder, fromByteString, toByteString)
	10	import Control.Exception (Exception)
	11	import Control.Monad.Trans.Class (lift)
	12	import Data.ByteString (ByteString)
	13	import qualified Data.ByteString as S
	14	import qualified Data.ByteString.Lazy as L
	15	import Data.Monoid
	16	import Data.Text (Text)
	17	import qualified Data.Text as T
	18	import qualified Data.Text.Encoding as TEE
	19	import qualified Data.Text.Lazy as TL
	20	import qualified Data.Text.Lazy.Encoding as TLE
	21
	22	import Pipes
	23	import Pipes.Parse
	24	import qualified Pipes.Prelude as PP
	25	import qualified Pipes.ByteString as Bytes
	26	import qualified Pipes.Text as Txt
	27	import Pipes.Text.Encoding (utf8)
	28
8197d6e0	29	import Control.Lens -- we use 'zoom' with MonadState, not just StateT
955edd33	30	import Control.Monad
	31	import qualified System.IO as IO
	32	import Control.Monad.Trans.Maybe
	33	import Control.Monad.State.Class
	34
	35	main :: IO ()
8197d6e0	36	main = do S.writeFile fp $ contents 10000 -- 10000 cannot be handled fileParser0 and 1
955edd33	37	-- parse_file fileParser0 -- pathological
	38	-- parse_file fileParser1 -- programs
	39	parse_file fileParser2 -- good program
	40
	41	where
	42	parse_file parser = IO.withBinaryFile fp IO.ReadMode $ \h ->
	43	do p' <- runEffect $ parseWith parser ( Bytes.fromHandle h ) >-> PP.print
	44	runEffect $ p' >-> PP.print
	45	parseWith parser = loop where
	46	loop p = do (m,p') <- lift (runStateT (runMaybeT parser) p)
	47	case m of Nothing -> return p'
	48	Just file -> do yield file
	49	loop p'
	50	fp = "encoded.fileformat"
	51	contents n = (toByteString . mconcat . replicate n . encodeFiles) input
	52	<> S.pack (replicate 10 250)
	53
	54
	55
	56	fileParser0, fileParser1, fileParser2 :: Monad m => MaybeT (StateT (Producer ByteString m x) m) File
	57	fileParser0 = do (name, len) <- zoom utf8 parseText
	58	contents <- zoom (Bytes.splitAt len) (lift drawAll)
	59	return (File name (S.concat contents))
	60	where
	61	-- this parser aggregates all Text parsing into one preliminary parser
	62	-- which is then applied with `zoom utf8`
	63	-- we cannot tell in advance how long, e.g. the file name will be
	64	parseText :: Monad m => MaybeT (StateT (Producer Text m x) m) (Text, Int)
	65	parseText = do nameLength <- parseNumber
	66	names <- zoom (Txt.splitAt nameLength) $ (lift drawAll)
	67	contentLength <- parseNumber
	68	return $! (T.concat names, contentLength)
	69
	70	-- here we disaggregate the little Text parsers but still apply them with `zoom utf8`
	71	-- this makes no difference
	72	fileParser1 = do nameLength <- zoom utf8 parseNumber
	73	names <- zoom (utf8 . Txt.splitAt nameLength) (lift drawAll)
	74	contentLength <- zoom utf8 parseNumber
	75	contents <- zoom (Bytes.splitAt contentLength) (lift drawAll)
	76	return (File (T.concat names) (S.concat contents))
	77
8197d6e0	78	-- This is the good program; by reflecting on the fact that file names
955edd33	79	-- should not be a 1000 bytes long, and binary files longer than e.g. 10 ^ 10
955edd33	80	-- we can restrict the length of the byte stream to which we apply `zoom utf8`
8197d6e0	81	fileParser2 = do nameLength <- zoom (Bytes.splitAt 3 . utf8) parseNumber
	82	names <- zoom (Bytes.splitAt nameLength . utf8) (lift drawAll)
	83	len <- zoom (Bytes.splitAt 10 . utf8) parseNumber
	84	contents <- zoom (Bytes.splitAt len) (lift drawAll)
955edd33	85	return (File (T.concat names) (S.concat contents))
955edd33	86
8197d6e0	87
955edd33	88
	89	parseNumber :: Monad m => MaybeT (StateT (Producer Text m x) m) Int
	90	parseNumber = loop 0 where
	91	loop !n = do c <- MaybeT Txt.drawChar
	92	case c of ':' -> return n
	93	_ -> do guard ('0' <= c && c <= '9')
	94	loop $! n * 10 + (fromEnum c - fromEnum '0')
	95
	96
	97
	98	-- --- Michael S's `File` type and its binary encoding, etc.
	99
	100
	101	data File = File
	102	{ fileName :: !Text
	103	, fileContents :: !ByteString
	104	}
	105	deriving Show
	106
	107	encodeFile :: File -> Builder
	108	encodeFile (File name contents) =
	109	tellLength (S.length bytesname) <>
	110	fromByteString bytesname <>
	111	tellLength (S.length contents) <>
	112	fromByteString contents
	113	where
	114	tellLength i = fromByteString $ TEE.encodeUtf8 (T.pack (shows i ":"))
	115	bytesname = TEE.encodeUtf8 name
	116
	117	encodeFiles :: [File] -> Builder
	118	encodeFiles = mconcat . map encodeFile
	119
	120	input :: [File]
	121	input =
	122	[ File "utf8.txt" $ TEE.encodeUtf8 "This file is in UTF-8"
	123	, File "utf16.txt" $ TEE.encodeUtf16LE "This file is in UTF-16"
	124	, File "binary.dat" "we'll pretend to be binary"
	125	]
	126
	127