blob: c3afc8365a618cd7a0b12fecce26aecb6d83579f (
plain) (
tree)
|
|
-- this file illustrates several uses of `zoom`
-- one of them is quadratic in the length of the file
-- since it has to decode and encode repeatedly,
-- and is thus no good on long files.
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns#-}
{-# LANGUAGE RankNTypes #-}
import Blaze.ByteString.Builder (Builder, fromByteString, toByteString)
import Control.Exception (Exception)
import Control.Monad.Trans.Class (lift)
import Data.ByteString (ByteString)
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TEE
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TLE
import Pipes
import Pipes.Parse
import qualified Pipes.Prelude as PP
import qualified Pipes.ByteString as Bytes
import qualified Pipes.Text as Txt
import Pipes.Text.Encoding (utf8)
import Control.Lens -- we use 'zoom' with MonadState, not just StateT
import Control.Monad
import qualified System.IO as IO
import Control.Monad.Trans.Maybe
import Control.Monad.State.Class
main :: IO ()
main = do S.writeFile fp $ contents 10000 -- 10000 cannot be handled fileParser0 and 1
-- parse_file fileParser0 -- pathological
-- parse_file fileParser1 -- programs
parse_file fileParser2 -- good program
where
parse_file parser = IO.withBinaryFile fp IO.ReadMode $ \h ->
do p' <- runEffect $ parseWith parser ( Bytes.fromHandle h ) >-> PP.print
runEffect $ p' >-> PP.print
parseWith parser = loop where
loop p = do (m,p') <- lift (runStateT (runMaybeT parser) p)
case m of Nothing -> return p'
Just file -> do yield file
loop p'
fp = "encoded.fileformat"
contents n = (toByteString . mconcat . replicate n . encodeFiles) input
<> S.pack (replicate 10 250)
fileParser0, fileParser1, fileParser2 :: Monad m => MaybeT (StateT (Producer ByteString m x) m) File
fileParser0 = do (name, len) <- zoom utf8 parseText
contents <- zoom (Bytes.splitAt len) (lift drawAll)
return (File name (S.concat contents))
where
-- this parser aggregates all Text parsing into one preliminary parser
-- which is then applied with `zoom utf8`
-- we cannot tell in advance how long, e.g. the file name will be
parseText :: Monad m => MaybeT (StateT (Producer Text m x) m) (Text, Int)
parseText = do nameLength <- parseNumber
names <- zoom (Txt.splitAt nameLength) $ (lift drawAll)
contentLength <- parseNumber
return $! (T.concat names, contentLength)
-- here we disaggregate the little Text parsers but still apply them with `zoom utf8`
-- this makes no difference
fileParser1 = do nameLength <- zoom utf8 parseNumber
names <- zoom (utf8 . Txt.splitAt nameLength) (lift drawAll)
contentLength <- zoom utf8 parseNumber
contents <- zoom (Bytes.splitAt contentLength) (lift drawAll)
return (File (T.concat names) (S.concat contents))
-- This is the good program; by reflecting on the fact that file names
-- should not be a 1000 bytes long, and binary files longer than e.g. 10 ^ 10
-- we can restrict the length of the byte stream to which we apply `zoom utf8`
fileParser2 = do nameLength <- zoom (Bytes.splitAt 3 . utf8) parseNumber
names <- zoom (Bytes.splitAt nameLength . utf8) (lift drawAll)
len <- zoom (Bytes.splitAt 10 . utf8) parseNumber
contents <- zoom (Bytes.splitAt len) (lift drawAll)
return (File (T.concat names) (S.concat contents))
parseNumber :: Monad m => MaybeT (StateT (Producer Text m x) m) Int
parseNumber = loop 0 where
loop !n = do c <- MaybeT Txt.drawChar
case c of ':' -> return n
_ -> do guard ('0' <= c && c <= '9')
loop $! n * 10 + (fromEnum c - fromEnum '0')
-- --- Michael S's `File` type and its binary encoding, etc.
data File = File
{ fileName :: !Text
, fileContents :: !ByteString
}
deriving Show
encodeFile :: File -> Builder
encodeFile (File name contents) =
tellLength (S.length bytesname) <>
fromByteString bytesname <>
tellLength (S.length contents) <>
fromByteString contents
where
tellLength i = fromByteString $ TEE.encodeUtf8 (T.pack (shows i ":"))
bytesname = TEE.encodeUtf8 name
encodeFiles :: [File] -> Builder
encodeFiles = mconcat . map encodeFile
input :: [File]
input =
[ File "utf8.txt" $ TEE.encodeUtf8 "This file is in UTF-8"
, File "utf16.txt" $ TEE.encodeUtf16LE "This file is in UTF-16"
, File "binary.dat" "we'll pretend to be binary"
]
|