From 64e03122e6ecc4898cb1b193cdcf3b26d3e71b14 Mon Sep 17 00:00:00 2001 From: michaelt Date: Sat, 25 Jan 2014 21:42:54 -0500 Subject: renamed fold foldChars and began updating documentation --- Pipes/Text.hs | 74 +++++++++++--- Pipes/Text/Internal.hs | 264 ++++++++++++++++++++++++++++++++++++++++++------- Pipes/Text/Parse.hs | 18 ++-- 3 files changed, 299 insertions(+), 57 deletions(-) (limited to 'Pipes') diff --git a/Pipes/Text.hs b/Pipes/Text.hs index cf493e9..99e4ed6 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -1,9 +1,12 @@ {-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns #-} - +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif {-| This module provides @pipes@ utilities for \"text streams\", which are - streams of 'Text' chunks. The individual chunks are uniformly @strict@, but - a 'Producer' can be converted to and from lazy 'Text's; an 'IO.Handle' can - be associated with a 'Producer' or 'Consumer' according as it is read or written to. + streams of 'Text' chunks. The individual chunks are uniformly @strict@, but + a 'Producer' can be converted to and from lazy 'Text's, though this is generally + unwise. Where pipes IO replaces lazy IO, 'Producer Text m r' replaces lazy 'Text'. + An 'IO.Handle' can be associated with a 'Producer' or 'Consumer' according as it is read or written to. To stream to or from 'IO.Handle's, one can use 'fromHandle' or 'toHandle'. For example, the following program copies a document from one file to another: @@ -52,9 +55,9 @@ To stream from files, the following is perhaps more Prelude-like (note that it u Note that functions in this library are designed to operate on streams that are insensitive to text boundaries. This means that they may freely split - text into smaller texts and /discard empty texts/. However, they will - /never concatenate texts/ in order to provide strict upper bounds on memory - usage. + text into smaller texts, /discard empty texts/. However, apart from the + special case of 'concatMap', they will /never concatenate texts/ in order + to provide strict upper bounds on memory usage -- with the single exception of 'concatMap'. -} module Pipes.Text ( @@ -91,7 +94,7 @@ module Pipes.Text ( -- * Folds toLazy, toLazyM, - fold, + foldChars, head, last, null, @@ -116,6 +119,7 @@ module Pipes.Text ( lines, words, decodeUtf8, + decode, -- * Transformations intersperse, @@ -139,7 +143,7 @@ module Pipes.Text ( ) where import Control.Exception (throwIO, try) -import Control.Monad (liftM, unless) +import Control.Monad (liftM, unless, join) import Control.Monad.Trans.State.Strict (StateT(..)) import Data.Monoid ((<>)) import qualified Data.Text as T @@ -160,13 +164,14 @@ import Foreign.C.Error (Errno(Errno), ePIPE) import qualified GHC.IO.Exception as G import Pipes import qualified Pipes.ByteString as PB -import qualified Pipes.ByteString.Parse as PBP +import qualified Pipes.ByteString as PBP import qualified Pipes.Text.Internal as PE +import Pipes.Text.Internal (Codec(..)) import Pipes.Text.Parse ( nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) import Pipes.Core (respond, Server') import qualified Pipes.Parse as PP -import Pipes.Parse (input, concat, FreeT) +import Pipes.Parse ( FreeT) import qualified Pipes.Safe.Prelude as Safe import qualified Pipes.Safe as Safe import Pipes.Safe (MonadSafe(..), Base(..)) @@ -499,10 +504,10 @@ toLazyM = liftM TL.fromChunks . P.toListM {-# INLINABLE toLazyM #-} -- | Reduce the text stream using a strict left fold over characters -fold +foldChars :: Monad m => (x -> Char -> x) -> x -> (x -> r) -> Producer Text m () -> m r -fold step begin done = P.fold (T.foldl' step) begin done +foldChars step begin done = P.fold (T.foldl' step) begin done {-# INLINABLE fold #-} -- | Retrieve the first 'Char' @@ -879,4 +884,45 @@ unwords = intercalate (yield $ T.pack " ") @Data.Text@ re-exports the 'Text' type. @Pipes.Parse@ re-exports 'input', 'concat', and 'FreeT' (the type). --} \ No newline at end of file +-} + + + +decode :: Monad m => PE.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) +-- decode codec = go B.empty where +-- go extra p0 = +-- do x <- lift (next p0) +-- case x of Right (chunk, p) -> +-- do let (text, stuff) = codecDecode codec (B.append extra chunk) +-- yield text +-- case stuff of Right extra' -> go extra' p +-- Left (exc,bs) -> do yield text +-- return (do yield bs +-- p) +-- Left r -> return (do yield extra +-- return r) + +decode d p0 = case d of + PE.Other txt bad -> do yield txt + return (do yield bad + p0) + PE.Some txt extra dec -> do yield txt + x <- lift (next p0) + case x of Left r -> return (do yield extra + return r) + Right (chunk,p1) -> decode (dec chunk) p1 + +-- go !carry dec0 p = do +-- x <- lift (next p) +-- case x of Left r -> if B.null carry +-- then return (return r) -- all bytestrinput was consumed +-- else return (do yield carry -- a potentially valid fragment remains +-- return r) +-- +-- Right (chunk, p') -> case dec0 chunk of +-- PE.Some text carry2 dec -> do yield text +-- go carry2 dec p' +-- PE.Other text bs -> do yield text +-- return (do yield bs -- an invalid blob remains +-- p') +-- {-# INLINABLE decodeUtf8 #-} diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 7e5b044..76c2f4f 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs @@ -1,5 +1,7 @@ -{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, - UnliftedFFITypes #-} +{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} +{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} +{-# LANGUAGE DeriveDataTypeable, RankNTypes #-} + -- This module lifts assorted materials from Brian O'Sullivan's text package -- especially Data.Text.Encoding in order to define a pipes-appropriate -- streamDecodeUtf8 @@ -7,13 +9,20 @@ module Pipes.Text.Internal ( Decoding(..) , streamDecodeUtf8 , decodeSomeUtf8 + , Codec(..) + , TextException(..) + , utf8 ) where import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Monad.ST (ST, runST) import Data.Bits ((.&.)) import Data.ByteString as B +import Data.ByteString (ByteString) import Data.ByteString.Internal as B -import qualified Data.Text as T (null) +import Data.ByteString.Char8 as B8 +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE import Data.Text.Encoding.Error () import Data.Text.Internal (Text, textP) import Foreign.C.Types (CSize) @@ -24,9 +33,226 @@ import Foreign.Storable (Storable, peek, poke) import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) import GHC.Word (Word8, Word32) import qualified Data.Text.Array as A - +import Data.Word (Word8, Word16) +import System.IO.Unsafe (unsafePerformIO) +import qualified Control.Exception as Exc +import Data.Bits ((.&.), (.|.), shiftL) +import Data.Typeable +import Control.Arrow (first) +import Data.Maybe (catMaybes) #include "pipes_text_cbits.h" + +-- | A specific character encoding. +-- +-- Since 0.3.0 +data Codec = Codec + { codecName :: Text + , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) + , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString) + } + +instance Show Codec where + showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c) + +-- Since 0.3.0 +data TextException = DecodeException Codec Word8 + | EncodeException Codec Char + | LengthExceeded Int + | TextException Exc.SomeException + deriving (Show, Typeable) +instance Exc.Exception TextException + +toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) + -> (ByteString -> Decoding) +toDecoding op = loop B.empty where + loop extra bs0 = case op (B.append extra bs0) of + (txt, Right bs) -> Some txt bs (loop bs) + (txt, Left (_,bs)) -> Other txt bs + + +splitSlowly :: (ByteString -> Text) + -> ByteString + -> (Text, Either (TextException, ByteString) ByteString) +splitSlowly dec bytes = valid where + valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) + splits 0 = [(B.empty, bytes)] + splits n = B.splitAt n bytes : splits (n - 1) + decFirst (a, b) = case tryEvaluate (dec a) of + Left _ -> Nothing + Right text -> let trouble = case tryEvaluate (dec b) of + Left exc -> Left (TextException exc, b) + Right _ -> Right B.empty + in Just (text, trouble) + -- this case shouldn't occur, + -- since splitSlowly is only called + -- when parsing failed somewhere + +utf8 :: Codec +utf8 = Codec name enc (toDecoding dec) where + name = T.pack "UTF-8" + enc text = (TE.encodeUtf8 text, Nothing) + dec bytes = case decodeSomeUtf8 bytes of + (t,b) -> (t, Right b) + +-- -- Whether the given byte is a continuation byte. +-- isContinuation byte = byte .&. 0xC0 == 0x80 +-- +-- -- The number of continuation bytes needed by the given +-- -- non-continuation byte. Returns -1 for an illegal UTF-8 +-- -- non-continuation byte and the whole split quickly must fail so +-- -- as the input is passed to TE.decodeUtf8, which will issue a +-- -- suitable error. +-- required x0 +-- | x0 .&. 0x80 == 0x00 = 0 +-- | x0 .&. 0xE0 == 0xC0 = 1 +-- | x0 .&. 0xF0 == 0xE0 = 2 +-- | x0 .&. 0xF8 == 0xF0 = 3 +-- | otherwise = -1 +-- +-- splitQuickly bytes +-- | B.null l || req == -1 = Nothing +-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) +-- | otherwise = Just (TE.decodeUtf8 l', r') +-- where +-- (l, r) = B.spanEnd isContinuation bytes +-- req = required (B.last l) +-- l' = B.init l +-- r' = B.cons (B.last l) r + +-- | +-- Since 0.3.0 +utf16_le :: Codec +utf16_le = Codec name enc (toDecoding dec) where + name = T.pack "UTF-16-LE" + enc text = (TE.encodeUtf16LE text, Nothing) + dec bytes = case splitQuickly bytes of + Just (text, extra) -> (text, Right extra) + Nothing -> splitSlowly TE.decodeUtf16LE bytes + + splitQuickly bytes = maybeDecode (loop 0) where + maxN = B.length bytes + + loop n | n == maxN = decodeAll + | (n + 1) == maxN = decodeTo n + loop n = let + req = utf16Required + (B.index bytes n) + (B.index bytes (n + 1)) + decodeMore = loop $! n + req + in if n + req > maxN + then decodeTo n + else decodeMore + + decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) + decodeAll = (TE.decodeUtf16LE bytes, B.empty) + +-- | +-- Since 0.3.0 +utf16_be :: Codec +utf16_be = Codec name enc (toDecoding dec) where + name = T.pack "UTF-16-BE" + enc text = (TE.encodeUtf16BE text, Nothing) + dec bytes = case splitQuickly bytes of + Just (text, extra) -> (text, Right extra) + Nothing -> splitSlowly TE.decodeUtf16BE bytes + + splitQuickly bytes = maybeDecode (loop 0) where + maxN = B.length bytes + + loop n | n == maxN = decodeAll + | (n + 1) == maxN = decodeTo n + loop n = let + req = utf16Required + (B.index bytes (n + 1)) + (B.index bytes n) + decodeMore = loop $! n + req + in if n + req > maxN + then decodeTo n + else decodeMore + + decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) + decodeAll = (TE.decodeUtf16BE bytes, B.empty) + +utf16Required :: Word8 -> Word8 -> Int +utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where + x :: Word16 + x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 + +-- | +-- Since 0.3.0 +utf32_le :: Codec +utf32_le = Codec name enc (toDecoding dec) where + name = T.pack "UTF-32-LE" + enc text = (TE.encodeUtf32LE text, Nothing) + dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of + Just (text, extra) -> (text, Right extra) + Nothing -> splitSlowly TE.decodeUtf32LE bs + +-- | +-- Since 0.3.0 +utf32_be :: Codec +utf32_be = Codec name enc (toDecoding dec) where + name = T.pack "UTF-32-BE" + enc text = (TE.encodeUtf32BE text, Nothing) + dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of + Just (text, extra) -> (text, Right extra) + Nothing -> splitSlowly TE.decodeUtf32BE bs + +utf32SplitBytes :: (ByteString -> Text) + -> ByteString + -> Maybe (Text, ByteString) +utf32SplitBytes dec bytes = split where + split = maybeDecode (dec toDecode, extra) + len = B.length bytes + lenExtra = mod len 4 + + lenToDecode = len - lenExtra + (toDecode, extra) = if lenExtra == 0 + then (bytes, B.empty) + else B.splitAt lenToDecode bytes + +-- | +-- Since 0.3.0 +ascii :: Codec +ascii = Codec name enc (toDecoding dec) where + name = T.pack "ASCII" + enc text = (bytes, extra) where + (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text + bytes = B8.pack (T.unpack safe) + extra = if T.null unsafe + then Nothing + else Just (EncodeException ascii (T.head unsafe), unsafe) + + dec bytes = (text, extra) where + (safe, unsafe) = B.span (<= 0x7F) bytes + text = T.pack (B8.unpack safe) + extra = if B.null unsafe + then Right B.empty + else Left (DecodeException ascii (B.head unsafe), unsafe) + +-- | +-- Since 0.3.0 +iso8859_1 :: Codec +iso8859_1 = Codec name enc (toDecoding dec) where + name = T.pack "ISO-8859-1" + enc text = (bytes, extra) where + (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text + bytes = B8.pack (T.unpack safe) + extra = if T.null unsafe + then Nothing + else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) + + dec bytes = (T.pack (B8.unpack bytes), Right B.empty) + +tryEvaluate :: a -> Either Exc.SomeException a +tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate + +maybeDecode :: (a, b) -> Maybe (a, b) +maybeDecode (a, b) = case tryEvaluate a of + Left _ -> Nothing + Right _ -> Just (a, b) + -- | A stream oriented decoding result. data Decoding = Some Text ByteString (ByteString -> Decoding) | Other Text ByteString @@ -103,36 +329,6 @@ decodeSomeUtf8 bs@(PS fp off len) = runST $ do return $! (chunkText, remaining) {-# INLINE decodeSomeUtf8 #-} --- decodeSomeUtf8 :: ByteString -> (Text, ByteString) --- decodeSomeUtf8 bs@(PS fp off len) = --- runST $ do marray <- A.new (len+1) --- unsafeIOToST (decodeChunkToBuffer marray) --- --- where --- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString) --- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> --- with (0::CSize) $ \destOffPtr -> --- with (0::CodePoint) $ \codepointPtr -> --- with (0::DecoderState) $ \statePtr -> --- with nullPtr $ \curPtrPtr -> --- do let end = ptr `plusPtr` (off + len) --- curPtr = ptr `plusPtr` off --- poke curPtrPtr curPtr --- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr --- state <- peek statePtr --- lastPtr <- peek curPtrPtr --- codepoint <- peek codepointPtr --- n <- peek destOffPtr --- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest --- return $! textP arr 0 (fromIntegral n) --- let left = lastPtr `minusPtr` curPtr --- remaining = B.drop left bs --- return $! (chunkText, remaining) --- {-# INLINE decodeChunkToBuffer #-} --- {-# INLINE decodeSomeUtf8 #-} - - - mkText :: A.MArray s -> CSize -> IO Text mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest return $! textP arr 0 (fromIntegral n) diff --git a/Pipes/Text/Parse.hs b/Pipes/Text/Parse.hs index ed0afa1..9cabaa6 100644 --- a/Pipes/Text/Parse.hs +++ b/Pipes/Text/Parse.hs @@ -44,16 +44,16 @@ nextChar = go {-| Draw one 'Char' from the underlying 'Producer', returning 'Left' if the 'Producer' is empty -} -drawChar :: (Monad m) => StateT (Producer Text m r) m (Either r Char) +drawChar :: (Monad m) => StateT (Producer Text m r) m (Maybe Char) drawChar = do x <- PP.draw case x of - Left r -> return (Left r) - Right txt -> case (T.uncons txt) of + Nothing -> return Nothing + Just txt -> case (T.uncons txt) of Nothing -> drawChar Just (c, txt') -> do PP.unDraw txt' - return (Right c) + return (Just c) {-# INLINABLE drawChar #-} -- | Push back a 'Char' onto the underlying 'Producer' @@ -71,12 +71,12 @@ unDrawChar c = modify (yield (T.singleton c) >>) > Right c -> unDrawChar c > return x -} -peekChar :: (Monad m) => StateT (Producer Text m r) m (Either r Char) +peekChar :: (Monad m) => StateT (Producer Text m r) m (Maybe Char) peekChar = do x <- drawChar case x of - Left _ -> return () - Right c -> unDrawChar c + Nothing -> return () + Just c -> unDrawChar c return x {-# INLINABLE peekChar #-} @@ -91,8 +91,8 @@ isEndOfChars :: (Monad m) => StateT (Producer Text m r) m Bool isEndOfChars = do x <- peekChar return (case x of - Left _ -> True - Right _ -> False ) + Nothing -> True + Just _-> False ) {-# INLINABLE isEndOfChars #-} {-| @(take n)@ only allows @n@ characters to pass -- cgit v1.2.3