From: michaelt Date: Sat, 15 Feb 2014 15:01:48 +0000 (-0500) Subject: use new text-stream-decoding X-Git-Url: https://git.immae.eu/?a=commitdiff_plain;h=bbdfd3056da4992e18d3983fd5992bee23af93af;p=github%2Ffretlink%2Ftext-pipes.git use new text-stream-decoding --- diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 4b2d2b0..8221c01 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -10,7 +10,8 @@ example, the following program copies a document from one file to another: > import Pipes -> import qualified Data.Text.Pipes as Text +> import qualified Pipes.Text as Text +> import qualified Pipes.Text.IO as Text > import System.IO > > main = @@ -21,7 +22,8 @@ To stream from files, the following is perhaps more Prelude-like (note that it uses Pipes.Safe): > import Pipes -> import qualified Data.Text.Pipes as Text +> import qualified Pipes.Text as Text +> import qualified Pipes.Text.IO as Text > import Pipes.Safe > > main = runSafeT $ runEffect $ Text.readFile "inFile.txt" >-> Text.writeFile "outFile.txt" @@ -61,14 +63,14 @@ To stream from files, the following is perhaps more Prelude-like (note that it u module Pipes.Text ( -- * Producers fromLazy - , stdin - , fromHandle - , readFile + -- , stdin + -- , fromHandle + -- , readFile -- * Consumers - , stdout - , toHandle - , writeFile + -- , stdout + -- , toHandle + -- , writeFile -- * Pipes , map @@ -79,7 +81,7 @@ module Pipes.Text ( , dropWhile , filter , scan - , encodeUtf8 +-- , encodeUtf8 , pack , unpack , toCaseFold @@ -120,22 +122,22 @@ module Pipes.Text ( , word , line - -- * Decoding Lenses - , decodeUtf8 - , codec - - -- * Codecs - , utf8 - , utf16_le - , utf16_be - , utf32_le - , utf32_be - - -- * Other Decoding/Encoding Functions - , decodeIso8859_1 - , decodeAscii - , encodeIso8859_1 - , encodeAscii + -- -- * Decoding Lenses + -- , decodeUtf8 + -- , codec + -- + -- -- * Codecs + -- , utf8 + -- , utf16_le + -- , utf16_be + -- , utf32_le + -- , utf32_be + -- + -- -- * Other Decoding/Encoding Functions + -- , decodeIso8859_1 + -- , decodeAscii + -- , encodeIso8859_1 + -- , encodeAscii -- * FreeT Splitters , chunksOf @@ -157,11 +159,9 @@ module Pipes.Text ( -- * Re-exports -- $reexports - , Decoding(..) - , streamDecodeUtf8 - , decodeSomeUtf8 - , Codec(..) - , TextException(..) + -- , DecodeResult(..) + -- , Codec + -- , TextException(..) , module Data.ByteString , module Data.Text , module Data.Profunctor @@ -170,7 +170,6 @@ module Pipes.Text ( , module Pipes.Group ) where -import Control.Exception (throwIO, try) import Control.Applicative ((<*)) import Control.Monad (liftM, unless, join) import Control.Monad.Trans.State.Strict (StateT(..), modify) @@ -193,24 +192,20 @@ import Data.Functor.Identity (Identity) import Data.Profunctor (Profunctor) import qualified Data.Profunctor import qualified Data.List as List -import Foreign.C.Error (Errno(Errno), ePIPE) -import qualified GHC.IO.Exception as G import Pipes import qualified Pipes.ByteString as PB -import qualified Pipes.Text.Internal as PI -import Pipes.Text.Internal +-- import Pipes.Text.Decoding import Pipes.Core (respond, Server') import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) import qualified Pipes.Group as PG import qualified Pipes.Parse as PP import Pipes.Parse (Parser) -import qualified Pipes.Safe.Prelude as Safe -import qualified Pipes.Safe as Safe -import Pipes.Safe (MonadSafe(..), Base(..)) + import qualified Pipes.Prelude as P import qualified System.IO as IO import Data.Char (isSpace) import Data.Word (Word8) +import Data.Text.StreamDecoding import Prelude hiding ( all, @@ -246,78 +241,6 @@ fromLazy :: (Monad m) => TL.Text -> Producer' Text m () fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) {-# INLINE fromLazy #-} --- | Stream text from 'stdin' -stdin :: MonadIO m => Producer Text m () -stdin = fromHandle IO.stdin -{-# INLINE stdin #-} - -{-| Convert a 'IO.Handle' into a text stream using a text size - determined by the good sense of the text library; note that this - is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@ - but uses the system encoding and has other `Data.Text.IO` features --} - -fromHandle :: MonadIO m => IO.Handle -> Producer Text m () -fromHandle h = go where - go = do txt <- liftIO (T.hGetChunk h) - unless (T.null txt) ( do yield txt - go ) -{-# INLINABLE fromHandle#-} - - -{-| Stream text from a file in the simple fashion of @Data.Text.IO@ - ->>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout -MAIN = PUTSTRLN "HELLO WORLD" --} - -readFile :: MonadSafe m => FilePath -> Producer Text m () -readFile file = Safe.withFile file IO.ReadMode fromHandle -{-# INLINE readFile #-} - - -{-| Stream text to 'stdout' - - Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe. - - Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@ - instead of @(source >-> stdout)@ . --} -stdout :: MonadIO m => Consumer' Text m () -stdout = go - where - go = do - txt <- await - x <- liftIO $ try (T.putStr txt) - case x of - Left (G.IOError { G.ioe_type = G.ResourceVanished - , G.ioe_errno = Just ioe }) - | Errno ioe == ePIPE - -> return () - Left e -> liftIO (throwIO e) - Right () -> go -{-# INLINABLE stdout #-} - - -{-| Convert a text stream into a 'Handle' - - Note: again, for best performance, where possible use - @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@. --} -toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r -toHandle h = for cat (liftIO . T.hPutStr h) -{-# INLINABLE toHandle #-} - -{-# RULES "p >-> toHandle h" forall p h . - p >-> toHandle h = for p (\txt -> liftIO (T.hPutStr h txt)) - #-} - - --- | Stream text into a file. Uses @pipes-safe@. -writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m () -writeFile file = Safe.withFile file IO.WriteMode toHandle -{-# INLINE writeFile #-} - type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) @@ -690,28 +613,6 @@ isEndOfChars = do {-# INLINABLE isEndOfChars #-} -{- | An improper lens into a stream of 'ByteString' expected to be UTF-8 encoded; the associated - stream of Text ends by returning a stream of ByteStrings beginning at the point of failure. - -} - -decodeUtf8 :: Monad m => Lens' (Producer ByteString m r) - (Producer Text m (Producer ByteString m r)) -decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8))) - (k (go B.empty PI.streamDecodeUtf8 p0)) where - go !carry dec0 p = do - x <- lift (next p) - case x of Left r -> return (if B.null carry - then return r -- all bytestring input was consumed - else (do yield carry -- a potentially valid fragment remains - return r)) - - Right (chunk, p') -> case dec0 chunk of - PI.Some text carry2 dec -> do yield text - go carry2 dec p' - PI.Other text bs -> do yield text - return (do yield bs -- an invalid blob remains - p') -{-# INLINABLE decodeUtf8 #-} -- | Splits a 'Producer' after the given number of characters @@ -1057,106 +958,4 @@ unwords = intercalate (yield $ T.singleton ' ') @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym. -} -{- | Use a 'Codec' as a pipes-style 'Lens' into a byte stream; the available 'Codec' s are - 'utf8', 'utf16_le', 'utf16_be', 'utf32_le', 'utf32_be' . The 'Codec' concept and the - individual 'Codec' definitions follow the enumerator and conduit libraries. - - Utf8 is handled differently in this library -- without the use of 'unsafePerformIO' &co - to catch 'Text' exceptions; but the same 'mypipe ^. codec utf8' interface can be used. - 'mypipe ^. decodeUtf8' should be the same, but has a somewhat more direct and thus perhaps - better implementation. - - -} -codec :: Monad m => Codec -> Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) -codec (Codec _ enc dec) k p0 = fmap (\p -> join (for p (yield . fst . enc))) - (k (decoder (dec B.empty) p0) ) where - decoder :: Monad m => PI.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) - decoder !d p0 = case d of - PI.Other txt bad -> do yield txt - return (do yield bad - p0) - PI.Some txt extra dec -> do yield txt - x <- lift (next p0) - case x of Left r -> return (do yield extra - return r) - Right (chunk,p1) -> decoder (dec chunk) p1 - -{- | ascii and latin encodings only represent a small fragment of 'Text'; thus we cannot - use the pipes 'Lens' style to work with them. Rather we simply define functions - each way. - - 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream, - returning the rest of the 'Text' at the first non-ascii 'Char' --} -encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) -encodeAscii = go where - go p = do echunk <- lift (next p) - case echunk of - Left r -> return (return r) - Right (chunk, p') -> - if T.null chunk - then go p' - else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk - in do yield (B8.pack (T.unpack safe)) - if T.null unsafe - then go p' - else return $ do yield unsafe - p' -{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, - returning the rest of the 'Text' upon hitting any non-latin 'Char' - -} -encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) -encodeIso8859_1 = go where - go p = do etxt <- lift (next p) - case etxt of - Left r -> return (return r) - Right (txt, p') -> - if T.null txt - then go p' - else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt - in do yield (B8.pack (T.unpack safe)) - if T.null unsafe - then go p' - else return $ do yield unsafe - p' - -{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the - unused 'ByteString' upon hitting an un-ascii byte. - -} -decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeAscii = go where - go p = do echunk <- lift (next p) - case echunk of - Left r -> return (return r) - Right (chunk, p') -> - if B.null chunk - then go p' - else let (safe, unsafe) = B.span (<= 0x7F) chunk - in do yield (T.pack (B8.unpack safe)) - if B.null unsafe - then go p' - else return $ do yield unsafe - p' - -{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the - unused 'ByteString' upon hitting the rare un-latinizable byte. - -} -decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeIso8859_1 = go where - go p = do echunk <- lift (next p) - case echunk of - Left r -> return (return r) - Right (chunk, p') -> - if B.null chunk - then go p' - else let (safe, unsafe) = B.span (<= 0xFF) chunk - in do yield (T.pack (B8.unpack safe)) - if B.null unsafe - then go p' - else return $ do yield unsafe - p' - - - - - + diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs new file mode 100644 index 0000000..2bb5807 --- /dev/null +++ b/Pipes/Text/Encoding.hs @@ -0,0 +1,205 @@ + +{-# LANGUAGE RankNTypes, BangPatterns #-} +-- | +-- Copyright: 2014 Michael Thompson +-- +-- This module uses the stream decoding functions from the text-stream-decoding package +-- to define pipes decoding functions and lenses. + +module Pipes.Text.Encoding + ( DecodeResult (..) + , Codec + , decodeUtf8 + , decodeUtf8Pure + , decodeUtf16LE + , decodeUtf16BE + , decodeUtf32LE + , decodeUtf32BE + , utf8 + , utf8Pure + , utf16LE + , utf16BE + , utf32LE + , utf32BE + , encodeAscii + , decodeAscii + , encodeIso8859_1 + , decodeIso8859_1 + ) + where + +import Data.Char (ord) +import Data.ByteString as B +import Data.ByteString (ByteString) +import Data.ByteString.Internal as B +import Data.ByteString.Char8 as B8 +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import Data.Text.StreamDecoding +import GHC.Word (Word8, Word32) +import Data.Word (Word8, Word16) +import Control.Monad +import Pipes +import Pipes.Core + + + +{- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text. + They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc. + The stream of text they 'see' in a bytestream ends by returning the original byte stream + beginning at the point of failure, or the empty bytestream with its return value. + -} +type Codec = forall f m r . (Functor f , Monad m ) => + (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r))) + -> Producer ByteString m r -> f (Producer ByteString m r ) + +decodeStream :: Monad m + => (B.ByteString -> DecodeResult) + -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeStream = loop where + loop dec0 p = + do x <- lift (next p) + case x of Left r -> return (return r) + Right (chunk, p') -> case dec0 chunk of + DecodeResultSuccess text dec -> do yield text + loop dec p' + DecodeResultFailure text bs -> do yield text + return (do yield bs + p') +{-# INLINABLE decodeStream#-} + +decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf8 = decodeStream streamUtf8 +{-# INLINE decodeUtf8 #-} + +decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf8Pure = decodeStream streamUtf8Pure +{-# INLINE decodeUtf8Pure #-} + +decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf16LE = decodeStream streamUtf16LE +{-# INLINE decodeUtf16LE #-} + +decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf16BE = decodeStream streamUtf16BE +{-# INLINE decodeUtf16BE #-} + +decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf32LE = decodeStream streamUtf32LE +{-# INLINE decodeUtf32LE #-} + +decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf32BE = decodeStream streamUtf32BE +{-# INLINE decodeUtf32BE #-} + +mkCodec :: (forall r m . Monad m => + Producer ByteString m r -> Producer Text m (Producer ByteString m r )) + -> (Text -> ByteString) + -> Codec +mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) + + +{- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated + text stream ends by returning the original bytestream beginning at the point of failure, + or the empty bytestring for a well-encoded text. + -} + +utf8 :: Codec +utf8 = mkCodec decodeUtf8 TE.encodeUtf8 + +utf8Pure :: Codec +utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 + +utf16LE :: Codec +utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE + +utf16BE :: Codec +utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE + +utf32LE :: Codec +utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE + +utf32BE :: Codec +utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE + + +{- | ascii and latin encodings only use a small number of the characters 'Text' + recognizes; thus we cannot use the pipes 'Lens' style to work with them. + Rather we simply define functions each way. + + 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream, + returning the rest of the 'Text' at the first non-ascii 'Char' +-} + +encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) +encodeAscii = go where + go p = do e <- lift (next p) + case e of + Left r -> return (return r) + Right (chunk, p') -> + if T.null chunk + then go p' + else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk + in do yield (B8.pack (T.unpack safe)) + if T.null unsafe + then go p' + else return $ do yield unsafe + p' + +{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, + returning the rest of the 'Text' upon hitting any non-latin 'Char' + -} +encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) +encodeIso8859_1 = go where + go p = do e <- lift (next p) + case e of + Left r -> return (return r) + Right (txt, p') -> + if T.null txt + then go p' + else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt + in do yield (B8.pack (T.unpack safe)) + if T.null unsafe + then go p' + else return $ do yield unsafe + p' + +{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the + unused 'ByteString' upon hitting an un-ascii byte. + -} +decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeAscii = go where + go p = do e <- lift (next p) + case e of + Left r -> return (return r) + Right (chunk, p') -> + if B.null chunk + then go p' + else let (safe, unsafe) = B.span (<= 0x7F) chunk + in do yield (T.pack (B8.unpack safe)) + if B.null unsafe + then go p' + else return (do yield unsafe + p') + +{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the + unused 'ByteString' upon hitting the rare un-latinizable byte. + -} +decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeIso8859_1 = go where + go p = do e <- lift (next p) + case e of + Left r -> return (return r) + Right (chunk, p') -> + if B.null chunk + then go p' + else do let (safe, unsafe) = B.span (<= 0xFF) chunk + yield (T.pack (B8.unpack safe)) + if B.null unsafe + then go p' + else return (do yield unsafe + p') + + + diff --git a/Pipes/Text/IO.hs b/Pipes/Text/IO.hs new file mode 100644 index 0000000..3c9ac98 --- /dev/null +++ b/Pipes/Text/IO.hs @@ -0,0 +1,96 @@ +{-#LANGUAGE RankNTypes#-} + +module Pipes.Text.IO + ( stdin + , stdout + , fromHandle + , toHandle + , readFile + , writeFile + ) where + +import qualified System.IO as IO +import Control.Exception (throwIO, try) +import Foreign.C.Error (Errno(Errno), ePIPE) +import qualified GHC.IO.Exception as G +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as T +import Pipes +import qualified Pipes.Safe.Prelude as Safe +import qualified Pipes.Safe as Safe +import Pipes.Safe (MonadSafe(..), Base(..)) +import Prelude hiding (readFile, writeFile) + +-- | Stream text from 'stdin' +stdin :: MonadIO m => Producer Text m () +stdin = fromHandle IO.stdin +{-# INLINE stdin #-} + +{-| Convert a 'IO.Handle' into a text stream using a text size + determined by the good sense of the text library; note that this + is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@ + but uses the system encoding and has other `Data.Text.IO` features +-} + +fromHandle :: MonadIO m => IO.Handle -> Producer Text m () +fromHandle h = go where + go = do txt <- liftIO (T.hGetChunk h) + if T.null txt then return () + else do yield txt + go +{-# INLINABLE fromHandle#-} + + +{-| Stream text from a file in the simple fashion of @Data.Text.IO@ + +>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout +MAIN = PUTSTRLN "HELLO WORLD" +-} + +readFile :: MonadSafe m => FilePath -> Producer Text m () +readFile file = Safe.withFile file IO.ReadMode fromHandle +{-# INLINE readFile #-} + + +{-| Stream text to 'stdout' + + Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe. + + Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@ + instead of @(source >-> stdout)@ . +-} +stdout :: MonadIO m => Consumer' Text m () +stdout = go + where + go = do + txt <- await + x <- liftIO $ try (T.putStr txt) + case x of + Left (G.IOError { G.ioe_type = G.ResourceVanished + , G.ioe_errno = Just ioe }) + | Errno ioe == ePIPE + -> return () + Left e -> liftIO (throwIO e) + Right () -> go +{-# INLINABLE stdout #-} + + +{-| Convert a text stream into a 'Handle' + + Note: again, for best performance, where possible use + @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@. +-} +toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r +toHandle h = for cat (liftIO . T.hPutStr h) +{-# INLINABLE toHandle #-} + +{-# RULES "p >-> toHandle h" forall p h . + p >-> toHandle h = for p (\txt -> liftIO (T.hPutStr h txt)) + #-} + + +-- | Stream text into a file. Uses @pipes-safe@. +writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m () +writeFile file = Safe.withFile file IO.WriteMode toHandle +{-# INLINE writeFile #-} diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs deleted file mode 100644 index 582ef14..0000000 --- a/Pipes/Text/Internal.hs +++ /dev/null @@ -1,7 +0,0 @@ -module Pipes.Text.Internal - (module Pipes.Text.Internal.Codec - , module Pipes.Text.Internal.Decoding - ) where - -import Pipes.Text.Internal.Codec -import Pipes.Text.Internal.Decoding \ No newline at end of file diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs deleted file mode 100644 index 075a152..0000000 --- a/Pipes/Text/Internal/Codec.hs +++ /dev/null @@ -1,216 +0,0 @@ - -{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-} --- | --- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin --- License: MIT --- This Parts of this code were taken from enumerator and conduits, and adapted for pipes - --- This module follows the model of the enumerator and conduits libraries, and defines --- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and --- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so --- special functions appear in @Pipes.Text@ - - -module Pipes.Text.Internal.Codec - ( Codec(..) - , TextException(..) - , utf8 - , utf16_le - , utf16_be - , utf32_le - , utf32_be - ) where - -import Data.Bits ((.&.)) -import Data.Char (ord) -import Data.ByteString as B -import Data.ByteString (ByteString) -import Data.ByteString.Internal as B -import Data.ByteString.Char8 as B8 -import Data.Text (Text) -import qualified Data.Text as T -import qualified Data.Text.Encoding as TE -import Data.Text.Encoding.Error () -import GHC.Word (Word8, Word32) -import qualified Data.Text.Array as A -import Data.Word (Word8, Word16) -import System.IO.Unsafe (unsafePerformIO) -import qualified Control.Exception as Exc -import Data.Bits ((.&.), (.|.), shiftL) -import Data.Typeable -import Control.Arrow (first) -import Data.Maybe (catMaybes) -import Pipes.Text.Internal.Decoding -import Pipes --- | A specific character encoding. - -data Codec = Codec - { codecName :: Text - , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) - , codecDecode :: ByteString -> Decoding - } - -instance Show Codec where - showsPrec d c = showParen (d > 10) $ - showString "Codec " . shows (codecName c) - -data TextException = DecodeException Codec Word8 - | EncodeException Codec Char - | LengthExceeded Int - | TextException Exc.SomeException - deriving (Show, Typeable) -instance Exc.Exception TextException - - -toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) - -> (ByteString -> Decoding) -toDecoding op = loop B.empty where - loop !extra bs0 = case op (B.append extra bs0) of - (txt, Right bs) -> Some txt bs (loop bs) - (txt, Left (_,bs)) -> Other txt bs --- To do: toDecoding should be inlined in each of the 'Codec' definitions --- or else Codec changed to the conduit/enumerator definition. We have --- altered it to use 'streamDecodeUtf8' - -splitSlowly :: (ByteString -> Text) - -> ByteString - -> (Text, Either (TextException, ByteString) ByteString) -splitSlowly dec bytes = valid where - valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) - splits 0 = [(B.empty, bytes)] - splits n = B.splitAt n bytes : splits (n - 1) - decFirst (a, b) = case tryEvaluate (dec a) of - Left _ -> Nothing - Right text -> let trouble = case tryEvaluate (dec b) of - Left exc -> Left (TextException exc, b) - Right _ -> Right B.empty - in Just (text, trouble) -- this case shouldn't occur, - -- since splitSlowly is only called - -- when parsing failed somewhere - -utf8 :: Codec -utf8 = Codec name enc (toDecoding dec) where - name = T.pack "UTF-8" - enc text = (TE.encodeUtf8 text, Nothing) - dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b) - --- -- Whether the given byte is a continuation byte. --- isContinuation byte = byte .&. 0xC0 == 0x80 --- --- -- The number of continuation bytes needed by the given --- -- non-continuation byte. Returns -1 for an illegal UTF-8 --- -- non-continuation byte and the whole split quickly must fail so --- -- as the input is passed to TE.decodeUtf8, which will issue a --- -- suitable error. --- required x0 --- | x0 .&. 0x80 == 0x00 = 0 --- | x0 .&. 0xE0 == 0xC0 = 1 --- | x0 .&. 0xF0 == 0xE0 = 2 --- | x0 .&. 0xF8 == 0xF0 = 3 --- | otherwise = -1 --- --- splitQuickly bytes --- | B.null l || req == -1 = Nothing --- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) --- | otherwise = Just (TE.decodeUtf8 l', r') --- where --- (l, r) = B.spanEnd isContinuation bytes --- req = required (B.last l) --- l' = B.init l --- r' = B.cons (B.last l) r - - -utf16_le :: Codec -utf16_le = Codec name enc (toDecoding dec) where - name = T.pack "UTF-16-LE" - enc text = (TE.encodeUtf16LE text, Nothing) - dec bytes = case splitQuickly bytes of - Just (text, extra) -> (text, Right extra) - Nothing -> splitSlowly TE.decodeUtf16LE bytes - - splitQuickly bytes = maybeDecode (loop 0) where - maxN = B.length bytes - - loop n | n == maxN = decodeAll - | (n + 1) == maxN = decodeTo n - loop n = let - req = utf16Required - (B.index bytes n) - (B.index bytes (n + 1)) - decodeMore = loop $! n + req - in if n + req > maxN - then decodeTo n - else decodeMore - - decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) - decodeAll = (TE.decodeUtf16LE bytes, B.empty) - -utf16_be :: Codec -utf16_be = Codec name enc (toDecoding dec) where - name = T.pack "UTF-16-BE" - enc text = (TE.encodeUtf16BE text, Nothing) - dec bytes = case splitQuickly bytes of - Just (text, extra) -> (text, Right extra) - Nothing -> splitSlowly TE.decodeUtf16BE bytes - - splitQuickly bytes = maybeDecode (loop 0) where - maxN = B.length bytes - - loop n | n == maxN = decodeAll - | (n + 1) == maxN = decodeTo n - loop n = let - req = utf16Required - (B.index bytes (n + 1)) - (B.index bytes n) - decodeMore = loop $! n + req - in if n + req > maxN - then decodeTo n - else decodeMore - - decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) - decodeAll = (TE.decodeUtf16BE bytes, B.empty) - -utf16Required :: Word8 -> Word8 -> Int -utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where - x :: Word16 - x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 - - -utf32_le :: Codec -utf32_le = Codec name enc (toDecoding dec) where - name = T.pack "UTF-32-LE" - enc text = (TE.encodeUtf32LE text, Nothing) - dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of - Just (text, extra) -> (text, Right extra) - Nothing -> splitSlowly TE.decodeUtf32LE bs - - -utf32_be :: Codec -utf32_be = Codec name enc (toDecoding dec) where - name = T.pack "UTF-32-BE" - enc text = (TE.encodeUtf32BE text, Nothing) - dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of - Just (text, extra) -> (text, Right extra) - Nothing -> splitSlowly TE.decodeUtf32BE bs - -utf32SplitBytes :: (ByteString -> Text) - -> ByteString - -> Maybe (Text, ByteString) -utf32SplitBytes dec bytes = split where - split = maybeDecode (dec toDecode, extra) - len = B.length bytes - lenExtra = mod len 4 - - lenToDecode = len - lenExtra - (toDecode, extra) = if lenExtra == 0 - then (bytes, B.empty) - else B.splitAt lenToDecode bytes - - -tryEvaluate :: a -> Either Exc.SomeException a -tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate - -maybeDecode :: (a, b) -> Maybe (a, b) -maybeDecode (a, b) = case tryEvaluate a of - Left _ -> Nothing - Right _ -> Just (a, b) diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs deleted file mode 100644 index b5d928a..0000000 --- a/Pipes/Text/Internal/Decoding.hs +++ /dev/null @@ -1,154 +0,0 @@ -{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} -{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} -{-# LANGUAGE DeriveDataTypeable, RankNTypes #-} - --- This module lifts assorted materials from Brian O'Sullivan's text package --- especially @Data.Text.Encoding@ in order to define a pipes-appropriate --- 'streamDecodeUtf8' - -module Pipes.Text.Internal.Decoding - ( Decoding(..) - , streamDecodeUtf8 - , decodeSomeUtf8 - ) where -import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) -import Control.Monad.ST (ST, runST) -import Data.Bits ((.&.)) -import Data.ByteString as B -import Data.ByteString (ByteString) -import Data.ByteString.Internal as B -import Data.ByteString.Char8 as B8 -import Data.Text (Text) -import qualified Data.Text as T -import qualified Data.Text.Encoding as TE -import Data.Text.Encoding.Error () -import Data.Text.Internal (Text, textP) -import Foreign.C.Types (CSize) -import Foreign.ForeignPtr (withForeignPtr) -import Foreign.Marshal.Utils (with) -import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) -import Foreign.Storable (Storable, peek, poke) -import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) -import GHC.Word (Word8, Word32) -import qualified Data.Text.Array as A -import Data.Word (Word8, Word16) -import System.IO.Unsafe (unsafePerformIO) -import qualified Control.Exception as Exc -import Data.Bits ((.&.), (.|.), shiftL) -import Data.Typeable -import Control.Arrow (first) -import Data.Maybe (catMaybes) -#include "pipes_text_cbits.h" - - - --- A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding - -data Decoding = Some Text ByteString (ByteString -> Decoding) - -- Text, continuation and any undecoded fragment. - | Other Text ByteString - -- Text followed by an undecodable ByteString - -instance Show Decoding where - showsPrec d (Some t bs _) = showParen (d > prec) $ - showString "Some " . showsPrec prec' t . - showChar ' ' . showsPrec prec' bs . - showString " _" - where prec = 10; prec' = prec + 1 - showsPrec d (Other t bs) = showParen (d > prec) $ - showString "Other " . showsPrec prec' t . - showChar ' ' . showsPrec prec' bs . - showString " _" - where prec = 10; prec' = prec + 1 - -newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) -newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) - --- Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'. -streamDecodeUtf8 :: ByteString -> Decoding -streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 - where - decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding - decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = - runST $ do marray <- A.new (len+1) - unsafeIOToST (decodeChunkToBuffer marray) - where - decodeChunkToBuffer :: A.MArray s -> IO Decoding - decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> - with (0::CSize) $ \destOffPtr -> - with codepoint0 $ \codepointPtr -> - with state0 $ \statePtr -> - with nullPtr $ \curPtrPtr -> - do let end = ptr `plusPtr` (off + len) - curPtr = ptr `plusPtr` off - poke curPtrPtr curPtr - c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr - state <- peek statePtr - lastPtr <- peek curPtrPtr - codepoint <- peek codepointPtr - n <- peek destOffPtr - chunkText <- mkText dest n - let left = lastPtr `minusPtr` curPtr - remaining = B.drop left bs - accum = if T.null chunkText then B.append old remaining else remaining - return $! case state of - UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error - _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) - {-# INLINE decodeChunkToBuffer #-} - {-# INLINE decodeChunkUtf8 #-} -{-# INLINE streamDecodeUtf8 #-} - --- Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble -decodeSomeUtf8 :: ByteString -> (Text, ByteString) -decodeSomeUtf8 bs@(PS fp off len) = runST $ do - dest <- A.new (len+1) - unsafeIOToST $ - withForeignPtr fp $ \ptr -> - with (0::CSize) $ \destOffPtr -> - with (0::CodePoint) $ \codepointPtr -> - with (0::DecoderState) $ \statePtr -> - with nullPtr $ \curPtrPtr -> - do let end = ptr `plusPtr` (off + len) - curPtr = ptr `plusPtr` off - poke curPtrPtr curPtr - c_decode_utf8_with_state (A.maBA dest) destOffPtr - curPtrPtr end codepointPtr statePtr - state <- peek statePtr - lastPtr <- peek curPtrPtr - codepoint <- peek codepointPtr - n <- peek destOffPtr - chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest - return $! textP arr 0 (fromIntegral n) - let left = lastPtr `minusPtr` curPtr - remaining = B.drop left bs - return $! (chunkText, remaining) -{-# INLINE decodeSomeUtf8 #-} - -mkText :: A.MArray s -> CSize -> IO Text -mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest - return $! textP arr 0 (fromIntegral n) -{-# INLINE mkText #-} - -ord :: Char -> Int -ord (C# c#) = I# (ord# c#) -{-# INLINE ord #-} - -unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int -unsafeWrite marr i c - | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) - return 1 - | otherwise = do A.unsafeWrite marr i lo - A.unsafeWrite marr (i+1) hi - return 2 - where n = ord c - m = n - 0x10000 - lo = fromIntegral $ (m `shiftR` 10) + 0xD800 - hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 - shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) - {-# INLINE shiftR #-} -{-# INLINE unsafeWrite #-} - -foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state - :: MutableByteArray# s -> Ptr CSize - -> Ptr (Ptr Word8) -> Ptr Word8 - -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file diff --git a/cbits/cbits.c b/cbits/cbits.c deleted file mode 100644 index c11645b..0000000 --- a/cbits/cbits.c +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2011 Bryan O'Sullivan . - * - * Portions copyright (c) 2008-2010 Björn Höhrmann . - * - * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. - */ - -#include -#include -#include -#include "pipes_text_cbits.h" - - - -#define UTF8_ACCEPT 0 -#define UTF8_REJECT 12 - -static const uint8_t utf8d[] = { - /* - * The first part of the table maps bytes to character classes that - * to reduce the size of the transition table and create bitmasks. - */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, - - /* - * The second part is a transition table that maps a combination of - * a state of the automaton and a character class to a state. - */ - 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, - 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, - 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, - 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, - 12,36,12,12,12,12,12,12,12,12,12,12, -}; - -static inline uint32_t -decode(uint32_t *state, uint32_t* codep, uint32_t byte) { - uint32_t type = utf8d[byte]; - - *codep = (*state != UTF8_ACCEPT) ? - (byte & 0x3fu) | (*codep << 6) : - (0xff >> type) & (byte); - - return *state = utf8d[256 + *state + type]; -} - -/* - * A best-effort decoder. Runs until it hits either end of input or - * the start of an invalid byte sequence. - * - * At exit, we update *destoff with the next offset to write to, *src - * with the next source location past the last one successfully - * decoded, and return the next source location to read from. - * - * Moreover, we expose the internal decoder state (state0 and - * codepoint0), allowing one to restart the decoder after it - * terminates (say, due to a partial codepoint). - * - * In particular, there are a few possible outcomes, - * - * 1) We decoded the buffer entirely: - * In this case we return srcend - * state0 == UTF8_ACCEPT - * - * 2) We met an invalid encoding - * In this case we return the address of the first invalid byte - * state0 == UTF8_REJECT - * - * 3) We reached the end of the buffer while decoding a codepoint - * In this case we return a pointer to the first byte of the partial codepoint - * state0 != UTF8_ACCEPT, UTF8_REJECT - * - */ - - #if defined(__GNUC__) || defined(__clang__) - static inline uint8_t const * - _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, - const uint8_t const **src, const uint8_t const *srcend, - uint32_t *codepoint0, uint32_t *state0) - __attribute((always_inline)); - #endif - -static inline uint8_t const * -_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, - const uint8_t const **src, const uint8_t const *srcend, - uint32_t *codepoint0, uint32_t *state0) -{ - uint16_t *d = dest + *destoff; - const uint8_t *s = *src, *last = *src; - uint32_t state = *state0; - uint32_t codepoint = *codepoint0; - - while (s < srcend) { -#if defined(__i386__) || defined(__x86_64__) - /* - * This code will only work on a little-endian system that - * supports unaligned loads. - * - * It gives a substantial speed win on data that is purely or - * partly ASCII (e.g. HTML), at only a slight cost on purely - * non-ASCII text. - */ - - if (state == UTF8_ACCEPT) { - while (s < srcend - 4) { - codepoint = *((uint32_t *) s); - if ((codepoint & 0x80808080) != 0) - break; - s += 4; - - /* - * Tried 32-bit stores here, but the extra bit-twiddling - * slowed the code down. - */ - - *d++ = (uint16_t) (codepoint & 0xff); - *d++ = (uint16_t) ((codepoint >> 8) & 0xff); - *d++ = (uint16_t) ((codepoint >> 16) & 0xff); - *d++ = (uint16_t) ((codepoint >> 24) & 0xff); - } - last = s; - } -#endif - - if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { - if (state != UTF8_REJECT) - continue; - break; - } - - if (codepoint <= 0xffff) - *d++ = (uint16_t) codepoint; - else { - *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); - *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); - } - last = s; - } - - *destoff = d - dest; - *codepoint0 = codepoint; - *state0 = state; - *src = last; - - return s; -} - -uint8_t const * -_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, - const uint8_t const **src, - const uint8_t const *srcend, - uint32_t *codepoint0, uint32_t *state0) -{ - uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend, - codepoint0, state0); - if (*state0 == UTF8_REJECT) - ret -=1; - return ret; -} - diff --git a/include/pipes_text_cbits.h b/include/pipes_text_cbits.h deleted file mode 100644 index b9ab670..0000000 --- a/include/pipes_text_cbits.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * Copyright (c) 2013 Bryan O'Sullivan . - */ - -#ifndef _pipes_text_cbits_h -#define _pipes_text_cbits_h - -#define UTF8_ACCEPT 0 -#define UTF8_REJECT 12 - -#endif diff --git a/pipes-text.cabal b/pipes-text.cabal index a1d57bb..017d41c 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal @@ -1,5 +1,5 @@ name: pipes-text -version: 0.0.0.6 +version: 0.0.0.7 synopsis: Text pipes. description: Many of the pipes and other operations defined here mirror those in the `pipes-bytestring` library. Folds like `length` and grouping @@ -36,23 +36,28 @@ source-repository head type: git location: https://github.com/michaelt/text-pipes +flag noio + default: False + Description: Use a version of text earlier than 0.11.3 library - c-sources: cbits/cbits.c - include-dirs: include - exposed-modules: Pipes.Text, Pipes.Text.Internal - other-modules: Pipes.Text.Internal.Decoding, Pipes.Text.Internal.Codec - other-extensions: RankNTypes - build-depends: base >= 4 && < 5 , - bytestring >=0.10 && < 0.11, - text >=0.11.3 && < 1.2, - profunctors >= 3.1.1 && < 4.1 , - pipes >=4.0 && < 4.2, - pipes-group >= 1.0.0 && < 1.1 , - pipes-parse >=2.0 && < 3.1, + exposed-modules: Pipes.Text, Pipes.Text.Encoding + build-depends: base >= 4 && < 5 , + bytestring >= 0.9 , + text >=0.11.3 && < 1.2, + text-stream-decode >= 0.1 && < 0.2, + profunctors >= 3.1.1 && < 4.1, + pipes >=4.0 && < 4.2, + pipes-group >= 1.0.0 && < 1.1, + pipes-parse >=2.0 && < 3.1, pipes-safe, - pipes-bytestring >= 1.0 && < 2.1, - transformers >= 0.2.0.0 && < 0.4 - -- hs-source-dirs: + pipes-bytestring >= 1.0 && < 2.1, + transformers >= 0.2.0.0 && < 0.4 + other-extensions: RankNTypes default-language: Haskell2010 - ghc-options: -O2 + ghc-options: -O2 + + if !flag(noio) + exposed-modules: Pipes.Text.IO + build-depends: text >=0.11.3 && < 1.2 +