import Pipes
import qualified Pipes.ByteString as PB
import qualified Pipes.Text.Internal as PE
-import Pipes.Text.Internal (Codec(..))
+import Pipes.Text.Codec (Codec(..))
import Pipes.Core (respond, Server')
import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))
import qualified Pipes.Group as PG
fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
fromHandle h = go where
go = do txt <- liftIO (T.hGetChunk h)
- unless (T.null txt) $ do yield txt
- go
+ unless (T.null txt) ( do yield txt
+ go )
{-# INLINABLE fromHandle#-}
readFile file = Safe.withFile file IO.ReadMode fromHandle
{-# INLINE readFile #-}
-{-| Stream lines of text from stdin (for testing in ghci etc.)
+{-| Crudely stream lines of input from stdin in the style of Pipes.Prelude.
+ This is for testing in ghci etc.; obviously it will be unsound if used to recieve
+ the contents of immense files with few newlines.
>>> let safely = runSafeT . runEffect
>>> safely $ for Text.stdinLn (lift . lift . print . T.length)
Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe.
- Note: For best performance, use @(for source (liftIO . putStr))@ instead of
- @(source >-> stdout)@ in suitable cases.
+ Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@
+ instead of @(source >-> stdout)@ .
-}
stdout :: MonadIO m => Consumer' Text m ()
stdout = go
{-# INLINABLE isEndOfChars #-}
-
-
-
--- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded into a Pipe of Text
--- returning a Pipe of ByteStrings that begins at the point of failure.
+-- | An improper lens into a stream of 'ByteString' expected to be UTF-8 encoded; the associated
+-- stream of Text ends by returning a stream of ByteStrings beginning at the point of failure.
decodeUtf8 :: Monad m => Lens' (Producer ByteString m r)
(Producer Text m (Producer ByteString m r))
(k (go B.empty PE.streamDecodeUtf8 p0)) where
go !carry dec0 p = do
x <- lift (next p)
- case x of Left r -> if B.null carry
- then return (return r) -- all bytestrinput was consumed
- else return (do yield carry -- a potentially valid fragment remains
- return r)
+ case x of Left r -> return (if B.null carry
+ then return r -- all bytestring input was consumed
+ else (do yield carry -- a potentially valid fragment remains
+ return r))
Right (chunk, p') -> case dec0 chunk of
PE.Some text carry2 dec -> do yield text
--- /dev/null
+
+{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
+-- |
+-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
+-- License: MIT
+--
+-- Handle streams of text.
+--
+-- Parts of this code were taken from enumerator and conduits, and adapted for pipes.
+
+module Pipes.Text.Codec
+ ( Decoding(..)
+ , streamDecodeUtf8
+ , decodeSomeUtf8
+ , Codec(..)
+ , TextException(..)
+ , utf8
+ ) where
+
+import Data.Bits ((.&.))
+import Data.Char (ord)
+import Data.ByteString as B
+import Data.ByteString (ByteString)
+import Data.ByteString.Internal as B
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as TE
+import Data.Text.Encoding.Error ()
+import GHC.Word (Word8, Word32)
+import qualified Data.Text.Array as A
+import Data.Word (Word8, Word16)
+import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exc
+import Data.Bits ((.&.), (.|.), shiftL)
+import Data.Typeable
+import Control.Arrow (first)
+import Data.Maybe (catMaybes)
+import Pipes.Text.Internal
+
+-- | A specific character encoding.
+--
+-- Since 0.3.0
+data Codec = Codec
+ { codecName :: Text
+ , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
+ , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
+ }
+
+instance Show Codec where
+ showsPrec d c = showParen (d > 10) $
+ showString "Codec " . shows (codecName c)
+
+data TextException = DecodeException Codec Word8
+ | EncodeException Codec Char
+ | LengthExceeded Int
+ | TextException Exc.SomeException
+ deriving (Show, Typeable)
+instance Exc.Exception TextException
+
+
+toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
+ -> (ByteString -> Decoding)
+toDecoding op = loop B.empty where
+ loop extra bs0 = case op (B.append extra bs0) of
+ (txt, Right bs) -> Some txt bs (loop bs)
+ (txt, Left (_,bs)) -> Other txt bs
+
+
+splitSlowly :: (ByteString -> Text)
+ -> ByteString
+ -> (Text, Either (TextException, ByteString) ByteString)
+splitSlowly dec bytes = valid where
+ valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
+ splits 0 = [(B.empty, bytes)]
+ splits n = B.splitAt n bytes : splits (n - 1)
+ decFirst (a, b) = case tryEvaluate (dec a) of
+ Left _ -> Nothing
+ Right text -> let trouble = case tryEvaluate (dec b) of
+ Left exc -> Left (TextException exc, b)
+ Right _ -> Right B.empty
+ in Just (text, trouble) -- this case shouldn't occur,
+ -- since splitSlowly is only called
+ -- when parsing failed somewhere
+
+utf8 :: Codec
+utf8 = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-8"
+ enc text = (TE.encodeUtf8 text, Nothing)
+ dec bytes = case decodeSomeUtf8 bytes of
+ (t,b) -> (t, Right b)
+
+-- -- Whether the given byte is a continuation byte.
+-- isContinuation byte = byte .&. 0xC0 == 0x80
+--
+-- -- The number of continuation bytes needed by the given
+-- -- non-continuation byte. Returns -1 for an illegal UTF-8
+-- -- non-continuation byte and the whole split quickly must fail so
+-- -- as the input is passed to TE.decodeUtf8, which will issue a
+-- -- suitable error.
+-- required x0
+-- | x0 .&. 0x80 == 0x00 = 0
+-- | x0 .&. 0xE0 == 0xC0 = 1
+-- | x0 .&. 0xF0 == 0xE0 = 2
+-- | x0 .&. 0xF8 == 0xF0 = 3
+-- | otherwise = -1
+--
+-- splitQuickly bytes
+-- | B.null l || req == -1 = Nothing
+-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
+-- | otherwise = Just (TE.decodeUtf8 l', r')
+-- where
+-- (l, r) = B.spanEnd isContinuation bytes
+-- req = required (B.last l)
+-- l' = B.init l
+-- r' = B.cons (B.last l) r
+
+
+utf16_le :: Codec
+utf16_le = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-16-LE"
+ enc text = (TE.encodeUtf16LE text, Nothing)
+ dec bytes = case splitQuickly bytes of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf16LE bytes
+
+ splitQuickly bytes = maybeDecode (loop 0) where
+ maxN = B.length bytes
+
+ loop n | n == maxN = decodeAll
+ | (n + 1) == maxN = decodeTo n
+ loop n = let
+ req = utf16Required
+ (B.index bytes n)
+ (B.index bytes (n + 1))
+ decodeMore = loop $! n + req
+ in if n + req > maxN
+ then decodeTo n
+ else decodeMore
+
+ decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
+ decodeAll = (TE.decodeUtf16LE bytes, B.empty)
+
+utf16_be :: Codec
+utf16_be = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-16-BE"
+ enc text = (TE.encodeUtf16BE text, Nothing)
+ dec bytes = case splitQuickly bytes of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf16BE bytes
+
+ splitQuickly bytes = maybeDecode (loop 0) where
+ maxN = B.length bytes
+
+ loop n | n == maxN = decodeAll
+ | (n + 1) == maxN = decodeTo n
+ loop n = let
+ req = utf16Required
+ (B.index bytes (n + 1))
+ (B.index bytes n)
+ decodeMore = loop $! n + req
+ in if n + req > maxN
+ then decodeTo n
+ else decodeMore
+
+ decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
+ decodeAll = (TE.decodeUtf16BE bytes, B.empty)
+
+utf16Required :: Word8 -> Word8 -> Int
+utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
+ x :: Word16
+ x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
+
+
+utf32_le :: Codec
+utf32_le = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-32-LE"
+ enc text = (TE.encodeUtf32LE text, Nothing)
+ dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf32LE bs
+
+
+utf32_be :: Codec
+utf32_be = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-32-BE"
+ enc text = (TE.encodeUtf32BE text, Nothing)
+ dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf32BE bs
+
+utf32SplitBytes :: (ByteString -> Text)
+ -> ByteString
+ -> Maybe (Text, ByteString)
+utf32SplitBytes dec bytes = split where
+ split = maybeDecode (dec toDecode, extra)
+ len = B.length bytes
+ lenExtra = mod len 4
+
+ lenToDecode = len - lenExtra
+ (toDecode, extra) = if lenExtra == 0
+ then (bytes, B.empty)
+ else B.splitAt lenToDecode bytes
+
+ascii :: Codec
+ascii = Codec name enc (toDecoding dec) where
+ name = T.pack "ASCII"
+ enc text = (bytes, extra) where
+ (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
+ bytes = B8.pack (T.unpack safe)
+ extra = if T.null unsafe
+ then Nothing
+ else Just (EncodeException ascii (T.head unsafe), unsafe)
+
+ dec bytes = (text, extra) where
+ (safe, unsafe) = B.span (<= 0x7F) bytes
+ text = T.pack (B8.unpack safe)
+ extra = if B.null unsafe
+ then Right B.empty
+ else Left (DecodeException ascii (B.head unsafe), unsafe)
+
+iso8859_1 :: Codec
+iso8859_1 = Codec name enc (toDecoding dec) where
+ name = T.pack "ISO-8859-1"
+ enc text = (bytes, extra) where
+ (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
+ bytes = B8.pack (T.unpack safe)
+ extra = if T.null unsafe
+ then Nothing
+ else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
+
+ dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
+
+tryEvaluate :: a -> Either Exc.SomeException a
+tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
+
+maybeDecode :: (a, b) -> Maybe (a, b)
+maybeDecode (a, b) = case tryEvaluate a of
+ Left _ -> Nothing
+ Right _ -> Just (a, b)
( Decoding(..)
, streamDecodeUtf8
, decodeSomeUtf8
- , Codec(..)
- , TextException(..)
- , utf8
) where
import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
import Control.Monad.ST (ST, runST)
#include "pipes_text_cbits.h"
--- | A specific character encoding.
---
--- Since 0.3.0
-data Codec = Codec
- { codecName :: Text
- , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
- , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
- }
-
-instance Show Codec where
- showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c)
-
--- Since 0.3.0
-data TextException = DecodeException Codec Word8
- | EncodeException Codec Char
- | LengthExceeded Int
- | TextException Exc.SomeException
- deriving (Show, Typeable)
-instance Exc.Exception TextException
-
-toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
- -> (ByteString -> Decoding)
-toDecoding op = loop B.empty where
- loop extra bs0 = case op (B.append extra bs0) of
- (txt, Right bs) -> Some txt bs (loop bs)
- (txt, Left (_,bs)) -> Other txt bs
-
-
-splitSlowly :: (ByteString -> Text)
- -> ByteString
- -> (Text, Either (TextException, ByteString) ByteString)
-splitSlowly dec bytes = valid where
- valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
- splits 0 = [(B.empty, bytes)]
- splits n = B.splitAt n bytes : splits (n - 1)
- decFirst (a, b) = case tryEvaluate (dec a) of
- Left _ -> Nothing
- Right text -> let trouble = case tryEvaluate (dec b) of
- Left exc -> Left (TextException exc, b)
- Right _ -> Right B.empty
- in Just (text, trouble)
- -- this case shouldn't occur,
- -- since splitSlowly is only called
- -- when parsing failed somewhere
-
-utf8 :: Codec
-utf8 = Codec name enc (toDecoding dec) where
- name = T.pack "UTF-8"
- enc text = (TE.encodeUtf8 text, Nothing)
- dec bytes = case decodeSomeUtf8 bytes of
- (t,b) -> (t, Right b)
-
--- -- Whether the given byte is a continuation byte.
--- isContinuation byte = byte .&. 0xC0 == 0x80
---
--- -- The number of continuation bytes needed by the given
--- -- non-continuation byte. Returns -1 for an illegal UTF-8
--- -- non-continuation byte and the whole split quickly must fail so
--- -- as the input is passed to TE.decodeUtf8, which will issue a
--- -- suitable error.
--- required x0
--- | x0 .&. 0x80 == 0x00 = 0
--- | x0 .&. 0xE0 == 0xC0 = 1
--- | x0 .&. 0xF0 == 0xE0 = 2
--- | x0 .&. 0xF8 == 0xF0 = 3
--- | otherwise = -1
---
--- splitQuickly bytes
--- | B.null l || req == -1 = Nothing
--- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
--- | otherwise = Just (TE.decodeUtf8 l', r')
--- where
--- (l, r) = B.spanEnd isContinuation bytes
--- req = required (B.last l)
--- l' = B.init l
--- r' = B.cons (B.last l) r
-
--- |
--- Since 0.3.0
-utf16_le :: Codec
-utf16_le = Codec name enc (toDecoding dec) where
- name = T.pack "UTF-16-LE"
- enc text = (TE.encodeUtf16LE text, Nothing)
- dec bytes = case splitQuickly bytes of
- Just (text, extra) -> (text, Right extra)
- Nothing -> splitSlowly TE.decodeUtf16LE bytes
-
- splitQuickly bytes = maybeDecode (loop 0) where
- maxN = B.length bytes
-
- loop n | n == maxN = decodeAll
- | (n + 1) == maxN = decodeTo n
- loop n = let
- req = utf16Required
- (B.index bytes n)
- (B.index bytes (n + 1))
- decodeMore = loop $! n + req
- in if n + req > maxN
- then decodeTo n
- else decodeMore
-
- decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
- decodeAll = (TE.decodeUtf16LE bytes, B.empty)
-
--- |
--- Since 0.3.0
-utf16_be :: Codec
-utf16_be = Codec name enc (toDecoding dec) where
- name = T.pack "UTF-16-BE"
- enc text = (TE.encodeUtf16BE text, Nothing)
- dec bytes = case splitQuickly bytes of
- Just (text, extra) -> (text, Right extra)
- Nothing -> splitSlowly TE.decodeUtf16BE bytes
-
- splitQuickly bytes = maybeDecode (loop 0) where
- maxN = B.length bytes
-
- loop n | n == maxN = decodeAll
- | (n + 1) == maxN = decodeTo n
- loop n = let
- req = utf16Required
- (B.index bytes (n + 1))
- (B.index bytes n)
- decodeMore = loop $! n + req
- in if n + req > maxN
- then decodeTo n
- else decodeMore
-
- decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
- decodeAll = (TE.decodeUtf16BE bytes, B.empty)
-
-utf16Required :: Word8 -> Word8 -> Int
-utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
- x :: Word16
- x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
-
--- |
--- Since 0.3.0
-utf32_le :: Codec
-utf32_le = Codec name enc (toDecoding dec) where
- name = T.pack "UTF-32-LE"
- enc text = (TE.encodeUtf32LE text, Nothing)
- dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
- Just (text, extra) -> (text, Right extra)
- Nothing -> splitSlowly TE.decodeUtf32LE bs
-
--- |
--- Since 0.3.0
-utf32_be :: Codec
-utf32_be = Codec name enc (toDecoding dec) where
- name = T.pack "UTF-32-BE"
- enc text = (TE.encodeUtf32BE text, Nothing)
- dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
- Just (text, extra) -> (text, Right extra)
- Nothing -> splitSlowly TE.decodeUtf32BE bs
-
-utf32SplitBytes :: (ByteString -> Text)
- -> ByteString
- -> Maybe (Text, ByteString)
-utf32SplitBytes dec bytes = split where
- split = maybeDecode (dec toDecode, extra)
- len = B.length bytes
- lenExtra = mod len 4
-
- lenToDecode = len - lenExtra
- (toDecode, extra) = if lenExtra == 0
- then (bytes, B.empty)
- else B.splitAt lenToDecode bytes
-
--- |
--- Since 0.3.0
-ascii :: Codec
-ascii = Codec name enc (toDecoding dec) where
- name = T.pack "ASCII"
- enc text = (bytes, extra) where
- (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
- bytes = B8.pack (T.unpack safe)
- extra = if T.null unsafe
- then Nothing
- else Just (EncodeException ascii (T.head unsafe), unsafe)
-
- dec bytes = (text, extra) where
- (safe, unsafe) = B.span (<= 0x7F) bytes
- text = T.pack (B8.unpack safe)
- extra = if B.null unsafe
- then Right B.empty
- else Left (DecodeException ascii (B.head unsafe), unsafe)
-
--- |
--- Since 0.3.0
-iso8859_1 :: Codec
-iso8859_1 = Codec name enc (toDecoding dec) where
- name = T.pack "ISO-8859-1"
- enc text = (bytes, extra) where
- (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
- bytes = B8.pack (T.unpack safe)
- extra = if T.null unsafe
- then Nothing
- else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
-
- dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
-
-tryEvaluate :: a -> Either Exc.SomeException a
-tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
-
-maybeDecode :: (a, b) -> Maybe (a, b)
-maybeDecode (a, b) = case tryEvaluate a of
- Left _ -> Nothing
- Right _ -> Just (a, b)
-- | A stream oriented decoding result.
data Decoding = Some Text ByteString (ByteString -> Decoding)
library
c-sources: cbits/cbits.c
include-dirs: include
- exposed-modules: Pipes.Text, Pipes.Text.Internal
+ exposed-modules: Pipes.Text, Pipes.Text.Internal, Pipes.Text.Codec
-- other-modules:
other-extensions: RankNTypes
build-depends: base >= 4 && < 5 ,