{-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns #-}
-
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
{-| This module provides @pipes@ utilities for \"text streams\", which are
- streams of 'Text' chunks. The individual chunks are uniformly @strict@, but
- a 'Producer' can be converted to and from lazy 'Text's; an 'IO.Handle' can
- be associated with a 'Producer' or 'Consumer' according as it is read or written to.
+ streams of 'Text' chunks. The individual chunks are uniformly @strict@, but
+ a 'Producer' can be converted to and from lazy 'Text's, though this is generally
+ unwise. Where pipes IO replaces lazy IO, 'Producer Text m r' replaces lazy 'Text'.
+ An 'IO.Handle' can be associated with a 'Producer' or 'Consumer' according as it is read or written to.
To stream to or from 'IO.Handle's, one can use 'fromHandle' or 'toHandle'. For
example, the following program copies a document from one file to another:
Note that functions in this library are designed to operate on streams that
are insensitive to text boundaries. This means that they may freely split
- text into smaller texts and /discard empty texts/. However, they will
- /never concatenate texts/ in order to provide strict upper bounds on memory
- usage.
+ text into smaller texts, /discard empty texts/. However, apart from the
+ special case of 'concatMap', they will /never concatenate texts/ in order
+ to provide strict upper bounds on memory usage -- with the single exception of 'concatMap'.
-}
module Pipes.Text (
-- * Folds
toLazy,
toLazyM,
- fold,
+ foldChars,
head,
last,
null,
lines,
words,
decodeUtf8,
+ decode,
-- * Transformations
intersperse,
) where
import Control.Exception (throwIO, try)
-import Control.Monad (liftM, unless)
+import Control.Monad (liftM, unless, join)
import Control.Monad.Trans.State.Strict (StateT(..))
import Data.Monoid ((<>))
import qualified Data.Text as T
import qualified GHC.IO.Exception as G
import Pipes
import qualified Pipes.ByteString as PB
-import qualified Pipes.ByteString.Parse as PBP
+import qualified Pipes.ByteString as PBP
import qualified Pipes.Text.Internal as PE
+import Pipes.Text.Internal (Codec(..))
import Pipes.Text.Parse (
nextChar, drawChar, unDrawChar, peekChar, isEndOfChars )
import Pipes.Core (respond, Server')
import qualified Pipes.Parse as PP
-import Pipes.Parse (input, concat, FreeT)
+import Pipes.Parse ( FreeT)
import qualified Pipes.Safe.Prelude as Safe
import qualified Pipes.Safe as Safe
import Pipes.Safe (MonadSafe(..), Base(..))
{-# INLINABLE toLazyM #-}
-- | Reduce the text stream using a strict left fold over characters
-fold
+foldChars
:: Monad m
=> (x -> Char -> x) -> x -> (x -> r) -> Producer Text m () -> m r
-fold step begin done = P.fold (T.foldl' step) begin done
+foldChars step begin done = P.fold (T.foldl' step) begin done
{-# INLINABLE fold #-}
-- | Retrieve the first 'Char'
@Data.Text@ re-exports the 'Text' type.
@Pipes.Parse@ re-exports 'input', 'concat', and 'FreeT' (the type).
--}
\ No newline at end of file
+-}
+
+
+
+decode :: Monad m => PE.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+-- decode codec = go B.empty where
+-- go extra p0 =
+-- do x <- lift (next p0)
+-- case x of Right (chunk, p) ->
+-- do let (text, stuff) = codecDecode codec (B.append extra chunk)
+-- yield text
+-- case stuff of Right extra' -> go extra' p
+-- Left (exc,bs) -> do yield text
+-- return (do yield bs
+-- p)
+-- Left r -> return (do yield extra
+-- return r)
+
+decode d p0 = case d of
+ PE.Other txt bad -> do yield txt
+ return (do yield bad
+ p0)
+ PE.Some txt extra dec -> do yield txt
+ x <- lift (next p0)
+ case x of Left r -> return (do yield extra
+ return r)
+ Right (chunk,p1) -> decode (dec chunk) p1
+
+-- go !carry dec0 p = do
+-- x <- lift (next p)
+-- case x of Left r -> if B.null carry
+-- then return (return r) -- all bytestrinput was consumed
+-- else return (do yield carry -- a potentially valid fragment remains
+-- return r)
+--
+-- Right (chunk, p') -> case dec0 chunk of
+-- PE.Some text carry2 dec -> do yield text
+-- go carry2 dec p'
+-- PE.Other text bs -> do yield text
+-- return (do yield bs -- an invalid blob remains
+-- p')
+-- {-# INLINABLE decodeUtf8 #-}
-{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
- UnliftedFFITypes #-}
+{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
+{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
+
-- This module lifts assorted materials from Brian O'Sullivan's text package
-- especially Data.Text.Encoding in order to define a pipes-appropriate
-- streamDecodeUtf8
( Decoding(..)
, streamDecodeUtf8
, decodeSomeUtf8
+ , Codec(..)
+ , TextException(..)
+ , utf8
) where
import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
import Control.Monad.ST (ST, runST)
import Data.Bits ((.&.))
import Data.ByteString as B
+import Data.ByteString (ByteString)
import Data.ByteString.Internal as B
-import qualified Data.Text as T (null)
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as TE
import Data.Text.Encoding.Error ()
import Data.Text.Internal (Text, textP)
import Foreign.C.Types (CSize)
import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
import GHC.Word (Word8, Word32)
import qualified Data.Text.Array as A
-
+import Data.Word (Word8, Word16)
+import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exc
+import Data.Bits ((.&.), (.|.), shiftL)
+import Data.Typeable
+import Control.Arrow (first)
+import Data.Maybe (catMaybes)
#include "pipes_text_cbits.h"
+
+-- | A specific character encoding.
+--
+-- Since 0.3.0
+data Codec = Codec
+ { codecName :: Text
+ , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
+ , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
+ }
+
+instance Show Codec where
+ showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c)
+
+-- Since 0.3.0
+data TextException = DecodeException Codec Word8
+ | EncodeException Codec Char
+ | LengthExceeded Int
+ | TextException Exc.SomeException
+ deriving (Show, Typeable)
+instance Exc.Exception TextException
+
+toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
+ -> (ByteString -> Decoding)
+toDecoding op = loop B.empty where
+ loop extra bs0 = case op (B.append extra bs0) of
+ (txt, Right bs) -> Some txt bs (loop bs)
+ (txt, Left (_,bs)) -> Other txt bs
+
+
+splitSlowly :: (ByteString -> Text)
+ -> ByteString
+ -> (Text, Either (TextException, ByteString) ByteString)
+splitSlowly dec bytes = valid where
+ valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
+ splits 0 = [(B.empty, bytes)]
+ splits n = B.splitAt n bytes : splits (n - 1)
+ decFirst (a, b) = case tryEvaluate (dec a) of
+ Left _ -> Nothing
+ Right text -> let trouble = case tryEvaluate (dec b) of
+ Left exc -> Left (TextException exc, b)
+ Right _ -> Right B.empty
+ in Just (text, trouble)
+ -- this case shouldn't occur,
+ -- since splitSlowly is only called
+ -- when parsing failed somewhere
+
+utf8 :: Codec
+utf8 = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-8"
+ enc text = (TE.encodeUtf8 text, Nothing)
+ dec bytes = case decodeSomeUtf8 bytes of
+ (t,b) -> (t, Right b)
+
+-- -- Whether the given byte is a continuation byte.
+-- isContinuation byte = byte .&. 0xC0 == 0x80
+--
+-- -- The number of continuation bytes needed by the given
+-- -- non-continuation byte. Returns -1 for an illegal UTF-8
+-- -- non-continuation byte and the whole split quickly must fail so
+-- -- as the input is passed to TE.decodeUtf8, which will issue a
+-- -- suitable error.
+-- required x0
+-- | x0 .&. 0x80 == 0x00 = 0
+-- | x0 .&. 0xE0 == 0xC0 = 1
+-- | x0 .&. 0xF0 == 0xE0 = 2
+-- | x0 .&. 0xF8 == 0xF0 = 3
+-- | otherwise = -1
+--
+-- splitQuickly bytes
+-- | B.null l || req == -1 = Nothing
+-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
+-- | otherwise = Just (TE.decodeUtf8 l', r')
+-- where
+-- (l, r) = B.spanEnd isContinuation bytes
+-- req = required (B.last l)
+-- l' = B.init l
+-- r' = B.cons (B.last l) r
+
+-- |
+-- Since 0.3.0
+utf16_le :: Codec
+utf16_le = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-16-LE"
+ enc text = (TE.encodeUtf16LE text, Nothing)
+ dec bytes = case splitQuickly bytes of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf16LE bytes
+
+ splitQuickly bytes = maybeDecode (loop 0) where
+ maxN = B.length bytes
+
+ loop n | n == maxN = decodeAll
+ | (n + 1) == maxN = decodeTo n
+ loop n = let
+ req = utf16Required
+ (B.index bytes n)
+ (B.index bytes (n + 1))
+ decodeMore = loop $! n + req
+ in if n + req > maxN
+ then decodeTo n
+ else decodeMore
+
+ decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
+ decodeAll = (TE.decodeUtf16LE bytes, B.empty)
+
+-- |
+-- Since 0.3.0
+utf16_be :: Codec
+utf16_be = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-16-BE"
+ enc text = (TE.encodeUtf16BE text, Nothing)
+ dec bytes = case splitQuickly bytes of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf16BE bytes
+
+ splitQuickly bytes = maybeDecode (loop 0) where
+ maxN = B.length bytes
+
+ loop n | n == maxN = decodeAll
+ | (n + 1) == maxN = decodeTo n
+ loop n = let
+ req = utf16Required
+ (B.index bytes (n + 1))
+ (B.index bytes n)
+ decodeMore = loop $! n + req
+ in if n + req > maxN
+ then decodeTo n
+ else decodeMore
+
+ decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
+ decodeAll = (TE.decodeUtf16BE bytes, B.empty)
+
+utf16Required :: Word8 -> Word8 -> Int
+utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
+ x :: Word16
+ x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
+
+-- |
+-- Since 0.3.0
+utf32_le :: Codec
+utf32_le = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-32-LE"
+ enc text = (TE.encodeUtf32LE text, Nothing)
+ dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf32LE bs
+
+-- |
+-- Since 0.3.0
+utf32_be :: Codec
+utf32_be = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-32-BE"
+ enc text = (TE.encodeUtf32BE text, Nothing)
+ dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf32BE bs
+
+utf32SplitBytes :: (ByteString -> Text)
+ -> ByteString
+ -> Maybe (Text, ByteString)
+utf32SplitBytes dec bytes = split where
+ split = maybeDecode (dec toDecode, extra)
+ len = B.length bytes
+ lenExtra = mod len 4
+
+ lenToDecode = len - lenExtra
+ (toDecode, extra) = if lenExtra == 0
+ then (bytes, B.empty)
+ else B.splitAt lenToDecode bytes
+
+-- |
+-- Since 0.3.0
+ascii :: Codec
+ascii = Codec name enc (toDecoding dec) where
+ name = T.pack "ASCII"
+ enc text = (bytes, extra) where
+ (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
+ bytes = B8.pack (T.unpack safe)
+ extra = if T.null unsafe
+ then Nothing
+ else Just (EncodeException ascii (T.head unsafe), unsafe)
+
+ dec bytes = (text, extra) where
+ (safe, unsafe) = B.span (<= 0x7F) bytes
+ text = T.pack (B8.unpack safe)
+ extra = if B.null unsafe
+ then Right B.empty
+ else Left (DecodeException ascii (B.head unsafe), unsafe)
+
+-- |
+-- Since 0.3.0
+iso8859_1 :: Codec
+iso8859_1 = Codec name enc (toDecoding dec) where
+ name = T.pack "ISO-8859-1"
+ enc text = (bytes, extra) where
+ (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
+ bytes = B8.pack (T.unpack safe)
+ extra = if T.null unsafe
+ then Nothing
+ else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
+
+ dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
+
+tryEvaluate :: a -> Either Exc.SomeException a
+tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
+
+maybeDecode :: (a, b) -> Maybe (a, b)
+maybeDecode (a, b) = case tryEvaluate a of
+ Left _ -> Nothing
+ Right _ -> Just (a, b)
+
-- | A stream oriented decoding result.
data Decoding = Some Text ByteString (ByteString -> Decoding)
| Other Text ByteString
return $! (chunkText, remaining)
{-# INLINE decodeSomeUtf8 #-}
--- decodeSomeUtf8 :: ByteString -> (Text, ByteString)
--- decodeSomeUtf8 bs@(PS fp off len) =
--- runST $ do marray <- A.new (len+1)
--- unsafeIOToST (decodeChunkToBuffer marray)
---
--- where
--- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString)
--- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
--- with (0::CSize) $ \destOffPtr ->
--- with (0::CodePoint) $ \codepointPtr ->
--- with (0::DecoderState) $ \statePtr ->
--- with nullPtr $ \curPtrPtr ->
--- do let end = ptr `plusPtr` (off + len)
--- curPtr = ptr `plusPtr` off
--- poke curPtrPtr curPtr
--- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
--- state <- peek statePtr
--- lastPtr <- peek curPtrPtr
--- codepoint <- peek codepointPtr
--- n <- peek destOffPtr
--- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
--- return $! textP arr 0 (fromIntegral n)
--- let left = lastPtr `minusPtr` curPtr
--- remaining = B.drop left bs
--- return $! (chunkText, remaining)
--- {-# INLINE decodeChunkToBuffer #-}
--- {-# INLINE decodeSomeUtf8 #-}
-
-
-
mkText :: A.MArray s -> CSize -> IO Text
mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
return $! textP arr 0 (fromIntegral n)