, module Data.Word
, module Pipes.Parse
, module Pipes.Group
+ , module Pipes.Text.Internal.Codec
) where
import Control.Exception (throwIO, try)
import qualified GHC.IO.Exception as G
import Pipes
import qualified Pipes.ByteString as PB
-import qualified Pipes.Text.Internal as PE
-import Pipes.Text.Codec
+import qualified Pipes.Text.Internal.Decoding as PE
+import Pipes.Text.Internal.Codec
import Pipes.Core (respond, Server')
import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))
import qualified Pipes.Group as PG
--- /dev/null
+
+{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
+-- |
+-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
+-- License: MIT
+--
+-- Parts of this code were taken from enumerator and conduits, and adapted for pipes.
+
+module Pipes.Text.Internal.Codec
+ ( Decoding(..)
+ , streamDecodeUtf8
+ , decodeSomeUtf8
+ , Codec(..)
+ , TextException(..)
+ , utf8
+ , utf16_le
+ , utf16_be
+ , utf32_le
+ , utf32_be
+ ) where
+
+import Data.Bits ((.&.))
+import Data.Char (ord)
+import Data.ByteString as B
+import Data.ByteString (ByteString)
+import Data.ByteString.Internal as B
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as TE
+import Data.Text.Encoding.Error ()
+import GHC.Word (Word8, Word32)
+import qualified Data.Text.Array as A
+import Data.Word (Word8, Word16)
+import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exc
+import Data.Bits ((.&.), (.|.), shiftL)
+import Data.Typeable
+import Control.Arrow (first)
+import Data.Maybe (catMaybes)
+import Pipes.Text.Internal.Decoding
+import Pipes
+-- | A specific character encoding.
+--
+-- Since 0.3.0
+data Codec = Codec
+ { codecName :: Text
+ , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
+ , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
+ }
+
+instance Show Codec where
+ showsPrec d c = showParen (d > 10) $
+ showString "Codec " . shows (codecName c)
+
+data TextException = DecodeException Codec Word8
+ | EncodeException Codec Char
+ | LengthExceeded Int
+ | TextException Exc.SomeException
+ deriving (Show, Typeable)
+instance Exc.Exception TextException
+
+
+toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
+ -> (ByteString -> Decoding)
+toDecoding op = loop B.empty where
+ loop !extra bs0 = case op (B.append extra bs0) of
+ (txt, Right bs) -> Some txt bs (loop bs)
+ (txt, Left (_,bs)) -> Other txt bs
+-- To do: toDecoding should be inlined in each of the 'Codec' definitions
+-- or else Codec changed to the conduit/enumerator definition. We have
+-- altered it to use 'streamDecodeUtf8'
+
+splitSlowly :: (ByteString -> Text)
+ -> ByteString
+ -> (Text, Either (TextException, ByteString) ByteString)
+splitSlowly dec bytes = valid where
+ valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
+ splits 0 = [(B.empty, bytes)]
+ splits n = B.splitAt n bytes : splits (n - 1)
+ decFirst (a, b) = case tryEvaluate (dec a) of
+ Left _ -> Nothing
+ Right text -> let trouble = case tryEvaluate (dec b) of
+ Left exc -> Left (TextException exc, b)
+ Right _ -> Right B.empty
+ in Just (text, trouble) -- this case shouldn't occur,
+ -- since splitSlowly is only called
+ -- when parsing failed somewhere
+
+utf8 :: Codec
+utf8 = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-8"
+ enc text = (TE.encodeUtf8 text, Nothing)
+ dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
+
+-- -- Whether the given byte is a continuation byte.
+-- isContinuation byte = byte .&. 0xC0 == 0x80
+--
+-- -- The number of continuation bytes needed by the given
+-- -- non-continuation byte. Returns -1 for an illegal UTF-8
+-- -- non-continuation byte and the whole split quickly must fail so
+-- -- as the input is passed to TE.decodeUtf8, which will issue a
+-- -- suitable error.
+-- required x0
+-- | x0 .&. 0x80 == 0x00 = 0
+-- | x0 .&. 0xE0 == 0xC0 = 1
+-- | x0 .&. 0xF0 == 0xE0 = 2
+-- | x0 .&. 0xF8 == 0xF0 = 3
+-- | otherwise = -1
+--
+-- splitQuickly bytes
+-- | B.null l || req == -1 = Nothing
+-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
+-- | otherwise = Just (TE.decodeUtf8 l', r')
+-- where
+-- (l, r) = B.spanEnd isContinuation bytes
+-- req = required (B.last l)
+-- l' = B.init l
+-- r' = B.cons (B.last l) r
+
+
+utf16_le :: Codec
+utf16_le = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-16-LE"
+ enc text = (TE.encodeUtf16LE text, Nothing)
+ dec bytes = case splitQuickly bytes of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf16LE bytes
+
+ splitQuickly bytes = maybeDecode (loop 0) where
+ maxN = B.length bytes
+
+ loop n | n == maxN = decodeAll
+ | (n + 1) == maxN = decodeTo n
+ loop n = let
+ req = utf16Required
+ (B.index bytes n)
+ (B.index bytes (n + 1))
+ decodeMore = loop $! n + req
+ in if n + req > maxN
+ then decodeTo n
+ else decodeMore
+
+ decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
+ decodeAll = (TE.decodeUtf16LE bytes, B.empty)
+
+utf16_be :: Codec
+utf16_be = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-16-BE"
+ enc text = (TE.encodeUtf16BE text, Nothing)
+ dec bytes = case splitQuickly bytes of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf16BE bytes
+
+ splitQuickly bytes = maybeDecode (loop 0) where
+ maxN = B.length bytes
+
+ loop n | n == maxN = decodeAll
+ | (n + 1) == maxN = decodeTo n
+ loop n = let
+ req = utf16Required
+ (B.index bytes (n + 1))
+ (B.index bytes n)
+ decodeMore = loop $! n + req
+ in if n + req > maxN
+ then decodeTo n
+ else decodeMore
+
+ decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
+ decodeAll = (TE.decodeUtf16BE bytes, B.empty)
+
+utf16Required :: Word8 -> Word8 -> Int
+utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
+ x :: Word16
+ x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
+
+
+utf32_le :: Codec
+utf32_le = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-32-LE"
+ enc text = (TE.encodeUtf32LE text, Nothing)
+ dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf32LE bs
+
+
+utf32_be :: Codec
+utf32_be = Codec name enc (toDecoding dec) where
+ name = T.pack "UTF-32-BE"
+ enc text = (TE.encodeUtf32BE text, Nothing)
+ dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
+ Just (text, extra) -> (text, Right extra)
+ Nothing -> splitSlowly TE.decodeUtf32BE bs
+
+utf32SplitBytes :: (ByteString -> Text)
+ -> ByteString
+ -> Maybe (Text, ByteString)
+utf32SplitBytes dec bytes = split where
+ split = maybeDecode (dec toDecode, extra)
+ len = B.length bytes
+ lenExtra = mod len 4
+
+ lenToDecode = len - lenExtra
+ (toDecode, extra) = if lenExtra == 0
+ then (bytes, B.empty)
+ else B.splitAt lenToDecode bytes
+
+
+tryEvaluate :: a -> Either Exc.SomeException a
+tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
+
+maybeDecode :: (a, b) -> Maybe (a, b)
+maybeDecode (a, b) = case tryEvaluate a of
+ Left _ -> Nothing
+ Right _ -> Just (a, b)
--- /dev/null
+{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
+{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
+
+-- This module lifts assorted materials from Brian O'Sullivan's text package
+-- especially Data.Text.Encoding in order to define a pipes-appropriate
+-- streamDecodeUtf8
+module Pipes.Text.Internal.Decoding
+ ( Decoding(..)
+ , streamDecodeUtf8
+ , decodeSomeUtf8
+ ) where
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
+import Control.Monad.ST (ST, runST)
+import Data.Bits ((.&.))
+import Data.ByteString as B
+import Data.ByteString (ByteString)
+import Data.ByteString.Internal as B
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as TE
+import Data.Text.Encoding.Error ()
+import Data.Text.Internal (Text, textP)
+import Foreign.C.Types (CSize)
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Marshal.Utils (with)
+import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
+import Foreign.Storable (Storable, peek, poke)
+import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
+import GHC.Word (Word8, Word32)
+import qualified Data.Text.Array as A
+import Data.Word (Word8, Word16)
+import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exc
+import Data.Bits ((.&.), (.|.), shiftL)
+import Data.Typeable
+import Control.Arrow (first)
+import Data.Maybe (catMaybes)
+#include "pipes_text_cbits.h"
+
+
+
+-- | A stream oriented decoding result.
+data Decoding = Some Text ByteString (ByteString -> Decoding)
+ | Other Text ByteString
+instance Show Decoding where
+ showsPrec d (Some t bs _) = showParen (d > prec) $
+ showString "Some " . showsPrec prec' t .
+ showChar ' ' . showsPrec prec' bs .
+ showString " _"
+ where prec = 10; prec' = prec + 1
+ showsPrec d (Other t bs) = showParen (d > prec) $
+ showString "Other " . showsPrec prec' t .
+ showChar ' ' . showsPrec prec' bs .
+ showString " _"
+ where prec = 10; prec' = prec + 1
+
+newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
+newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
+
+streamDecodeUtf8 :: ByteString -> Decoding
+streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
+ where
+ decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
+ decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
+ runST $ do marray <- A.new (len+1)
+ unsafeIOToST (decodeChunkToBuffer marray)
+ where
+ decodeChunkToBuffer :: A.MArray s -> IO Decoding
+ decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+ with (0::CSize) $ \destOffPtr ->
+ with codepoint0 $ \codepointPtr ->
+ with state0 $ \statePtr ->
+ with nullPtr $ \curPtrPtr ->
+ do let end = ptr `plusPtr` (off + len)
+ curPtr = ptr `plusPtr` off
+ poke curPtrPtr curPtr
+ c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+ state <- peek statePtr
+ lastPtr <- peek curPtrPtr
+ codepoint <- peek codepointPtr
+ n <- peek destOffPtr
+ chunkText <- mkText dest n
+ let left = lastPtr `minusPtr` curPtr
+ remaining = B.drop left bs
+ accum = if T.null chunkText then B.append old remaining else remaining
+ return $! case state of
+ UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
+ _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
+ {-# INLINE decodeChunkToBuffer #-}
+ {-# INLINE decodeChunkUtf8 #-}
+{-# INLINE streamDecodeUtf8 #-}
+
+decodeSomeUtf8 :: ByteString -> (Text, ByteString)
+decodeSomeUtf8 bs@(PS fp off len) = runST $ do
+ dest <- A.new (len+1)
+ unsafeIOToST $
+ withForeignPtr fp $ \ptr ->
+ with (0::CSize) $ \destOffPtr ->
+ with (0::CodePoint) $ \codepointPtr ->
+ with (0::DecoderState) $ \statePtr ->
+ with nullPtr $ \curPtrPtr ->
+ do let end = ptr `plusPtr` (off + len)
+ curPtr = ptr `plusPtr` off
+ poke curPtrPtr curPtr
+ c_decode_utf8_with_state (A.maBA dest) destOffPtr
+ curPtrPtr end codepointPtr statePtr
+ state <- peek statePtr
+ lastPtr <- peek curPtrPtr
+ codepoint <- peek codepointPtr
+ n <- peek destOffPtr
+ chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+ return $! textP arr 0 (fromIntegral n)
+ let left = lastPtr `minusPtr` curPtr
+ remaining = B.drop left bs
+ return $! (chunkText, remaining)
+{-# INLINE decodeSomeUtf8 #-}
+
+mkText :: A.MArray s -> CSize -> IO Text
+mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+ return $! textP arr 0 (fromIntegral n)
+{-# INLINE mkText #-}
+
+ord :: Char -> Int
+ord (C# c#) = I# (ord# c#)
+{-# INLINE ord #-}
+
+unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
+unsafeWrite marr i c
+ | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
+ return 1
+ | otherwise = do A.unsafeWrite marr i lo
+ A.unsafeWrite marr (i+1) hi
+ return 2
+ where n = ord c
+ m = n - 0x10000
+ lo = fromIntegral $ (m `shiftR` 10) + 0xD800
+ hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
+ shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
+ {-# INLINE shiftR #-}
+{-# INLINE unsafeWrite #-}
+
+foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
+ :: MutableByteArray# s -> Ptr CSize
+ -> Ptr (Ptr Word8) -> Ptr Word8
+ -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
\ No newline at end of file
name: pipes-text
version: 0.1.0.0
synopsis: Text pipes.
-description: Text pipes.
+description: Most of the pipes and other operations defined in `pipes-text`
+ closely mirrors the `pipes-bytestring` library, simply adjusting for
+ the difference between strict `ByteString` and strict `Text`, and between
+ `Word8` and `Char`. To this core are added some simple functions
+ some akin to the `String` operations in `Pipes.Prelude`, some like the
+ utilities in `Data.Text`.
+
+ All of the `IO` operations defined here - e.g `readFile`, `stdout` etc.
+ - are conveniences akin to those in `Data.Text.IO` which e.g. try to
+ find the system encoding and use the exceptions defined in the `text`
+ library. Proper `IO` in the sense of this library will employ
+ `pipes-bytestring` in conjuntion with 'pure' operations like
+ `decodeUtf8` and `encodeUtf8` that are defined here.
+
homepage: github.com/michaelt/text-pipes
license: BSD3
license-file: LICENSE
-author: michaelt
+author: Michael Thompson
maintainer: what_is_it_to_do_anything@yahoo.com
-category: Text
+category: Text, Pipes
build-type: Simple
cabal-version: >=1.10
+extra-source-files: README.md
+ include/*.h
library
c-sources: cbits/cbits.c
include-dirs: include
- exposed-modules: Pipes.Text, Pipes.Text.Internal, Pipes.Text.Codec
+ exposed-modules: Pipes.Text, Pipes.Text.Internal.Decoding, Pipes.Text.Internal.Codec
-- other-modules:
other-extensions: RankNTypes
build-depends: base >= 4 && < 5 ,