moved internals

author michaelt <what_is_it_to_do_anything@yahoo.com>

Wed, 5 Feb 2014 02:47:27 +0000 (21:47 -0500)

committer michaelt <what_is_it_to_do_anything@yahoo.com>

Wed, 5 Feb 2014 02:47:27 +0000 (21:47 -0500)
author michaelt <what_is_it_to_do_anything@yahoo.com>
Wed, 5 Feb 2014 02:47:27 +0000 (21:47 -0500)
committer michaelt <what_is_it_to_do_anything@yahoo.com>
Wed, 5 Feb 2014 02:47:27 +0000 (21:47 -0500)
diff --git a/.DS_Store b/.DS_Store

new file mode 100644 (file)

index 0000000..1538af8

Binary files /dev/null and b/.DS_Store differ
diff --git a/Pipes/Text.hs b/Pipes/Text.hs

index 18ec8ec6ec0640d8f6c393e5e28f7b82943f7e33..0957a7d40b7975f54bfa55cc8ce2b4ee6bddeabd 100644 (file)
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -167,6 +167,7 @@ module Pipes.Text  (
      , module Data.Word
      , module Pipes.Parse
      , module Pipes.Group
+    , module Pipes.Text.Internal.Codec
      ) where
  
  import Control.Exception (throwIO, try)
@@ -196,8 +197,8 @@ import Foreign.C.Error (Errno(Errno), ePIPE)
  import qualified GHC.IO.Exception as G
  import Pipes
  import qualified Pipes.ByteString as PB
-import qualified Pipes.Text.Internal as PE
-import Pipes.Text.Codec 
+import qualified Pipes.Text.Internal.Decoding as PE
+import Pipes.Text.Internal.Codec 
  import Pipes.Core (respond, Server')
  import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))
  import qualified Pipes.Group as PG
diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs

new file mode 100644 (file)

index 0000000..4b9367f
--- /dev/null
+++ b/Pipes/Text/Internal/Codec.hs
@@ -0,0 +1,215 @@
+
+{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
+-- |
+-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
+-- License: MIT
+--
+-- Parts of this code were taken from enumerator and conduits, and adapted for pipes.
+
+module Pipes.Text.Internal.Codec
+    ( Decoding(..)
+    , streamDecodeUtf8
+    , decodeSomeUtf8
+    , Codec(..)
+    , TextException(..)
+    , utf8
+    , utf16_le
+    , utf16_be
+    , utf32_le
+    , utf32_be
+    ) where
+
+import Data.Bits ((.&.))
+import Data.Char (ord)
+import Data.ByteString as B 
+import Data.ByteString (ByteString)
+import Data.ByteString.Internal as B 
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T 
+import qualified Data.Text.Encoding as TE 
+import Data.Text.Encoding.Error ()
+import GHC.Word (Word8, Word32)
+import qualified Data.Text.Array as A
+import Data.Word (Word8, Word16)
+import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exc
+import Data.Bits ((.&.), (.|.), shiftL)
+import Data.Typeable
+import Control.Arrow (first)
+import Data.Maybe (catMaybes)
+import Pipes.Text.Internal.Decoding
+import Pipes
+-- | A specific character encoding.
+--
+-- Since 0.3.0
+data Codec = Codec
+  { codecName :: Text
+  , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
+  , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
+  }
+
+instance Show Codec where
+    showsPrec d c = showParen (d > 10) $ 
+                    showString "Codec " . shows (codecName c)
+
+data TextException = DecodeException Codec Word8
+                   | EncodeException Codec Char
+                   | LengthExceeded Int
+                   | TextException Exc.SomeException
+    deriving (Show, Typeable)
+instance Exc.Exception TextException
+
+
+toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
+           -> (ByteString -> Decoding)
+toDecoding op = loop B.empty where
+  loop !extra bs0 = case op (B.append extra bs0) of
+                      (txt, Right bs) -> Some txt bs (loop bs)
+                      (txt, Left (_,bs)) -> Other txt bs
+-- To do: toDecoding should be inlined in each of the 'Codec' definitions
+-- or else Codec changed to the conduit/enumerator definition.  We have
+-- altered it to use 'streamDecodeUtf8'
+
+splitSlowly :: (ByteString -> Text)
+            -> ByteString 
+            -> (Text, Either (TextException, ByteString) ByteString)
+splitSlowly dec bytes = valid where
+    valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
+    splits 0 = [(B.empty, bytes)]
+    splits n = B.splitAt n bytes : splits (n - 1)
+    decFirst (a, b) = case tryEvaluate (dec a) of
+        Left _ -> Nothing
+        Right text -> let trouble = case tryEvaluate (dec b) of
+                            Left exc -> Left (TextException exc, b)
+                            Right _  -> Right B.empty 
+                      in Just (text, trouble) -- this case shouldn't occur, 
+                                      -- since splitSlowly is only called
+                                      -- when parsing failed somewhere
+
+utf8 :: Codec
+utf8 = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-8"
+    enc text = (TE.encodeUtf8 text, Nothing)
+    dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
+
+--     -- Whether the given byte is a continuation byte.
+--     isContinuation byte = byte .&. 0xC0 == 0x80
+-- 
+--     -- The number of continuation bytes needed by the given
+--     -- non-continuation byte. Returns -1 for an illegal UTF-8
+--     -- non-continuation byte and the whole split quickly must fail so
+--     -- as the input is passed to TE.decodeUtf8, which will issue a
+--     -- suitable error.
+--     required x0
+--         | x0 .&. 0x80 == 0x00 = 0
+--         | x0 .&. 0xE0 == 0xC0 = 1
+--         | x0 .&. 0xF0 == 0xE0 = 2
+--         | x0 .&. 0xF8 == 0xF0 = 3
+--         | otherwise           = -1
+-- 
+--     splitQuickly bytes
+--         | B.null l || req == -1 = Nothing
+--         | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
+--         | otherwise = Just (TE.decodeUtf8 l', r')
+--       where
+--         (l, r) = B.spanEnd isContinuation bytes
+--         req = required (B.last l)
+--         l' = B.init l
+--         r' = B.cons (B.last l) r
+
+
+utf16_le :: Codec
+utf16_le = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-16-LE"
+    enc text = (TE.encodeUtf16LE text, Nothing)
+    dec bytes = case splitQuickly bytes of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf16LE bytes
+
+    splitQuickly bytes = maybeDecode (loop 0) where
+        maxN = B.length bytes
+
+        loop n |  n      == maxN = decodeAll
+               | (n + 1) == maxN = decodeTo n
+        loop n = let
+            req = utf16Required
+                (B.index bytes n)
+                (B.index bytes (n + 1))
+            decodeMore = loop $! n + req
+            in if n + req > maxN
+                then decodeTo n
+                else decodeMore
+
+        decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
+        decodeAll = (TE.decodeUtf16LE bytes, B.empty)
+
+utf16_be :: Codec
+utf16_be = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-16-BE"
+    enc text = (TE.encodeUtf16BE text, Nothing)
+    dec bytes = case splitQuickly bytes of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf16BE bytes
+
+    splitQuickly bytes = maybeDecode (loop 0) where
+        maxN = B.length bytes
+
+        loop n |  n      == maxN = decodeAll
+               | (n + 1) == maxN = decodeTo n
+        loop n = let
+            req = utf16Required
+                (B.index bytes (n + 1))
+                (B.index bytes n)
+            decodeMore = loop $! n + req
+            in if n + req > maxN
+                then decodeTo n
+                else decodeMore
+
+        decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
+        decodeAll = (TE.decodeUtf16BE bytes, B.empty)
+
+utf16Required :: Word8 -> Word8 -> Int
+utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
+    x :: Word16
+    x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
+
+
+utf32_le :: Codec
+utf32_le = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-32-LE"
+    enc text = (TE.encodeUtf32LE text, Nothing)
+    dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf32LE bs
+
+
+utf32_be :: Codec
+utf32_be = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-32-BE"
+    enc text = (TE.encodeUtf32BE text, Nothing)
+    dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf32BE bs
+
+utf32SplitBytes :: (ByteString -> Text)
+                -> ByteString
+                -> Maybe (Text, ByteString)
+utf32SplitBytes dec bytes = split where
+    split = maybeDecode (dec toDecode, extra)
+    len = B.length bytes
+    lenExtra = mod len 4
+
+    lenToDecode = len - lenExtra
+    (toDecode, extra) = if lenExtra == 0
+        then (bytes, B.empty)
+        else B.splitAt lenToDecode bytes
+
+
+tryEvaluate :: a -> Either Exc.SomeException a
+tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
+
+maybeDecode :: (a, b) -> Maybe (a, b)
+maybeDecode (a, b) = case tryEvaluate a of
+    Left _ -> Nothing
+    Right _ -> Just (a, b)
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs

new file mode 100644 (file)

index 0000000..531104a
--- /dev/null
+++ b/Pipes/Text/Internal/Decoding.hs
@@ -0,0 +1,147 @@
+{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
+{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
+
+-- This module lifts assorted materials from Brian O'Sullivan's text package 
+-- especially Data.Text.Encoding in order to define a pipes-appropriate
+-- streamDecodeUtf8
+module Pipes.Text.Internal.Decoding 
+    ( Decoding(..)
+    , streamDecodeUtf8
+    , decodeSomeUtf8
+    ) where
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
+import Control.Monad.ST (ST, runST)
+import Data.Bits ((.&.))
+import Data.ByteString as B 
+import Data.ByteString (ByteString)
+import Data.ByteString.Internal as B 
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T 
+import qualified Data.Text.Encoding as TE 
+import Data.Text.Encoding.Error ()
+import Data.Text.Internal (Text, textP)
+import Foreign.C.Types (CSize)
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Marshal.Utils (with)
+import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
+import Foreign.Storable (Storable, peek, poke)
+import GHC.Base  (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
+import GHC.Word (Word8, Word32)
+import qualified Data.Text.Array as A
+import Data.Word (Word8, Word16)
+import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exc
+import Data.Bits ((.&.), (.|.), shiftL)
+import Data.Typeable
+import Control.Arrow (first)
+import Data.Maybe (catMaybes)
+#include "pipes_text_cbits.h"
+
+
+
+-- | A stream oriented decoding result.
+data Decoding = Some Text ByteString (ByteString -> Decoding)
+              | Other Text ByteString
+instance Show Decoding where
+    showsPrec d (Some t bs _) = showParen (d > prec) $
+                                showString "Some " . showsPrec prec' t .
+                                showChar ' ' . showsPrec prec' bs .
+                                showString " _"
+      where prec = 10; prec' = prec + 1
+    showsPrec d (Other t bs)  = showParen (d > prec) $
+                                showString "Other " . showsPrec prec' t .
+                                showChar ' ' . showsPrec prec' bs .
+                                showString " _"
+      where prec = 10; prec' = prec + 1
+
+newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
+newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
+
+streamDecodeUtf8 :: ByteString -> Decoding
+streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 
+  where
+  decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
+  decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = 
+                    runST $ do marray <- A.new (len+1) 
+                               unsafeIOToST (decodeChunkToBuffer marray)
+     where
+     decodeChunkToBuffer :: A.MArray s -> IO Decoding
+     decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+       with (0::CSize) $ \destOffPtr ->
+       with codepoint0 $ \codepointPtr ->
+       with state0     $ \statePtr ->
+       with nullPtr    $ \curPtrPtr ->
+         do let end = ptr `plusPtr` (off + len)
+                curPtr = ptr `plusPtr` off
+            poke curPtrPtr curPtr
+            c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+            state <- peek statePtr
+            lastPtr <- peek curPtrPtr
+            codepoint <- peek codepointPtr
+            n <- peek destOffPtr
+            chunkText <- mkText dest n
+            let left      = lastPtr `minusPtr` curPtr
+                remaining = B.drop left bs
+                accum = if T.null chunkText then B.append old remaining  else remaining 
+            return $! case state of 
+              UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
+              _ ->           Some  chunkText accum (decodeChunkUtf8 accum codepoint state)
+     {-# INLINE decodeChunkToBuffer #-}
+  {-# INLINE decodeChunkUtf8 #-}
+{-# INLINE streamDecodeUtf8 #-}
+
+decodeSomeUtf8 :: ByteString -> (Text, ByteString)
+decodeSomeUtf8 bs@(PS fp off len) = runST $ do 
+  dest <- A.new (len+1) 
+  unsafeIOToST $ 
+     withForeignPtr fp $ \ptr ->
+     with (0::CSize)        $ \destOffPtr ->
+     with (0::CodePoint)    $ \codepointPtr ->
+     with (0::DecoderState) $ \statePtr ->
+     with nullPtr           $ \curPtrPtr ->
+       do let end = ptr `plusPtr` (off + len)
+              curPtr = ptr `plusPtr` off
+          poke curPtrPtr curPtr
+          c_decode_utf8_with_state (A.maBA dest) destOffPtr 
+                                   curPtrPtr end codepointPtr statePtr
+          state <- peek statePtr
+          lastPtr <- peek curPtrPtr
+          codepoint <- peek codepointPtr
+          n <- peek destOffPtr
+          chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+                                         return $! textP arr 0 (fromIntegral n)
+          let left      = lastPtr `minusPtr` curPtr
+              remaining = B.drop left bs
+          return $! (chunkText, remaining)
+{-# INLINE decodeSomeUtf8 #-}
+
+mkText :: A.MArray s -> CSize -> IO Text
+mkText dest n =  unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+                                   return $! textP arr 0 (fromIntegral n)
+{-# INLINE mkText #-}
+
+ord :: Char -> Int
+ord (C# c#) = I# (ord# c#)
+{-# INLINE ord #-}
+
+unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
+unsafeWrite marr i c
+    | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
+                       return 1
+    | otherwise   = do A.unsafeWrite marr i lo
+                       A.unsafeWrite marr (i+1) hi
+                       return 2
+    where n = ord c
+          m = n - 0x10000
+          lo = fromIntegral $ (m `shiftR` 10) + 0xD800
+          hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
+          shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
+          {-# INLINE shiftR #-}
+{-# INLINE unsafeWrite #-}
+
+foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
+    :: MutableByteArray# s -> Ptr CSize
+    -> Ptr (Ptr Word8) -> Ptr Word8
+    -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
+\ No newline at end of file
diff --git a/pipes-text.cabal b/pipes-text.cabal

index 44bc5512b57c6c79e1f0eadbf3efabdef00ee9a2..fda6f5927f3cb3e0c667f2cd7a9497f7de30ae58 100644 (file)
--- a/pipes-text.cabal
+++ b/pipes-text.cabal
@@ -1,20 +1,35 @@
  name:                pipes-text
  version:             0.1.0.0
  synopsis:            Text pipes.
-description:         Text pipes.
+description:         Most of the pipes and other operations defined in `pipes-text` 
+                     closely mirrors the `pipes-bytestring` library, simply adjusting for
+                     the difference between strict `ByteString` and strict `Text`, and between
+                     `Word8` and `Char`. To this core are added some simple functions 
+                     some akin to the `String` operations in `Pipes.Prelude`, some like the
+                     utilities in `Data.Text`.  
+                     
+                     All of the `IO` operations defined here - e.g `readFile`, `stdout` etc. 
+                     - are conveniences akin to those in `Data.Text.IO` which e.g. try to 
+                     find the system encoding and use the exceptions defined in the `text`
+                     library. Proper `IO` in the sense of this library will employ 
+                     `pipes-bytestring` in conjuntion with 'pure' operations like 
+                     `decodeUtf8` and `encodeUtf8` that are defined here. 
+
  homepage:            github.com/michaelt/text-pipes
  license:             BSD3
  license-file:        LICENSE
-author:              michaelt
+author:              Michael Thompson
  maintainer:          what_is_it_to_do_anything@yahoo.com
-category:            Text
+category:            Text, Pipes
  build-type:          Simple
  cabal-version:       >=1.10
+extra-source-files: README.md
+                    include/*.h 
  
  library
    c-sources:    cbits/cbits.c
    include-dirs: include
-  exposed-modules:     Pipes.Text, Pipes.Text.Internal, Pipes.Text.Codec
+  exposed-modules:     Pipes.Text, Pipes.Text.Internal.Decoding, Pipes.Text.Internal.Codec
    -- other-modules:       
    other-extensions:    RankNTypes
    build-depends:       base         >= 4       && < 5  ,
author	michaelt <what_is_it_to_do_anything@yahoo.com>
	Wed, 5 Feb 2014 02:47:27 +0000 (21:47 -0500)
committer	michaelt <what_is_it_to_do_anything@yahoo.com>
	Wed, 5 Feb 2014 02:47:27 +0000 (21:47 -0500)
.DS_Store	[new file with mode: 0644]	patch \| blob
Pipes/Text.hs		patch \| blob \| blame \| history
Pipes/Text/Internal/Codec.hs	[new file with mode: 0644]	patch \| blob
Pipes/Text/Internal/Decoding.hs	[new file with mode: 0644]	patch \| blob
pipes-text.cabal		patch \| blob \| blame \| history