use new text-stream-decoding

author: michaelt <what_is_it_to_do_anything@yahoo.com> 2014-02-15 10:01:48 -0500
committer: michaelt <what_is_it_to_do_anything@yahoo.com> 2014-02-15 10:01:48 -0500
commit: bbdfd3056da4992e18d3983fd5992bee23af93af (patch)
tree: 5765f3f224708890cf50b400c675c23e759595a2 /Pipes/Text
parent: fd27d0c43e2995437cf1f3e2d6f292597371ebb0 (diff)
download: text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.gz
text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.zst
text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.zip
5 files changed, 301 insertions, 377 deletions
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs
new file mode 100644
index 0000000..2bb5807
--- /dev/null
+++ b/Pipes/Text/Encoding.hs
@@ -0,0 +1,205 @@
+{-# LANGUAGE RankNTypes, BangPatterns #-}
+-- |
+-- Copyright: 2014 Michael Thompson
+--
+-- This module uses the stream decoding functions from the text-stream-decoding package
+-- to define pipes decoding functions and lenses.
+module Pipes.Text.Encoding
+    ( DecodeResult (..)
+    , Codec
+    , decodeUtf8
+    , decodeUtf8Pure
+    , decodeUtf16LE
+    , decodeUtf16BE
+    , decodeUtf32LE
+    , decodeUtf32BE
+    , utf8
+    , utf8Pure
+    , utf16LE
+    , utf16BE
+    , utf32LE
+    , utf32BE
+    , encodeAscii
+    , decodeAscii
+    , encodeIso8859_1
+    , decodeIso8859_1
+    ) 
+    where
+import Data.Char (ord)
+import Data.ByteString as B 
+import Data.ByteString (ByteString)
+import Data.ByteString.Internal as B 
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T 
+import qualified Data.Text.Encoding as TE 
+import Data.Text.StreamDecoding
+import GHC.Word (Word8, Word32)
+import Data.Word (Word8, Word16)
+import Control.Monad
+import Pipes
+import Pipes.Core
+{- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text.
+    They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
+    The stream of text they 'see' in a bytestream ends by returning the original byte stream 
+    beginning at the point of failure, or the empty bytestream with its return value.
+   -}
+type Codec  = forall f m r . (Functor f , Monad m ) => 
+     (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r)))
+     -> Producer ByteString m r -> f (Producer ByteString m r )
+decodeStream :: Monad m 
+       => (B.ByteString -> DecodeResult) 
+       -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeStream = loop where
+  loop dec0 p = 
+    do x <- lift (next p) 
+       case x of Left r -> return (return r)
+                 Right (chunk, p') -> case dec0 chunk of 
+                    DecodeResultSuccess text dec -> do yield text
+                                                       loop dec p'
+                    DecodeResultFailure text bs -> do yield text 
+                                                      return (do yield bs 
+                                                                 p')
+{-# INLINABLE decodeStream#-}
+decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf8 = decodeStream streamUtf8
+{-# INLINE decodeUtf8 #-}
+decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf8Pure = decodeStream streamUtf8Pure
+{-# INLINE decodeUtf8Pure #-}
+decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf16LE = decodeStream streamUtf16LE
+{-# INLINE decodeUtf16LE #-}
+decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf16BE = decodeStream streamUtf16BE
+{-# INLINE decodeUtf16BE #-}
+decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf32LE = decodeStream streamUtf32LE
+{-# INLINE decodeUtf32LE #-}
+decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf32BE = decodeStream streamUtf32BE
+{-# INLINE decodeUtf32BE #-}
+mkCodec :: (forall r m . Monad m => 
+           Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
+        -> (Text -> ByteString)
+        -> Codec
+mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc)))  (k (dec p0))
+{- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated
+   text stream ends by returning the original bytestream beginning at the point of failure,
+   or the empty bytestring for a well-encoded text. 
+   -}
+utf8 :: Codec
+utf8 = mkCodec decodeUtf8 TE.encodeUtf8
+utf8Pure :: Codec
+utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
+utf16LE :: Codec
+utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
+utf16BE :: Codec
+utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
+utf32LE :: Codec
+utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
+utf32BE :: Codec
+utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
+{- | ascii and latin encodings only use a small number of the characters 'Text'
+     recognizes; thus we cannot use the pipes 'Lens' style to work with them. 
+     Rather we simply define functions each way. 
+     'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
+     returning the rest of the 'Text' at the first non-ascii 'Char'
+-}
+encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
+encodeAscii = go where
+  go p = do e <- lift (next p)
+            case e of 
+              Left r -> return (return r)
+              Right (chunk, p') -> 
+                 if T.null chunk 
+                   then go p'
+                   else let (safe, unsafe)  = T.span (\c -> ord c <= 0x7F) chunk
+                        in do yield (B8.pack (T.unpack safe))
+                              if T.null unsafe
+                                then go p'
+                                else return $ do yield unsafe 
+                                                 p'
+                                                 
+{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
+     returning the rest of the 'Text' upon hitting any non-latin 'Char'
+   -}
+encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
+encodeIso8859_1 = go where
+  go p = do e <- lift (next p)
+            case e of 
+              Left r -> return (return r)
+              Right (txt, p') -> 
+                 if T.null txt 
+                   then go p'
+                   else let (safe, unsafe)  = T.span (\c -> ord c <= 0xFF) txt
+                        in do yield (B8.pack (T.unpack safe))
+                              if T.null unsafe
+                                then go p'
+                                else return $ do yield unsafe 
+                                                 p'
+{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
+     unused 'ByteString' upon hitting an un-ascii byte.
+   -}
+decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeAscii = go where
+  go p = do e <- lift (next p)
+            case e of 
+              Left r -> return (return r)
+              Right (chunk, p') -> 
+                 if B.null chunk 
+                   then go p'
+                   else let (safe, unsafe) = B.span (<= 0x7F) chunk
+                        in do yield (T.pack (B8.unpack safe))
+                              if B.null unsafe
+                                then go p'
+                                else return (do yield unsafe 
+                                                p')
+{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
+     unused 'ByteString' upon hitting the rare un-latinizable byte.
+     -}
+decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeIso8859_1 = go where
+  go p = do e <- lift (next p)
+            case e of 
+              Left r -> return (return r)
+              Right (chunk, p') -> 
+                 if B.null chunk 
+                    then go p'
+                    else do let (safe, unsafe) = B.span (<= 0xFF) chunk
+                            yield (T.pack (B8.unpack safe))
+                            if B.null unsafe 
+                               then go p'
+                               else return (do yield unsafe 
+                                               p')
diff --git a/Pipes/Text/IO.hs b/Pipes/Text/IO.hs
new file mode 100644
index 0000000..3c9ac98
--- /dev/null
+++ b/Pipes/Text/IO.hs
@@ -0,0 +1,96 @@
+{-#LANGUAGE RankNTypes#-}
+module Pipes.Text.IO 
+   ( stdin
+   , stdout
+   , fromHandle
+   , toHandle
+   , readFile
+   , writeFile
+   ) where
+import qualified System.IO as IO
+import Control.Exception (throwIO, try)
+import Foreign.C.Error (Errno(Errno), ePIPE)
+import qualified GHC.IO.Exception as G
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import Pipes
+import qualified Pipes.Safe.Prelude as Safe
+import qualified Pipes.Safe as Safe
+import Pipes.Safe (MonadSafe(..), Base(..))
+import Prelude hiding (readFile, writeFile)
+-- | Stream text from 'stdin'
+stdin :: MonadIO m => Producer Text m ()
+stdin = fromHandle IO.stdin
+{-# INLINE stdin #-}
+{-| Convert a 'IO.Handle' into a text stream using a text size 
+    determined by the good sense of the text library; note that this
+    is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@
+    but uses the system encoding and has other `Data.Text.IO` features
+-}
+fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
+fromHandle h =  go where
+      go = do txt <- liftIO (T.hGetChunk h)
+              if T.null txt then return ()
+                            else do yield txt
+                                    go 
+{-# INLINABLE fromHandle#-}
+{-| Stream text from a file in the simple fashion of @Data.Text.IO@ 
+>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
+MAIN = PUTSTRLN "HELLO WORLD"
+-}
+readFile :: MonadSafe m => FilePath -> Producer Text m ()
+readFile file = Safe.withFile file IO.ReadMode fromHandle
+{-# INLINE readFile #-}
+{-| Stream text to 'stdout'
+    Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe.
+    Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@ 
+    instead of @(source >-> stdout)@ .
+-}
+stdout :: MonadIO m => Consumer' Text m ()
+stdout = go
+  where
+    go = do
+        txt <- await
+        x  <- liftIO $ try (T.putStr txt)
+        case x of
+            Left (G.IOError { G.ioe_type  = G.ResourceVanished
+                            , G.ioe_errno = Just ioe })
+                 | Errno ioe == ePIPE
+                     -> return ()
+            Left  e  -> liftIO (throwIO e)
+            Right () -> go
+{-# INLINABLE stdout #-}
+{-| Convert a text stream into a 'Handle'
+    Note: again, for best performance, where possible use 
+    @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@.
+-}
+toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r
+toHandle h = for cat (liftIO . T.hPutStr h)
+{-# INLINABLE toHandle #-}
+{-# RULES "p >-> toHandle h" forall p h .
+        p >-> toHandle h = for p (\txt -> liftIO (T.hPutStr h txt))
+  #-}
+-- | Stream text into a file. Uses @pipes-safe@.
+writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
+writeFile file = Safe.withFile file IO.WriteMode toHandle
+{-# INLINE writeFile #-}
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
deleted file mode 100644
index 582ef14..0000000
--- a/Pipes/Text/Internal.hs
+++ /dev/null
@@ -1,7 +0,0 @@
-module Pipes.Text.Internal
-    (module Pipes.Text.Internal.Codec
-    , module Pipes.Text.Internal.Decoding
-    ) where
-import Pipes.Text.Internal.Codec
-import Pipes.Text.Internal.Decoding
-\ No newline at end of file
diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs
deleted file mode 100644
index 075a152..0000000
--- a/Pipes/Text/Internal/Codec.hs
+++ /dev/null
@@ -1,216 +0,0 @@
-{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
-- |
-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
-- License: MIT
--  This Parts of this code were taken from enumerator and conduits, and adapted for pipes
-- This module follows the model of the enumerator and conduits libraries, and defines
-- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and 
-- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so
-- special functions appear in @Pipes.Text@
-module Pipes.Text.Internal.Codec
-    ( Codec(..)
-    , TextException(..)
-    , utf8
-    , utf16_le
-    , utf16_be
-    , utf32_le
-    , utf32_be
-    ) where
-import Data.Bits ((.&.))
-import Data.Char (ord)
-import Data.ByteString as B 
-import Data.ByteString (ByteString)
-import Data.ByteString.Internal as B 
-import Data.ByteString.Char8 as B8
-import Data.Text (Text)
-import qualified Data.Text as T 
-import qualified Data.Text.Encoding as TE 
-import Data.Text.Encoding.Error ()
-import GHC.Word (Word8, Word32)
-import qualified Data.Text.Array as A
-import Data.Word (Word8, Word16)
-import System.IO.Unsafe (unsafePerformIO)
-import qualified Control.Exception as Exc
-import Data.Bits ((.&.), (.|.), shiftL)
-import Data.Typeable
-import Control.Arrow (first)
-import Data.Maybe (catMaybes)
-import Pipes.Text.Internal.Decoding
-import Pipes
-- | A specific character encoding.
-data Codec = Codec
-  { codecName :: Text
-  , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
-  , codecDecode :: ByteString -> Decoding 
-  }
-instance Show Codec where
-    showsPrec d c = showParen (d > 10) $ 
-                    showString "Codec " . shows (codecName c)
-data TextException = DecodeException Codec Word8
-                   | EncodeException Codec Char
-                   | LengthExceeded Int
-                   | TextException Exc.SomeException
-    deriving (Show, Typeable)
-instance Exc.Exception TextException
-toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
-           -> (ByteString -> Decoding)
-toDecoding op = loop B.empty where
-  loop !extra bs0 = case op (B.append extra bs0) of
-                      (txt, Right bs) -> Some txt bs (loop bs)
-                      (txt, Left (_,bs)) -> Other txt bs
-- To do: toDecoding should be inlined in each of the 'Codec' definitions
-- or else Codec changed to the conduit/enumerator definition.  We have
-- altered it to use 'streamDecodeUtf8'
-splitSlowly :: (ByteString -> Text)
-            -> ByteString 
-            -> (Text, Either (TextException, ByteString) ByteString)
-splitSlowly dec bytes = valid where
-    valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
-    splits 0 = [(B.empty, bytes)]
-    splits n = B.splitAt n bytes : splits (n - 1)
-    decFirst (a, b) = case tryEvaluate (dec a) of
-        Left _ -> Nothing
-        Right text -> let trouble = case tryEvaluate (dec b) of
-                            Left exc -> Left (TextException exc, b)
-                            Right _  -> Right B.empty 
-                      in Just (text, trouble) -- this case shouldn't occur, 
-                                      -- since splitSlowly is only called
-                                      -- when parsing failed somewhere
-utf8 :: Codec
-utf8 = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-8"
-    enc text = (TE.encodeUtf8 text, Nothing)
-    dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
--     -- Whether the given byte is a continuation byte.
--     isContinuation byte = byte .&. 0xC0 == 0x80
-- 
--     -- The number of continuation bytes needed by the given
--     -- non-continuation byte. Returns -1 for an illegal UTF-8
--     -- non-continuation byte and the whole split quickly must fail so
--     -- as the input is passed to TE.decodeUtf8, which will issue a
--     -- suitable error.
--     required x0
--         | x0 .&. 0x80 == 0x00 = 0
--         | x0 .&. 0xE0 == 0xC0 = 1
--         | x0 .&. 0xF0 == 0xE0 = 2
--         | x0 .&. 0xF8 == 0xF0 = 3
--         | otherwise           = -1
-- 
--     splitQuickly bytes
--         | B.null l || req == -1 = Nothing
--         | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
--         | otherwise = Just (TE.decodeUtf8 l', r')
--       where
--         (l, r) = B.spanEnd isContinuation bytes
--         req = required (B.last l)
--         l' = B.init l
--         r' = B.cons (B.last l) r
-utf16_le :: Codec
-utf16_le = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-16-LE"
-    enc text = (TE.encodeUtf16LE text, Nothing)
-    dec bytes = case splitQuickly bytes of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf16LE bytes
-    splitQuickly bytes = maybeDecode (loop 0) where
-        maxN = B.length bytes
-        loop n |  n      == maxN = decodeAll
-               | (n + 1) == maxN = decodeTo n
-        loop n = let
-            req = utf16Required
-                (B.index bytes n)
-                (B.index bytes (n + 1))
-            decodeMore = loop $! n + req
-            in if n + req > maxN
-                then decodeTo n
-                else decodeMore
-        decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
-        decodeAll = (TE.decodeUtf16LE bytes, B.empty)
-utf16_be :: Codec
-utf16_be = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-16-BE"
-    enc text = (TE.encodeUtf16BE text, Nothing)
-    dec bytes = case splitQuickly bytes of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf16BE bytes
-    splitQuickly bytes = maybeDecode (loop 0) where
-        maxN = B.length bytes
-        loop n |  n      == maxN = decodeAll
-               | (n + 1) == maxN = decodeTo n
-        loop n = let
-            req = utf16Required
-                (B.index bytes (n + 1))
-                (B.index bytes n)
-            decodeMore = loop $! n + req
-            in if n + req > maxN
-                then decodeTo n
-                else decodeMore
-        decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
-        decodeAll = (TE.decodeUtf16BE bytes, B.empty)
-utf16Required :: Word8 -> Word8 -> Int
-utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
-    x :: Word16
-    x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
-utf32_le :: Codec
-utf32_le = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-32-LE"
-    enc text = (TE.encodeUtf32LE text, Nothing)
-    dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf32LE bs
-utf32_be :: Codec
-utf32_be = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-32-BE"
-    enc text = (TE.encodeUtf32BE text, Nothing)
-    dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf32BE bs
-utf32SplitBytes :: (ByteString -> Text)
-                -> ByteString
-                -> Maybe (Text, ByteString)
-utf32SplitBytes dec bytes = split where
-    split = maybeDecode (dec toDecode, extra)
-    len = B.length bytes
-    lenExtra = mod len 4
-    lenToDecode = len - lenExtra
-    (toDecode, extra) = if lenExtra == 0
-        then (bytes, B.empty)
-        else B.splitAt lenToDecode bytes
-tryEvaluate :: a -> Either Exc.SomeException a
-tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
-maybeDecode :: (a, b) -> Maybe (a, b)
-maybeDecode (a, b) = case tryEvaluate a of
-    Left _ -> Nothing
-    Right _ -> Just (a, b)
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs
deleted file mode 100644
index b5d928a..0000000
--- a/Pipes/Text/Internal/Decoding.hs
+++ /dev/null
@@ -1,154 +0,0 @@
-{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
-{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
-{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
-- This module lifts assorted materials from Brian O'Sullivan's text package 
-- especially @Data.Text.Encoding@ in order to define a pipes-appropriate
-- 'streamDecodeUtf8'
-module Pipes.Text.Internal.Decoding 
-    ( Decoding(..)
-    , streamDecodeUtf8
-    , decodeSomeUtf8
-    ) where
-import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
-import Control.Monad.ST (ST, runST)
-import Data.Bits ((.&.))
-import Data.ByteString as B 
-import Data.ByteString (ByteString)
-import Data.ByteString.Internal as B 
-import Data.ByteString.Char8 as B8
-import Data.Text (Text)
-import qualified Data.Text as T 
-import qualified Data.Text.Encoding as TE 
-import Data.Text.Encoding.Error ()
-import Data.Text.Internal (Text, textP)
-import Foreign.C.Types (CSize)
-import Foreign.ForeignPtr (withForeignPtr)
-import Foreign.Marshal.Utils (with)
-import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
-import Foreign.Storable (Storable, peek, poke)
-import GHC.Base  (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
-import GHC.Word (Word8, Word32)
-import qualified Data.Text.Array as A
-import Data.Word (Word8, Word16)
-import System.IO.Unsafe (unsafePerformIO)
-import qualified Control.Exception as Exc
-import Data.Bits ((.&.), (.|.), shiftL)
-import Data.Typeable
-import Control.Arrow (first)
-import Data.Maybe (catMaybes)
-#include "pipes_text_cbits.h"
--  A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding
-data Decoding = Some Text ByteString (ByteString -> Decoding) 
-               -- Text, continuation and any undecoded fragment.
-              | Other Text ByteString  
-              --  Text followed by an undecodable ByteString
-              
-instance Show Decoding where
-    showsPrec d (Some t bs _) = showParen (d > prec) $
-                                showString "Some " . showsPrec prec' t .
-                                showChar ' ' . showsPrec prec' bs .
-                                showString " _"
-      where prec = 10; prec' = prec + 1
-    showsPrec d (Other t bs)  = showParen (d > prec) $
-                                showString "Other " . showsPrec prec' t .
-                                showChar ' ' . showsPrec prec' bs .
-                                showString " _"
-      where prec = 10; prec' = prec + 1
-newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
-newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
--  Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'. 
-streamDecodeUtf8 :: ByteString -> Decoding
-streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 
-  where
-  decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
-  decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = 
-                    runST $ do marray <- A.new (len+1) 
-                               unsafeIOToST (decodeChunkToBuffer marray)
-     where
-     decodeChunkToBuffer :: A.MArray s -> IO Decoding
-     decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
-       with (0::CSize) $ \destOffPtr ->
-       with codepoint0 $ \codepointPtr ->
-       with state0     $ \statePtr ->
-       with nullPtr    $ \curPtrPtr ->
-         do let end = ptr `plusPtr` (off + len)
-                curPtr = ptr `plusPtr` off
-            poke curPtrPtr curPtr
-            c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
-            state <- peek statePtr
-            lastPtr <- peek curPtrPtr
-            codepoint <- peek codepointPtr
-            n <- peek destOffPtr
-            chunkText <- mkText dest n
-            let left      = lastPtr `minusPtr` curPtr
-                remaining = B.drop left bs
-                accum = if T.null chunkText then B.append old remaining  else remaining 
-            return $! case state of 
-              UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
-              _ ->           Some  chunkText accum (decodeChunkUtf8 accum codepoint state)
-     {-# INLINE decodeChunkToBuffer #-}
-  {-# INLINE decodeChunkUtf8 #-}
-{-# INLINE streamDecodeUtf8 #-}
--  Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble
-decodeSomeUtf8 :: ByteString -> (Text, ByteString)
-decodeSomeUtf8 bs@(PS fp off len) = runST $ do 
-  dest <- A.new (len+1) 
-  unsafeIOToST $ 
-     withForeignPtr fp $ \ptr ->
-     with (0::CSize)        $ \destOffPtr ->
-     with (0::CodePoint)    $ \codepointPtr ->
-     with (0::DecoderState) $ \statePtr ->
-     with nullPtr           $ \curPtrPtr ->
-       do let end = ptr `plusPtr` (off + len)
-              curPtr = ptr `plusPtr` off
-          poke curPtrPtr curPtr
-          c_decode_utf8_with_state (A.maBA dest) destOffPtr 
-                                   curPtrPtr end codepointPtr statePtr
-          state <- peek statePtr
-          lastPtr <- peek curPtrPtr
-          codepoint <- peek codepointPtr
-          n <- peek destOffPtr
-          chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
-                                         return $! textP arr 0 (fromIntegral n)
-          let left      = lastPtr `minusPtr` curPtr
-              remaining = B.drop left bs
-          return $! (chunkText, remaining)
-{-# INLINE decodeSomeUtf8 #-}
-mkText :: A.MArray s -> CSize -> IO Text
-mkText dest n =  unsafeSTToIO $ do arr <- A.unsafeFreeze dest
-                                   return $! textP arr 0 (fromIntegral n)
-{-# INLINE mkText #-}
-ord :: Char -> Int
-ord (C# c#) = I# (ord# c#)
-{-# INLINE ord #-}
-unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
-unsafeWrite marr i c
-    | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
-                       return 1
-    | otherwise   = do A.unsafeWrite marr i lo
-                       A.unsafeWrite marr (i+1) hi
-                       return 2
-    where n = ord c
-          m = n - 0x10000
-          lo = fromIntegral $ (m `shiftR` 10) + 0xD800
-          hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
-          shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
-          {-# INLINE shiftR #-}
-{-# INLINE unsafeWrite #-}
-foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
-    :: MutableByteArray# s -> Ptr CSize
-    -> Ptr (Ptr Word8) -> Ptr Word8
-    -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
-\ No newline at end of file
author	michaelt <what_is_it_to_do_anything@yahoo.com>	2014-02-15 10:01:48 -0500
committer	michaelt <what_is_it_to_do_anything@yahoo.com>	2014-02-15 10:01:48 -0500
commit	bbdfd3056da4992e18d3983fd5992bee23af93af (patch)
tree	5765f3f224708890cf50b400c675c23e759595a2 /Pipes/Text
parent	fd27d0c43e2995437cf1f3e2d6f292597371ebb0 (diff)
download	text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.gz text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.zst text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.zip