merge home made decodeUtf8

author: michaelt <what_is_it_to_do_anything@yahoo.com> 2014-01-14 22:17:25 -0500
committer: michaelt <what_is_it_to_do_anything@yahoo.com> 2014-01-14 22:17:25 -0500
commit: 7381c94f47c76833972565ee8d15d86216b214ce (patch)
tree: 38ddadda59a3808422fc432d37b886c456adcb1d
parent: ca6f90a05bee6471d6837d629ddaee9b0a75bd50 (diff)
parent: 3694350ac7b9c42fd64e0092a74cf0471a080058 (diff)
download: text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.gz
text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.zst
text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.zip
7 files changed, 580 insertions, 3 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
new file mode 100644
index 0000000..7e5b044
--- /dev/null
+++ b/Pipes/Text/Internal.hs
@@ -0,0 +1,163 @@
+{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
+    UnliftedFFITypes #-}
+-- This module lifts assorted materials from Brian O'Sullivan's text package 
+-- especially Data.Text.Encoding in order to define a pipes-appropriate
+-- streamDecodeUtf8
+module Pipes.Text.Internal 
+    ( Decoding(..)
+    , streamDecodeUtf8
+    , decodeSomeUtf8
+    ) where
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
+import Control.Monad.ST (ST, runST)
+import Data.Bits ((.&.))
+import Data.ByteString as B 
+import Data.ByteString.Internal as B 
+import qualified Data.Text as T (null)
+import Data.Text.Encoding.Error ()
+import Data.Text.Internal (Text, textP)
+import Foreign.C.Types (CSize)
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Marshal.Utils (with)
+import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
+import Foreign.Storable (Storable, peek, poke)
+import GHC.Base  (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
+import GHC.Word (Word8, Word32)
+import qualified Data.Text.Array as A
+    
+#include "pipes_text_cbits.h"
+-- | A stream oriented decoding result.
+data Decoding = Some Text ByteString (ByteString -> Decoding)
+              | Other Text ByteString
+instance Show Decoding where
+    showsPrec d (Some t bs _) = showParen (d > prec) $
+                                showString "Some " . showsPrec prec' t .
+                                showChar ' ' . showsPrec prec' bs .
+                                showString " _"
+      where prec = 10; prec' = prec + 1
+    showsPrec d (Other t bs)  = showParen (d > prec) $
+                                showString "Other " . showsPrec prec' t .
+                                showChar ' ' . showsPrec prec' bs .
+                                showString " _"
+      where prec = 10; prec' = prec + 1
+newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
+newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
+streamDecodeUtf8 :: ByteString -> Decoding
+streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 
+  where
+  decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
+  decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = 
+                    runST $ do marray <- A.new (len+1) 
+                               unsafeIOToST (decodeChunkToBuffer marray)
+     where
+     decodeChunkToBuffer :: A.MArray s -> IO Decoding
+     decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+       with (0::CSize) $ \destOffPtr ->
+       with codepoint0 $ \codepointPtr ->
+       with state0     $ \statePtr ->
+       with nullPtr    $ \curPtrPtr ->
+         do let end = ptr `plusPtr` (off + len)
+                curPtr = ptr `plusPtr` off
+            poke curPtrPtr curPtr
+            c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+            state <- peek statePtr
+            lastPtr <- peek curPtrPtr
+            codepoint <- peek codepointPtr
+            n <- peek destOffPtr
+            chunkText <- mkText dest n
+            let left      = lastPtr `minusPtr` curPtr
+                remaining = B.drop left bs
+                accum = if T.null chunkText then B.append old remaining  else remaining 
+            return $! case state of 
+              UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
+              _ ->           Some  chunkText accum (decodeChunkUtf8 accum codepoint state)
+     {-# INLINE decodeChunkToBuffer #-}
+  {-# INLINE decodeChunkUtf8 #-}
+{-# INLINE streamDecodeUtf8 #-}
+decodeSomeUtf8 :: ByteString -> (Text, ByteString)
+decodeSomeUtf8 bs@(PS fp off len) = runST $ do 
+  dest <- A.new (len+1) 
+  unsafeIOToST $ 
+     withForeignPtr fp $ \ptr ->
+     with (0::CSize)        $ \destOffPtr ->
+     with (0::CodePoint)    $ \codepointPtr ->
+     with (0::DecoderState) $ \statePtr ->
+     with nullPtr           $ \curPtrPtr ->
+       do let end = ptr `plusPtr` (off + len)
+              curPtr = ptr `plusPtr` off
+          poke curPtrPtr curPtr
+          c_decode_utf8_with_state (A.maBA dest) destOffPtr 
+                                   curPtrPtr end codepointPtr statePtr
+          state <- peek statePtr
+          lastPtr <- peek curPtrPtr
+          codepoint <- peek codepointPtr
+          n <- peek destOffPtr
+          chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+                                         return $! textP arr 0 (fromIntegral n)
+          let left      = lastPtr `minusPtr` curPtr
+              remaining = B.drop left bs
+          return $! (chunkText, remaining)
+{-# INLINE decodeSomeUtf8 #-}
+-- decodeSomeUtf8 :: ByteString -> (Text, ByteString)
+-- decodeSomeUtf8 bs@(PS fp off len) = 
+--                   runST $ do marray <- A.new (len+1) 
+--                              unsafeIOToST (decodeChunkToBuffer marray)
+--   
+--      where
+--      decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString)
+--      decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+--        with (0::CSize)        $ \destOffPtr ->
+--        with (0::CodePoint)    $ \codepointPtr ->
+--        with (0::DecoderState) $ \statePtr ->
+--        with nullPtr           $ \curPtrPtr ->
+--          do let end = ptr `plusPtr` (off + len)
+--                 curPtr = ptr `plusPtr` off
+--             poke curPtrPtr curPtr
+--             c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+--             state <- peek statePtr
+--             lastPtr <- peek curPtrPtr
+--             codepoint <- peek codepointPtr
+--             n <- peek destOffPtr
+--             chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+--                                            return $! textP arr 0 (fromIntegral n)
+--             let left      = lastPtr `minusPtr` curPtr
+--                 remaining = B.drop left bs
+--             return $! (chunkText, remaining)
+--      {-# INLINE decodeChunkToBuffer #-}
+-- {-# INLINE decodeSomeUtf8 #-}
+mkText :: A.MArray s -> CSize -> IO Text
+mkText dest n =  unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+                                   return $! textP arr 0 (fromIntegral n)
+{-# INLINE mkText #-}
+ord :: Char -> Int
+ord (C# c#) = I# (ord# c#)
+{-# INLINE ord #-}
+unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
+unsafeWrite marr i c
+    | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
+                       return 1
+    | otherwise   = do A.unsafeWrite marr i lo
+                       A.unsafeWrite marr (i+1) hi
+                       return 2
+    where n = ord c
+          m = n - 0x10000
+          lo = fromIntegral $ (m `shiftR` 10) + 0xD800
+          hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
+          shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
+          {-# INLINE shiftR #-}
+{-# INLINE unsafeWrite #-}
+foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
+    :: MutableByteArray# s -> Ptr CSize
+    -> Ptr (Ptr Word8) -> Ptr Word8
+    -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
+\ No newline at end of file
diff --git a/bench/IO.hs b/bench/IO.hs
new file mode 100644
index 0000000..b3a52f6
--- /dev/null
+++ b/bench/IO.hs
@@ -0,0 +1,20 @@
+import qualified Data.Text.IO as T
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.IO as TL
+import qualified Data.Text.Lazy as TL
+import Pipes
+import qualified Pipes.Text as TP
+import qualified Pipes.ByteString as BP
+import Pipes.Safe
+main = textaction
+big = "../../examples/txt/words2.txt"
+textaction = T.readFile big >>= T.putStrLn
+pipeaction =  runEffect $ for ((TP.readFile big) >> return ()) (lift . T.putStrLn)
diff --git a/cbits/cbits.c b/cbits/cbits.c
new file mode 100644
index 0000000..c11645b
--- /dev/null
+++ b/cbits/cbits.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
+ *
+ * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
+ *
+ * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+ */
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "pipes_text_cbits.h"
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+static const uint8_t utf8d[] = {
+  /*
+   * The first part of the table maps bytes to character classes that
+   * to reduce the size of the transition table and create bitmasks.
+   */
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+  /*
+   * The second part is a transition table that maps a combination of
+   * a state of the automaton and a character class to a state.
+   */
+   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+  12,36,12,12,12,12,12,12,12,12,12,12,
+};
+static inline uint32_t
+decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
+  uint32_t type = utf8d[byte];
+  *codep = (*state != UTF8_ACCEPT) ?
+    (byte & 0x3fu) | (*codep << 6) :
+    (0xff >> type) & (byte);
+  return *state = utf8d[256 + *state + type];
+}
+/*
+ * A best-effort decoder. Runs until it hits either end of input or
+ * the start of an invalid byte sequence.
+ *
+ * At exit, we update *destoff with the next offset to write to, *src
+ * with the next source location past the last one successfully
+ * decoded, and return the next source location to read from.
+ *
+ * Moreover, we expose the internal decoder state (state0 and
+ * codepoint0), allowing one to restart the decoder after it
+ * terminates (say, due to a partial codepoint).
+ *
+ * In particular, there are a few possible outcomes,
+ *
+ *   1) We decoded the buffer entirely:
+ *      In this case we return srcend
+ *      state0 == UTF8_ACCEPT
+ *
+ *   2) We met an invalid encoding
+ *      In this case we return the address of the first invalid byte
+ *      state0 == UTF8_REJECT
+ *
+ *   3) We reached the end of the buffer while decoding a codepoint
+ *      In this case we return a pointer to the first byte of the partial codepoint
+ *      state0 != UTF8_ACCEPT, UTF8_REJECT
+ *
+ */
+ #if defined(__GNUC__) || defined(__clang__)
+ static inline uint8_t const *
+ _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+                         const uint8_t const **src, const uint8_t const *srcend,
+                         uint32_t *codepoint0, uint32_t *state0)
+   __attribute((always_inline));
+ #endif
+static inline uint8_t const *
+_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+                         const uint8_t const **src, const uint8_t const *srcend,
+                         uint32_t *codepoint0, uint32_t *state0)
+{
+ uint16_t *d = dest + *destoff;
+ const uint8_t *s = *src, *last = *src;
+ uint32_t state = *state0;
+ uint32_t codepoint = *codepoint0;
+ while (s < srcend) {
+#if defined(__i386__) || defined(__x86_64__)
+   /*
+    * This code will only work on a little-endian system that
+    * supports unaligned loads.
+    *
+    * It gives a substantial speed win on data that is purely or
+    * partly ASCII (e.g. HTML), at only a slight cost on purely
+    * non-ASCII text.
+    */
+   if (state == UTF8_ACCEPT) {
+     while (s < srcend - 4) {
+        codepoint = *((uint32_t *) s);
+        if ((codepoint & 0x80808080) != 0)
+          break;
+        s += 4;
+        /*
+         * Tried 32-bit stores here, but the extra bit-twiddling
+         * slowed the code down.
+         */
+        *d++ = (uint16_t) (codepoint & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
+     }
+     last = s;
+   }
+#endif
+   if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
+     if (state != UTF8_REJECT)
+        continue;
+     break;
+   }
+   if (codepoint <= 0xffff)
+     *d++ = (uint16_t) codepoint;
+   else {
+     *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
+     *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
+   }
+   last = s;
+ }
+ *destoff = d - dest;
+ *codepoint0 = codepoint;
+ *state0 = state;
+ *src = last;
+ return s;
+}
+uint8_t const *
+_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+                          const uint8_t const **src,
+                           const uint8_t const *srcend,
+                          uint32_t *codepoint0, uint32_t *state0)
+{
+ uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
+                                                codepoint0, state0);
+ if (*state0 == UTF8_REJECT)
+   ret -=1;
+ return ret;
+}
diff --git a/include/pipes_text_cbits.h b/include/pipes_text_cbits.h
new file mode 100644
index 0000000..b9ab670
--- /dev/null
+++ b/include/pipes_text_cbits.h
@@ -0,0 +1,11 @@
+/*
+ * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>.
+ */
+#ifndef _pipes_text_cbits_h
+#define _pipes_text_cbits_h
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+#endif
diff --git a/pipes-text.cabal b/pipes-text.cabal
index e79f168..b4388be 100644
--- a/pipes-text.cabal
+++ b/pipes-text.cabal
@@ -12,7 +12,9 @@ build-type:          Simple
 cabal-version:       >=1.10
 library
-  exposed-modules:     Pipes.Text, Pipes.Text.Parse
+  c-sources:    cbits/cbits.c
+  include-dirs: include
+  exposed-modules:     Pipes.Text, Pipes.Text.Parse, Pipes.Text.Internal
  -- other-modules:       
  other-extensions:    RankNTypes
  build-depends:       base         >= 4       && < 5  ,
@@ -23,6 +25,9 @@ library
                       pipes-bytestring >= 1.0 && < 1.2,
                       transformers >= 0.3     && < 0.4,
                       text >=0.11             && < 0.12,
-                       bytestring >=0.10       && < 0.11
+                       bytestring >=0.10       && < 0.11,
+                       vector,
+                       void
  -- hs-source-dirs:      
-  default-language:    Haskell2010
-\ No newline at end of file
+  default-language:    Haskell2010
+  ghc-options: -O2 
diff --git a/test/Test.hs b/test/Test.hs
new file mode 100644
index 0000000..373bafb
--- /dev/null
+++ b/test/Test.hs
@@ -0,0 +1,101 @@
+import Utils
+import Test.QuickCheck
+import Test.QuickCheck.Monadic
+import Test.Framework (Test, testGroup, defaultMain)
+import Test.Framework.Providers.QuickCheck2 (testProperty)
+import Control.Exception (catch)
+import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)
+import Data.Monoid (Monoid(..))
+import Control.Monad
+import Data.String (fromString)
+import Data.Text.Encoding.Error
+import qualified Data.List as L
+import qualified Data.Bits as Bits (shiftL, shiftR)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Encoding as E
+import qualified Pipes.Text.Internal as PE
+import qualified Pipes.Text as TP
+import qualified Pipes.ByteString as BP 
+import qualified Pipes as P 
+main :: IO ()
+main = defaultMain [tests]
+-- >>> :main  -a 10000
+tests = testGroup "stream_decode" [
+  -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
+  testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed ,
+  testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe,
+  testProperty "t_utf8_dec_some" t_utf8_dec_some]
+t_utf8_incr_valid  = do
+        Positive n <- arbitrary
+        forAll genUnicode $ recode n `eq` id
+    where recode n = T.concat . feedChunksOf n PE.streamDecodeUtf8 . E.encodeUtf8
+          feedChunksOf :: Int -> (B.ByteString -> PE.Decoding) -> B.ByteString
+                       -> [T.Text]
+          feedChunksOf n f bs
+            | B.null bs  = []
+            | otherwise  = let (a,b) = B.splitAt n bs
+                               PE.Some t _ f' = f a
+                           in case f a of 
+                              PE.Some t _ f' -> t : feedChunksOf n f' b
+                              _             -> []
+t_utf8_incr_mixed  = do
+       Positive n <- arbitrary
+       txt <- genUnicode
+       let chunkSize = mod n 7 + 1
+       forAll (vector 9) $ 
+              (roundtrip . chunk chunkSize . appendBytes txt) `eq` (appendBytes txt)
+    where 
+    roundtrip :: [B.ByteString] -> B.ByteString
+    roundtrip bss = go PE.streamDecodeUtf8 B.empty bss where    
+       go dec acc [] = acc   
+       go dec acc [bs]  = case dec bs of 
+          PE.Some t l dec' -> acc <> E.encodeUtf8 t <> l
+          PE.Other t bs'   -> acc <> E.encodeUtf8 t <> bs' 
+       go dec acc (bs:bss) = case dec bs of 
+         PE.Some t l dec' -> go dec' (acc <> E.encodeUtf8 t) bss
+         PE.Other t bs'   -> acc <> E.encodeUtf8 t <> bs' <> B.concat bss
+    chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
+    appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
+t_utf8_incr_pipe  = do    
+       Positive  m <- arbitrary
+       Positive n  <- arbitrary  
+       txt         <- genUnicode
+       let chunkSize = mod n 7 + 1
+           bytesLength = mod 10 m
+       forAll (vector bytesLength) $ 
+              (BL.toStrict . BP.toLazy . roundtrip . P.each . chunk chunkSize . appendBytes txt) 
+              `eq` 
+              appendBytes txt
+    where 
+    roundtrip :: Monad m => P.Producer B.ByteString m r -> P.Producer B.ByteString m r
+    roundtrip p = join (TP.decodeUtf8 p P.>-> TP.encodeUtf8) 
+    chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
+    appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
+--
+t_utf8_dec_some = do    
+       Positive  m <- arbitrary
+       txt         <- genUnicode
+       let bytesLength = mod 10 m :: Int
+       forAll (vector bytesLength) $ 
+              (roundtrip . appendBytes txt) 
+              `eq` 
+              appendBytes txt
+    where 
+    roundtrip bs = case PE.decodeSomeUtf8 bs of
+                        (txt,bys) -> E.encodeUtf8 txt <> bys
+    appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
diff --git a/test/Utils.hs b/test/Utils.hs
new file mode 100644
index 0000000..75cd1db
--- /dev/null
+++ b/test/Utils.hs
@@ -0,0 +1,109 @@
+{-#LANGUAGE ScopedTypeVariables#-}
+module Utils where
+import Control.Exception (SomeException, bracket, bracket_, evaluate, try)
+import System.IO.Unsafe (unsafePerformIO)
+import Debug.Trace (trace)
+import Data.Bits ((.&.))
+import Data.Char (chr)
+import Data.String (IsString, fromString)
+import System.Random (Random (..), RandomGen)
+import Test.QuickCheck hiding ((.&.))
+import Test.QuickCheck.Monadic (assert, monadicIO, run)
+import qualified Data.ByteString as B
+import Pipes.Text.Internal
+-- Ensure that two potentially bottom values (in the sense of crashing
+-- for some inputs, not looping infinitely) either both crash, or both
+-- give comparable results for some input.
+(=^=) :: (Eq a, Show a) => a -> a -> Bool
+i =^= j = unsafePerformIO $ do
+  x <- try (evaluate i)
+  y <- try (evaluate j)
+  case (x,y) of
+    (Left (_ :: SomeException), Left (_ :: SomeException))
+                       -> return True
+    (Right a, Right b) -> return (a == b)
+    e                  -> trace ("*** Divergence: " ++ show e) return False
+infix 4 =^=
+{-# NOINLINE (=^=) #-}
+-- Do two functions give the same answer?
+eq :: (Eq a, Show a) => (t -> a) -> (t -> a) -> t -> Bool
+eq a b s  = a s =^= b s
+-- What about with the RHS packed?
+-- eqP :: (Eq a, Show a, Stringy s) =>
+--        (String -> a) -> (s -> a) -> String -> Word8 -> Bool
+-- eqP f g s w  = eql "orig" (f s) (g t) &&
+--                eql "mini" (f s) (g mini) &&
+--                eql "head" (f sa) (g ta) &&
+--                eql "tail" (f sb) (g tb)
+--     where t             = packS s
+--           mini          = packSChunkSize 10 s
+--           (sa,sb)       = splitAt m s
+--           (ta,tb)       = splitAtS m t
+--           l             = length s
+--           m | l == 0    = n
+--             | otherwise = n `mod` l
+--           n             = fromIntegral w
+--           eql d a b
+--             | a =^= b   = True
+--             | otherwise = trace (d ++ ": " ++ show a ++ " /= " ++ show b) False
+instance Arbitrary B.ByteString where
+    arbitrary     = B.pack `fmap` arbitrary
+genUnicode :: IsString a => Gen a
+genUnicode = fmap fromString string where
+    string = sized $ \n ->
+        do k <- choose (0,n)
+           sequence [ char | _ <- [1..k] ]
+    excluding :: [a -> Bool] -> Gen a -> Gen a
+    excluding bad gen = loop
+      where
+        loop = do
+          x <- gen
+          if or (map ($ x) bad)
+            then loop
+            else return x
+    reserved = [lowSurrogate, highSurrogate, noncharacter]
+    lowSurrogate c = c >= 0xDC00 && c <= 0xDFFF
+    highSurrogate c = c >= 0xD800 && c <= 0xDBFF
+    noncharacter c = masked == 0xFFFE || masked == 0xFFFF
+      where
+        masked = c .&. 0xFFFF
+    ascii = choose (0,0x7F)
+    plane0 = choose (0xF0, 0xFFFF)
+    plane1 = oneof [ choose (0x10000, 0x10FFF)
+                   , choose (0x11000, 0x11FFF)
+                   , choose (0x12000, 0x12FFF)
+                   , choose (0x13000, 0x13FFF)
+                   , choose (0x1D000, 0x1DFFF)
+                   , choose (0x1F000, 0x1FFFF)
+                   ]
+    plane2 = oneof [ choose (0x20000, 0x20FFF)
+                   , choose (0x21000, 0x21FFF)
+                   , choose (0x22000, 0x22FFF)
+                   , choose (0x23000, 0x23FFF)
+                   , choose (0x24000, 0x24FFF)
+                   , choose (0x25000, 0x25FFF)
+                   , choose (0x26000, 0x26FFF)
+                   , choose (0x27000, 0x27FFF)
+                   , choose (0x28000, 0x28FFF)
+                   , choose (0x29000, 0x29FFF)
+                   , choose (0x2A000, 0x2AFFF)
+                   , choose (0x2B000, 0x2BFFF)
+                   , choose (0x2F000, 0x2FFFF)
+                   ]
+    plane14 = choose (0xE0000, 0xE0FFF)
+    planes = [ascii, plane0, plane1, plane2, plane14]
+    char = chr `fmap` excluding reserved (oneof planes)
author	michaelt <what_is_it_to_do_anything@yahoo.com>	2014-01-14 22:17:25 -0500
committer	michaelt <what_is_it_to_do_anything@yahoo.com>	2014-01-14 22:17:25 -0500
commit	7381c94f47c76833972565ee8d15d86216b214ce (patch)
tree	38ddadda59a3808422fc432d37b886c456adcb1d
parent	ca6f90a05bee6471d6837d629ddaee9b0a75bd50 (diff)
parent	3694350ac7b9c42fd64e0092a74cf0471a080058 (diff)
download	text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.gz text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.zst text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.zip

diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs new file mode 100644 index 0000000..7e5b044 --- /dev/null +++ b/Pipes/Text/Internal.hs
@@ -0,0 +1,163 @@
		1	{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
		2	UnliftedFFITypes #-}
		3	-- This module lifts assorted materials from Brian O'Sullivan's text package
		4	-- especially Data.Text.Encoding in order to define a pipes-appropriate
		5	-- streamDecodeUtf8
		6	module Pipes.Text.Internal
		7	( Decoding(..)
		8	, streamDecodeUtf8
		9	, decodeSomeUtf8
		10	) where
		11	import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
		12	import Control.Monad.ST (ST, runST)
		13	import Data.Bits ((.&.))
		14	import Data.ByteString as B
		15	import Data.ByteString.Internal as B
		16	import qualified Data.Text as T (null)
		17	import Data.Text.Encoding.Error ()
		18	import Data.Text.Internal (Text, textP)
		19	import Foreign.C.Types (CSize)
		20	import Foreign.ForeignPtr (withForeignPtr)
		21	import Foreign.Marshal.Utils (with)
		22	import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
		23	import Foreign.Storable (Storable, peek, poke)
		24	import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
		25	import GHC.Word (Word8, Word32)
		26	import qualified Data.Text.Array as A
		27
		28	#include "pipes_text_cbits.h"
		29
		30	-- \| A stream oriented decoding result.
		31	data Decoding = Some Text ByteString (ByteString -> Decoding)
		32	\| Other Text ByteString
		33	instance Show Decoding where
		34	showsPrec d (Some t bs _) = showParen (d > prec) $
		35	showString "Some " . showsPrec prec' t .
		36	showChar ' ' . showsPrec prec' bs .
		37	showString " _"
		38	where prec = 10; prec' = prec + 1
		39	showsPrec d (Other t bs) = showParen (d > prec) $
		40	showString "Other " . showsPrec prec' t .
		41	showChar ' ' . showsPrec prec' bs .
		42	showString " _"
		43	where prec = 10; prec' = prec + 1
		44
		45	newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
		46	newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
		47
		48	streamDecodeUtf8 :: ByteString -> Decoding
		49	streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
		50	where
		51	decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
		52	decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
		53	runST $ do marray <- A.new (len+1)
		54	unsafeIOToST (decodeChunkToBuffer marray)
		55	where
		56	decodeChunkToBuffer :: A.MArray s -> IO Decoding
		57	decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
		58	with (0::CSize) $ \destOffPtr ->
		59	with codepoint0 $ \codepointPtr ->
		60	with state0 $ \statePtr ->
		61	with nullPtr $ \curPtrPtr ->
		62	do let end = ptr `plusPtr` (off + len)
		63	curPtr = ptr `plusPtr` off
		64	poke curPtrPtr curPtr
		65	c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
		66	state <- peek statePtr
		67	lastPtr <- peek curPtrPtr
		68	codepoint <- peek codepointPtr
		69	n <- peek destOffPtr
		70	chunkText <- mkText dest n
		71	let left = lastPtr `minusPtr` curPtr
		72	remaining = B.drop left bs
		73	accum = if T.null chunkText then B.append old remaining else remaining
		74	return $! case state of
		75	UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
		76	_ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
		77	{-# INLINE decodeChunkToBuffer #-}
		78	{-# INLINE decodeChunkUtf8 #-}
		79	{-# INLINE streamDecodeUtf8 #-}
		80
		81	decodeSomeUtf8 :: ByteString -> (Text, ByteString)
		82	decodeSomeUtf8 bs@(PS fp off len) = runST $ do
		83	dest <- A.new (len+1)
		84	unsafeIOToST $
		85	withForeignPtr fp $ \ptr ->
		86	with (0::CSize) $ \destOffPtr ->
		87	with (0::CodePoint) $ \codepointPtr ->
		88	with (0::DecoderState) $ \statePtr ->
		89	with nullPtr $ \curPtrPtr ->
		90	do let end = ptr `plusPtr` (off + len)
		91	curPtr = ptr `plusPtr` off
		92	poke curPtrPtr curPtr
		93	c_decode_utf8_with_state (A.maBA dest) destOffPtr
		94	curPtrPtr end codepointPtr statePtr
		95	state <- peek statePtr
		96	lastPtr <- peek curPtrPtr
		97	codepoint <- peek codepointPtr
		98	n <- peek destOffPtr
		99	chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
		100	return $! textP arr 0 (fromIntegral n)
		101	let left = lastPtr `minusPtr` curPtr
		102	remaining = B.drop left bs
		103	return $! (chunkText, remaining)
		104	{-# INLINE decodeSomeUtf8 #-}
		105
		106	-- decodeSomeUtf8 :: ByteString -> (Text, ByteString)
		107	-- decodeSomeUtf8 bs@(PS fp off len) =
		108	-- runST $ do marray <- A.new (len+1)
		109	-- unsafeIOToST (decodeChunkToBuffer marray)
		110	--
		111	-- where
		112	-- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString)
		113	-- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
		114	-- with (0::CSize) $ \destOffPtr ->
		115	-- with (0::CodePoint) $ \codepointPtr ->
		116	-- with (0::DecoderState) $ \statePtr ->
		117	-- with nullPtr $ \curPtrPtr ->
		118	-- do let end = ptr `plusPtr` (off + len)
		119	-- curPtr = ptr `plusPtr` off
		120	-- poke curPtrPtr curPtr
		121	-- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
		122	-- state <- peek statePtr
		123	-- lastPtr <- peek curPtrPtr
		124	-- codepoint <- peek codepointPtr
		125	-- n <- peek destOffPtr
		126	-- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
		127	-- return $! textP arr 0 (fromIntegral n)
		128	-- let left = lastPtr `minusPtr` curPtr
		129	-- remaining = B.drop left bs
		130	-- return $! (chunkText, remaining)
		131	-- {-# INLINE decodeChunkToBuffer #-}
		132	-- {-# INLINE decodeSomeUtf8 #-}
		133
		134
		135
		136	mkText :: A.MArray s -> CSize -> IO Text
		137	mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
		138	return $! textP arr 0 (fromIntegral n)
		139	{-# INLINE mkText #-}
		140
		141	ord :: Char -> Int
		142	ord (C# c#) = I# (ord# c#)
		143	{-# INLINE ord #-}
		144
		145	unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
		146	unsafeWrite marr i c
		147	\| n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
		148	return 1
		149	\| otherwise = do A.unsafeWrite marr i lo
		150	A.unsafeWrite marr (i+1) hi
		151	return 2
		152	where n = ord c
		153	m = n - 0x10000
		154	lo = fromIntegral $ (m `shiftR` 10) + 0xD800
		155	hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
		156	shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
		157	{-# INLINE shiftR #-}
		158	{-# INLINE unsafeWrite #-}
		159
		160	foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
		161	:: MutableByteArray# s -> Ptr CSize
		162	-> Ptr (Ptr Word8) -> Ptr Word8
		163	-> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file


diff --git a/bench/IO.hs b/bench/IO.hs new file mode 100644 index 0000000..b3a52f6 --- /dev/null +++ b/bench/IO.hs
@@ -0,0 +1,20 @@
		1	import qualified Data.Text.IO as T
		2	import qualified Data.Text as T
		3	import qualified Data.Text.Lazy.IO as TL
		4	import qualified Data.Text.Lazy as TL
		5
		6	import Pipes
		7	import qualified Pipes.Text as TP
		8	import qualified Pipes.ByteString as BP
		9	import Pipes.Safe
		10
		11	main = textaction
		12	big = "../../examples/txt/words2.txt"
		13
		14	textaction = T.readFile big >>= T.putStrLn
		15	pipeaction = runEffect $ for ((TP.readFile big) >> return ()) (lift . T.putStrLn)
		16
		17
		18
		19
		20


diff --git a/cbits/cbits.c b/cbits/cbits.c new file mode 100644 index 0000000..c11645b --- /dev/null +++ b/cbits/cbits.c
@@ -0,0 +1,168 @@
		1	/*
		2	* Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
		3	*
		4	* Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
		5	*
		6	* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
		7	*/
		8
		9	#include <string.h>
		10	#include <stdint.h>
		11	#include <stdio.h>
		12	#include "pipes_text_cbits.h"
		13
		14
		15
		16	#define UTF8_ACCEPT 0
		17	#define UTF8_REJECT 12
		18
		19	static const uint8_t utf8d[] = {
		20	/*
		21	* The first part of the table maps bytes to character classes that
		22	* to reduce the size of the transition table and create bitmasks.
		23	*/
		24	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		25	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		26	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		27	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		28	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
		29	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
		30	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
		31	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
		32
		33	/*
		34	* The second part is a transition table that maps a combination of
		35	* a state of the automaton and a character class to a state.
		36	*/
		37	0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
		38	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
		39	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
		40	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
		41	12,36,12,12,12,12,12,12,12,12,12,12,
		42	};
		43
		44	static inline uint32_t
		45	decode(uint32_t state, uint32_t codep, uint32_t byte) {
		46	uint32_t type = utf8d[byte];
		47
		48	codep = (state != UTF8_ACCEPT) ?
		49	(byte & 0x3fu) \| (*codep << 6) :
		50	(0xff >> type) & (byte);
		51
		52	return state = utf8d[256 + state + type];
		53	}
		54
		55	/*
		56	* A best-effort decoder. Runs until it hits either end of input or
		57	* the start of an invalid byte sequence.
		58	*
		59	* At exit, we update destoff with the next offset to write to, src
		60	* with the next source location past the last one successfully
		61	* decoded, and return the next source location to read from.
		62	*
		63	* Moreover, we expose the internal decoder state (state0 and
		64	* codepoint0), allowing one to restart the decoder after it
		65	* terminates (say, due to a partial codepoint).
		66	*
		67	* In particular, there are a few possible outcomes,
		68	*
		69	* 1) We decoded the buffer entirely:
		70	* In this case we return srcend
		71	* state0 == UTF8_ACCEPT
		72	*
		73	* 2) We met an invalid encoding
		74	* In this case we return the address of the first invalid byte
		75	* state0 == UTF8_REJECT
		76	*
		77	* 3) We reached the end of the buffer while decoding a codepoint
		78	* In this case we return a pointer to the first byte of the partial codepoint
		79	* state0 != UTF8_ACCEPT, UTF8_REJECT
		80	*
		81	*/
		82
		83	#if defined(__GNUC__) \|\| defined(__clang__)
		84	static inline uint8_t const *
		85	_hs_pipes_text_decode_utf8_int(uint16_t const dest, size_t destoff,
		86	const uint8_t const *src, const uint8_t const srcend,
		87	uint32_t codepoint0, uint32_t state0)
		88	__attribute((always_inline));
		89	#endif
		90
		91	static inline uint8_t const *
		92	_hs_pipes_text_decode_utf8_int(uint16_t const dest, size_t destoff,
		93	const uint8_t const *src, const uint8_t const srcend,
		94	uint32_t codepoint0, uint32_t state0)
		95	{
		96	uint16_t d = dest + destoff;
		97	const uint8_t s = src, last = src;
		98	uint32_t state = *state0;
		99	uint32_t codepoint = *codepoint0;
		100
		101	while (s < srcend) {
		102	#if defined(__i386__) \|\| defined(__x86_64__)
		103	/*
		104	* This code will only work on a little-endian system that
		105	* supports unaligned loads.
		106	*
		107	* It gives a substantial speed win on data that is purely or
		108	* partly ASCII (e.g. HTML), at only a slight cost on purely
		109	* non-ASCII text.
		110	*/
		111
		112	if (state == UTF8_ACCEPT) {
		113	while (s < srcend - 4) {
		114	codepoint = ((uint32_t ) s);
		115	if ((codepoint & 0x80808080) != 0)
		116	break;
		117	s += 4;
		118
		119	/*
		120	* Tried 32-bit stores here, but the extra bit-twiddling
		121	* slowed the code down.
		122	*/
		123
		124	*d++ = (uint16_t) (codepoint & 0xff);
		125	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
		126	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
		127	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
		128	}
		129	last = s;
		130	}
		131	#endif
		132
		133	if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
		134	if (state != UTF8_REJECT)
		135	continue;
		136	break;
		137	}
		138
		139	if (codepoint <= 0xffff)
		140	*d++ = (uint16_t) codepoint;
		141	else {
		142	*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
		143	*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
		144	}
		145	last = s;
		146	}
		147
		148	*destoff = d - dest;
		149	*codepoint0 = codepoint;
		150	*state0 = state;
		151	*src = last;
		152
		153	return s;
		154	}
		155
		156	uint8_t const *
		157	_hs_pipes_text_decode_utf8_state(uint16_t const dest, size_t destoff,
		158	const uint8_t const **src,
		159	const uint8_t const *srcend,
		160	uint32_t codepoint0, uint32_t state0)
		161	{
		162	uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
		163	codepoint0, state0);
		164	if (*state0 == UTF8_REJECT)
		165	ret -=1;
		166	return ret;
		167	}
		168


diff --git a/include/pipes_text_cbits.h b/include/pipes_text_cbits.h new file mode 100644 index 0000000..b9ab670 --- /dev/null +++ b/include/pipes_text_cbits.h
@@ -0,0 +1,11 @@
		1	/*
		2	* Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>.
		3	*/
		4
		5	#ifndef _pipes_text_cbits_h
		6	#define _pipes_text_cbits_h
		7
		8	#define UTF8_ACCEPT 0
		9	#define UTF8_REJECT 12
		10
		11	#endif


diff --git a/pipes-text.cabal b/pipes-text.cabal index e79f168..b4388be 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal
@@ -12,7 +12,9 @@ build-type: Simple
12	cabal-version: >=1.10	12	cabal-version: >=1.10
13		13
14	library	14	library
15	exposed-modules: Pipes.Text, Pipes.Text.Parse	15	c-sources: cbits/cbits.c
		16	include-dirs: include
		17	exposed-modules: Pipes.Text, Pipes.Text.Parse, Pipes.Text.Internal
16	-- other-modules:	18	-- other-modules:
17	other-extensions: RankNTypes	19	other-extensions: RankNTypes
18	build-depends: base >= 4 && < 5 ,	20	build-depends: base >= 4 && < 5 ,
@@ -23,6 +25,9 @@ library
23	pipes-bytestring >= 1.0 && < 1.2,	25	pipes-bytestring >= 1.0 && < 1.2,
24	transformers >= 0.3 && < 0.4,	26	transformers >= 0.3 && < 0.4,
25	text >=0.11 && < 0.12,	27	text >=0.11 && < 0.12,
26	bytestring >=0.10 && < 0.11	28	bytestring >=0.10 && < 0.11,
		29	vector,
		30	void
27	-- hs-source-dirs:	31	-- hs-source-dirs:
28	default-language: Haskell2010 \ No newline at end of file	32	default-language: Haskell2010
		33	ghc-options: -O2


diff --git a/test/Test.hs b/test/Test.hs new file mode 100644 index 0000000..373bafb --- /dev/null +++ b/test/Test.hs
@@ -0,0 +1,101 @@
		1	import Utils
		2
		3	import Test.QuickCheck
		4	import Test.QuickCheck.Monadic
		5	import Test.Framework (Test, testGroup, defaultMain)
		6	import Test.Framework.Providers.QuickCheck2 (testProperty)
		7
		8	import Control.Exception (catch)
		9	import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)
		10	import Data.Monoid (Monoid(..))
		11	import Control.Monad
		12	import Data.String (fromString)
		13	import Data.Text.Encoding.Error
		14	import qualified Data.List as L
		15
		16	import qualified Data.Bits as Bits (shiftL, shiftR)
		17	import qualified Data.ByteString as B
		18	import qualified Data.ByteString.Lazy as BL
		19	import qualified Data.Text as T
		20	import qualified Data.Text.Lazy as TL
		21	import qualified Data.Text.Encoding as E
		22	import qualified Pipes.Text.Internal as PE
		23	import qualified Pipes.Text as TP
		24	import qualified Pipes.ByteString as BP
		25	import qualified Pipes as P
		26
		27	main :: IO ()
		28	main = defaultMain [tests]
		29	-- >>> :main -a 10000
		30	tests = testGroup "stream_decode" [
		31	-- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
		32	testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed ,
		33	testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe,
		34	testProperty "t_utf8_dec_some" t_utf8_dec_some]
		35
		36	t_utf8_incr_valid = do
		37	Positive n <- arbitrary
		38	forAll genUnicode $ recode n `eq` id
		39	where recode n = T.concat . feedChunksOf n PE.streamDecodeUtf8 . E.encodeUtf8
		40	feedChunksOf :: Int -> (B.ByteString -> PE.Decoding) -> B.ByteString
		41	-> [T.Text]
		42	feedChunksOf n f bs
		43	\| B.null bs = []
		44	\| otherwise = let (a,b) = B.splitAt n bs
		45	PE.Some t _ f' = f a
		46	in case f a of
		47	PE.Some t _ f' -> t : feedChunksOf n f' b
		48	_ -> []
		49
		50	t_utf8_incr_mixed = do
		51	Positive n <- arbitrary
		52	txt <- genUnicode
		53	let chunkSize = mod n 7 + 1
		54	forAll (vector 9) $
		55	(roundtrip . chunk chunkSize . appendBytes txt) `eq` (appendBytes txt)
		56	where
		57	roundtrip :: [B.ByteString] -> B.ByteString
		58	roundtrip bss = go PE.streamDecodeUtf8 B.empty bss where
		59	go dec acc [] = acc
		60	go dec acc [bs] = case dec bs of
		61	PE.Some t l dec' -> acc <> E.encodeUtf8 t <> l
		62	PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs'
		63	go dec acc (bs:bss) = case dec bs of
		64	PE.Some t l dec' -> go dec' (acc <> E.encodeUtf8 t) bss
		65	PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs' <> B.concat bss
		66	chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
		67	appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
		68
		69	t_utf8_incr_pipe = do
		70	Positive m <- arbitrary
		71	Positive n <- arbitrary
		72	txt <- genUnicode
		73	let chunkSize = mod n 7 + 1
		74	bytesLength = mod 10 m
		75	forAll (vector bytesLength) $
		76	(BL.toStrict . BP.toLazy . roundtrip . P.each . chunk chunkSize . appendBytes txt)
		77	`eq`
		78	appendBytes txt
		79	where
		80	roundtrip :: Monad m => P.Producer B.ByteString m r -> P.Producer B.ByteString m r
		81	roundtrip p = join (TP.decodeUtf8 p P.>-> TP.encodeUtf8)
		82	chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
		83	appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
		84
		85	--
		86	t_utf8_dec_some = do
		87	Positive m <- arbitrary
		88	txt <- genUnicode
		89	let bytesLength = mod 10 m :: Int
		90	forAll (vector bytesLength) $
		91	(roundtrip . appendBytes txt)
		92	`eq`
		93	appendBytes txt
		94	where
		95	roundtrip bs = case PE.decodeSomeUtf8 bs of
		96	(txt,bys) -> E.encodeUtf8 txt <> bys
		97	appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
		98
		99
		100
		101


diff --git a/test/Utils.hs b/test/Utils.hs new file mode 100644 index 0000000..75cd1db --- /dev/null +++ b/test/Utils.hs
@@ -0,0 +1,109 @@
		1	{-#LANGUAGE ScopedTypeVariables#-}
		2	module Utils where
		3	import Control.Exception (SomeException, bracket, bracket_, evaluate, try)
		4	import System.IO.Unsafe (unsafePerformIO)
		5	import Debug.Trace (trace)
		6	import Data.Bits ((.&.))
		7	import Data.Char (chr)
		8	import Data.String (IsString, fromString)
		9	import System.Random (Random (..), RandomGen)
		10	import Test.QuickCheck hiding ((.&.))
		11	import Test.QuickCheck.Monadic (assert, monadicIO, run)
		12	import qualified Data.ByteString as B
		13	import Pipes.Text.Internal
		14
		15
		16
		17
		18
		19	-- Ensure that two potentially bottom values (in the sense of crashing
		20	-- for some inputs, not looping infinitely) either both crash, or both
		21	-- give comparable results for some input.
		22	(=^=) :: (Eq a, Show a) => a -> a -> Bool
		23	i =^= j = unsafePerformIO $ do
		24	x <- try (evaluate i)
		25	y <- try (evaluate j)
		26	case (x,y) of
		27	(Left (_ :: SomeException), Left (_ :: SomeException))
		28	-> return True
		29	(Right a, Right b) -> return (a == b)
		30	e -> trace ("*** Divergence: " ++ show e) return False
		31	infix 4 =^=
		32	{-# NOINLINE (=^=) #-}
		33
		34	-- Do two functions give the same answer?
		35	eq :: (Eq a, Show a) => (t -> a) -> (t -> a) -> t -> Bool
		36	eq a b s = a s =^= b s
		37
		38	-- What about with the RHS packed?
		39	-- eqP :: (Eq a, Show a, Stringy s) =>
		40	-- (String -> a) -> (s -> a) -> String -> Word8 -> Bool
		41	-- eqP f g s w = eql "orig" (f s) (g t) &&
		42	-- eql "mini" (f s) (g mini) &&
		43	-- eql "head" (f sa) (g ta) &&
		44	-- eql "tail" (f sb) (g tb)
		45	-- where t = packS s
		46	-- mini = packSChunkSize 10 s
		47	-- (sa,sb) = splitAt m s
		48	-- (ta,tb) = splitAtS m t
		49	-- l = length s
		50	-- m \| l == 0 = n
		51	-- \| otherwise = n `mod` l
		52	-- n = fromIntegral w
		53	-- eql d a b
		54	-- \| a =^= b = True
		55	-- \| otherwise = trace (d ++ ": " ++ show a ++ " /= " ++ show b) False
		56
		57
		58	instance Arbitrary B.ByteString where
		59	arbitrary = B.pack `fmap` arbitrary
		60
		61	genUnicode :: IsString a => Gen a
		62	genUnicode = fmap fromString string where
		63	string = sized $ \n ->
		64	do k <- choose (0,n)
		65	sequence [ char \| _ <- [1..k] ]
		66
		67	excluding :: [a -> Bool] -> Gen a -> Gen a
		68	excluding bad gen = loop
		69	where
		70	loop = do
		71	x <- gen
		72	if or (map ($ x) bad)
		73	then loop
		74	else return x
		75
		76	reserved = [lowSurrogate, highSurrogate, noncharacter]
		77	lowSurrogate c = c >= 0xDC00 && c <= 0xDFFF
		78	highSurrogate c = c >= 0xD800 && c <= 0xDBFF
		79	noncharacter c = masked == 0xFFFE \|\| masked == 0xFFFF
		80	where
		81	masked = c .&. 0xFFFF
		82
		83	ascii = choose (0,0x7F)
		84	plane0 = choose (0xF0, 0xFFFF)
		85	plane1 = oneof [ choose (0x10000, 0x10FFF)
		86	, choose (0x11000, 0x11FFF)
		87	, choose (0x12000, 0x12FFF)
		88	, choose (0x13000, 0x13FFF)
		89	, choose (0x1D000, 0x1DFFF)
		90	, choose (0x1F000, 0x1FFFF)
		91	]
		92	plane2 = oneof [ choose (0x20000, 0x20FFF)
		93	, choose (0x21000, 0x21FFF)
		94	, choose (0x22000, 0x22FFF)
		95	, choose (0x23000, 0x23FFF)
		96	, choose (0x24000, 0x24FFF)
		97	, choose (0x25000, 0x25FFF)
		98	, choose (0x26000, 0x26FFF)
		99	, choose (0x27000, 0x27FFF)
		100	, choose (0x28000, 0x28FFF)
		101	, choose (0x29000, 0x29FFF)
		102	, choose (0x2A000, 0x2AFFF)
		103	, choose (0x2B000, 0x2BFFF)
		104	, choose (0x2F000, 0x2FFFF)
		105	]
		106	plane14 = choose (0xE0000, 0xE0FFF)
		107	planes = [ascii, plane0, plane1, plane2, plane14]
		108
		109	char = chr `fmap` excluding reserved (oneof planes)