1 files changed, 65 insertions, 48 deletions
diff --git a/cbits/cbits.c b/cbits/cbits.c
index e0fdfd5..c11645b 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -79,30 +79,38 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
 *      state0 != UTF8_ACCEPT, UTF8_REJECT
 *
 */
-const uint8_t *
-_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+ #if defined(__GNUC__) || defined(__clang__)
-                           const uint8_t **const src,
+ static inline uint8_t const *
-                           const uint8_t *const srcend,
+ _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
-                           uint32_t *codepoint0, uint32_t *state0)
+                         const uint8_t const **src, const uint8_t const *srcend,
+                         uint32_t *codepoint0, uint32_t *state0)
+   __attribute((always_inline));
+ #endif
+static inline uint8_t const *
+_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+                         const uint8_t const **src, const uint8_t const *srcend,
+                         uint32_t *codepoint0, uint32_t *state0)
 {
-  uint16_t *d = dest + *destoff;
+ uint16_t *d = dest + *destoff;
-  const uint8_t *s = *src, *last = *src;
+ const uint8_t *s = *src, *last = *src;
-  uint32_t state = *state0;
+ uint32_t state = *state0;
-  uint32_t codepoint = *codepoint0;
+ uint32_t codepoint = *codepoint0;
-  while (s < srcend) {
+ while (s < srcend) {
 #if defined(__i386__) || defined(__x86_64__)
-    /*
+   /*
-     * This code will only work on a little-endian system that
+    * This code will only work on a little-endian system that
-     * supports unaligned loads.
+    * supports unaligned loads.
-     *
+    *
-     * It gives a substantial speed win on data that is purely or
+    * It gives a substantial speed win on data that is purely or
-     * partly ASCII (e.g. HTML), at only a slight cost on purely
+    * partly ASCII (e.g. HTML), at only a slight cost on purely
-     * non-ASCII text.
+    * non-ASCII text.
-     */
+    */
-    if (state == UTF8_ACCEPT) {
+   if (state == UTF8_ACCEPT) {
-      while (s < srcend - 4) {
+     while (s < srcend - 4) {
        codepoint = *((uint32_t *) s);
        if ((codepoint & 0x80808080) != 0)
          break;
@@ -117,35 +125,44 @@ _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
        *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
        *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
        *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
-      }
+     }
-      last = s;
+     last = s;
-    }
+   }
 #endif
-    if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
+   if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
-      if (state != UTF8_REJECT)
+     if (state != UTF8_REJECT)
        continue;
-      break;
+     break;
-    }
+   }
-    if (codepoint <= 0xffff)
+   if (codepoint <= 0xffff)
-      *d++ = (uint16_t) codepoint;
+     *d++ = (uint16_t) codepoint;
-    else {
+   else {
-      *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
+     *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
-      *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
+     *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
-    }
+   }
-    last = s;
+   last = s;
-  }
+ }
-  /* Invalid encoding, back up to the errant character */
+ *destoff = d - dest;
-  if (state == UTF8_REJECT)
+ *codepoint0 = codepoint;
-    s -= 1;
+ *state0 = state;
+ *src = last;
-  *destoff = d - dest;
-  *codepoint0 = codepoint;
+ return s;
-  *state0 = state;
+}
-  *src = last;
+uint8_t const *
-  return s;
+_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+                          const uint8_t const **src,
+                           const uint8_t const *srcend,
+                          uint32_t *codepoint0, uint32_t *state0)
+{
+ uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
+                                                codepoint0, state0);
+ if (*state0 == UTF8_REJECT)
+   ret -=1;
+ return ret;
 }

diff --git a/cbits/cbits.c b/cbits/cbits.c index e0fdfd5..c11645b 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c
@@ -79,30 +79,38 @@ decode(uint32_t state, uint32_t codep, uint32_t byte) {
79	* state0 != UTF8_ACCEPT, UTF8_REJECT	79	* state0 != UTF8_ACCEPT, UTF8_REJECT
80	*	80	*
81	*/	81	*/
82	const uint8_t *	82
83	_hs_pipes_text_decode_utf8_state(uint16_t const dest, size_t destoff,	83	#if defined(__GNUC__) \|\| defined(__clang__)
84	const uint8_t **const src,	84	static inline uint8_t const *
85	const uint8_t *const srcend,	85	_hs_pipes_text_decode_utf8_int(uint16_t const dest, size_t destoff,
86	uint32_t codepoint0, uint32_t state0)	86	const uint8_t const *src, const uint8_t const srcend,
		87	uint32_t codepoint0, uint32_t state0)
		88	__attribute((always_inline));
		89	#endif
		90
		91	static inline uint8_t const *
		92	_hs_pipes_text_decode_utf8_int(uint16_t const dest, size_t destoff,
		93	const uint8_t const *src, const uint8_t const srcend,
		94	uint32_t codepoint0, uint32_t state0)
87	{	95	{
88	uint16_t d = dest + destoff;	96	uint16_t d = dest + destoff;
89	const uint8_t s = src, last = src;	97	const uint8_t s = src, last = src;
90	uint32_t state = *state0;	98	uint32_t state = *state0;
91	uint32_t codepoint = *codepoint0;	99	uint32_t codepoint = *codepoint0;
92		100
93	while (s < srcend) {	101	while (s < srcend) {
94	#if defined(__i386__) \|\| defined(__x86_64__)	102	#if defined(__i386__) \|\| defined(__x86_64__)
95	/*	103	/*
96	* This code will only work on a little-endian system that	104	* This code will only work on a little-endian system that
97	* supports unaligned loads.	105	* supports unaligned loads.
98	*	106	*
99	* It gives a substantial speed win on data that is purely or	107	* It gives a substantial speed win on data that is purely or
100	* partly ASCII (e.g. HTML), at only a slight cost on purely	108	* partly ASCII (e.g. HTML), at only a slight cost on purely
101	* non-ASCII text.	109	* non-ASCII text.
102	*/	110	*/
103		111
104	if (state == UTF8_ACCEPT) {	112	if (state == UTF8_ACCEPT) {
105	while (s < srcend - 4) {	113	while (s < srcend - 4) {
106	codepoint = ((uint32_t ) s);	114	codepoint = ((uint32_t ) s);
107	if ((codepoint & 0x80808080) != 0)	115	if ((codepoint & 0x80808080) != 0)
108	break;	116	break;
@@ -117,35 +125,44 @@ _hs_pipes_text_decode_utf8_state(uint16_t const dest, size_t destoff,
117	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);	125	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
118	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);	126	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
119	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);	127	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
120	}	128	}
121	last = s;	129	last = s;
122	}	130	}
123	#endif	131	#endif
124		132
125	if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {	133	if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
126	if (state != UTF8_REJECT)	134	if (state != UTF8_REJECT)
127	continue;	135	continue;
128	break;	136	break;
129	}	137	}
130		138
131	if (codepoint <= 0xffff)	139	if (codepoint <= 0xffff)
132	*d++ = (uint16_t) codepoint;	140	*d++ = (uint16_t) codepoint;
133	else {	141	else {
134	*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));	142	*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
135	*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));	143	*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
136	}	144	}
137	last = s;	145	last = s;
138	}	146	}
139		147
140	/* Invalid encoding, back up to the errant character */	148	*destoff = d - dest;
141	if (state == UTF8_REJECT)	149	*codepoint0 = codepoint;
142	s -= 1;	150	*state0 = state;
143		151	*src = last;
144	*destoff = d - dest;	152
145	*codepoint0 = codepoint;	153	return s;
146	*state0 = state;	154	}
147	*src = last;	155
148		156	uint8_t const *
149	return s;	157	_hs_pipes_text_decode_utf8_state(uint16_t const dest, size_t destoff,
		158	const uint8_t const **src,
		159	const uint8_t const *srcend,
		160	uint32_t codepoint0, uint32_t state0)
		161	{
		162	uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
		163	codepoint0, state0);
		164	if (*state0 == UTF8_REJECT)
		165	ret -=1;
		166	return ret;
150	}	167	}
151		168