merge home made decodeUtf8

author: michaelt <what_is_it_to_do_anything@yahoo.com> 2014-01-14 22:17:25 -0500
committer: michaelt <what_is_it_to_do_anything@yahoo.com> 2014-01-14 22:17:25 -0500
commit: 7381c94f47c76833972565ee8d15d86216b214ce (patch)
tree: 38ddadda59a3808422fc432d37b886c456adcb1d /cbits/cbits.c
parent: ca6f90a05bee6471d6837d629ddaee9b0a75bd50 (diff)
parent: 3694350ac7b9c42fd64e0092a74cf0471a080058 (diff)
download: text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.gz
text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.zst
text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.zip
1 files changed, 168 insertions, 0 deletions
diff --git a/cbits/cbits.c b/cbits/cbits.c
new file mode 100644
index 0000000..c11645b
--- /dev/null
+++ b/cbits/cbits.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
+ *
+ * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
+ *
+ * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+ */
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "pipes_text_cbits.h"
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+static const uint8_t utf8d[] = {
+  /*
+   * The first part of the table maps bytes to character classes that
+   * to reduce the size of the transition table and create bitmasks.
+   */
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+  /*
+   * The second part is a transition table that maps a combination of
+   * a state of the automaton and a character class to a state.
+   */
+   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+  12,36,12,12,12,12,12,12,12,12,12,12,
+};
+static inline uint32_t
+decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
+  uint32_t type = utf8d[byte];
+  *codep = (*state != UTF8_ACCEPT) ?
+    (byte & 0x3fu) | (*codep << 6) :
+    (0xff >> type) & (byte);
+  return *state = utf8d[256 + *state + type];
+}
+/*
+ * A best-effort decoder. Runs until it hits either end of input or
+ * the start of an invalid byte sequence.
+ *
+ * At exit, we update *destoff with the next offset to write to, *src
+ * with the next source location past the last one successfully
+ * decoded, and return the next source location to read from.
+ *
+ * Moreover, we expose the internal decoder state (state0 and
+ * codepoint0), allowing one to restart the decoder after it
+ * terminates (say, due to a partial codepoint).
+ *
+ * In particular, there are a few possible outcomes,
+ *
+ *   1) We decoded the buffer entirely:
+ *      In this case we return srcend
+ *      state0 == UTF8_ACCEPT
+ *
+ *   2) We met an invalid encoding
+ *      In this case we return the address of the first invalid byte
+ *      state0 == UTF8_REJECT
+ *
+ *   3) We reached the end of the buffer while decoding a codepoint
+ *      In this case we return a pointer to the first byte of the partial codepoint
+ *      state0 != UTF8_ACCEPT, UTF8_REJECT
+ *
+ */
+ #if defined(__GNUC__) || defined(__clang__)
+ static inline uint8_t const *
+ _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+                         const uint8_t const **src, const uint8_t const *srcend,
+                         uint32_t *codepoint0, uint32_t *state0)
+   __attribute((always_inline));
+ #endif
+static inline uint8_t const *
+_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+                         const uint8_t const **src, const uint8_t const *srcend,
+                         uint32_t *codepoint0, uint32_t *state0)
+{
+ uint16_t *d = dest + *destoff;
+ const uint8_t *s = *src, *last = *src;
+ uint32_t state = *state0;
+ uint32_t codepoint = *codepoint0;
+ while (s < srcend) {
+#if defined(__i386__) || defined(__x86_64__)
+   /*
+    * This code will only work on a little-endian system that
+    * supports unaligned loads.
+    *
+    * It gives a substantial speed win on data that is purely or
+    * partly ASCII (e.g. HTML), at only a slight cost on purely
+    * non-ASCII text.
+    */
+   if (state == UTF8_ACCEPT) {
+     while (s < srcend - 4) {
+        codepoint = *((uint32_t *) s);
+        if ((codepoint & 0x80808080) != 0)
+          break;
+        s += 4;
+        /*
+         * Tried 32-bit stores here, but the extra bit-twiddling
+         * slowed the code down.
+         */
+        *d++ = (uint16_t) (codepoint & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
+     }
+     last = s;
+   }
+#endif
+   if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
+     if (state != UTF8_REJECT)
+        continue;
+     break;
+   }
+   if (codepoint <= 0xffff)
+     *d++ = (uint16_t) codepoint;
+   else {
+     *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
+     *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
+   }
+   last = s;
+ }
+ *destoff = d - dest;
+ *codepoint0 = codepoint;
+ *state0 = state;
+ *src = last;
+ return s;
+}
+uint8_t const *
+_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+                          const uint8_t const **src,
+                           const uint8_t const *srcend,
+                          uint32_t *codepoint0, uint32_t *state0)
+{
+ uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
+                                                codepoint0, state0);
+ if (*state0 == UTF8_REJECT)
+   ret -=1;
+ return ret;
+}
author	michaelt <what_is_it_to_do_anything@yahoo.com>	2014-01-14 22:17:25 -0500
committer	michaelt <what_is_it_to_do_anything@yahoo.com>	2014-01-14 22:17:25 -0500
commit	7381c94f47c76833972565ee8d15d86216b214ce (patch)
tree	38ddadda59a3808422fc432d37b886c456adcb1d /cbits/cbits.c
parent	ca6f90a05bee6471d6837d629ddaee9b0a75bd50 (diff)
parent	3694350ac7b9c42fd64e0092a74cf0471a080058 (diff)
download	text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.gz text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.zst text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.zip

diff --git a/cbits/cbits.c b/cbits/cbits.c new file mode 100644 index 0000000..c11645b --- /dev/null +++ b/cbits/cbits.c
@@ -0,0 +1,168 @@
	1	/*
	2	* Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
	3	*
	4	* Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
	5	*
	6	* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
	7	*/
	8
	9	#include <string.h>
	10	#include <stdint.h>
	11	#include <stdio.h>
	12	#include "pipes_text_cbits.h"
	13
	14
	15
	16	#define UTF8_ACCEPT 0
	17	#define UTF8_REJECT 12
	18
	19	static const uint8_t utf8d[] = {
	20	/*
	21	* The first part of the table maps bytes to character classes that
	22	* to reduce the size of the transition table and create bitmasks.
	23	*/
	24	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	25	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	26	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	27	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	28	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	29	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	30	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	31	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
	32
	33	/*
	34	* The second part is a transition table that maps a combination of
	35	* a state of the automaton and a character class to a state.
	36	*/
	37	0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
	38	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
	39	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
	40	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
	41	12,36,12,12,12,12,12,12,12,12,12,12,
	42	};
	43
	44	static inline uint32_t
	45	decode(uint32_t state, uint32_t codep, uint32_t byte) {
	46	uint32_t type = utf8d[byte];
	47
	48	codep = (state != UTF8_ACCEPT) ?
	49	(byte & 0x3fu) \| (*codep << 6) :
	50	(0xff >> type) & (byte);
	51
	52	return state = utf8d[256 + state + type];
	53	}
	54
	55	/*
	56	* A best-effort decoder. Runs until it hits either end of input or
	57	* the start of an invalid byte sequence.
	58	*
	59	* At exit, we update destoff with the next offset to write to, src
	60	* with the next source location past the last one successfully
	61	* decoded, and return the next source location to read from.
	62	*
	63	* Moreover, we expose the internal decoder state (state0 and
	64	* codepoint0), allowing one to restart the decoder after it
	65	* terminates (say, due to a partial codepoint).
	66	*
	67	* In particular, there are a few possible outcomes,
	68	*
	69	* 1) We decoded the buffer entirely:
	70	* In this case we return srcend
	71	* state0 == UTF8_ACCEPT
	72	*
	73	* 2) We met an invalid encoding
	74	* In this case we return the address of the first invalid byte
	75	* state0 == UTF8_REJECT
	76	*
	77	* 3) We reached the end of the buffer while decoding a codepoint
	78	* In this case we return a pointer to the first byte of the partial codepoint
	79	* state0 != UTF8_ACCEPT, UTF8_REJECT
	80	*
	81	*/
	82
	83	#if defined(__GNUC__) \|\| defined(__clang__)
	84	static inline uint8_t const *
	85	_hs_pipes_text_decode_utf8_int(uint16_t const dest, size_t destoff,
	86	const uint8_t const *src, const uint8_t const srcend,
	87	uint32_t codepoint0, uint32_t state0)
	88	__attribute((always_inline));
	89	#endif
	90
	91	static inline uint8_t const *
	92	_hs_pipes_text_decode_utf8_int(uint16_t const dest, size_t destoff,
	93	const uint8_t const *src, const uint8_t const srcend,
	94	uint32_t codepoint0, uint32_t state0)
	95	{
	96	uint16_t d = dest + destoff;
	97	const uint8_t s = src, last = src;
	98	uint32_t state = *state0;
	99	uint32_t codepoint = *codepoint0;
	100
	101	while (s < srcend) {
	102	#if defined(__i386__) \|\| defined(__x86_64__)
	103	/*
	104	* This code will only work on a little-endian system that
	105	* supports unaligned loads.
	106	*
	107	* It gives a substantial speed win on data that is purely or
	108	* partly ASCII (e.g. HTML), at only a slight cost on purely
	109	* non-ASCII text.
	110	*/
	111
	112	if (state == UTF8_ACCEPT) {
	113	while (s < srcend - 4) {
	114	codepoint = ((uint32_t ) s);
	115	if ((codepoint & 0x80808080) != 0)
	116	break;
	117	s += 4;
	118
	119	/*
	120	* Tried 32-bit stores here, but the extra bit-twiddling
	121	* slowed the code down.
	122	*/
	123
	124	*d++ = (uint16_t) (codepoint & 0xff);
	125	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
	126	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
	127	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
	128	}
	129	last = s;
	130	}
	131	#endif
	132
	133	if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
	134	if (state != UTF8_REJECT)
	135	continue;
	136	break;
	137	}
	138
	139	if (codepoint <= 0xffff)
	140	*d++ = (uint16_t) codepoint;
	141	else {
	142	*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
	143	*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
	144	}
	145	last = s;
	146	}
	147
	148	*destoff = d - dest;
	149	*codepoint0 = codepoint;
	150	*state0 = state;
	151	*src = last;
	152
	153	return s;
	154	}
	155
	156	uint8_t const *
	157	_hs_pipes_text_decode_utf8_state(uint16_t const dest, size_t destoff,
	158	const uint8_t const **src,
	159	const uint8_t const *srcend,
	160	uint32_t codepoint0, uint32_t state0)
	161	{
	162	uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
	163	codepoint0, state0);
	164	if (*state0 == UTF8_REJECT)
	165	ret -=1;
	166	return ret;
	167	}
	168