aboutsummaryrefslogtreecommitdiffhomepage
path: root/cbits/cbits.c
diff options
context:
space:
mode:
authormichaelt <what_is_it_to_do_anything@yahoo.com>2014-01-14 22:17:25 -0500
committermichaelt <what_is_it_to_do_anything@yahoo.com>2014-01-14 22:17:25 -0500
commit7381c94f47c76833972565ee8d15d86216b214ce (patch)
tree38ddadda59a3808422fc432d37b886c456adcb1d /cbits/cbits.c
parentca6f90a05bee6471d6837d629ddaee9b0a75bd50 (diff)
parent3694350ac7b9c42fd64e0092a74cf0471a080058 (diff)
downloadtext-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.gz
text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.zst
text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.zip
merge home made decodeUtf8
Diffstat (limited to 'cbits/cbits.c')
-rw-r--r--cbits/cbits.c168
1 files changed, 168 insertions, 0 deletions
diff --git a/cbits/cbits.c b/cbits/cbits.c
new file mode 100644
index 0000000..c11645b
--- /dev/null
+++ b/cbits/cbits.c
@@ -0,0 +1,168 @@
1/*
2 * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
3 *
4 * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
5 *
6 * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
7 */
8
9#include <string.h>
10#include <stdint.h>
11#include <stdio.h>
12#include "pipes_text_cbits.h"
13
14
15
16#define UTF8_ACCEPT 0
17#define UTF8_REJECT 12
18
19static const uint8_t utf8d[] = {
20 /*
21 * The first part of the table maps bytes to character classes that
22 * to reduce the size of the transition table and create bitmasks.
23 */
24 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
26 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
27 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
28 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
29 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
30 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
31 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
32
33 /*
34 * The second part is a transition table that maps a combination of
35 * a state of the automaton and a character class to a state.
36 */
37 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
38 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
39 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
40 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
41 12,36,12,12,12,12,12,12,12,12,12,12,
42};
43
44static inline uint32_t
45decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
46 uint32_t type = utf8d[byte];
47
48 *codep = (*state != UTF8_ACCEPT) ?
49 (byte & 0x3fu) | (*codep << 6) :
50 (0xff >> type) & (byte);
51
52 return *state = utf8d[256 + *state + type];
53}
54
55/*
56 * A best-effort decoder. Runs until it hits either end of input or
57 * the start of an invalid byte sequence.
58 *
59 * At exit, we update *destoff with the next offset to write to, *src
60 * with the next source location past the last one successfully
61 * decoded, and return the next source location to read from.
62 *
63 * Moreover, we expose the internal decoder state (state0 and
64 * codepoint0), allowing one to restart the decoder after it
65 * terminates (say, due to a partial codepoint).
66 *
67 * In particular, there are a few possible outcomes,
68 *
69 * 1) We decoded the buffer entirely:
70 * In this case we return srcend
71 * state0 == UTF8_ACCEPT
72 *
73 * 2) We met an invalid encoding
74 * In this case we return the address of the first invalid byte
75 * state0 == UTF8_REJECT
76 *
77 * 3) We reached the end of the buffer while decoding a codepoint
78 * In this case we return a pointer to the first byte of the partial codepoint
79 * state0 != UTF8_ACCEPT, UTF8_REJECT
80 *
81 */
82
83 #if defined(__GNUC__) || defined(__clang__)
84 static inline uint8_t const *
85 _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
86 const uint8_t const **src, const uint8_t const *srcend,
87 uint32_t *codepoint0, uint32_t *state0)
88 __attribute((always_inline));
89 #endif
90
91static inline uint8_t const *
92_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
93 const uint8_t const **src, const uint8_t const *srcend,
94 uint32_t *codepoint0, uint32_t *state0)
95{
96 uint16_t *d = dest + *destoff;
97 const uint8_t *s = *src, *last = *src;
98 uint32_t state = *state0;
99 uint32_t codepoint = *codepoint0;
100
101 while (s < srcend) {
102#if defined(__i386__) || defined(__x86_64__)
103 /*
104 * This code will only work on a little-endian system that
105 * supports unaligned loads.
106 *
107 * It gives a substantial speed win on data that is purely or
108 * partly ASCII (e.g. HTML), at only a slight cost on purely
109 * non-ASCII text.
110 */
111
112 if (state == UTF8_ACCEPT) {
113 while (s < srcend - 4) {
114 codepoint = *((uint32_t *) s);
115 if ((codepoint & 0x80808080) != 0)
116 break;
117 s += 4;
118
119 /*
120 * Tried 32-bit stores here, but the extra bit-twiddling
121 * slowed the code down.
122 */
123
124 *d++ = (uint16_t) (codepoint & 0xff);
125 *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
126 *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
127 *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
128 }
129 last = s;
130 }
131#endif
132
133 if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
134 if (state != UTF8_REJECT)
135 continue;
136 break;
137 }
138
139 if (codepoint <= 0xffff)
140 *d++ = (uint16_t) codepoint;
141 else {
142 *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
143 *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
144 }
145 last = s;
146 }
147
148 *destoff = d - dest;
149 *codepoint0 = codepoint;
150 *state0 = state;
151 *src = last;
152
153 return s;
154}
155
156uint8_t const *
157_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
158 const uint8_t const **src,
159 const uint8_t const *srcend,
160 uint32_t *codepoint0, uint32_t *state0)
161{
162 uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
163 codepoint0, state0);
164 if (*state0 == UTF8_REJECT)
165 ret -=1;
166 return ret;
167}
168