diff options
author | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-01-14 22:17:25 -0500 |
---|---|---|
committer | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-01-14 22:17:25 -0500 |
commit | 7381c94f47c76833972565ee8d15d86216b214ce (patch) | |
tree | 38ddadda59a3808422fc432d37b886c456adcb1d /cbits/cbits.c | |
parent | ca6f90a05bee6471d6837d629ddaee9b0a75bd50 (diff) | |
parent | 3694350ac7b9c42fd64e0092a74cf0471a080058 (diff) | |
download | text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.gz text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.zst text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.zip |
merge home made decodeUtf8
Diffstat (limited to 'cbits/cbits.c')
-rw-r--r-- | cbits/cbits.c | 168 |
1 files changed, 168 insertions, 0 deletions
diff --git a/cbits/cbits.c b/cbits/cbits.c new file mode 100644 index 0000000..c11645b --- /dev/null +++ b/cbits/cbits.c | |||
@@ -0,0 +1,168 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>. | ||
3 | * | ||
4 | * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>. | ||
5 | * | ||
6 | * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | ||
7 | */ | ||
8 | |||
9 | #include <string.h> | ||
10 | #include <stdint.h> | ||
11 | #include <stdio.h> | ||
12 | #include "pipes_text_cbits.h" | ||
13 | |||
14 | |||
15 | |||
16 | #define UTF8_ACCEPT 0 | ||
17 | #define UTF8_REJECT 12 | ||
18 | |||
19 | static const uint8_t utf8d[] = { | ||
20 | /* | ||
21 | * The first part of the table maps bytes to character classes that | ||
22 | * to reduce the size of the transition table and create bitmasks. | ||
23 | */ | ||
24 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
25 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
26 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
27 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
28 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | ||
29 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | ||
30 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | ||
31 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, | ||
32 | |||
33 | /* | ||
34 | * The second part is a transition table that maps a combination of | ||
35 | * a state of the automaton and a character class to a state. | ||
36 | */ | ||
37 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | ||
38 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | ||
39 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | ||
40 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | ||
41 | 12,36,12,12,12,12,12,12,12,12,12,12, | ||
42 | }; | ||
43 | |||
44 | static inline uint32_t | ||
45 | decode(uint32_t *state, uint32_t* codep, uint32_t byte) { | ||
46 | uint32_t type = utf8d[byte]; | ||
47 | |||
48 | *codep = (*state != UTF8_ACCEPT) ? | ||
49 | (byte & 0x3fu) | (*codep << 6) : | ||
50 | (0xff >> type) & (byte); | ||
51 | |||
52 | return *state = utf8d[256 + *state + type]; | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * A best-effort decoder. Runs until it hits either end of input or | ||
57 | * the start of an invalid byte sequence. | ||
58 | * | ||
59 | * At exit, we update *destoff with the next offset to write to, *src | ||
60 | * with the next source location past the last one successfully | ||
61 | * decoded, and return the next source location to read from. | ||
62 | * | ||
63 | * Moreover, we expose the internal decoder state (state0 and | ||
64 | * codepoint0), allowing one to restart the decoder after it | ||
65 | * terminates (say, due to a partial codepoint). | ||
66 | * | ||
67 | * In particular, there are a few possible outcomes, | ||
68 | * | ||
69 | * 1) We decoded the buffer entirely: | ||
70 | * In this case we return srcend | ||
71 | * state0 == UTF8_ACCEPT | ||
72 | * | ||
73 | * 2) We met an invalid encoding | ||
74 | * In this case we return the address of the first invalid byte | ||
75 | * state0 == UTF8_REJECT | ||
76 | * | ||
77 | * 3) We reached the end of the buffer while decoding a codepoint | ||
78 | * In this case we return a pointer to the first byte of the partial codepoint | ||
79 | * state0 != UTF8_ACCEPT, UTF8_REJECT | ||
80 | * | ||
81 | */ | ||
82 | |||
83 | #if defined(__GNUC__) || defined(__clang__) | ||
84 | static inline uint8_t const * | ||
85 | _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, | ||
86 | const uint8_t const **src, const uint8_t const *srcend, | ||
87 | uint32_t *codepoint0, uint32_t *state0) | ||
88 | __attribute((always_inline)); | ||
89 | #endif | ||
90 | |||
91 | static inline uint8_t const * | ||
92 | _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, | ||
93 | const uint8_t const **src, const uint8_t const *srcend, | ||
94 | uint32_t *codepoint0, uint32_t *state0) | ||
95 | { | ||
96 | uint16_t *d = dest + *destoff; | ||
97 | const uint8_t *s = *src, *last = *src; | ||
98 | uint32_t state = *state0; | ||
99 | uint32_t codepoint = *codepoint0; | ||
100 | |||
101 | while (s < srcend) { | ||
102 | #if defined(__i386__) || defined(__x86_64__) | ||
103 | /* | ||
104 | * This code will only work on a little-endian system that | ||
105 | * supports unaligned loads. | ||
106 | * | ||
107 | * It gives a substantial speed win on data that is purely or | ||
108 | * partly ASCII (e.g. HTML), at only a slight cost on purely | ||
109 | * non-ASCII text. | ||
110 | */ | ||
111 | |||
112 | if (state == UTF8_ACCEPT) { | ||
113 | while (s < srcend - 4) { | ||
114 | codepoint = *((uint32_t *) s); | ||
115 | if ((codepoint & 0x80808080) != 0) | ||
116 | break; | ||
117 | s += 4; | ||
118 | |||
119 | /* | ||
120 | * Tried 32-bit stores here, but the extra bit-twiddling | ||
121 | * slowed the code down. | ||
122 | */ | ||
123 | |||
124 | *d++ = (uint16_t) (codepoint & 0xff); | ||
125 | *d++ = (uint16_t) ((codepoint >> 8) & 0xff); | ||
126 | *d++ = (uint16_t) ((codepoint >> 16) & 0xff); | ||
127 | *d++ = (uint16_t) ((codepoint >> 24) & 0xff); | ||
128 | } | ||
129 | last = s; | ||
130 | } | ||
131 | #endif | ||
132 | |||
133 | if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { | ||
134 | if (state != UTF8_REJECT) | ||
135 | continue; | ||
136 | break; | ||
137 | } | ||
138 | |||
139 | if (codepoint <= 0xffff) | ||
140 | *d++ = (uint16_t) codepoint; | ||
141 | else { | ||
142 | *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); | ||
143 | *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); | ||
144 | } | ||
145 | last = s; | ||
146 | } | ||
147 | |||
148 | *destoff = d - dest; | ||
149 | *codepoint0 = codepoint; | ||
150 | *state0 = state; | ||
151 | *src = last; | ||
152 | |||
153 | return s; | ||
154 | } | ||
155 | |||
156 | uint8_t const * | ||
157 | _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, | ||
158 | const uint8_t const **src, | ||
159 | const uint8_t const *srcend, | ||
160 | uint32_t *codepoint0, uint32_t *state0) | ||
161 | { | ||
162 | uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend, | ||
163 | codepoint0, state0); | ||
164 | if (*state0 == UTF8_REJECT) | ||
165 | ret -=1; | ||
166 | return ret; | ||
167 | } | ||
168 | |||