aboutsummaryrefslogtreecommitdiffhomepage
path: root/cbits
diff options
context:
space:
mode:
Diffstat (limited to 'cbits')
-rw-r--r--cbits/cbits.c113
1 files changed, 65 insertions, 48 deletions
diff --git a/cbits/cbits.c b/cbits/cbits.c
index e0fdfd5..c11645b 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -79,30 +79,38 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
79 * state0 != UTF8_ACCEPT, UTF8_REJECT 79 * state0 != UTF8_ACCEPT, UTF8_REJECT
80 * 80 *
81 */ 81 */
82const uint8_t * 82
83_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, 83 #if defined(__GNUC__) || defined(__clang__)
84 const uint8_t **const src, 84 static inline uint8_t const *
85 const uint8_t *const srcend, 85 _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
86 uint32_t *codepoint0, uint32_t *state0) 86 const uint8_t const **src, const uint8_t const *srcend,
87 uint32_t *codepoint0, uint32_t *state0)
88 __attribute((always_inline));
89 #endif
90
91static inline uint8_t const *
92_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
93 const uint8_t const **src, const uint8_t const *srcend,
94 uint32_t *codepoint0, uint32_t *state0)
87{ 95{
88 uint16_t *d = dest + *destoff; 96 uint16_t *d = dest + *destoff;
89 const uint8_t *s = *src, *last = *src; 97 const uint8_t *s = *src, *last = *src;
90 uint32_t state = *state0; 98 uint32_t state = *state0;
91 uint32_t codepoint = *codepoint0; 99 uint32_t codepoint = *codepoint0;
92 100
93 while (s < srcend) { 101 while (s < srcend) {
94#if defined(__i386__) || defined(__x86_64__) 102#if defined(__i386__) || defined(__x86_64__)
95 /* 103 /*
96 * This code will only work on a little-endian system that 104 * This code will only work on a little-endian system that
97 * supports unaligned loads. 105 * supports unaligned loads.
98 * 106 *
99 * It gives a substantial speed win on data that is purely or 107 * It gives a substantial speed win on data that is purely or
100 * partly ASCII (e.g. HTML), at only a slight cost on purely 108 * partly ASCII (e.g. HTML), at only a slight cost on purely
101 * non-ASCII text. 109 * non-ASCII text.
102 */ 110 */
103 111
104 if (state == UTF8_ACCEPT) { 112 if (state == UTF8_ACCEPT) {
105 while (s < srcend - 4) { 113 while (s < srcend - 4) {
106 codepoint = *((uint32_t *) s); 114 codepoint = *((uint32_t *) s);
107 if ((codepoint & 0x80808080) != 0) 115 if ((codepoint & 0x80808080) != 0)
108 break; 116 break;
@@ -117,35 +125,44 @@ _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
117 *d++ = (uint16_t) ((codepoint >> 8) & 0xff); 125 *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
118 *d++ = (uint16_t) ((codepoint >> 16) & 0xff); 126 *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
119 *d++ = (uint16_t) ((codepoint >> 24) & 0xff); 127 *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
120 } 128 }
121 last = s; 129 last = s;
122 } 130 }
123#endif 131#endif
124 132
125 if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { 133 if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
126 if (state != UTF8_REJECT) 134 if (state != UTF8_REJECT)
127 continue; 135 continue;
128 break; 136 break;
129 } 137 }
130 138
131 if (codepoint <= 0xffff) 139 if (codepoint <= 0xffff)
132 *d++ = (uint16_t) codepoint; 140 *d++ = (uint16_t) codepoint;
133 else { 141 else {
134 *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); 142 *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
135 *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); 143 *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
136 } 144 }
137 last = s; 145 last = s;
138 } 146 }
139 147
140 /* Invalid encoding, back up to the errant character */ 148 *destoff = d - dest;
141 if (state == UTF8_REJECT) 149 *codepoint0 = codepoint;
142 s -= 1; 150 *state0 = state;
143 151 *src = last;
144 *destoff = d - dest; 152
145 *codepoint0 = codepoint; 153 return s;
146 *state0 = state; 154}
147 *src = last; 155
148 156uint8_t const *
149 return s; 157_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
158 const uint8_t const **src,
159 const uint8_t const *srcend,
160 uint32_t *codepoint0, uint32_t *state0)
161{
162 uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
163 codepoint0, state0);
164 if (*state0 == UTF8_REJECT)
165 ret -=1;
166 return ret;
150} 167}
151 168