diff options
Diffstat (limited to 'cbits')
-rw-r--r-- | cbits/cbits.c | 113 |
1 files changed, 65 insertions, 48 deletions
diff --git a/cbits/cbits.c b/cbits/cbits.c index e0fdfd5..c11645b 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c | |||
@@ -79,30 +79,38 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) { | |||
79 | * state0 != UTF8_ACCEPT, UTF8_REJECT | 79 | * state0 != UTF8_ACCEPT, UTF8_REJECT |
80 | * | 80 | * |
81 | */ | 81 | */ |
82 | const uint8_t * | 82 | |
83 | _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, | 83 | #if defined(__GNUC__) || defined(__clang__) |
84 | const uint8_t **const src, | 84 | static inline uint8_t const * |
85 | const uint8_t *const srcend, | 85 | _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, |
86 | uint32_t *codepoint0, uint32_t *state0) | 86 | const uint8_t const **src, const uint8_t const *srcend, |
87 | uint32_t *codepoint0, uint32_t *state0) | ||
88 | __attribute((always_inline)); | ||
89 | #endif | ||
90 | |||
91 | static inline uint8_t const * | ||
92 | _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, | ||
93 | const uint8_t const **src, const uint8_t const *srcend, | ||
94 | uint32_t *codepoint0, uint32_t *state0) | ||
87 | { | 95 | { |
88 | uint16_t *d = dest + *destoff; | 96 | uint16_t *d = dest + *destoff; |
89 | const uint8_t *s = *src, *last = *src; | 97 | const uint8_t *s = *src, *last = *src; |
90 | uint32_t state = *state0; | 98 | uint32_t state = *state0; |
91 | uint32_t codepoint = *codepoint0; | 99 | uint32_t codepoint = *codepoint0; |
92 | 100 | ||
93 | while (s < srcend) { | 101 | while (s < srcend) { |
94 | #if defined(__i386__) || defined(__x86_64__) | 102 | #if defined(__i386__) || defined(__x86_64__) |
95 | /* | 103 | /* |
96 | * This code will only work on a little-endian system that | 104 | * This code will only work on a little-endian system that |
97 | * supports unaligned loads. | 105 | * supports unaligned loads. |
98 | * | 106 | * |
99 | * It gives a substantial speed win on data that is purely or | 107 | * It gives a substantial speed win on data that is purely or |
100 | * partly ASCII (e.g. HTML), at only a slight cost on purely | 108 | * partly ASCII (e.g. HTML), at only a slight cost on purely |
101 | * non-ASCII text. | 109 | * non-ASCII text. |
102 | */ | 110 | */ |
103 | 111 | ||
104 | if (state == UTF8_ACCEPT) { | 112 | if (state == UTF8_ACCEPT) { |
105 | while (s < srcend - 4) { | 113 | while (s < srcend - 4) { |
106 | codepoint = *((uint32_t *) s); | 114 | codepoint = *((uint32_t *) s); |
107 | if ((codepoint & 0x80808080) != 0) | 115 | if ((codepoint & 0x80808080) != 0) |
108 | break; | 116 | break; |
@@ -117,35 +125,44 @@ _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, | |||
117 | *d++ = (uint16_t) ((codepoint >> 8) & 0xff); | 125 | *d++ = (uint16_t) ((codepoint >> 8) & 0xff); |
118 | *d++ = (uint16_t) ((codepoint >> 16) & 0xff); | 126 | *d++ = (uint16_t) ((codepoint >> 16) & 0xff); |
119 | *d++ = (uint16_t) ((codepoint >> 24) & 0xff); | 127 | *d++ = (uint16_t) ((codepoint >> 24) & 0xff); |
120 | } | 128 | } |
121 | last = s; | 129 | last = s; |
122 | } | 130 | } |
123 | #endif | 131 | #endif |
124 | 132 | ||
125 | if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { | 133 | if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { |
126 | if (state != UTF8_REJECT) | 134 | if (state != UTF8_REJECT) |
127 | continue; | 135 | continue; |
128 | break; | 136 | break; |
129 | } | 137 | } |
130 | 138 | ||
131 | if (codepoint <= 0xffff) | 139 | if (codepoint <= 0xffff) |
132 | *d++ = (uint16_t) codepoint; | 140 | *d++ = (uint16_t) codepoint; |
133 | else { | 141 | else { |
134 | *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); | 142 | *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); |
135 | *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); | 143 | *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); |
136 | } | 144 | } |
137 | last = s; | 145 | last = s; |
138 | } | 146 | } |
139 | 147 | ||
140 | /* Invalid encoding, back up to the errant character */ | 148 | *destoff = d - dest; |
141 | if (state == UTF8_REJECT) | 149 | *codepoint0 = codepoint; |
142 | s -= 1; | 150 | *state0 = state; |
143 | 151 | *src = last; | |
144 | *destoff = d - dest; | 152 | |
145 | *codepoint0 = codepoint; | 153 | return s; |
146 | *state0 = state; | 154 | } |
147 | *src = last; | 155 | |
148 | 156 | uint8_t const * | |
149 | return s; | 157 | _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, |
158 | const uint8_t const **src, | ||
159 | const uint8_t const *srcend, | ||
160 | uint32_t *codepoint0, uint32_t *state0) | ||
161 | { | ||
162 | uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend, | ||
163 | codepoint0, state0); | ||
164 | if (*state0 == UTF8_REJECT) | ||
165 | ret -=1; | ||
166 | return ret; | ||
150 | } | 167 | } |
151 | 168 | ||