]>
git.immae.eu Git - github/fretlink/text-pipes.git/blob - cbits/cbits.c
2 * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
4 * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
6 * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
12 #include "pipes_text_cbits.h"
17 #define UTF8_REJECT 12
19 static const uint8_t utf8d
[] = {
21 * The first part of the table maps bytes to character classes that
22 * to reduce the size of the transition table and create bitmasks.
24 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
26 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
27 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
28 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
29 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
30 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
31 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
34 * The second part is a transition table that maps a combination of
35 * a state of the automaton and a character class to a state.
37 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
38 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
39 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
40 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
41 12,36,12,12,12,12,12,12,12,12,12,12,
44 static inline uint32_t
45 decode(uint32_t *state
, uint32_t* codep
, uint32_t byte
) {
46 uint32_t type
= utf8d
[byte
];
48 *codep
= (*state
!= UTF8_ACCEPT
) ?
49 (byte
& 0x3fu
) | (*codep
<< 6) :
50 (0xff >> type
) & (byte
);
52 return *state
= utf8d
[256 + *state
+ type
];
56 * A best-effort decoder. Runs until it hits either end of input or
57 * the start of an invalid byte sequence.
59 * At exit, we update *destoff with the next offset to write to, *src
60 * with the next source location past the last one successfully
61 * decoded, and return the next source location to read from.
63 * Moreover, we expose the internal decoder state (state0 and
64 * codepoint0), allowing one to restart the decoder after it
65 * terminates (say, due to a partial codepoint).
67 * In particular, there are a few possible outcomes,
69 * 1) We decoded the buffer entirely:
70 * In this case we return srcend
71 * state0 == UTF8_ACCEPT
73 * 2) We met an invalid encoding
74 * In this case we return the address of the first invalid byte
75 * state0 == UTF8_REJECT
77 * 3) We reached the end of the buffer while decoding a codepoint
78 * In this case we return a pointer to the first byte of the partial codepoint
79 * state0 != UTF8_ACCEPT, UTF8_REJECT
83 #if defined(__GNUC__) || defined(__clang__)
84 static inline uint8_t const *
85 _hs_pipes_text_decode_utf8_int(uint16_t *const dest
, size_t *destoff
,
86 const uint8_t const **src
, const uint8_t const *srcend
,
87 uint32_t *codepoint0
, uint32_t *state0
)
88 __attribute((always_inline
));
91 static inline uint8_t const *
92 _hs_pipes_text_decode_utf8_int(uint16_t *const dest
, size_t *destoff
,
93 const uint8_t const **src
, const uint8_t const *srcend
,
94 uint32_t *codepoint0
, uint32_t *state0
)
96 uint16_t *d
= dest
+ *destoff
;
97 const uint8_t *s
= *src
, *last
= *src
;
98 uint32_t state
= *state0
;
99 uint32_t codepoint
= *codepoint0
;
102 #if defined(__i386__) || defined(__x86_64__)
104 * This code will only work on a little-endian system that
105 * supports unaligned loads.
107 * It gives a substantial speed win on data that is purely or
108 * partly ASCII (e.g. HTML), at only a slight cost on purely
112 if (state
== UTF8_ACCEPT
) {
113 while (s
< srcend
- 4) {
114 codepoint
= *((uint32_t *) s
);
115 if ((codepoint
& 0x80808080) != 0)
120 * Tried 32-bit stores here, but the extra bit-twiddling
121 * slowed the code down.
124 *d
++ = (uint16_t) (codepoint
& 0xff);
125 *d
++ = (uint16_t) ((codepoint
>> 8) & 0xff);
126 *d
++ = (uint16_t) ((codepoint
>> 16) & 0xff);
127 *d
++ = (uint16_t) ((codepoint
>> 24) & 0xff);
133 if (decode(&state
, &codepoint
, *s
++) != UTF8_ACCEPT
) {
134 if (state
!= UTF8_REJECT
)
139 if (codepoint
<= 0xffff)
140 *d
++ = (uint16_t) codepoint
;
142 *d
++ = (uint16_t) (0xD7C0 + (codepoint
>> 10));
143 *d
++ = (uint16_t) (0xDC00 + (codepoint
& 0x3FF));
149 *codepoint0
= codepoint
;
157 _hs_pipes_text_decode_utf8_state(uint16_t *const dest
, size_t *destoff
,
158 const uint8_t const **src
,
159 const uint8_t const *srcend
,
160 uint32_t *codepoint0
, uint32_t *state0
)
162 uint8_t const *ret
= _hs_pipes_text_decode_utf8_int(dest
, destoff
, src
, srcend
,
164 if (*state0
== UTF8_REJECT
)