]>
Commit | Line | Data |
---|---|---|
8c482809 | 1 | /* |
2 | * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>. | |
3 | * | |
4 | * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>. | |
5 | * | |
6 | * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
7 | */ | |
8 | ||
9 | #include <string.h> | |
10 | #include <stdint.h> | |
11 | #include <stdio.h> | |
12 | #include "pipes_text_cbits.h" | |
13 | ||
14 | ||
15 | ||
16 | #define UTF8_ACCEPT 0 | |
17 | #define UTF8_REJECT 12 | |
18 | ||
19 | static const uint8_t utf8d[] = { | |
20 | /* | |
21 | * The first part of the table maps bytes to character classes that | |
22 | * to reduce the size of the transition table and create bitmasks. | |
23 | */ | |
24 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
25 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
26 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
27 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
28 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | |
29 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | |
30 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
31 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, | |
32 | ||
33 | /* | |
34 | * The second part is a transition table that maps a combination of | |
35 | * a state of the automaton and a character class to a state. | |
36 | */ | |
37 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | |
38 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | |
39 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | |
40 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | |
41 | 12,36,12,12,12,12,12,12,12,12,12,12, | |
42 | }; | |
43 | ||
44 | static inline uint32_t | |
45 | decode(uint32_t *state, uint32_t* codep, uint32_t byte) { | |
46 | uint32_t type = utf8d[byte]; | |
47 | ||
48 | *codep = (*state != UTF8_ACCEPT) ? | |
49 | (byte & 0x3fu) | (*codep << 6) : | |
50 | (0xff >> type) & (byte); | |
51 | ||
52 | return *state = utf8d[256 + *state + type]; | |
53 | } | |
54 | ||
55 | /* | |
56 | * A best-effort decoder. Runs until it hits either end of input or | |
57 | * the start of an invalid byte sequence. | |
58 | * | |
59 | * At exit, we update *destoff with the next offset to write to, *src | |
60 | * with the next source location past the last one successfully | |
61 | * decoded, and return the next source location to read from. | |
62 | * | |
63 | * Moreover, we expose the internal decoder state (state0 and | |
64 | * codepoint0), allowing one to restart the decoder after it | |
65 | * terminates (say, due to a partial codepoint). | |
66 | * | |
67 | * In particular, there are a few possible outcomes, | |
68 | * | |
69 | * 1) We decoded the buffer entirely: | |
70 | * In this case we return srcend | |
71 | * state0 == UTF8_ACCEPT | |
72 | * | |
73 | * 2) We met an invalid encoding | |
74 | * In this case we return the address of the first invalid byte | |
75 | * state0 == UTF8_REJECT | |
76 | * | |
77 | * 3) We reached the end of the buffer while decoding a codepoint | |
78 | * In this case we return a pointer to the first byte of the partial codepoint | |
79 | * state0 != UTF8_ACCEPT, UTF8_REJECT | |
80 | * | |
81 | */ | |
82 | const uint8_t * | |
83 | _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, | |
84 | const uint8_t **const src, | |
85 | const uint8_t *const srcend, | |
86 | uint32_t *codepoint0, uint32_t *state0) | |
87 | { | |
88 | uint16_t *d = dest + *destoff; | |
89 | const uint8_t *s = *src, *last = *src; | |
90 | uint32_t state = *state0; | |
91 | uint32_t codepoint = *codepoint0; | |
92 | ||
93 | while (s < srcend) { | |
94 | #if defined(__i386__) || defined(__x86_64__) | |
95 | /* | |
96 | * This code will only work on a little-endian system that | |
97 | * supports unaligned loads. | |
98 | * | |
99 | * It gives a substantial speed win on data that is purely or | |
100 | * partly ASCII (e.g. HTML), at only a slight cost on purely | |
101 | * non-ASCII text. | |
102 | */ | |
103 | ||
104 | if (state == UTF8_ACCEPT) { | |
105 | while (s < srcend - 4) { | |
106 | codepoint = *((uint32_t *) s); | |
107 | if ((codepoint & 0x80808080) != 0) | |
108 | break; | |
109 | s += 4; | |
110 | ||
111 | /* | |
112 | * Tried 32-bit stores here, but the extra bit-twiddling | |
113 | * slowed the code down. | |
114 | */ | |
115 | ||
116 | *d++ = (uint16_t) (codepoint & 0xff); | |
117 | *d++ = (uint16_t) ((codepoint >> 8) & 0xff); | |
118 | *d++ = (uint16_t) ((codepoint >> 16) & 0xff); | |
119 | *d++ = (uint16_t) ((codepoint >> 24) & 0xff); | |
120 | } | |
121 | last = s; | |
122 | } | |
123 | #endif | |
124 | ||
125 | if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { | |
126 | if (state != UTF8_REJECT) | |
127 | continue; | |
128 | break; | |
129 | } | |
130 | ||
131 | if (codepoint <= 0xffff) | |
132 | *d++ = (uint16_t) codepoint; | |
133 | else { | |
134 | *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); | |
135 | *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); | |
136 | } | |
137 | last = s; | |
138 | } | |
139 | ||
140 | /* Invalid encoding, back up to the errant character */ | |
141 | if (state == UTF8_REJECT) | |
142 | s -= 1; | |
143 | ||
144 | *destoff = d - dest; | |
145 | *codepoint0 = codepoint; | |
146 | *state0 = state; | |
147 | *src = last; | |
148 | ||
149 | return s; | |
150 | } | |
151 |