[github/fretlink/text-pipes.git] / cbits / cbits.c

/*
 * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
 *
 * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
 *
 * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
 */

#include <string.h>
#include <stdint.h>
#include <stdio.h>
#include "pipes_text_cbits.h"


#define UTF8_ACCEPT 0
#define UTF8_REJECT 12

static const uint8_t utf8d[] = {
  /*
   * The first part of the table maps bytes to character classes that
   * to reduce the size of the transition table and create bitmasks.
   */
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

  /*
   * The second part is a transition table that maps a combination of
   * a state of the automaton and a character class to a state.
   */
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  12,36,12,12,12,12,12,12,12,12,12,12,
};

static inline uint32_t
decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
  uint32_t type = utf8d[byte];

  *codep = (*state != UTF8_ACCEPT) ?
    (byte & 0x3fu) | (*codep << 6) :
    (0xff >> type) & (byte);

  return *state = utf8d[256 + *state + type];
}

/*
 * A best-effort decoder. Runs until it hits either end of input or
 * the start of an invalid byte sequence.
 *
 * At exit, we update *destoff with the next offset to write to, *src
 * with the next source location past the last one successfully
 * decoded, and return the next source location to read from.
 *
 * Moreover, we expose the internal decoder state (state0 and
 * codepoint0), allowing one to restart the decoder after it
 * terminates (say, due to a partial codepoint).
 *
 * In particular, there are a few possible outcomes,
 *
 *   1) We decoded the buffer entirely:
 *      In this case we return srcend
 *      state0 == UTF8_ACCEPT
 *
 *   2) We met an invalid encoding
 *      In this case we return the address of the first invalid byte
 *      state0 == UTF8_REJECT
 *
 *   3) We reached the end of the buffer while decoding a codepoint
 *      In this case we return a pointer to the first byte of the partial codepoint
 *      state0 != UTF8_ACCEPT, UTF8_REJECT
 *
 */
const uint8_t *
_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
                           const uint8_t **const src,
                           const uint8_t *const srcend,
                           uint32_t *codepoint0, uint32_t *state0)
{
  uint16_t *d = dest + *destoff;
  const uint8_t *s = *src, *last = *src;
  uint32_t state = *state0;
  uint32_t codepoint = *codepoint0;

  while (s < srcend) {
#if defined(__i386__) || defined(__x86_64__)
    /*
     * This code will only work on a little-endian system that
     * supports unaligned loads.
     *
     * It gives a substantial speed win on data that is purely or
     * partly ASCII (e.g. HTML), at only a slight cost on purely
     * non-ASCII text.
     */

    if (state == UTF8_ACCEPT) {
      while (s < srcend - 4) {
	codepoint = *((uint32_t *) s);
	if ((codepoint & 0x80808080) != 0)
	  break;
	s += 4;

	/*
	 * Tried 32-bit stores here, but the extra bit-twiddling
	 * slowed the code down.
	 */

	*d++ = (uint16_t) (codepoint & 0xff);
	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
      }
      last = s;
    }
#endif

    if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
      if (state != UTF8_REJECT)
	continue;
      break;
    }

    if (codepoint <= 0xffff)
      *d++ = (uint16_t) codepoint;
    else {
      *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
      *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
    }
    last = s;
  }

  /* Invalid encoding, back up to the errant character */
  if (state == UTF8_REJECT)
    s -= 1;

  *destoff = d - dest;
  *codepoint0 = codepoint;
  *state0 = state;
  *src = last;

  return s;
}
Commit	Line	Data
8c482809	1	/*
	2	* Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
	3	*
	4	* Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
	5	*
	6	* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
	7	*/
	8
	9	#include <string.h>
	10	#include <stdint.h>
	11	#include <stdio.h>
	12	#include "pipes_text_cbits.h"
	13
	14
	15
	16	#define UTF8_ACCEPT 0
	17	#define UTF8_REJECT 12
	18
	19	static const uint8_t utf8d[] = {
	20	/*
	21	* The first part of the table maps bytes to character classes that
	22	* to reduce the size of the transition table and create bitmasks.
	23	*/
	24	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	25	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	26	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	27	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	28	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	29	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	30	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	31	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
	32
	33	/*
	34	* The second part is a transition table that maps a combination of
	35	* a state of the automaton and a character class to a state.
	36	*/
	37	0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
	38	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
	39	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
	40	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
	41	12,36,12,12,12,12,12,12,12,12,12,12,
	42	};
	43
	44	static inline uint32_t
	45	decode(uint32_t state, uint32_t codep, uint32_t byte) {
	46	uint32_t type = utf8d[byte];
	47
	48	codep = (state != UTF8_ACCEPT) ?
	49	(byte & 0x3fu) \| (*codep << 6) :
	50	(0xff >> type) & (byte);
	51
	52	return state = utf8d[256 + state + type];
	53	}
	54
	55	/*
	56	* A best-effort decoder. Runs until it hits either end of input or
	57	* the start of an invalid byte sequence.
	58	*
	59	* At exit, we update destoff with the next offset to write to, src
	60	* with the next source location past the last one successfully
	61	* decoded, and return the next source location to read from.
	62	*
	63	* Moreover, we expose the internal decoder state (state0 and
	64	* codepoint0), allowing one to restart the decoder after it
65	* terminates (say, due to a partial codepoint).
66	*
67	* In particular, there are a few possible outcomes,
68	*
69	* 1) We decoded the buffer entirely:
70	* In this case we return srcend
71	* state0 == UTF8_ACCEPT
72	*
73	* 2) We met an invalid encoding
74	* In this case we return the address of the first invalid byte
75	* state0 == UTF8_REJECT
76	*
77	* 3) We reached the end of the buffer while decoding a codepoint
78	* In this case we return a pointer to the first byte of the partial codepoint
79	* state0 != UTF8_ACCEPT, UTF8_REJECT
80	*
81	*/
82	const uint8_t *
83	_hs_pipes_text_decode_utf8_state(uint16_t const dest, size_t destoff,
84	const uint8_t **const src,
85	const uint8_t *const srcend,
86	uint32_t codepoint0, uint32_t state0)
87	{
88	uint16_t d = dest + destoff;
89	const uint8_t s = src, last = src;
90	uint32_t state = *state0;
91	uint32_t codepoint = *codepoint0;
92
93	while (s < srcend) {
94	#if defined(__i386__) \|\| defined(__x86_64__)
95	/*
96	* This code will only work on a little-endian system that
97	* supports unaligned loads.
98	*
99	* It gives a substantial speed win on data that is purely or
100	* partly ASCII (e.g. HTML), at only a slight cost on purely
101	* non-ASCII text.
102	*/
103
104	if (state == UTF8_ACCEPT) {
105	while (s < srcend - 4) {
106	codepoint = ((uint32_t ) s);
107	if ((codepoint & 0x80808080) != 0)
108	break;
109	s += 4;
110
111	/*
112	* Tried 32-bit stores here, but the extra bit-twiddling
113	* slowed the code down.
114	*/
115
116	*d++ = (uint16_t) (codepoint & 0xff);
117	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
118	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
119	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
120	}
121	last = s;
122	}
123	#endif
124
125	if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
126	if (state != UTF8_REJECT)
127	continue;
128	break;
129	}
130
131	if (codepoint <= 0xffff)
132	*d++ = (uint16_t) codepoint;
133	else {
134	*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
135	*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
136	}
137	last = s;
138	}
139
140	/* Invalid encoding, back up to the errant character */
141	if (state == UTF8_REJECT)
142	s -= 1;
143
144	*destoff = d - dest;
145	*codepoint0 = codepoint;
146	*state0 = state;
147	*src = last;
148
149	return s;
150	}
151