cbits/cbits.c

   1 /*
   2  * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
   3  *
   4  * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
   5  *
   6  * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
   7  */
   8
   9 #include <string.h>
  10 #include <stdint.h>
  11 #include <stdio.h>
  12 #include "pipes_text_cbits.h"
  13
  14
  15
  16 #define UTF8_ACCEPT 0
  17 #define UTF8_REJECT 12
  18
  19 static const uint8_t utf8d[] = {
  20   /*
  21    * The first part of the table maps bytes to character classes that
  22    * to reduce the size of the transition table and create bitmasks.
  23    */
  24    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  25    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  26    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  27    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  28    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
  29    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  30    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  31   10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
  32
  33   /*
  34    * The second part is a transition table that maps a combination of
  35    * a state of the automaton and a character class to a state.
  36    */
  37    0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  38   12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  39   12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  40   12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  41   12,36,12,12,12,12,12,12,12,12,12,12,
  42 };
  43
  44 static inline uint32_t
  45 decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
  46   uint32_t type = utf8d[byte];
  47
  48   *codep = (*state != UTF8_ACCEPT) ?
  49     (byte & 0x3fu) | (*codep << 6) :
  50     (0xff >> type) & (byte);
  51
  52   return *state = utf8d[256 + *state + type];
  53 }
  54
  55 /*
  56  * A best-effort decoder. Runs until it hits either end of input or
  57  * the start of an invalid byte sequence.
  58  *
  59  * At exit, we update *destoff with the next offset to write to, *src
  60  * with the next source location past the last one successfully
  61  * decoded, and return the next source location to read from.
  62  *
  63  * Moreover, we expose the internal decoder state (state0 and
  64  * codepoint0), allowing one to restart the decoder after it
  65  * terminates (say, due to a partial codepoint).
  66  *
  67  * In particular, there are a few possible outcomes,
  68  *
  69  *   1) We decoded the buffer entirely:
  70  *      In this case we return srcend
  71  *      state0 == UTF8_ACCEPT
  72  *
  73  *   2) We met an invalid encoding
  74  *      In this case we return the address of the first invalid byte
  75  *      state0 == UTF8_REJECT
  76  *
  77  *   3) We reached the end of the buffer while decoding a codepoint
  78  *      In this case we return a pointer to the first byte of the partial codepoint
  79  *      state0 != UTF8_ACCEPT, UTF8_REJECT
  80  *
  81  */
  82
  83  #if defined(__GNUC__) || defined(__clang__)
  84  static inline uint8_t const *
  85  _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
  86                          const uint8_t const **src, const uint8_t const *srcend,
  87                          uint32_t *codepoint0, uint32_t *state0)
  88    __attribute((always_inline));
  89  #endif
  90
  91 static inline uint8_t const *
  92 _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
  93                          const uint8_t const **src, const uint8_t const *srcend,
  94                          uint32_t *codepoint0, uint32_t *state0)
  95 {
  96  uint16_t *d = dest + *destoff;
  97  const uint8_t *s = *src, *last = *src;
  98  uint32_t state = *state0;
  99  uint32_t codepoint = *codepoint0;
 100
 101  while (s < srcend) {
 102 #if defined(__i386__) || defined(__x86_64__)
 103    /*
 104     * This code will only work on a little-endian system that
 105     * supports unaligned loads.
 106     *
 107     * It gives a substantial speed win on data that is purely or
 108     * partly ASCII (e.g. HTML), at only a slight cost on purely
 109     * non-ASCII text.
 110     */
 111
 112    if (state == UTF8_ACCEPT) {
 113      while (s < srcend - 4) {
 114         codepoint = *((uint32_t *) s);
 115         if ((codepoint & 0x80808080) != 0)
 116           break;
 117         s += 4;
 118
 119         /*
 120          * Tried 32-bit stores here, but the extra bit-twiddling
 121          * slowed the code down.
 122          */
 123
 124         *d++ = (uint16_t) (codepoint & 0xff);
 125         *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
 126         *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
 127         *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
 128      }
 129      last = s;
 130    }
 131 #endif
 132
 133    if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
 134      if (state != UTF8_REJECT)
 135         continue;
 136      break;
 137    }
 138
 139    if (codepoint <= 0xffff)
 140      *d++ = (uint16_t) codepoint;
 141    else {
 142      *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
 143      *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
 144    }
 145    last = s;
 146  }
 147
 148  *destoff = d - dest;
 149  *codepoint0 = codepoint;
 150  *state0 = state;
 151  *src = last;
 152
 153  return s;
 154 }
 155
 156 uint8_t const *
 157 _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
 158                           const uint8_t const **src,
 159                            const uint8_t const *srcend,
 160                           uint32_t *codepoint0, uint32_t *state0)
 161 {
 162  uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
 163                                                 codepoint0, state0);
 164  if (*state0 == UTF8_REJECT)
 165    ret -=1;
 166  return ret;
 167 }
 168