aboutsummaryrefslogtreecommitdiffhomepage
path: root/vendor/github.com/apparentlymart/go-textseg/textseg/grapheme_clusters.rl
blob: 003ffbf5948080ec7687a2ab50975ff5cc70471b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
package textseg

import (
    "errors"
    "unicode/utf8"
)

// Generated from grapheme_clusters.rl. DO NOT EDIT
%%{
  # (except you are actually in grapheme_clusters.rl here, so edit away!)

  machine graphclust;
  write data;
}%%

var Error = errors.New("invalid UTF8 text")

// ScanGraphemeClusters is a split function for bufio.Scanner that splits
// on grapheme cluster boundaries.
func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) {
    if len(data) == 0 {
        return 0, nil, nil
    }

    // Ragel state
	cs := 0 // Current State
	p := 0  // "Pointer" into data
	pe := len(data) // End-of-data "pointer"
    ts := 0
    te := 0
    act := 0
    eof := pe

    // Make Go compiler happy
    _ = ts
    _ = te
    _ = act
    _ = eof

    startPos := 0
    endPos := 0

    %%{
        include GraphemeCluster "grapheme_clusters_table.rl";

        action start {
            startPos = p
        }

        action end {
            endPos = p
        }

        action emit {
            return endPos+1, data[startPos:endPos+1], nil
        }

        ZWJGlue = ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?)?;
        AnyExtender = Extend | ZWJGlue | SpacingMark;
        Extension = AnyExtender*;
        ReplacementChar = (0xEF 0xBF 0xBD);

        CRLFSeq = CR LF;
        ControlSeq = Control | ReplacementChar;
        HangulSeq = (
            L+ (((LV? V+ | LVT) T*)?|LV?) |
            LV V* T* |
            V+ T* |
            LVT T* |
            T+
        ) Extension;
        EmojiSeq = (E_Base | E_Base_GAZ) Extend* E_Modifier? Extension;
        ZWJSeq = ZWJGlue Extension;
        EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension;

        UTF8Cont = 0x80 .. 0xBF;
        AnyUTF8 = (
            0x00..0x7F |
            0xC0..0xDF . UTF8Cont |
            0xE0..0xEF . UTF8Cont . UTF8Cont |
            0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
        );

        # OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension
        OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|E_Base|E_Base_GAZ|ZWJ|Regional_Indicator|Prepend)) Extension;

        # PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break
        PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?;

        CRLFTok = CRLFSeq >start @end;
        ControlTok = ControlSeq >start @end;
        HangulTok = HangulSeq >start @end;
        EmojiTok = EmojiSeq >start @end;
        ZWJTok = ZWJSeq >start @end;
        EmojiFlagTok = EmojiFlagSeq >start @end;
        OtherTok = OtherSeq >start @end;
        PrependTok = PrependSeq >start @end;

        main := |*
            CRLFTok => emit;
            ControlTok => emit;
            HangulTok => emit;
            EmojiTok => emit;
            ZWJTok => emit;
            EmojiFlagTok => emit;
            PrependTok => emit;
            OtherTok => emit;

            # any single valid UTF-8 character would also be valid per spec,
            # but we'll handle that separately after the loop so we can deal
            # with requesting more bytes if we're not at EOF.
        *|;

        write init;
        write exec;
    }%%

    // If we fall out here then we were unable to complete a sequence.
    // If we weren't able to complete a sequence then either we've
    // reached the end of a partial buffer (so there's more data to come)
    // or we have an isolated symbol that would normally be part of a
    // grapheme cluster but has appeared in isolation here.

    if !atEOF {
        // Request more
        return 0, nil, nil
    }

    // Just take the first UTF-8 sequence and return that.
    _, seqLen := utf8.DecodeRune(data)
    return seqLen, data[:seqLen], nil
}