]>
Commit | Line | Data |
---|---|---|
1 | // Copyright 2015 The Go Authors. All rights reserved. | |
2 | // Use of this source code is governed by a BSD-style | |
3 | // license that can be found in the LICENSE file. | |
4 | ||
5 | package bidi | |
6 | ||
7 | import ( | |
8 | "container/list" | |
9 | "fmt" | |
10 | "sort" | |
11 | ) | |
12 | ||
13 | // This file contains a port of the reference implementation of the | |
14 | // Bidi Parentheses Algorithm: | |
15 | // https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java | |
16 | // | |
17 | // The implementation in this file covers definitions BD14-BD16 and rule N0 | |
18 | // of UAX#9. | |
19 | // | |
20 | // Some preprocessing is done for each rune before data is passed to this | |
21 | // algorithm: | |
22 | // - opening and closing brackets are identified | |
23 | // - a bracket pair type, like '(' and ')' is assigned a unique identifier that | |
24 | // is identical for the opening and closing bracket. It is left to do these | |
25 | // mappings. | |
26 | // - The BPA algorithm requires that bracket characters that are canonical | |
27 | // equivalents of each other be able to be substituted for each other. | |
28 | // It is the responsibility of the caller to do this canonicalization. | |
29 | // | |
30 | // In implementing BD16, this implementation departs slightly from the "logical" | |
31 | // algorithm defined in UAX#9. In particular, the stack referenced there | |
32 | // supports operations that go beyond a "basic" stack. An equivalent | |
33 | // implementation based on a linked list is used here. | |
34 | ||
35 | // Bidi_Paired_Bracket_Type | |
36 | // BD14. An opening paired bracket is a character whose | |
37 | // Bidi_Paired_Bracket_Type property value is Open. | |
38 | // | |
39 | // BD15. A closing paired bracket is a character whose | |
40 | // Bidi_Paired_Bracket_Type property value is Close. | |
41 | type bracketType byte | |
42 | ||
43 | const ( | |
44 | bpNone bracketType = iota | |
45 | bpOpen | |
46 | bpClose | |
47 | ) | |
48 | ||
49 | // bracketPair holds a pair of index values for opening and closing bracket | |
50 | // location of a bracket pair. | |
51 | type bracketPair struct { | |
52 | opener int | |
53 | closer int | |
54 | } | |
55 | ||
56 | func (b *bracketPair) String() string { | |
57 | return fmt.Sprintf("(%v, %v)", b.opener, b.closer) | |
58 | } | |
59 | ||
60 | // bracketPairs is a slice of bracketPairs with a sort.Interface implementation. | |
61 | type bracketPairs []bracketPair | |
62 | ||
63 | func (b bracketPairs) Len() int { return len(b) } | |
64 | func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } | |
65 | func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener } | |
66 | ||
67 | // resolvePairedBrackets runs the paired bracket part of the UBA algorithm. | |
68 | // | |
69 | // For each rune, it takes the indexes into the original string, the class the | |
70 | // bracket type (in pairTypes) and the bracket identifier (pairValues). It also | |
71 | // takes the direction type for the start-of-sentence and the embedding level. | |
72 | // | |
73 | // The identifiers for bracket types are the rune of the canonicalized opening | |
74 | // bracket for brackets (open or close) or 0 for runes that are not brackets. | |
75 | func resolvePairedBrackets(s *isolatingRunSequence) { | |
76 | p := bracketPairer{ | |
77 | sos: s.sos, | |
78 | openers: list.New(), | |
79 | codesIsolatedRun: s.types, | |
80 | indexes: s.indexes, | |
81 | } | |
82 | dirEmbed := L | |
83 | if s.level&1 != 0 { | |
84 | dirEmbed = R | |
85 | } | |
86 | p.locateBrackets(s.p.pairTypes, s.p.pairValues) | |
87 | p.resolveBrackets(dirEmbed, s.p.initialTypes) | |
88 | } | |
89 | ||
90 | type bracketPairer struct { | |
91 | sos Class // direction corresponding to start of sequence | |
92 | ||
93 | // The following is a restatement of BD 16 using non-algorithmic language. | |
94 | // | |
95 | // A bracket pair is a pair of characters consisting of an opening | |
96 | // paired bracket and a closing paired bracket such that the | |
97 | // Bidi_Paired_Bracket property value of the former equals the latter, | |
98 | // subject to the following constraints. | |
99 | // - both characters of a pair occur in the same isolating run sequence | |
100 | // - the closing character of a pair follows the opening character | |
101 | // - any bracket character can belong at most to one pair, the earliest possible one | |
102 | // - any bracket character not part of a pair is treated like an ordinary character | |
103 | // - pairs may nest properly, but their spans may not overlap otherwise | |
104 | ||
105 | // Bracket characters with canonical decompositions are supposed to be | |
106 | // treated as if they had been normalized, to allow normalized and non- | |
107 | // normalized text to give the same result. In this implementation that step | |
108 | // is pushed out to the caller. The caller has to ensure that the pairValue | |
109 | // slices contain the rune of the opening bracket after normalization for | |
110 | // any opening or closing bracket. | |
111 | ||
112 | openers *list.List // list of positions for opening brackets | |
113 | ||
114 | // bracket pair positions sorted by location of opening bracket | |
115 | pairPositions bracketPairs | |
116 | ||
117 | codesIsolatedRun []Class // directional bidi codes for an isolated run | |
118 | indexes []int // array of index values into the original string | |
119 | ||
120 | } | |
121 | ||
122 | // matchOpener reports whether characters at given positions form a matching | |
123 | // bracket pair. | |
124 | func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool { | |
125 | return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]] | |
126 | } | |
127 | ||
128 | const maxPairingDepth = 63 | |
129 | ||
130 | // locateBrackets locates matching bracket pairs according to BD16. | |
131 | // | |
132 | // This implementation uses a linked list instead of a stack, because, while | |
133 | // elements are added at the front (like a push) they are not generally removed | |
134 | // in atomic 'pop' operations, reducing the benefit of the stack archetype. | |
135 | func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) { | |
136 | // traverse the run | |
137 | // do that explicitly (not in a for-each) so we can record position | |
138 | for i, index := range p.indexes { | |
139 | ||
140 | // look at the bracket type for each character | |
141 | if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON { | |
142 | // continue scanning | |
143 | continue | |
144 | } | |
145 | switch pairTypes[index] { | |
146 | case bpOpen: | |
147 | // check if maximum pairing depth reached | |
148 | if p.openers.Len() == maxPairingDepth { | |
149 | p.openers.Init() | |
150 | return | |
151 | } | |
152 | // remember opener location, most recent first | |
153 | p.openers.PushFront(i) | |
154 | ||
155 | case bpClose: | |
156 | // see if there is a match | |
157 | count := 0 | |
158 | for elem := p.openers.Front(); elem != nil; elem = elem.Next() { | |
159 | count++ | |
160 | opener := elem.Value.(int) | |
161 | if p.matchOpener(pairValues, opener, i) { | |
162 | // if the opener matches, add nested pair to the ordered list | |
163 | p.pairPositions = append(p.pairPositions, bracketPair{opener, i}) | |
164 | // remove up to and including matched opener | |
165 | for ; count > 0; count-- { | |
166 | p.openers.Remove(p.openers.Front()) | |
167 | } | |
168 | break | |
169 | } | |
170 | } | |
171 | sort.Sort(p.pairPositions) | |
172 | // if we get here, the closing bracket matched no openers | |
173 | // and gets ignored | |
174 | } | |
175 | } | |
176 | } | |
177 | ||
178 | // Bracket pairs within an isolating run sequence are processed as units so | |
179 | // that both the opening and the closing paired bracket in a pair resolve to | |
180 | // the same direction. | |
181 | // | |
182 | // N0. Process bracket pairs in an isolating run sequence sequentially in | |
183 | // the logical order of the text positions of the opening paired brackets | |
184 | // using the logic given below. Within this scope, bidirectional types EN | |
185 | // and AN are treated as R. | |
186 | // | |
187 | // Identify the bracket pairs in the current isolating run sequence | |
188 | // according to BD16. For each bracket-pair element in the list of pairs of | |
189 | // text positions: | |
190 | // | |
191 | // a Inspect the bidirectional types of the characters enclosed within the | |
192 | // bracket pair. | |
193 | // | |
194 | // b If any strong type (either L or R) matching the embedding direction is | |
195 | // found, set the type for both brackets in the pair to match the embedding | |
196 | // direction. | |
197 | // | |
198 | // o [ e ] o -> o e e e o | |
199 | // | |
200 | // o [ o e ] -> o e o e e | |
201 | // | |
202 | // o [ NI e ] -> o e NI e e | |
203 | // | |
204 | // c Otherwise, if a strong type (opposite the embedding direction) is | |
205 | // found, test for adjacent strong types as follows: 1 First, check | |
206 | // backwards before the opening paired bracket until the first strong type | |
207 | // (L, R, or sos) is found. If that first preceding strong type is opposite | |
208 | // the embedding direction, then set the type for both brackets in the pair | |
209 | // to that type. 2 Otherwise, set the type for both brackets in the pair to | |
210 | // the embedding direction. | |
211 | // | |
212 | // o [ o ] e -> o o o o e | |
213 | // | |
214 | // o [ o NI ] o -> o o o NI o o | |
215 | // | |
216 | // e [ o ] o -> e e o e o | |
217 | // | |
218 | // e [ o ] e -> e e o e e | |
219 | // | |
220 | // e ( o [ o ] NI ) e -> e e o o o o NI e e | |
221 | // | |
222 | // d Otherwise, do not set the type for the current bracket pair. Note that | |
223 | // if the enclosed text contains no strong types the paired brackets will | |
224 | // both resolve to the same level when resolved individually using rules N1 | |
225 | // and N2. | |
226 | // | |
227 | // e ( NI ) o -> e ( NI ) o | |
228 | ||
229 | // getStrongTypeN0 maps character's directional code to strong type as required | |
230 | // by rule N0. | |
231 | // | |
232 | // TODO: have separate type for "strong" directionality. | |
233 | func (p *bracketPairer) getStrongTypeN0(index int) Class { | |
234 | switch p.codesIsolatedRun[index] { | |
235 | // in the scope of N0, number types are treated as R | |
236 | case EN, AN, AL, R: | |
237 | return R | |
238 | case L: | |
239 | return L | |
240 | default: | |
241 | return ON | |
242 | } | |
243 | } | |
244 | ||
245 | // classifyPairContent reports the strong types contained inside a Bracket Pair, | |
246 | // assuming the given embedding direction. | |
247 | // | |
248 | // It returns ON if no strong type is found. If a single strong type is found, | |
249 | // it returns this type. Otherwise it returns the embedding direction. | |
250 | // | |
251 | // TODO: use separate type for "strong" directionality. | |
252 | func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class { | |
253 | dirOpposite := ON | |
254 | for i := loc.opener + 1; i < loc.closer; i++ { | |
255 | dir := p.getStrongTypeN0(i) | |
256 | if dir == ON { | |
257 | continue | |
258 | } | |
259 | if dir == dirEmbed { | |
260 | return dir // type matching embedding direction found | |
261 | } | |
262 | dirOpposite = dir | |
263 | } | |
264 | // return ON if no strong type found, or class opposite to dirEmbed | |
265 | return dirOpposite | |
266 | } | |
267 | ||
268 | // classBeforePair determines which strong types are present before a Bracket | |
269 | // Pair. Return R or L if strong type found, otherwise ON. | |
270 | func (p *bracketPairer) classBeforePair(loc bracketPair) Class { | |
271 | for i := loc.opener - 1; i >= 0; i-- { | |
272 | if dir := p.getStrongTypeN0(i); dir != ON { | |
273 | return dir | |
274 | } | |
275 | } | |
276 | // no strong types found, return sos | |
277 | return p.sos | |
278 | } | |
279 | ||
280 | // assignBracketType implements rule N0 for a single bracket pair. | |
281 | func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) { | |
282 | // rule "N0, a", inspect contents of pair | |
283 | dirPair := p.classifyPairContent(loc, dirEmbed) | |
284 | ||
285 | // dirPair is now L, R, or N (no strong type found) | |
286 | ||
287 | // the following logical tests are performed out of order compared to | |
288 | // the statement of the rules but yield the same results | |
289 | if dirPair == ON { | |
290 | return // case "d" - nothing to do | |
291 | } | |
292 | ||
293 | if dirPair != dirEmbed { | |
294 | // case "c": strong type found, opposite - check before (c.1) | |
295 | dirPair = p.classBeforePair(loc) | |
296 | if dirPair == dirEmbed || dirPair == ON { | |
297 | // no strong opposite type found before - use embedding (c.2) | |
298 | dirPair = dirEmbed | |
299 | } | |
300 | } | |
301 | // else: case "b", strong type found matching embedding, | |
302 | // no explicit action needed, as dirPair is already set to embedding | |
303 | // direction | |
304 | ||
305 | // set the bracket types to the type found | |
306 | p.setBracketsToType(loc, dirPair, initialTypes) | |
307 | } | |
308 | ||
309 | func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) { | |
310 | p.codesIsolatedRun[loc.opener] = dirPair | |
311 | p.codesIsolatedRun[loc.closer] = dirPair | |
312 | ||
313 | for i := loc.opener + 1; i < loc.closer; i++ { | |
314 | index := p.indexes[i] | |
315 | if initialTypes[index] != NSM { | |
316 | break | |
317 | } | |
318 | p.codesIsolatedRun[i] = dirPair | |
319 | } | |
320 | ||
321 | for i := loc.closer + 1; i < len(p.indexes); i++ { | |
322 | index := p.indexes[i] | |
323 | if initialTypes[index] != NSM { | |
324 | break | |
325 | } | |
326 | p.codesIsolatedRun[i] = dirPair | |
327 | } | |
328 | } | |
329 | ||
330 | // resolveBrackets implements rule N0 for a list of pairs. | |
331 | func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) { | |
332 | for _, loc := range p.pairPositions { | |
333 | p.assignBracketType(loc, dirEmbed, initialTypes) | |
334 | } | |
335 | } |