]>
Commit | Line | Data |
---|---|---|
15c0b25d AP |
1 | // Copyright 2015 The Go Authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style | |
3 | // license that can be found in the LICENSE file. | |
4 | ||
5 | package bidi | |
6 | ||
7 | import "log" | |
8 | ||
9 | // This implementation is a port based on the reference implementation found at: | |
107c1cdb | 10 | // https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/ |
15c0b25d AP |
11 | // |
12 | // described in Unicode Bidirectional Algorithm (UAX #9). | |
13 | // | |
14 | // Input: | |
15 | // There are two levels of input to the algorithm, since clients may prefer to | |
16 | // supply some information from out-of-band sources rather than relying on the | |
17 | // default behavior. | |
18 | // | |
19 | // - Bidi class array | |
20 | // - Bidi class array, with externally supplied base line direction | |
21 | // | |
22 | // Output: | |
23 | // Output is separated into several stages: | |
24 | // | |
25 | // - levels array over entire paragraph | |
26 | // - reordering array over entire paragraph | |
27 | // - levels array over line | |
28 | // - reordering array over line | |
29 | // | |
30 | // Note that for conformance to the Unicode Bidirectional Algorithm, | |
31 | // implementations are only required to generate correct reordering and | |
32 | // character directionality (odd or even levels) over a line. Generating | |
33 | // identical level arrays over a line is not required. Bidi explicit format | |
34 | // codes (LRE, RLE, LRO, RLO, PDF) and BN can be assigned arbitrary levels and | |
35 | // positions as long as the rest of the input is properly reordered. | |
36 | // | |
37 | // As the algorithm is defined to operate on a single paragraph at a time, this | |
38 | // implementation is written to handle single paragraphs. Thus rule P1 is | |
39 | // presumed by this implementation-- the data provided to the implementation is | |
40 | // assumed to be a single paragraph, and either contains no 'B' codes, or a | |
41 | // single 'B' code at the end of the input. 'B' is allowed as input to | |
42 | // illustrate how the algorithm assigns it a level. | |
43 | // | |
44 | // Also note that rules L3 and L4 depend on the rendering engine that uses the | |
45 | // result of the bidi algorithm. This implementation assumes that the rendering | |
46 | // engine expects combining marks in visual order (e.g. to the left of their | |
47 | // base character in RTL runs) and that it adjusts the glyphs used to render | |
48 | // mirrored characters that are in RTL runs so that they render appropriately. | |
49 | ||
50 | // level is the embedding level of a character. Even embedding levels indicate | |
51 | // left-to-right order and odd levels indicate right-to-left order. The special | |
52 | // level of -1 is reserved for undefined order. | |
53 | type level int8 | |
54 | ||
55 | const implicitLevel level = -1 | |
56 | ||
57 | // in returns if x is equal to any of the values in set. | |
58 | func (c Class) in(set ...Class) bool { | |
59 | for _, s := range set { | |
60 | if c == s { | |
61 | return true | |
62 | } | |
63 | } | |
64 | return false | |
65 | } | |
66 | ||
67 | // A paragraph contains the state of a paragraph. | |
68 | type paragraph struct { | |
69 | initialTypes []Class | |
70 | ||
71 | // Arrays of properties needed for paired bracket evaluation in N0 | |
72 | pairTypes []bracketType // paired Bracket types for paragraph | |
73 | pairValues []rune // rune for opening bracket or pbOpen and pbClose; 0 for pbNone | |
74 | ||
75 | embeddingLevel level // default: = implicitLevel; | |
76 | ||
77 | // at the paragraph levels | |
78 | resultTypes []Class | |
79 | resultLevels []level | |
80 | ||
81 | // Index of matching PDI for isolate initiator characters. For other | |
82 | // characters, the value of matchingPDI will be set to -1. For isolate | |
83 | // initiators with no matching PDI, matchingPDI will be set to the length of | |
84 | // the input string. | |
85 | matchingPDI []int | |
86 | ||
87 | // Index of matching isolate initiator for PDI characters. For other | |
88 | // characters, and for PDIs with no matching isolate initiator, the value of | |
89 | // matchingIsolateInitiator will be set to -1. | |
90 | matchingIsolateInitiator []int | |
91 | } | |
92 | ||
93 | // newParagraph initializes a paragraph. The user needs to supply a few arrays | |
94 | // corresponding to the preprocessed text input. The types correspond to the | |
95 | // Unicode BiDi classes for each rune. pairTypes indicates the bracket type for | |
96 | // each rune. pairValues provides a unique bracket class identifier for each | |
97 | // rune (suggested is the rune of the open bracket for opening and matching | |
98 | // close brackets, after normalization). The embedding levels are optional, but | |
99 | // may be supplied to encode embedding levels of styled text. | |
100 | // | |
101 | // TODO: return an error. | |
102 | func newParagraph(types []Class, pairTypes []bracketType, pairValues []rune, levels level) *paragraph { | |
103 | validateTypes(types) | |
104 | validatePbTypes(pairTypes) | |
105 | validatePbValues(pairValues, pairTypes) | |
106 | validateParagraphEmbeddingLevel(levels) | |
107 | ||
108 | p := ¶graph{ | |
109 | initialTypes: append([]Class(nil), types...), | |
110 | embeddingLevel: levels, | |
111 | ||
112 | pairTypes: pairTypes, | |
113 | pairValues: pairValues, | |
114 | ||
115 | resultTypes: append([]Class(nil), types...), | |
116 | } | |
117 | p.run() | |
118 | return p | |
119 | } | |
120 | ||
121 | func (p *paragraph) Len() int { return len(p.initialTypes) } | |
122 | ||
123 | // The algorithm. Does not include line-based processing (Rules L1, L2). | |
124 | // These are applied later in the line-based phase of the algorithm. | |
125 | func (p *paragraph) run() { | |
126 | p.determineMatchingIsolates() | |
127 | ||
128 | // 1) determining the paragraph level | |
129 | // Rule P1 is the requirement for entering this algorithm. | |
130 | // Rules P2, P3. | |
131 | // If no externally supplied paragraph embedding level, use default. | |
132 | if p.embeddingLevel == implicitLevel { | |
133 | p.embeddingLevel = p.determineParagraphEmbeddingLevel(0, p.Len()) | |
134 | } | |
135 | ||
136 | // Initialize result levels to paragraph embedding level. | |
137 | p.resultLevels = make([]level, p.Len()) | |
138 | setLevels(p.resultLevels, p.embeddingLevel) | |
139 | ||
140 | // 2) Explicit levels and directions | |
141 | // Rules X1-X8. | |
142 | p.determineExplicitEmbeddingLevels() | |
143 | ||
144 | // Rule X9. | |
145 | // We do not remove the embeddings, the overrides, the PDFs, and the BNs | |
146 | // from the string explicitly. But they are not copied into isolating run | |
147 | // sequences when they are created, so they are removed for all | |
148 | // practical purposes. | |
149 | ||
150 | // Rule X10. | |
151 | // Run remainder of algorithm one isolating run sequence at a time | |
152 | for _, seq := range p.determineIsolatingRunSequences() { | |
153 | // 3) resolving weak types | |
154 | // Rules W1-W7. | |
155 | seq.resolveWeakTypes() | |
156 | ||
157 | // 4a) resolving paired brackets | |
158 | // Rule N0 | |
159 | resolvePairedBrackets(seq) | |
160 | ||
161 | // 4b) resolving neutral types | |
162 | // Rules N1-N3. | |
163 | seq.resolveNeutralTypes() | |
164 | ||
165 | // 5) resolving implicit embedding levels | |
166 | // Rules I1, I2. | |
167 | seq.resolveImplicitLevels() | |
168 | ||
169 | // Apply the computed levels and types | |
170 | seq.applyLevelsAndTypes() | |
171 | } | |
172 | ||
173 | // Assign appropriate levels to 'hide' LREs, RLEs, LROs, RLOs, PDFs, and | |
174 | // BNs. This is for convenience, so the resulting level array will have | |
175 | // a value for every character. | |
176 | p.assignLevelsToCharactersRemovedByX9() | |
177 | } | |
178 | ||
179 | // determineMatchingIsolates determines the matching PDI for each isolate | |
180 | // initiator and vice versa. | |
181 | // | |
182 | // Definition BD9. | |
183 | // | |
184 | // At the end of this function: | |
185 | // | |
186 | // - The member variable matchingPDI is set to point to the index of the | |
187 | // matching PDI character for each isolate initiator character. If there is | |
188 | // no matching PDI, it is set to the length of the input text. For other | |
189 | // characters, it is set to -1. | |
190 | // - The member variable matchingIsolateInitiator is set to point to the | |
191 | // index of the matching isolate initiator character for each PDI character. | |
192 | // If there is no matching isolate initiator, or the character is not a PDI, | |
193 | // it is set to -1. | |
194 | func (p *paragraph) determineMatchingIsolates() { | |
195 | p.matchingPDI = make([]int, p.Len()) | |
196 | p.matchingIsolateInitiator = make([]int, p.Len()) | |
197 | ||
198 | for i := range p.matchingIsolateInitiator { | |
199 | p.matchingIsolateInitiator[i] = -1 | |
200 | } | |
201 | ||
202 | for i := range p.matchingPDI { | |
203 | p.matchingPDI[i] = -1 | |
204 | ||
205 | if t := p.resultTypes[i]; t.in(LRI, RLI, FSI) { | |
206 | depthCounter := 1 | |
207 | for j := i + 1; j < p.Len(); j++ { | |
208 | if u := p.resultTypes[j]; u.in(LRI, RLI, FSI) { | |
209 | depthCounter++ | |
210 | } else if u == PDI { | |
211 | if depthCounter--; depthCounter == 0 { | |
212 | p.matchingPDI[i] = j | |
213 | p.matchingIsolateInitiator[j] = i | |
214 | break | |
215 | } | |
216 | } | |
217 | } | |
218 | if p.matchingPDI[i] == -1 { | |
219 | p.matchingPDI[i] = p.Len() | |
220 | } | |
221 | } | |
222 | } | |
223 | } | |
224 | ||
225 | // determineParagraphEmbeddingLevel reports the resolved paragraph direction of | |
226 | // the substring limited by the given range [start, end). | |
227 | // | |
228 | // Determines the paragraph level based on rules P2, P3. This is also used | |
229 | // in rule X5c to find if an FSI should resolve to LRI or RLI. | |
230 | func (p *paragraph) determineParagraphEmbeddingLevel(start, end int) level { | |
231 | var strongType Class = unknownClass | |
232 | ||
233 | // Rule P2. | |
234 | for i := start; i < end; i++ { | |
235 | if t := p.resultTypes[i]; t.in(L, AL, R) { | |
236 | strongType = t | |
237 | break | |
238 | } else if t.in(FSI, LRI, RLI) { | |
239 | i = p.matchingPDI[i] // skip over to the matching PDI | |
240 | if i > end { | |
241 | log.Panic("assert (i <= end)") | |
242 | } | |
243 | } | |
244 | } | |
245 | // Rule P3. | |
246 | switch strongType { | |
247 | case unknownClass: // none found | |
248 | // default embedding level when no strong types found is 0. | |
249 | return 0 | |
250 | case L: | |
251 | return 0 | |
252 | default: // AL, R | |
253 | return 1 | |
254 | } | |
255 | } | |
256 | ||
257 | const maxDepth = 125 | |
258 | ||
259 | // This stack will store the embedding levels and override and isolated | |
260 | // statuses | |
261 | type directionalStatusStack struct { | |
262 | stackCounter int | |
263 | embeddingLevelStack [maxDepth + 1]level | |
264 | overrideStatusStack [maxDepth + 1]Class | |
265 | isolateStatusStack [maxDepth + 1]bool | |
266 | } | |
267 | ||
268 | func (s *directionalStatusStack) empty() { s.stackCounter = 0 } | |
269 | func (s *directionalStatusStack) pop() { s.stackCounter-- } | |
270 | func (s *directionalStatusStack) depth() int { return s.stackCounter } | |
271 | ||
272 | func (s *directionalStatusStack) push(level level, overrideStatus Class, isolateStatus bool) { | |
273 | s.embeddingLevelStack[s.stackCounter] = level | |
274 | s.overrideStatusStack[s.stackCounter] = overrideStatus | |
275 | s.isolateStatusStack[s.stackCounter] = isolateStatus | |
276 | s.stackCounter++ | |
277 | } | |
278 | ||
279 | func (s *directionalStatusStack) lastEmbeddingLevel() level { | |
280 | return s.embeddingLevelStack[s.stackCounter-1] | |
281 | } | |
282 | ||
283 | func (s *directionalStatusStack) lastDirectionalOverrideStatus() Class { | |
284 | return s.overrideStatusStack[s.stackCounter-1] | |
285 | } | |
286 | ||
287 | func (s *directionalStatusStack) lastDirectionalIsolateStatus() bool { | |
288 | return s.isolateStatusStack[s.stackCounter-1] | |
289 | } | |
290 | ||
291 | // Determine explicit levels using rules X1 - X8 | |
292 | func (p *paragraph) determineExplicitEmbeddingLevels() { | |
293 | var stack directionalStatusStack | |
294 | var overflowIsolateCount, overflowEmbeddingCount, validIsolateCount int | |
295 | ||
296 | // Rule X1. | |
297 | stack.push(p.embeddingLevel, ON, false) | |
298 | ||
299 | for i, t := range p.resultTypes { | |
300 | // Rules X2, X3, X4, X5, X5a, X5b, X5c | |
301 | switch t { | |
302 | case RLE, LRE, RLO, LRO, RLI, LRI, FSI: | |
303 | isIsolate := t.in(RLI, LRI, FSI) | |
304 | isRTL := t.in(RLE, RLO, RLI) | |
305 | ||
306 | // override if this is an FSI that resolves to RLI | |
307 | if t == FSI { | |
308 | isRTL = (p.determineParagraphEmbeddingLevel(i+1, p.matchingPDI[i]) == 1) | |
309 | } | |
310 | if isIsolate { | |
311 | p.resultLevels[i] = stack.lastEmbeddingLevel() | |
312 | if stack.lastDirectionalOverrideStatus() != ON { | |
313 | p.resultTypes[i] = stack.lastDirectionalOverrideStatus() | |
314 | } | |
315 | } | |
316 | ||
317 | var newLevel level | |
318 | if isRTL { | |
319 | // least greater odd | |
320 | newLevel = (stack.lastEmbeddingLevel() + 1) | 1 | |
321 | } else { | |
322 | // least greater even | |
323 | newLevel = (stack.lastEmbeddingLevel() + 2) &^ 1 | |
324 | } | |
325 | ||
326 | if newLevel <= maxDepth && overflowIsolateCount == 0 && overflowEmbeddingCount == 0 { | |
327 | if isIsolate { | |
328 | validIsolateCount++ | |
329 | } | |
330 | // Push new embedding level, override status, and isolated | |
331 | // status. | |
332 | // No check for valid stack counter, since the level check | |
333 | // suffices. | |
334 | switch t { | |
335 | case LRO: | |
336 | stack.push(newLevel, L, isIsolate) | |
337 | case RLO: | |
338 | stack.push(newLevel, R, isIsolate) | |
339 | default: | |
340 | stack.push(newLevel, ON, isIsolate) | |
341 | } | |
342 | // Not really part of the spec | |
343 | if !isIsolate { | |
344 | p.resultLevels[i] = newLevel | |
345 | } | |
346 | } else { | |
347 | // This is an invalid explicit formatting character, | |
348 | // so apply the "Otherwise" part of rules X2-X5b. | |
349 | if isIsolate { | |
350 | overflowIsolateCount++ | |
351 | } else { // !isIsolate | |
352 | if overflowIsolateCount == 0 { | |
353 | overflowEmbeddingCount++ | |
354 | } | |
355 | } | |
356 | } | |
357 | ||
358 | // Rule X6a | |
359 | case PDI: | |
360 | if overflowIsolateCount > 0 { | |
361 | overflowIsolateCount-- | |
362 | } else if validIsolateCount == 0 { | |
363 | // do nothing | |
364 | } else { | |
365 | overflowEmbeddingCount = 0 | |
366 | for !stack.lastDirectionalIsolateStatus() { | |
367 | stack.pop() | |
368 | } | |
369 | stack.pop() | |
370 | validIsolateCount-- | |
371 | } | |
372 | p.resultLevels[i] = stack.lastEmbeddingLevel() | |
373 | ||
374 | // Rule X7 | |
375 | case PDF: | |
376 | // Not really part of the spec | |
377 | p.resultLevels[i] = stack.lastEmbeddingLevel() | |
378 | ||
379 | if overflowIsolateCount > 0 { | |
380 | // do nothing | |
381 | } else if overflowEmbeddingCount > 0 { | |
382 | overflowEmbeddingCount-- | |
383 | } else if !stack.lastDirectionalIsolateStatus() && stack.depth() >= 2 { | |
384 | stack.pop() | |
385 | } | |
386 | ||
387 | case B: // paragraph separator. | |
388 | // Rule X8. | |
389 | ||
390 | // These values are reset for clarity, in this implementation B | |
391 | // can only occur as the last code in the array. | |
392 | stack.empty() | |
393 | overflowIsolateCount = 0 | |
394 | overflowEmbeddingCount = 0 | |
395 | validIsolateCount = 0 | |
396 | p.resultLevels[i] = p.embeddingLevel | |
397 | ||
398 | default: | |
399 | p.resultLevels[i] = stack.lastEmbeddingLevel() | |
400 | if stack.lastDirectionalOverrideStatus() != ON { | |
401 | p.resultTypes[i] = stack.lastDirectionalOverrideStatus() | |
402 | } | |
403 | } | |
404 | } | |
405 | } | |
406 | ||
407 | type isolatingRunSequence struct { | |
408 | p *paragraph | |
409 | ||
410 | indexes []int // indexes to the original string | |
411 | ||
412 | types []Class // type of each character using the index | |
413 | resolvedLevels []level // resolved levels after application of rules | |
414 | level level | |
415 | sos, eos Class | |
416 | } | |
417 | ||
418 | func (i *isolatingRunSequence) Len() int { return len(i.indexes) } | |
419 | ||
420 | func maxLevel(a, b level) level { | |
421 | if a > b { | |
422 | return a | |
423 | } | |
424 | return b | |
425 | } | |
426 | ||
427 | // Rule X10, second bullet: Determine the start-of-sequence (sos) and end-of-sequence (eos) types, | |
428 | // either L or R, for each isolating run sequence. | |
429 | func (p *paragraph) isolatingRunSequence(indexes []int) *isolatingRunSequence { | |
430 | length := len(indexes) | |
431 | types := make([]Class, length) | |
432 | for i, x := range indexes { | |
433 | types[i] = p.resultTypes[x] | |
434 | } | |
435 | ||
436 | // assign level, sos and eos | |
437 | prevChar := indexes[0] - 1 | |
438 | for prevChar >= 0 && isRemovedByX9(p.initialTypes[prevChar]) { | |
439 | prevChar-- | |
440 | } | |
441 | prevLevel := p.embeddingLevel | |
442 | if prevChar >= 0 { | |
443 | prevLevel = p.resultLevels[prevChar] | |
444 | } | |
445 | ||
446 | var succLevel level | |
447 | lastType := types[length-1] | |
448 | if lastType.in(LRI, RLI, FSI) { | |
449 | succLevel = p.embeddingLevel | |
450 | } else { | |
451 | // the first character after the end of run sequence | |
452 | limit := indexes[length-1] + 1 | |
453 | for ; limit < p.Len() && isRemovedByX9(p.initialTypes[limit]); limit++ { | |
454 | ||
455 | } | |
456 | succLevel = p.embeddingLevel | |
457 | if limit < p.Len() { | |
458 | succLevel = p.resultLevels[limit] | |
459 | } | |
460 | } | |
461 | level := p.resultLevels[indexes[0]] | |
462 | return &isolatingRunSequence{ | |
463 | p: p, | |
464 | indexes: indexes, | |
465 | types: types, | |
466 | level: level, | |
467 | sos: typeForLevel(maxLevel(prevLevel, level)), | |
468 | eos: typeForLevel(maxLevel(succLevel, level)), | |
469 | } | |
470 | } | |
471 | ||
472 | // Resolving weak types Rules W1-W7. | |
473 | // | |
474 | // Note that some weak types (EN, AN) remain after this processing is | |
475 | // complete. | |
476 | func (s *isolatingRunSequence) resolveWeakTypes() { | |
477 | ||
478 | // on entry, only these types remain | |
479 | s.assertOnly(L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM, LRI, RLI, FSI, PDI) | |
480 | ||
481 | // Rule W1. | |
482 | // Changes all NSMs. | |
483 | preceedingCharacterType := s.sos | |
484 | for i, t := range s.types { | |
485 | if t == NSM { | |
486 | s.types[i] = preceedingCharacterType | |
487 | } else { | |
488 | if t.in(LRI, RLI, FSI, PDI) { | |
489 | preceedingCharacterType = ON | |
490 | } | |
491 | preceedingCharacterType = t | |
492 | } | |
493 | } | |
494 | ||
495 | // Rule W2. | |
496 | // EN does not change at the start of the run, because sos != AL. | |
497 | for i, t := range s.types { | |
498 | if t == EN { | |
499 | for j := i - 1; j >= 0; j-- { | |
500 | if t := s.types[j]; t.in(L, R, AL) { | |
501 | if t == AL { | |
502 | s.types[i] = AN | |
503 | } | |
504 | break | |
505 | } | |
506 | } | |
507 | } | |
508 | } | |
509 | ||
510 | // Rule W3. | |
511 | for i, t := range s.types { | |
512 | if t == AL { | |
513 | s.types[i] = R | |
514 | } | |
515 | } | |
516 | ||
517 | // Rule W4. | |
518 | // Since there must be values on both sides for this rule to have an | |
519 | // effect, the scan skips the first and last value. | |
520 | // | |
521 | // Although the scan proceeds left to right, and changes the type | |
522 | // values in a way that would appear to affect the computations | |
523 | // later in the scan, there is actually no problem. A change in the | |
524 | // current value can only affect the value to its immediate right, | |
525 | // and only affect it if it is ES or CS. But the current value can | |
526 | // only change if the value to its right is not ES or CS. Thus | |
527 | // either the current value will not change, or its change will have | |
528 | // no effect on the remainder of the analysis. | |
529 | ||
530 | for i := 1; i < s.Len()-1; i++ { | |
531 | t := s.types[i] | |
532 | if t == ES || t == CS { | |
533 | prevSepType := s.types[i-1] | |
534 | succSepType := s.types[i+1] | |
535 | if prevSepType == EN && succSepType == EN { | |
536 | s.types[i] = EN | |
537 | } else if s.types[i] == CS && prevSepType == AN && succSepType == AN { | |
538 | s.types[i] = AN | |
539 | } | |
540 | } | |
541 | } | |
542 | ||
543 | // Rule W5. | |
544 | for i, t := range s.types { | |
545 | if t == ET { | |
546 | // locate end of sequence | |
547 | runStart := i | |
548 | runEnd := s.findRunLimit(runStart, ET) | |
549 | ||
550 | // check values at ends of sequence | |
551 | t := s.sos | |
552 | if runStart > 0 { | |
553 | t = s.types[runStart-1] | |
554 | } | |
555 | if t != EN { | |
556 | t = s.eos | |
557 | if runEnd < len(s.types) { | |
558 | t = s.types[runEnd] | |
559 | } | |
560 | } | |
561 | if t == EN { | |
562 | setTypes(s.types[runStart:runEnd], EN) | |
563 | } | |
564 | // continue at end of sequence | |
565 | i = runEnd | |
566 | } | |
567 | } | |
568 | ||
569 | // Rule W6. | |
570 | for i, t := range s.types { | |
571 | if t.in(ES, ET, CS) { | |
572 | s.types[i] = ON | |
573 | } | |
574 | } | |
575 | ||
576 | // Rule W7. | |
577 | for i, t := range s.types { | |
578 | if t == EN { | |
579 | // set default if we reach start of run | |
580 | prevStrongType := s.sos | |
581 | for j := i - 1; j >= 0; j-- { | |
582 | t = s.types[j] | |
583 | if t == L || t == R { // AL's have been changed to R | |
584 | prevStrongType = t | |
585 | break | |
586 | } | |
587 | } | |
588 | if prevStrongType == L { | |
589 | s.types[i] = L | |
590 | } | |
591 | } | |
592 | } | |
593 | } | |
594 | ||
595 | // 6) resolving neutral types Rules N1-N2. | |
596 | func (s *isolatingRunSequence) resolveNeutralTypes() { | |
597 | ||
598 | // on entry, only these types can be in resultTypes | |
599 | s.assertOnly(L, R, EN, AN, B, S, WS, ON, RLI, LRI, FSI, PDI) | |
600 | ||
601 | for i, t := range s.types { | |
602 | switch t { | |
603 | case WS, ON, B, S, RLI, LRI, FSI, PDI: | |
604 | // find bounds of run of neutrals | |
605 | runStart := i | |
606 | runEnd := s.findRunLimit(runStart, B, S, WS, ON, RLI, LRI, FSI, PDI) | |
607 | ||
608 | // determine effective types at ends of run | |
609 | var leadType, trailType Class | |
610 | ||
611 | // Note that the character found can only be L, R, AN, or | |
612 | // EN. | |
613 | if runStart == 0 { | |
614 | leadType = s.sos | |
615 | } else { | |
616 | leadType = s.types[runStart-1] | |
617 | if leadType.in(AN, EN) { | |
618 | leadType = R | |
619 | } | |
620 | } | |
621 | if runEnd == len(s.types) { | |
622 | trailType = s.eos | |
623 | } else { | |
624 | trailType = s.types[runEnd] | |
625 | if trailType.in(AN, EN) { | |
626 | trailType = R | |
627 | } | |
628 | } | |
629 | ||
630 | var resolvedType Class | |
631 | if leadType == trailType { | |
632 | // Rule N1. | |
633 | resolvedType = leadType | |
634 | } else { | |
635 | // Rule N2. | |
636 | // Notice the embedding level of the run is used, not | |
637 | // the paragraph embedding level. | |
638 | resolvedType = typeForLevel(s.level) | |
639 | } | |
640 | ||
641 | setTypes(s.types[runStart:runEnd], resolvedType) | |
642 | ||
643 | // skip over run of (former) neutrals | |
644 | i = runEnd | |
645 | } | |
646 | } | |
647 | } | |
648 | ||
649 | func setLevels(levels []level, newLevel level) { | |
650 | for i := range levels { | |
651 | levels[i] = newLevel | |
652 | } | |
653 | } | |
654 | ||
655 | func setTypes(types []Class, newType Class) { | |
656 | for i := range types { | |
657 | types[i] = newType | |
658 | } | |
659 | } | |
660 | ||
661 | // 7) resolving implicit embedding levels Rules I1, I2. | |
662 | func (s *isolatingRunSequence) resolveImplicitLevels() { | |
663 | ||
664 | // on entry, only these types can be in resultTypes | |
665 | s.assertOnly(L, R, EN, AN) | |
666 | ||
667 | s.resolvedLevels = make([]level, len(s.types)) | |
668 | setLevels(s.resolvedLevels, s.level) | |
669 | ||
670 | if (s.level & 1) == 0 { // even level | |
671 | for i, t := range s.types { | |
672 | // Rule I1. | |
673 | if t == L { | |
674 | // no change | |
675 | } else if t == R { | |
676 | s.resolvedLevels[i] += 1 | |
677 | } else { // t == AN || t == EN | |
678 | s.resolvedLevels[i] += 2 | |
679 | } | |
680 | } | |
681 | } else { // odd level | |
682 | for i, t := range s.types { | |
683 | // Rule I2. | |
684 | if t == R { | |
685 | // no change | |
686 | } else { // t == L || t == AN || t == EN | |
687 | s.resolvedLevels[i] += 1 | |
688 | } | |
689 | } | |
690 | } | |
691 | } | |
692 | ||
693 | // Applies the levels and types resolved in rules W1-I2 to the | |
694 | // resultLevels array. | |
695 | func (s *isolatingRunSequence) applyLevelsAndTypes() { | |
696 | for i, x := range s.indexes { | |
697 | s.p.resultTypes[x] = s.types[i] | |
698 | s.p.resultLevels[x] = s.resolvedLevels[i] | |
699 | } | |
700 | } | |
701 | ||
702 | // Return the limit of the run consisting only of the types in validSet | |
703 | // starting at index. This checks the value at index, and will return | |
704 | // index if that value is not in validSet. | |
705 | func (s *isolatingRunSequence) findRunLimit(index int, validSet ...Class) int { | |
706 | loop: | |
707 | for ; index < len(s.types); index++ { | |
708 | t := s.types[index] | |
709 | for _, valid := range validSet { | |
710 | if t == valid { | |
711 | continue loop | |
712 | } | |
713 | } | |
714 | return index // didn't find a match in validSet | |
715 | } | |
716 | return len(s.types) | |
717 | } | |
718 | ||
719 | // Algorithm validation. Assert that all values in types are in the | |
720 | // provided set. | |
721 | func (s *isolatingRunSequence) assertOnly(codes ...Class) { | |
722 | loop: | |
723 | for i, t := range s.types { | |
724 | for _, c := range codes { | |
725 | if t == c { | |
726 | continue loop | |
727 | } | |
728 | } | |
729 | log.Panicf("invalid bidi code %v present in assertOnly at position %d", t, s.indexes[i]) | |
730 | } | |
731 | } | |
732 | ||
733 | // determineLevelRuns returns an array of level runs. Each level run is | |
734 | // described as an array of indexes into the input string. | |
735 | // | |
736 | // Determines the level runs. Rule X9 will be applied in determining the | |
737 | // runs, in the way that makes sure the characters that are supposed to be | |
738 | // removed are not included in the runs. | |
739 | func (p *paragraph) determineLevelRuns() [][]int { | |
740 | run := []int{} | |
741 | allRuns := [][]int{} | |
742 | currentLevel := implicitLevel | |
743 | ||
744 | for i := range p.initialTypes { | |
745 | if !isRemovedByX9(p.initialTypes[i]) { | |
746 | if p.resultLevels[i] != currentLevel { | |
747 | // we just encountered a new run; wrap up last run | |
748 | if currentLevel >= 0 { // only wrap it up if there was a run | |
749 | allRuns = append(allRuns, run) | |
750 | run = nil | |
751 | } | |
752 | // Start new run | |
753 | currentLevel = p.resultLevels[i] | |
754 | } | |
755 | run = append(run, i) | |
756 | } | |
757 | } | |
758 | // Wrap up the final run, if any | |
759 | if len(run) > 0 { | |
760 | allRuns = append(allRuns, run) | |
761 | } | |
762 | return allRuns | |
763 | } | |
764 | ||
765 | // Definition BD13. Determine isolating run sequences. | |
766 | func (p *paragraph) determineIsolatingRunSequences() []*isolatingRunSequence { | |
767 | levelRuns := p.determineLevelRuns() | |
768 | ||
769 | // Compute the run that each character belongs to | |
770 | runForCharacter := make([]int, p.Len()) | |
771 | for i, run := range levelRuns { | |
772 | for _, index := range run { | |
773 | runForCharacter[index] = i | |
774 | } | |
775 | } | |
776 | ||
777 | sequences := []*isolatingRunSequence{} | |
778 | ||
779 | var currentRunSequence []int | |
780 | ||
781 | for _, run := range levelRuns { | |
782 | first := run[0] | |
783 | if p.initialTypes[first] != PDI || p.matchingIsolateInitiator[first] == -1 { | |
784 | currentRunSequence = nil | |
785 | // int run = i; | |
786 | for { | |
787 | // Copy this level run into currentRunSequence | |
788 | currentRunSequence = append(currentRunSequence, run...) | |
789 | ||
790 | last := currentRunSequence[len(currentRunSequence)-1] | |
791 | lastT := p.initialTypes[last] | |
792 | if lastT.in(LRI, RLI, FSI) && p.matchingPDI[last] != p.Len() { | |
793 | run = levelRuns[runForCharacter[p.matchingPDI[last]]] | |
794 | } else { | |
795 | break | |
796 | } | |
797 | } | |
798 | sequences = append(sequences, p.isolatingRunSequence(currentRunSequence)) | |
799 | } | |
800 | } | |
801 | return sequences | |
802 | } | |
803 | ||
804 | // Assign level information to characters removed by rule X9. This is for | |
805 | // ease of relating the level information to the original input data. Note | |
806 | // that the levels assigned to these codes are arbitrary, they're chosen so | |
807 | // as to avoid breaking level runs. | |
808 | func (p *paragraph) assignLevelsToCharactersRemovedByX9() { | |
809 | for i, t := range p.initialTypes { | |
810 | if t.in(LRE, RLE, LRO, RLO, PDF, BN) { | |
811 | p.resultTypes[i] = t | |
812 | p.resultLevels[i] = -1 | |
813 | } | |
814 | } | |
815 | // now propagate forward the levels information (could have | |
816 | // propagated backward, the main thing is not to introduce a level | |
817 | // break where one doesn't already exist). | |
818 | ||
819 | if p.resultLevels[0] == -1 { | |
820 | p.resultLevels[0] = p.embeddingLevel | |
821 | } | |
822 | for i := 1; i < len(p.initialTypes); i++ { | |
823 | if p.resultLevels[i] == -1 { | |
824 | p.resultLevels[i] = p.resultLevels[i-1] | |
825 | } | |
826 | } | |
827 | // Embedding information is for informational purposes only so need not be | |
828 | // adjusted. | |
829 | } | |
830 | ||
831 | // | |
832 | // Output | |
833 | // | |
834 | ||
835 | // getLevels computes levels array breaking lines at offsets in linebreaks. | |
836 | // Rule L1. | |
837 | // | |
838 | // The linebreaks array must include at least one value. The values must be | |
839 | // in strictly increasing order (no duplicates) between 1 and the length of | |
840 | // the text, inclusive. The last value must be the length of the text. | |
841 | func (p *paragraph) getLevels(linebreaks []int) []level { | |
842 | // Note that since the previous processing has removed all | |
843 | // P, S, and WS values from resultTypes, the values referred to | |
844 | // in these rules are the initial types, before any processing | |
845 | // has been applied (including processing of overrides). | |
846 | // | |
847 | // This example implementation has reinserted explicit format codes | |
848 | // and BN, in order that the levels array correspond to the | |
849 | // initial text. Their final placement is not normative. | |
850 | // These codes are treated like WS in this implementation, | |
851 | // so they don't interrupt sequences of WS. | |
852 | ||
853 | validateLineBreaks(linebreaks, p.Len()) | |
854 | ||
855 | result := append([]level(nil), p.resultLevels...) | |
856 | ||
857 | // don't worry about linebreaks since if there is a break within | |
858 | // a series of WS values preceding S, the linebreak itself | |
859 | // causes the reset. | |
860 | for i, t := range p.initialTypes { | |
861 | if t.in(B, S) { | |
862 | // Rule L1, clauses one and two. | |
863 | result[i] = p.embeddingLevel | |
864 | ||
865 | // Rule L1, clause three. | |
866 | for j := i - 1; j >= 0; j-- { | |
867 | if isWhitespace(p.initialTypes[j]) { // including format codes | |
868 | result[j] = p.embeddingLevel | |
869 | } else { | |
870 | break | |
871 | } | |
872 | } | |
873 | } | |
874 | } | |
875 | ||
876 | // Rule L1, clause four. | |
877 | start := 0 | |
878 | for _, limit := range linebreaks { | |
879 | for j := limit - 1; j >= start; j-- { | |
880 | if isWhitespace(p.initialTypes[j]) { // including format codes | |
881 | result[j] = p.embeddingLevel | |
882 | } else { | |
883 | break | |
884 | } | |
885 | } | |
886 | start = limit | |
887 | } | |
888 | ||
889 | return result | |
890 | } | |
891 | ||
892 | // getReordering returns the reordering of lines from a visual index to a | |
893 | // logical index for line breaks at the given offsets. | |
894 | // | |
895 | // Lines are concatenated from left to right. So for example, the fifth | |
896 | // character from the left on the third line is | |
897 | // | |
898 | // getReordering(linebreaks)[linebreaks[1] + 4] | |
899 | // | |
900 | // (linebreaks[1] is the position after the last character of the second | |
901 | // line, which is also the index of the first character on the third line, | |
902 | // and adding four gets the fifth character from the left). | |
903 | // | |
904 | // The linebreaks array must include at least one value. The values must be | |
905 | // in strictly increasing order (no duplicates) between 1 and the length of | |
906 | // the text, inclusive. The last value must be the length of the text. | |
907 | func (p *paragraph) getReordering(linebreaks []int) []int { | |
908 | validateLineBreaks(linebreaks, p.Len()) | |
909 | ||
910 | return computeMultilineReordering(p.getLevels(linebreaks), linebreaks) | |
911 | } | |
912 | ||
913 | // Return multiline reordering array for a given level array. Reordering | |
914 | // does not occur across a line break. | |
915 | func computeMultilineReordering(levels []level, linebreaks []int) []int { | |
916 | result := make([]int, len(levels)) | |
917 | ||
918 | start := 0 | |
919 | for _, limit := range linebreaks { | |
920 | tempLevels := make([]level, limit-start) | |
921 | copy(tempLevels, levels[start:]) | |
922 | ||
923 | for j, order := range computeReordering(tempLevels) { | |
924 | result[start+j] = order + start | |
925 | } | |
926 | start = limit | |
927 | } | |
928 | return result | |
929 | } | |
930 | ||
931 | // Return reordering array for a given level array. This reorders a single | |
932 | // line. The reordering is a visual to logical map. For example, the | |
933 | // leftmost char is string.charAt(order[0]). Rule L2. | |
934 | func computeReordering(levels []level) []int { | |
935 | result := make([]int, len(levels)) | |
936 | // initialize order | |
937 | for i := range result { | |
938 | result[i] = i | |
939 | } | |
940 | ||
941 | // locate highest level found on line. | |
942 | // Note the rules say text, but no reordering across line bounds is | |
943 | // performed, so this is sufficient. | |
944 | highestLevel := level(0) | |
945 | lowestOddLevel := level(maxDepth + 2) | |
946 | for _, level := range levels { | |
947 | if level > highestLevel { | |
948 | highestLevel = level | |
949 | } | |
950 | if level&1 != 0 && level < lowestOddLevel { | |
951 | lowestOddLevel = level | |
952 | } | |
953 | } | |
954 | ||
955 | for level := highestLevel; level >= lowestOddLevel; level-- { | |
956 | for i := 0; i < len(levels); i++ { | |
957 | if levels[i] >= level { | |
958 | // find range of text at or above this level | |
959 | start := i | |
960 | limit := i + 1 | |
961 | for limit < len(levels) && levels[limit] >= level { | |
962 | limit++ | |
963 | } | |
964 | ||
965 | for j, k := start, limit-1; j < k; j, k = j+1, k-1 { | |
966 | result[j], result[k] = result[k], result[j] | |
967 | } | |
968 | // skip to end of level run | |
969 | i = limit | |
970 | } | |
971 | } | |
972 | } | |
973 | ||
974 | return result | |
975 | } | |
976 | ||
977 | // isWhitespace reports whether the type is considered a whitespace type for the | |
978 | // line break rules. | |
979 | func isWhitespace(c Class) bool { | |
980 | switch c { | |
981 | case LRE, RLE, LRO, RLO, PDF, LRI, RLI, FSI, PDI, BN, WS: | |
982 | return true | |
983 | } | |
984 | return false | |
985 | } | |
986 | ||
987 | // isRemovedByX9 reports whether the type is one of the types removed in X9. | |
988 | func isRemovedByX9(c Class) bool { | |
989 | switch c { | |
990 | case LRE, RLE, LRO, RLO, PDF, BN: | |
991 | return true | |
992 | } | |
993 | return false | |
994 | } | |
995 | ||
996 | // typeForLevel reports the strong type (L or R) corresponding to the level. | |
997 | func typeForLevel(level level) Class { | |
998 | if (level & 0x1) == 0 { | |
999 | return L | |
1000 | } | |
1001 | return R | |
1002 | } | |
1003 | ||
1004 | // TODO: change validation to not panic | |
1005 | ||
1006 | func validateTypes(types []Class) { | |
1007 | if len(types) == 0 { | |
1008 | log.Panic("types is null") | |
1009 | } | |
1010 | for i, t := range types[:len(types)-1] { | |
1011 | if t == B { | |
1012 | log.Panicf("B type before end of paragraph at index: %d", i) | |
1013 | } | |
1014 | } | |
1015 | } | |
1016 | ||
1017 | func validateParagraphEmbeddingLevel(embeddingLevel level) { | |
1018 | if embeddingLevel != implicitLevel && | |
1019 | embeddingLevel != 0 && | |
1020 | embeddingLevel != 1 { | |
1021 | log.Panicf("illegal paragraph embedding level: %d", embeddingLevel) | |
1022 | } | |
1023 | } | |
1024 | ||
1025 | func validateLineBreaks(linebreaks []int, textLength int) { | |
1026 | prev := 0 | |
1027 | for i, next := range linebreaks { | |
1028 | if next <= prev { | |
1029 | log.Panicf("bad linebreak: %d at index: %d", next, i) | |
1030 | } | |
1031 | prev = next | |
1032 | } | |
1033 | if prev != textLength { | |
1034 | log.Panicf("last linebreak was %d, want %d", prev, textLength) | |
1035 | } | |
1036 | } | |
1037 | ||
1038 | func validatePbTypes(pairTypes []bracketType) { | |
1039 | if len(pairTypes) == 0 { | |
1040 | log.Panic("pairTypes is null") | |
1041 | } | |
1042 | for i, pt := range pairTypes { | |
1043 | switch pt { | |
1044 | case bpNone, bpOpen, bpClose: | |
1045 | default: | |
1046 | log.Panicf("illegal pairType value at %d: %v", i, pairTypes[i]) | |
1047 | } | |
1048 | } | |
1049 | } | |
1050 | ||
1051 | func validatePbValues(pairValues []rune, pairTypes []bracketType) { | |
1052 | if pairValues == nil { | |
1053 | log.Panic("pairValues is null") | |
1054 | } | |
1055 | if len(pairTypes) != len(pairValues) { | |
1056 | log.Panic("pairTypes is different length from pairValues") | |
1057 | } | |
1058 | } |