unicode_bidi/
prepare.rs

1// Copyright 2015 The Servo Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! 3.3.3 Preparations for Implicit Processing
11//!
12//! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing>
13
14use std::cmp::max;
15use std::ops::Range;
16
17use super::char_data::BidiClass;
18use super::level::Level;
19
20use BidiClass::*;
21
22/// A maximal substring of characters with the same embedding level.
23///
24/// Represented as a range of byte indices.
25pub type LevelRun = Range<usize>;
26
27
28/// Output of `isolating_run_sequences` (steps X9-X10)
29#[derive(Debug, PartialEq)]
30pub struct IsolatingRunSequence {
31    pub runs: Vec<LevelRun>,
32    pub sos: BidiClass, // Start-of-sequence type.
33    pub eos: BidiClass, // End-of-sequence type.
34}
35
36
37/// Compute the set of isolating run sequences.
38///
39/// An isolating run sequence is a maximal sequence of level runs such that for all level runs
40/// except the last one in the sequence, the last character of the run is an isolate initiator
41/// whose matching PDI is the first character of the next level run in the sequence.
42///
43/// Note: This function does *not* return the sequences in order by their first characters.
44#[cfg_attr(feature = "flame_it", flame)]
45pub fn isolating_run_sequences(
46    para_level: Level,
47    original_classes: &[BidiClass],
48    levels: &[Level],
49) -> Vec<IsolatingRunSequence> {
50    let runs = level_runs(levels, original_classes);
51
52    // Compute the set of isolating run sequences.
53    // <http://www.unicode.org/reports/tr9/#BD13>
54    let mut sequences = Vec::with_capacity(runs.len());
55
56    // When we encounter an isolate initiator, we push the current sequence onto the
57    // stack so we can resume it after the matching PDI.
58    let mut stack = vec![Vec::new()];
59
60    for run in runs {
61        assert!(run.len() > 0);
62        assert!(!stack.is_empty());
63
64        let start_class = original_classes[run.start];
65        let end_class = original_classes[run.end - 1];
66
67        let mut sequence = if start_class == PDI && stack.len() > 1 {
68            // Continue a previous sequence interrupted by an isolate.
69            stack.pop().unwrap()
70        } else {
71            // Start a new sequence.
72            Vec::new()
73        };
74
75        sequence.push(run);
76
77        if matches!(end_class, RLI | LRI | FSI) {
78            // Resume this sequence after the isolate.
79            stack.push(sequence);
80        } else {
81            // This sequence is finished.
82            sequences.push(sequence);
83        }
84    }
85    // Pop any remaning sequences off the stack.
86    sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty()));
87
88    // Determine the `sos` and `eos` class for each sequence.
89    // <http://www.unicode.org/reports/tr9/#X10>
90    sequences
91        .into_iter()
92        .map(|sequence: Vec<LevelRun>| {
93            assert!(!sequence.is_empty());
94
95            let start_of_seq = sequence[0].start;
96            let end_of_seq = sequence[sequence.len() - 1].end;
97            let seq_level = levels[start_of_seq];
98
99            #[cfg(test)]
100            for run in sequence.clone() {
101                for idx in run {
102                    if not_removed_by_x9(&original_classes[idx]) {
103                        assert_eq!(seq_level, levels[idx]);
104                    }
105                }
106            }
107
108            // Get the level of the last non-removed char before the runs.
109            let pred_level = match original_classes[..start_of_seq].iter().rposition(
110                not_removed_by_x9,
111            ) {
112                Some(idx) => levels[idx],
113                None => para_level,
114            };
115
116            // Get the level of the next non-removed char after the runs.
117            let succ_level = if matches!(original_classes[end_of_seq - 1], RLI | LRI | FSI) {
118                para_level
119            } else {
120                match original_classes[end_of_seq..].iter().position(
121                    not_removed_by_x9,
122                ) {
123                    Some(idx) => levels[end_of_seq + idx],
124                    None => para_level,
125                }
126            };
127
128            IsolatingRunSequence {
129                runs: sequence,
130                sos: max(seq_level, pred_level).bidi_class(),
131                eos: max(seq_level, succ_level).bidi_class(),
132            }
133        })
134        .collect()
135}
136
137/// Finds the level runs in a paragraph.
138///
139/// <http://www.unicode.org/reports/tr9/#BD7>
140fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
141    assert_eq!(levels.len(), original_classes.len());
142
143    let mut runs = Vec::new();
144    if levels.is_empty() {
145        return runs;
146    }
147
148    let mut current_run_level = levels[0];
149    let mut current_run_start = 0;
150    for i in 1..levels.len() {
151        if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
152            // End the last run and start a new one.
153            runs.push(current_run_start..i);
154            current_run_level = levels[i];
155            current_run_start = i;
156        }
157    }
158    runs.push(current_run_start..levels.len());
159
160    runs
161}
162
163/// Should this character be ignored in steps after X9?
164///
165/// <http://www.unicode.org/reports/tr9/#X9>
166pub fn removed_by_x9(class: BidiClass) -> bool {
167    matches!(class, RLE | LRE | RLO | LRO | PDF | BN)
168}
169
170// For use as a predicate for `position` / `rposition`
171pub fn not_removed_by_x9(class: &BidiClass) -> bool {
172    !removed_by_x9(*class)
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178
179    #[test]
180    fn test_level_runs() {
181        assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]);
182        assert_eq!(
183            level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]),
184            &[0..3, 3..5, 5..6, 6..8]
185        );
186    }
187
188    // From <http://www.unicode.org/reports/tr9/#BD13>
189    #[cfg_attr(rustfmt, rustfmt_skip)]
190    #[test]
191    fn test_isolating_run_sequences() {
192
193        // == Example 1 ==
194        // text1·RLE·text2·PDF·RLE·text3·PDF·text4
195        // index        0    1  2    3    4  5    6  7
196        let classes = &[L, RLE, L, PDF, RLE, L, PDF, L];
197        let levels =  &[0,   1, 1,   1,   1, 1,   1, 0];
198        let para_level = Level::ltr();
199        let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
200        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
201        assert_eq!(
202            sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
203            vec![vec![0..2], vec![2..7], vec![7..8]]
204        );
205
206        // == Example 2 ==
207        // text1·RLI·text2·PDI·RLI·text3·PDI·text4
208        // index        0    1  2    3    4  5    6  7
209        let classes = &[L, RLI, L, PDI, RLI, L, PDI, L];
210        let levels =  &[0,   0, 1,   0,   0, 1,   0, 0];
211        let para_level = Level::ltr();
212        let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
213        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
214        assert_eq!(
215            sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
216            vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]]
217        );
218
219        // == Example 3 ==
220        // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7
221        // index        0    1  2    3  4    5  6    7  8    9  10  11  12
222        let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI,  L];
223        let levels =  &[0,   0, 1,   1, 2,   3, 3,   3, 2,   1, 1,   0,  0];
224        let para_level = Level::ltr();
225        let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
226        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
227        assert_eq!(
228            sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
229            vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]]
230        );
231    }
232
233    // From <http://www.unicode.org/reports/tr9/#X10>
234    #[cfg_attr(rustfmt, rustfmt_skip)]
235    #[test]
236    fn test_isolating_run_sequences_sos_and_eos() {
237
238        // == Example 1 ==
239        // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6
240        // index        0    1  2    3  4    5  6    7    8  9   10  11
241        let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF,  L];
242        let levels =  &[0,   1, 1,   2, 2,   2, 1,   1,   1, 1,   1,  0];
243        let para_level = Level::ltr();
244        let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
245        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
246
247        // text1
248        assert_eq!(
249            &sequences[0],
250            &IsolatingRunSequence {
251                runs: vec![0..2],
252                sos: L,
253                eos: R,
254            }
255        );
256
257        // text2
258        assert_eq!(
259            &sequences[1],
260            &IsolatingRunSequence {
261                runs: vec![2..4],
262                sos: R,
263                eos: L,
264            }
265        );
266
267        // text3
268        assert_eq!(
269            &sequences[2],
270            &IsolatingRunSequence {
271                runs: vec![4..6],
272                sos: L,
273                eos: L,
274            }
275        );
276
277        // text4 text5
278        assert_eq!(
279            &sequences[3],
280            &IsolatingRunSequence {
281                runs: vec![6..11],
282                sos: L,
283                eos: R,
284            }
285        );
286
287        // text6
288        assert_eq!(
289            &sequences[4],
290            &IsolatingRunSequence {
291                runs: vec![11..12],
292                sos: R,
293                eos: L,
294            }
295        );
296
297        // == Example 2 ==
298        // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6
299        // index        0    1  2    3  4    5  6    7    8  9   10  11
300        let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI,  L];
301        let levels =  &[0,   0, 1,   1, 2,   1, 1,   0,   0, 1,   0,  0];
302        let para_level = Level::ltr();
303        let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
304        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
305
306        // text1·RLI·PDI·RLI·PDI·text6
307        assert_eq!(
308            &sequences[0],
309            &IsolatingRunSequence {
310                runs: vec![0..2, 7..9, 10..12],
311                sos: L,
312                eos: L,
313            }
314        );
315
316        // text2·LRI·PDI·text4
317        assert_eq!(
318            &sequences[1],
319            &IsolatingRunSequence {
320                runs: vec![2..4, 5..7],
321                sos: R,
322                eos: R,
323            }
324        );
325
326        // text3
327        assert_eq!(
328            &sequences[2],
329            &IsolatingRunSequence {
330                runs: vec![4..5],
331                sos: L,
332                eos: L,
333            }
334        );
335
336        // text5
337        assert_eq!(
338            &sequences[3],
339            &IsolatingRunSequence {
340                runs: vec![9..10],
341                sos: R,
342                eos: R,
343            }
344        );
345    }
346
347    #[test]
348    fn test_removed_by_x9() {
349        let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN];
350        let not_classes = &[L, RLI, AL, LRI, PDI];
351        for x in rem_classes {
352            assert_eq!(removed_by_x9(*x), true);
353        }
354        for x in not_classes {
355            assert_eq!(removed_by_x9(*x), false);
356        }
357    }
358
359    #[test]
360    fn test_not_removed_by_x9() {
361        let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI];
362        for x in non_x9_classes {
363            assert_eq!(not_removed_by_x9(&x), true);
364        }
365    }
366}