unicode_segmentation/
grapheme.rs

1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12
13use crate::tables::grapheme::GraphemeCat;
14
15/// External iterator for grapheme clusters and byte offsets.
16///
17/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
18/// trait. See its documentation for more.
19///
20/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
21/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
22#[derive(Clone)]
23pub struct GraphemeIndices<'a> {
24    start_offset: usize,
25    iter: Graphemes<'a>,
26}
27
28impl<'a> GraphemeIndices<'a> {
29    #[inline]
30    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
31    ///
32    /// ```rust
33    /// # use unicode_segmentation::UnicodeSegmentation;
34    /// let mut iter = "abc".grapheme_indices(true);
35    /// assert_eq!(iter.as_str(), "abc");
36    /// iter.next();
37    /// assert_eq!(iter.as_str(), "bc");
38    /// iter.next();
39    /// iter.next();
40    /// assert_eq!(iter.as_str(), "");
41    /// ```
42    pub fn as_str(&self) -> &'a str {
43        self.iter.as_str()
44    }
45}
46
47impl<'a> Iterator for GraphemeIndices<'a> {
48    type Item = (usize, &'a str);
49
50    #[inline]
51    fn next(&mut self) -> Option<(usize, &'a str)> {
52        self.iter
53            .next()
54            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
55    }
56
57    #[inline]
58    fn size_hint(&self) -> (usize, Option<usize>) {
59        self.iter.size_hint()
60    }
61}
62
63impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
64    #[inline]
65    fn next_back(&mut self) -> Option<(usize, &'a str)> {
66        self.iter
67            .next_back()
68            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
69    }
70}
71
72/// External iterator for a string's
73/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
74///
75/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
76/// documentation for more.
77///
78/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
79/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
80#[derive(Clone, Debug)]
81pub struct Graphemes<'a> {
82    string: &'a str,
83    cursor: GraphemeCursor,
84    cursor_back: GraphemeCursor,
85}
86
87impl<'a> Graphemes<'a> {
88    #[inline]
89    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
90    ///
91    /// ```rust
92    /// # use unicode_segmentation::UnicodeSegmentation;
93    /// let mut iter = "abc".graphemes(true);
94    /// assert_eq!(iter.as_str(), "abc");
95    /// iter.next();
96    /// assert_eq!(iter.as_str(), "bc");
97    /// iter.next();
98    /// iter.next();
99    /// assert_eq!(iter.as_str(), "");
100    /// ```
101    pub fn as_str(&self) -> &'a str {
102        &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
103    }
104}
105
106impl<'a> Iterator for Graphemes<'a> {
107    type Item = &'a str;
108
109    #[inline]
110    fn size_hint(&self) -> (usize, Option<usize>) {
111        let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
112        (cmp::min(slen, 1), Some(slen))
113    }
114
115    #[inline]
116    fn next(&mut self) -> Option<&'a str> {
117        let start = self.cursor.cur_cursor();
118        if start == self.cursor_back.cur_cursor() {
119            return None;
120        }
121        let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
122        Some(&self.string[start..next])
123    }
124}
125
126impl<'a> DoubleEndedIterator for Graphemes<'a> {
127    #[inline]
128    fn next_back(&mut self) -> Option<&'a str> {
129        let end = self.cursor_back.cur_cursor();
130        if end == self.cursor.cur_cursor() {
131            return None;
132        }
133        let prev = self
134            .cursor_back
135            .prev_boundary(self.string, 0)
136            .unwrap()
137            .unwrap();
138        Some(&self.string[prev..end])
139    }
140}
141
142#[inline]
143pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
144    let len = s.len();
145    Graphemes {
146        string: s,
147        cursor: GraphemeCursor::new(0, len, is_extended),
148        cursor_back: GraphemeCursor::new(len, len, is_extended),
149    }
150}
151
152#[inline]
153pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
154    GraphemeIndices {
155        start_offset: s.as_ptr() as usize,
156        iter: new_graphemes(s, is_extended),
157    }
158}
159
160// maybe unify with PairResult?
161// An enum describing information about a potential boundary.
162#[derive(PartialEq, Eq, Clone, Debug)]
163enum GraphemeState {
164    // No information is known.
165    Unknown,
166    // It is known to not be a boundary.
167    NotBreak,
168    // It is known to be a boundary.
169    Break,
170    // The codepoint after is a Regional Indicator Symbol, so a boundary iff
171    // it is preceded by an even number of RIS codepoints. (GB12, GB13)
172    Regional,
173    // The codepoint after is Extended_Pictographic,
174    // so whether it's a boundary depends on pre-context according to GB11.
175    Emoji,
176}
177
178/// Cursor-based segmenter for grapheme clusters.
179///
180/// This allows working with ropes and other datastructures where the string is not contiguous or
181/// fully known at initialization time.
182#[derive(Clone, Debug)]
183pub struct GraphemeCursor {
184    // Current cursor position.
185    offset: usize,
186    // Total length of the string.
187    len: usize,
188    // A config flag indicating whether this cursor computes legacy or extended
189    // grapheme cluster boundaries (enables GB9a and GB9b if set).
190    is_extended: bool,
191    // Information about the potential boundary at `offset`
192    state: GraphemeState,
193    // Category of codepoint immediately preceding cursor, if known.
194    cat_before: Option<GraphemeCat>,
195    // Category of codepoint immediately after cursor, if known.
196    cat_after: Option<GraphemeCat>,
197    // If set, at least one more codepoint immediately preceding this offset
198    // is needed to resolve whether there's a boundary at `offset`.
199    pre_context_offset: Option<usize>,
200    // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
201    // is set, then counts the number of RIS between that and `offset`, otherwise
202    // is an accurate count relative to the string.
203    ris_count: Option<usize>,
204    // Set if a call to `prev_boundary` or `next_boundary` was suspended due
205    // to needing more input.
206    resuming: bool,
207    // Cached grapheme category and associated scalar value range.
208    grapheme_cat_cache: (u32, u32, GraphemeCat),
209}
210
211/// An error return indicating that not enough content was available in the
212/// provided chunk to satisfy the query, and that more content must be provided.
213#[derive(PartialEq, Eq, Debug)]
214pub enum GraphemeIncomplete {
215    /// More pre-context is needed. The caller should call `provide_context`
216    /// with a chunk ending at the offset given, then retry the query. This
217    /// will only be returned if the `chunk_start` parameter is nonzero.
218    PreContext(usize),
219
220    /// When requesting `prev_boundary`, the cursor is moving past the beginning
221    /// of the current chunk, so the chunk before that is requested. This will
222    /// only be returned if the `chunk_start` parameter is nonzero.
223    PrevChunk,
224
225    /// When requesting `next_boundary`, the cursor is moving past the end of the
226    /// current chunk, so the chunk after that is requested. This will only be
227    /// returned if the chunk ends before the `len` parameter provided on
228    /// creation of the cursor.
229    NextChunk, // requesting chunk following the one given
230
231    /// An error returned when the chunk given does not contain the cursor position.
232    InvalidOffset,
233}
234
235// An enum describing the result from lookup of a pair of categories.
236#[derive(PartialEq, Eq)]
237enum PairResult {
238    NotBreak, // definitely not a break
239    Break,    // definitely a break
240    Extended, // a break iff not in extended mode
241    Regional, // a break if preceded by an even number of RIS
242    Emoji,    // a break if preceded by emoji base and (Extend)*
243}
244
245#[inline]
246fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
247    use self::PairResult::*;
248    use crate::tables::grapheme::GraphemeCat::*;
249    match (before, after) {
250        (GC_CR, GC_LF) => NotBreak,                                 // GB3
251        (GC_Control, _) => Break,                                   // GB4
252        (GC_CR, _) => Break,                                        // GB4
253        (GC_LF, _) => Break,                                        // GB4
254        (_, GC_Control) => Break,                                   // GB5
255        (_, GC_CR) => Break,                                        // GB5
256        (_, GC_LF) => Break,                                        // GB5
257        (GC_L, GC_L) => NotBreak,                                   // GB6
258        (GC_L, GC_V) => NotBreak,                                   // GB6
259        (GC_L, GC_LV) => NotBreak,                                  // GB6
260        (GC_L, GC_LVT) => NotBreak,                                 // GB6
261        (GC_LV, GC_V) => NotBreak,                                  // GB7
262        (GC_LV, GC_T) => NotBreak,                                  // GB7
263        (GC_V, GC_V) => NotBreak,                                   // GB7
264        (GC_V, GC_T) => NotBreak,                                   // GB7
265        (GC_LVT, GC_T) => NotBreak,                                 // GB8
266        (GC_T, GC_T) => NotBreak,                                   // GB8
267        (_, GC_Extend) => NotBreak,                                 // GB9
268        (_, GC_ZWJ) => NotBreak,                                    // GB9
269        (_, GC_SpacingMark) => Extended,                            // GB9a
270        (GC_Prepend, _) => Extended,                                // GB9b
271        (GC_ZWJ, GC_Extended_Pictographic) => Emoji,                // GB11
272        (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
273        (_, _) => Break,                                            // GB999
274    }
275}
276
277impl GraphemeCursor {
278    /// Create a new cursor. The string and initial offset are given at creation
279    /// time, but the contents of the string are not. The `is_extended` parameter
280    /// controls whether extended grapheme clusters are selected.
281    ///
282    /// The `offset` parameter must be on a codepoint boundary.
283    ///
284    /// ```rust
285    /// # use unicode_segmentation::GraphemeCursor;
286    /// let s = "हिन्दी";
287    /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
288    /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
289    /// let mut extended = GraphemeCursor::new(0, s.len(), true);
290    /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
291    /// ```
292    pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
293        let state = if offset == 0 || offset == len {
294            GraphemeState::Break
295        } else {
296            GraphemeState::Unknown
297        };
298        GraphemeCursor {
299            offset: offset,
300            len: len,
301            state: state,
302            is_extended: is_extended,
303            cat_before: None,
304            cat_after: None,
305            pre_context_offset: None,
306            ris_count: None,
307            resuming: false,
308            grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
309        }
310    }
311
312    fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
313        use crate::tables::grapheme as gr;
314        use crate::tables::grapheme::GraphemeCat::*;
315
316        if ch <= '\u{7e}' {
317            // Special-case optimization for ascii, except U+007F.  This
318            // improves performance even for many primarily non-ascii texts,
319            // due to use of punctuation and white space characters from the
320            // ascii range.
321            if ch >= '\u{20}' {
322                GC_Any
323            } else if ch == '\n' {
324                GC_LF
325            } else if ch == '\r' {
326                GC_CR
327            } else {
328                GC_Control
329            }
330        } else {
331            // If this char isn't within the cached range, update the cache to the
332            // range that includes it.
333            if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
334                self.grapheme_cat_cache = gr::grapheme_category(ch);
335            }
336            self.grapheme_cat_cache.2
337        }
338    }
339
340    // Not sure I'm gonna keep this, the advantage over new() seems thin.
341
342    /// Set the cursor to a new location in the same string.
343    ///
344    /// ```rust
345    /// # use unicode_segmentation::GraphemeCursor;
346    /// let s = "abcd";
347    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
348    /// assert_eq!(cursor.cur_cursor(), 0);
349    /// cursor.set_cursor(2);
350    /// assert_eq!(cursor.cur_cursor(), 2);
351    /// ```
352    pub fn set_cursor(&mut self, offset: usize) {
353        if offset != self.offset {
354            self.offset = offset;
355            self.state = if offset == 0 || offset == self.len {
356                GraphemeState::Break
357            } else {
358                GraphemeState::Unknown
359            };
360            // reset state derived from text around cursor
361            self.cat_before = None;
362            self.cat_after = None;
363            self.ris_count = None;
364        }
365    }
366
367    #[inline]
368    /// The current offset of the cursor. Equal to the last value provided to
369    /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
370    /// `prev_boundary()`.
371    ///
372    /// ```rust
373    /// # use unicode_segmentation::GraphemeCursor;
374    /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
375    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
376    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
377    /// assert_eq!(cursor.cur_cursor(), 4);
378    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
379    /// assert_eq!(cursor.cur_cursor(), 8);
380    /// ```
381    pub fn cur_cursor(&self) -> usize {
382        self.offset
383    }
384
385    /// Provide additional pre-context when it is needed to decide a boundary.
386    /// The end of the chunk must coincide with the value given in the
387    /// `GraphemeIncomplete::PreContext` request.
388    ///
389    /// ```rust
390    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
391    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
392    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
393    /// // Not enough pre-context to decide if there's a boundary between the two flags.
394    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
395    /// // Provide one more Regional Indicator Symbol of pre-context
396    /// cursor.provide_context(&flags[4..8], 4);
397    /// // Still not enough context to decide.
398    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
399    /// // Provide additional requested context.
400    /// cursor.provide_context(&flags[0..4], 0);
401    /// // That's enough to decide (it always is when context goes to the start of the string)
402    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
403    /// ```
404    pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
405        use crate::tables::grapheme as gr;
406        assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
407        self.pre_context_offset = None;
408        if self.is_extended && chunk_start + chunk.len() == self.offset {
409            let ch = chunk.chars().rev().next().unwrap();
410            if self.grapheme_category(ch) == gr::GC_Prepend {
411                self.decide(false); // GB9b
412                return;
413            }
414        }
415        match self.state {
416            GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
417            GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
418            _ => {
419                if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
420                    let ch = chunk.chars().rev().next().unwrap();
421                    self.cat_before = Some(self.grapheme_category(ch));
422                }
423            }
424        }
425    }
426
427    #[inline]
428    fn decide(&mut self, is_break: bool) {
429        self.state = if is_break {
430            GraphemeState::Break
431        } else {
432            GraphemeState::NotBreak
433        };
434    }
435
436    #[inline]
437    fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
438        self.decide(is_break);
439        Ok(is_break)
440    }
441
442    #[inline]
443    fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
444        if self.state == GraphemeState::Break {
445            Ok(true)
446        } else if self.state == GraphemeState::NotBreak {
447            Ok(false)
448        } else if let Some(pre_context_offset) = self.pre_context_offset {
449            Err(GraphemeIncomplete::PreContext(pre_context_offset))
450        } else {
451            unreachable!("inconsistent state");
452        }
453    }
454
455    #[inline]
456    fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
457        use crate::tables::grapheme as gr;
458        let mut ris_count = self.ris_count.unwrap_or(0);
459        for ch in chunk.chars().rev() {
460            if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
461                self.ris_count = Some(ris_count);
462                self.decide((ris_count % 2) == 0);
463                return;
464            }
465            ris_count += 1;
466        }
467        self.ris_count = Some(ris_count);
468        if chunk_start == 0 {
469            self.decide((ris_count % 2) == 0);
470            return;
471        }
472        self.pre_context_offset = Some(chunk_start);
473        self.state = GraphemeState::Regional;
474    }
475
476    #[inline]
477    fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
478        use crate::tables::grapheme as gr;
479        let mut iter = chunk.chars().rev();
480        if let Some(ch) = iter.next() {
481            if self.grapheme_category(ch) != gr::GC_ZWJ {
482                self.decide(true);
483                return;
484            }
485        }
486        for ch in iter {
487            match self.grapheme_category(ch) {
488                gr::GC_Extend => (),
489                gr::GC_Extended_Pictographic => {
490                    self.decide(false);
491                    return;
492                }
493                _ => {
494                    self.decide(true);
495                    return;
496                }
497            }
498        }
499        if chunk_start == 0 {
500            self.decide(true);
501            return;
502        }
503        self.pre_context_offset = Some(chunk_start);
504        self.state = GraphemeState::Emoji;
505    }
506
507    #[inline]
508    /// Determine whether the current cursor location is a grapheme cluster boundary.
509    /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
510    /// the length of `chunk` is not equal to `len` on creation, then this method
511    /// may return `GraphemeIncomplete::PreContext`. The caller should then
512    /// call `provide_context` with the requested chunk, then retry calling this
513    /// method.
514    ///
515    /// For partial chunks, if the cursor is not at the beginning or end of the
516    /// string, the chunk should contain at least the codepoint following the cursor.
517    /// If the string is nonempty, the chunk must be nonempty.
518    ///
519    /// All calls should have consistent chunk contents (ie, if a chunk provides
520    /// content for a given slice, all further chunks covering that slice must have
521    /// the same content for it).
522    ///
523    /// ```rust
524    /// # use unicode_segmentation::GraphemeCursor;
525    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
526    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
527    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
528    /// cursor.set_cursor(12);
529    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
530    /// ```
531    pub fn is_boundary(
532        &mut self,
533        chunk: &str,
534        chunk_start: usize,
535    ) -> Result<bool, GraphemeIncomplete> {
536        use crate::tables::grapheme as gr;
537        if self.state == GraphemeState::Break {
538            return Ok(true);
539        }
540        if self.state == GraphemeState::NotBreak {
541            return Ok(false);
542        }
543        if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
544            if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
545                return Err(GraphemeIncomplete::InvalidOffset);
546            }
547        }
548        if let Some(pre_context_offset) = self.pre_context_offset {
549            return Err(GraphemeIncomplete::PreContext(pre_context_offset));
550        }
551        let offset_in_chunk = self.offset - chunk_start;
552        if self.cat_after.is_none() {
553            let ch = chunk[offset_in_chunk..].chars().next().unwrap();
554            self.cat_after = Some(self.grapheme_category(ch));
555        }
556        if self.offset == chunk_start {
557            let mut need_pre_context = true;
558            match self.cat_after.unwrap() {
559                gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
560                gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
561                _ => need_pre_context = self.cat_before.is_none(),
562            }
563            if need_pre_context {
564                self.pre_context_offset = Some(chunk_start);
565                return Err(GraphemeIncomplete::PreContext(chunk_start));
566            }
567        }
568        if self.cat_before.is_none() {
569            let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
570            self.cat_before = Some(self.grapheme_category(ch));
571        }
572        match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
573            PairResult::NotBreak => return self.decision(false),
574            PairResult::Break => return self.decision(true),
575            PairResult::Extended => {
576                let is_extended = self.is_extended;
577                return self.decision(!is_extended);
578            }
579            PairResult::Regional => {
580                if let Some(ris_count) = self.ris_count {
581                    return self.decision((ris_count % 2) == 0);
582                }
583                self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
584                self.is_boundary_result()
585            }
586            PairResult::Emoji => {
587                self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
588                self.is_boundary_result()
589            }
590        }
591    }
592
593    #[inline]
594    /// Find the next boundary after the current cursor position. Only a part of
595    /// the string need be supplied. If the chunk is incomplete, then this
596    /// method might return `GraphemeIncomplete::PreContext` or
597    /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
598    /// call `provide_context` with the requested chunk, then retry. In the
599    /// latter case, the caller should provide the chunk following the one
600    /// given, then retry.
601    ///
602    /// See `is_boundary` for expectations on the provided chunk.
603    ///
604    /// ```rust
605    /// # use unicode_segmentation::GraphemeCursor;
606    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
607    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
608    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
609    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
610    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
611    /// ```
612    ///
613    /// And an example that uses partial strings:
614    ///
615    /// ```rust
616    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
617    /// let s = "abcd";
618    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
619    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
620    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
621    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
622    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
623    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
624    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
625    /// ```
626    pub fn next_boundary(
627        &mut self,
628        chunk: &str,
629        chunk_start: usize,
630    ) -> Result<Option<usize>, GraphemeIncomplete> {
631        if self.offset == self.len {
632            return Ok(None);
633        }
634        let mut iter = chunk[self.offset - chunk_start..].chars();
635        let mut ch = iter.next().unwrap();
636        loop {
637            if self.resuming {
638                if self.cat_after.is_none() {
639                    self.cat_after = Some(self.grapheme_category(ch));
640                }
641            } else {
642                self.offset += ch.len_utf8();
643                self.state = GraphemeState::Unknown;
644                self.cat_before = self.cat_after.take();
645                if self.cat_before.is_none() {
646                    self.cat_before = Some(self.grapheme_category(ch));
647                }
648                if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
649                    self.ris_count = self.ris_count.map(|c| c + 1);
650                } else {
651                    self.ris_count = Some(0);
652                }
653                if let Some(next_ch) = iter.next() {
654                    ch = next_ch;
655                    self.cat_after = Some(self.grapheme_category(ch));
656                } else if self.offset == self.len {
657                    self.decide(true);
658                } else {
659                    self.resuming = true;
660                    return Err(GraphemeIncomplete::NextChunk);
661                }
662            }
663            self.resuming = true;
664            if self.is_boundary(chunk, chunk_start)? {
665                self.resuming = false;
666                return Ok(Some(self.offset));
667            }
668            self.resuming = false;
669        }
670    }
671
672    /// Find the previous boundary after the current cursor position. Only a part
673    /// of the string need be supplied. If the chunk is incomplete, then this
674    /// method might return `GraphemeIncomplete::PreContext` or
675    /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
676    /// call `provide_context` with the requested chunk, then retry. In the
677    /// latter case, the caller should provide the chunk preceding the one
678    /// given, then retry.
679    ///
680    /// See `is_boundary` for expectations on the provided chunk.
681    ///
682    /// ```rust
683    /// # use unicode_segmentation::GraphemeCursor;
684    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
685    /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
686    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
687    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
688    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
689    /// ```
690    ///
691    /// And an example that uses partial strings (note the exact return is not
692    /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
693    ///
694    /// ```rust
695    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
696    /// let s = "abcd";
697    /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
698    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
699    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
700    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
701    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
702    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
703    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
704    /// ```
705    pub fn prev_boundary(
706        &mut self,
707        chunk: &str,
708        chunk_start: usize,
709    ) -> Result<Option<usize>, GraphemeIncomplete> {
710        if self.offset == 0 {
711            return Ok(None);
712        }
713        if self.offset == chunk_start {
714            return Err(GraphemeIncomplete::PrevChunk);
715        }
716        let mut iter = chunk[..self.offset - chunk_start].chars().rev();
717        let mut ch = iter.next().unwrap();
718        loop {
719            if self.offset == chunk_start {
720                self.resuming = true;
721                return Err(GraphemeIncomplete::PrevChunk);
722            }
723            if self.resuming {
724                self.cat_before = Some(self.grapheme_category(ch));
725            } else {
726                self.offset -= ch.len_utf8();
727                self.cat_after = self.cat_before.take();
728                self.state = GraphemeState::Unknown;
729                if let Some(ris_count) = self.ris_count {
730                    self.ris_count = if ris_count > 0 {
731                        Some(ris_count - 1)
732                    } else {
733                        None
734                    };
735                }
736                if let Some(prev_ch) = iter.next() {
737                    ch = prev_ch;
738                    self.cat_before = Some(self.grapheme_category(ch));
739                } else if self.offset == 0 {
740                    self.decide(true);
741                } else {
742                    self.resuming = true;
743                    self.cat_after = Some(self.grapheme_category(ch));
744                    return Err(GraphemeIncomplete::PrevChunk);
745                }
746            }
747            self.resuming = true;
748            if self.is_boundary(chunk, chunk_start)? {
749                self.resuming = false;
750                return Ok(Some(self.offset));
751            }
752            self.resuming = false;
753        }
754    }
755}
756
757#[test]
758fn test_grapheme_cursor_ris_precontext() {
759    let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
760    let mut c = GraphemeCursor::new(8, s.len(), true);
761    assert_eq!(
762        c.is_boundary(&s[4..], 4),
763        Err(GraphemeIncomplete::PreContext(4))
764    );
765    c.provide_context(&s[..4], 0);
766    assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
767}
768
769#[test]
770fn test_grapheme_cursor_chunk_start_require_precontext() {
771    let s = "\r\n";
772    let mut c = GraphemeCursor::new(1, s.len(), true);
773    assert_eq!(
774        c.is_boundary(&s[1..], 1),
775        Err(GraphemeIncomplete::PreContext(1))
776    );
777    c.provide_context(&s[..1], 0);
778    assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
779}
780
781#[test]
782fn test_grapheme_cursor_prev_boundary() {
783    let s = "abcd";
784    let mut c = GraphemeCursor::new(3, s.len(), true);
785    assert_eq!(
786        c.prev_boundary(&s[2..], 2),
787        Err(GraphemeIncomplete::PrevChunk)
788    );
789    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
790}
791
792#[test]
793fn test_grapheme_cursor_prev_boundary_chunk_start() {
794    let s = "abcd";
795    let mut c = GraphemeCursor::new(2, s.len(), true);
796    assert_eq!(
797        c.prev_boundary(&s[2..], 2),
798        Err(GraphemeIncomplete::PrevChunk)
799    );
800    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
801}