unicode_segmentation/
word.rs

Help
1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14use crate::tables::word::WordCat;
15
16/// An iterator over the substrings of a string which, after splitting the string on
17/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18/// contain any characters with the
19/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20/// property, or with
21/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22///
23/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24/// its documentation for more.
25///
26/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28#[derive(Debug)]
29pub struct UnicodeWords<'a> {
30    inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
31}
32
33impl<'a> Iterator for UnicodeWords<'a> {
34    type Item = &'a str;
35
36    #[inline]
37    fn next(&mut self) -> Option<&'a str> {
38        self.inner.next()
39    }
40
41    #[inline]
42    fn size_hint(&self) -> (usize, Option<usize>) {
43        self.inner.size_hint()
44    }
45}
46impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
47    #[inline]
48    fn next_back(&mut self) -> Option<&'a str> {
49        self.inner.next_back()
50    }
51}
52
53/// An iterator over the substrings of a string which, after splitting the string on
54/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
55/// contain any characters with the
56/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
57/// property, or with
58/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
59/// This iterator also provides the byte offsets for each substring.
60///
61/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
62/// its documentation for more.
63///
64/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
65/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
66#[derive(Debug)]
67pub struct UnicodeWordIndices<'a> {
68    #[allow(clippy::type_complexity)]
69    inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
70}
71
72impl<'a> Iterator for UnicodeWordIndices<'a> {
73    type Item = (usize, &'a str);
74
75    #[inline]
76    fn next(&mut self) -> Option<(usize, &'a str)> {
77        self.inner.next()
78    }
79
80    #[inline]
81    fn size_hint(&self) -> (usize, Option<usize>) {
82        self.inner.size_hint()
83    }
84}
85impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
86    #[inline]
87    fn next_back(&mut self) -> Option<(usize, &'a str)> {
88        self.inner.next_back()
89    }
90}
91
92/// External iterator for a string's
93/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
94///
95/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
96/// trait. See its documentation for more.
97///
98/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
99/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
100#[derive(Debug, Clone)]
101pub struct UWordBounds<'a> {
102    string: &'a str,
103    cat: Option<WordCat>,
104    catb: Option<WordCat>,
105}
106
107/// External iterator for word boundaries and byte offsets.
108///
109/// This struct is created by the [`split_word_bound_indices`] method on the
110/// [`UnicodeSegmentation`] trait. See its documentation for more.
111///
112/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
113/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
114#[derive(Debug, Clone)]
115pub struct UWordBoundIndices<'a> {
116    start_offset: usize,
117    iter: UWordBounds<'a>,
118}
119
120impl<'a> UWordBoundIndices<'a> {
121    #[inline]
122    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
123    ///
124    /// ```rust
125    /// # use unicode_segmentation::UnicodeSegmentation;
126    /// let mut iter = "Hello world".split_word_bound_indices();
127    /// assert_eq!(iter.as_str(), "Hello world");
128    /// iter.next();
129    /// assert_eq!(iter.as_str(), " world");
130    /// iter.next();
131    /// assert_eq!(iter.as_str(), "world");
132    /// ```
133    pub fn as_str(&self) -> &'a str {
134        self.iter.as_str()
135    }
136}
137
138impl<'a> Iterator for UWordBoundIndices<'a> {
139    type Item = (usize, &'a str);
140
141    #[inline]
142    fn next(&mut self) -> Option<(usize, &'a str)> {
143        self.iter
144            .next()
145            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
146    }
147
148    #[inline]
149    fn size_hint(&self) -> (usize, Option<usize>) {
150        self.iter.size_hint()
151    }
152}
153
154impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
155    #[inline]
156    fn next_back(&mut self) -> Option<(usize, &'a str)> {
157        self.iter
158            .next_back()
159            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
160    }
161}
162
163// state machine for word boundary rules
164#[derive(Clone, Copy, PartialEq, Eq, Debug)]
165enum UWordBoundsState {
166    Start,
167    Letter,
168    HLetter,
169    Numeric,
170    Katakana,
171    ExtendNumLet,
172    Regional(RegionalState),
173    FormatExtend(FormatExtendType),
174    Zwj,
175    Emoji,
176    WSegSpace,
177}
178
179// subtypes for FormatExtend state in UWordBoundsState
180#[derive(Clone, Copy, PartialEq, Eq, Debug)]
181enum FormatExtendType {
182    AcceptAny,
183    AcceptNone,
184    RequireLetter,
185    RequireHLetter,
186    AcceptQLetter,
187    RequireNumeric,
188}
189
190#[derive(Clone, Copy, PartialEq, Eq, Debug)]
191enum RegionalState {
192    Half,
193    Full,
194    Unknown,
195}
196
197fn is_emoji(ch: char) -> bool {
198    use crate::tables::emoji;
199    emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
200}
201
202impl<'a> Iterator for UWordBounds<'a> {
203    type Item = &'a str;
204
205    #[inline]
206    fn size_hint(&self) -> (usize, Option<usize>) {
207        let slen = self.string.len();
208        (cmp::min(slen, 1), Some(slen))
209    }
210
211    #[inline]
212    fn next(&mut self) -> Option<&'a str> {
213        use self::FormatExtendType::*;
214        use self::UWordBoundsState::*;
215        use crate::tables::word as wd;
216        if self.string.is_empty() {
217            return None;
218        }
219
220        let mut take_curr = true;
221        let mut take_cat = true;
222        let mut idx = 0;
223        let mut saveidx = 0;
224        let mut state = Start;
225        let mut cat = wd::WC_Any;
226        let mut savecat = wd::WC_Any;
227
228        // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
229        let mut skipped_format_extend = false;
230        for (curr, ch) in self.string.char_indices() {
231            idx = curr;
232            // Whether or not the previous category was ZWJ
233            // ZWJs get collapsed, so this handles precedence of WB3c over WB4
234            let prev_zwj = cat == wd::WC_ZWJ;
235            // if there's a category cached, grab it
236            cat = match self.cat {
237                None => wd::word_category(ch).2,
238                _ => self.cat.take().unwrap(),
239            };
240            take_cat = true;
241
242            // handle rule WB4
243            // just skip all format, extend, and zwj chars
244            // note that Start is a special case: if there's a bunch of Format | Extend
245            // characters at the beginning of a block of text, dump them out as one unit.
246            //
247            // (This is not obvious from the wording of UAX#29, but if you look at the
248            // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
249            // then the "correct" interpretation of WB4 becomes apparent.)
250            if state != Start {
251                match cat {
252                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
253                        skipped_format_extend = true;
254                        continue;
255                    }
256                    _ => {}
257                }
258            }
259
260            // rule WB3c
261            // WB4 makes all ZWJs collapse into the previous state
262            // but you can still be in a Zwj state if you started with Zwj
263            //
264            // This means that an EP + Zwj will collapse into EP, which is wrong,
265            // since EP+EP is not a boundary but EP+ZWJ+EP is
266            //
267            // Thus, we separately keep track of whether or not the last character
268            // was a ZWJ. This is an additional bit of state tracked outside of the
269            // state enum; the state enum represents the last non-zwj state encountered.
270            // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
271            // however we are in the previous state for the purposes of all other rules.
272            if prev_zwj && is_emoji(ch) {
273                state = Emoji;
274                continue;
275            }
276            // Don't use `continue` in this match without updating `cat`
277            state = match state {
278                Start if cat == wd::WC_CR => {
279                    idx += match self.get_next_cat(idx) {
280                        Some(wd::WC_LF) => 1, // rule WB3
281                        _ => 0,
282                    };
283                    break; // rule WB3a
284                }
285                Start => match cat {
286                    wd::WC_ALetter => Letter,            // rule WB5, WB6, WB9, WB13a
287                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
288                    wd::WC_Numeric => Numeric,           // rule WB8, WB10, WB12, WB13a
289                    wd::WC_Katakana => Katakana,         // rule WB13, WB13a
290                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
291                    wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
292                    wd::WC_LF | wd::WC_Newline => break, // rule WB3a
293                    wd::WC_ZWJ => Zwj,                   // rule WB3c
294                    wd::WC_WSegSpace => WSegSpace,       // rule WB3d
295                    _ => {
296                        if let Some(ncat) = self.get_next_cat(idx) {
297                            // rule WB4
298                            if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
299                            {
300                                state = FormatExtend(AcceptNone);
301                                self.cat = Some(ncat);
302                                continue;
303                            }
304                        }
305                        break; // rule WB999
306                    }
307                },
308                WSegSpace => match cat {
309                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
310                    _ => {
311                        take_curr = false;
312                        break;
313                    }
314                },
315                Zwj => {
316                    // We already handle WB3c above.
317                    take_curr = false;
318                    break;
319                }
320                Letter | HLetter => match cat {
321                    wd::WC_ALetter => Letter,            // rule WB5
322                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
323                    wd::WC_Numeric => Numeric,           // rule WB9
324                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
325                    wd::WC_Double_Quote if state == HLetter => {
326                        savecat = cat;
327                        saveidx = idx;
328                        FormatExtend(RequireHLetter) // rule WB7b
329                    }
330                    wd::WC_Single_Quote if state == HLetter => {
331                        FormatExtend(AcceptQLetter) // rule WB7a
332                    }
333                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
334                        savecat = cat;
335                        saveidx = idx;
336                        FormatExtend(RequireLetter) // rule WB6
337                    }
338                    _ => {
339                        take_curr = false;
340                        break;
341                    }
342                },
343                Numeric => match cat {
344                    wd::WC_Numeric => Numeric,           // rule WB8
345                    wd::WC_ALetter => Letter,            // rule WB10
346                    wd::WC_Hebrew_Letter => HLetter,     // rule WB10
347                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
348                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
349                        savecat = cat;
350                        saveidx = idx;
351                        FormatExtend(RequireNumeric) // rule WB12
352                    }
353                    _ => {
354                        take_curr = false;
355                        break;
356                    }
357                },
358                Katakana => match cat {
359                    wd::WC_Katakana => Katakana,         // rule WB13
360                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
361                    _ => {
362                        take_curr = false;
363                        break;
364                    }
365                },
366                ExtendNumLet => match cat {
367                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
368                    wd::WC_ALetter => Letter,            // rule WB13b
369                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13b
370                    wd::WC_Numeric => Numeric,           // rule WB13b
371                    wd::WC_Katakana => Katakana,         // rule WB13b
372                    _ => {
373                        take_curr = false;
374                        break;
375                    }
376                },
377                Regional(RegionalState::Full) => {
378                    // if it reaches here we've gone too far,
379                    // a full flag can only compose with ZWJ/Extend/Format
380                    // proceeding it.
381                    take_curr = false;
382                    break;
383                }
384                Regional(RegionalState::Half) => match cat {
385                    wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
386                    _ => {
387                        take_curr = false;
388                        break;
389                    }
390                },
391                Regional(_) => {
392                    unreachable!("RegionalState::Unknown should not occur on forward iteration")
393                }
394                Emoji => {
395                    // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
396                    take_curr = false;
397                    break;
398                }
399                FormatExtend(t) => match t {
400                    // handle FormatExtends depending on what type
401                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
402                    RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
403                    RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
404                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
405                    AcceptNone | AcceptQLetter => {
406                        take_curr = false; // emit all the Format|Extend characters
407                        take_cat = false;
408                        break;
409                    }
410                    _ => break, // rewind (in if statement below)
411                },
412            }
413        }
414
415        if let FormatExtend(t) = state {
416            // we were looking for something and didn't find it; we have to back up
417            if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
418                idx = saveidx;
419                cat = savecat;
420                take_curr = false;
421            }
422        }
423
424        self.cat = if take_curr {
425            idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
426            None
427        } else if take_cat {
428            Some(cat)
429        } else {
430            None
431        };
432
433        let retstr = &self.string[..idx];
434        self.string = &self.string[idx..];
435        Some(retstr)
436    }
437}
438
439impl<'a> DoubleEndedIterator for UWordBounds<'a> {
440    #[inline]
441    fn next_back(&mut self) -> Option<&'a str> {
442        use self::FormatExtendType::*;
443        use self::UWordBoundsState::*;
444        use crate::tables::word as wd;
445        if self.string.is_empty() {
446            return None;
447        }
448
449        let mut take_curr = true;
450        let mut take_cat = true;
451        let mut idx = self.string.len();
452        idx -= self.string.chars().next_back().unwrap().len_utf8();
453        let mut previdx = idx;
454        let mut saveidx = idx;
455        let mut state = Start;
456        let mut savestate = Start;
457        let mut cat = wd::WC_Any;
458
459        let mut skipped_format_extend = false;
460
461        for (curr, ch) in self.string.char_indices().rev() {
462            previdx = idx;
463            idx = curr;
464
465            // if there's a category cached, grab it
466            cat = match self.catb {
467                None => wd::word_category(ch).2,
468                _ => self.catb.take().unwrap(),
469            };
470            take_cat = true;
471
472            // backward iterator over word boundaries. Mostly the same as the forward
473            // iterator, with two weirdnesses:
474            // (1) If we encounter a single quote in the Start state, we have to check for a
475            //     Hebrew Letter immediately before it.
476            // (2) Format and Extend char handling takes some gymnastics.
477
478            if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
479                // WB3c has more priority so we should not
480                // fold in that case
481                if !matches!(state, FormatExtend(_) | Start) {
482                    saveidx = previdx;
483                    savestate = state;
484                    state = FormatExtend(AcceptNone);
485                }
486
487                if state != Start {
488                    continue;
489                }
490            } else if state == FormatExtend(AcceptNone) {
491                // finished a scan of some Format|Extend chars, restore previous state
492                state = savestate;
493                previdx = saveidx;
494                take_cat = false;
495                skipped_format_extend = true;
496            }
497
498            // Don't use `continue` in this match without updating `catb`
499            state = match state {
500                Start | FormatExtend(AcceptAny) => match cat {
501                    _ if is_emoji(ch) => Zwj,
502                    wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
503                    wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
504                    wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
505                    wd::WC_Katakana => Katakana, // rule WB13, WB13b
506                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
507                    wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
508                    // rule WB4:
509                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
510                    wd::WC_Single_Quote => {
511                        saveidx = idx;
512                        FormatExtend(AcceptQLetter) // rule WB7a
513                    }
514                    wd::WC_WSegSpace => WSegSpace,
515                    wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
516                        if state == Start {
517                            if cat == wd::WC_LF {
518                                idx -= match self.get_prev_cat(idx) {
519                                    Some(wd::WC_CR) => 1, // rule WB3
520                                    _ => 0,
521                                };
522                            }
523                        } else {
524                            take_curr = false;
525                        }
526                        break; // rule WB3a
527                    }
528                    _ => break, // rule WB999
529                },
530                Zwj => match cat {
531                    // rule WB3c
532                    wd::WC_ZWJ => FormatExtend(AcceptAny),
533                    _ => {
534                        take_curr = false;
535                        break;
536                    }
537                },
538                WSegSpace => match cat {
539                    // rule WB3d
540                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
541                    _ => {
542                        take_curr = false;
543                        break;
544                    }
545                },
546                Letter | HLetter => match cat {
547                    wd::WC_ALetter => Letter,            // rule WB5
548                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
549                    wd::WC_Numeric => Numeric,           // rule WB10
550                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
551                    wd::WC_Double_Quote if state == HLetter => {
552                        saveidx = previdx;
553                        FormatExtend(RequireHLetter) // rule WB7c
554                    }
555                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
556                        saveidx = previdx;
557                        FormatExtend(RequireLetter) // rule WB7
558                    }
559                    _ => {
560                        take_curr = false;
561                        break;
562                    }
563                },
564                Numeric => match cat {
565                    wd::WC_Numeric => Numeric,           // rule WB8
566                    wd::WC_ALetter => Letter,            // rule WB9
567                    wd::WC_Hebrew_Letter => HLetter,     // rule WB9
568                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
569                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
570                        saveidx = previdx;
571                        FormatExtend(RequireNumeric) // rule WB11
572                    }
573                    _ => {
574                        take_curr = false;
575                        break;
576                    }
577                },
578                Katakana => match cat {
579                    wd::WC_Katakana => Katakana,         // rule WB13
580                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
581                    _ => {
582                        take_curr = false;
583                        break;
584                    }
585                },
586                ExtendNumLet => match cat {
587                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
588                    wd::WC_ALetter => Letter,            // rule WB13a
589                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13a
590                    wd::WC_Numeric => Numeric,           // rule WB13a
591                    wd::WC_Katakana => Katakana,         // rule WB13a
592                    _ => {
593                        take_curr = false;
594                        break;
595                    }
596                },
597                Regional(mut regional_state) => match cat {
598                    // rule WB13c
599                    wd::WC_Regional_Indicator => {
600                        if regional_state == RegionalState::Unknown {
601                            let count = self.string[..previdx]
602                                .chars()
603                                .rev()
604                                .map(|c| wd::word_category(c).2)
605                                .filter(|&c| {
606                                    !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
607                                })
608                                .take_while(|&c| c == wd::WC_Regional_Indicator)
609                                .count();
610                            regional_state = if count % 2 == 0 {
611                                RegionalState::Full
612                            } else {
613                                RegionalState::Half
614                            };
615                        }
616                        if regional_state == RegionalState::Full {
617                            take_curr = false;
618                            break;
619                        } else {
620                            Regional(RegionalState::Full)
621                        }
622                    }
623                    _ => {
624                        take_curr = false;
625                        break;
626                    }
627                },
628                Emoji => {
629                    if is_emoji(ch) {
630                        // rule WB3c
631                        Zwj
632                    } else {
633                        take_curr = false;
634                        break;
635                    }
636                }
637                FormatExtend(t) => match t {
638                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
639                    RequireLetter if cat == wd::WC_ALetter => Letter,   // rule WB6
640                    RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
641                    AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
642                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
643                    _ => break,                                         // backtrack will happens
644                },
645            }
646        }
647
648        if let FormatExtend(t) = state {
649            // if we required something but didn't find it, backtrack
650            if t == RequireLetter
651                || t == RequireHLetter
652                || t == RequireNumeric
653                || t == AcceptNone
654                || t == AcceptQLetter
655            {
656                previdx = saveidx;
657                take_cat = false;
658                take_curr = false;
659            }
660        }
661
662        self.catb = if take_curr {
663            None
664        } else {
665            idx = previdx;
666            if take_cat {
667                Some(cat)
668            } else {
669                None
670            }
671        };
672
673        let retstr = &self.string[idx..];
674        self.string = &self.string[..idx];
675        Some(retstr)
676    }
677}
678
679impl<'a> UWordBounds<'a> {
680    #[inline]
681    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
682    ///
683    /// ```rust
684    /// # use unicode_segmentation::UnicodeSegmentation;
685    /// let mut iter = "Hello world".split_word_bounds();
686    /// assert_eq!(iter.as_str(), "Hello world");
687    /// iter.next();
688    /// assert_eq!(iter.as_str(), " world");
689    /// iter.next();
690    /// assert_eq!(iter.as_str(), "world");
691    /// ```
692    pub fn as_str(&self) -> &'a str {
693        self.string
694    }
695
696    #[inline]
697    fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
698        use crate::tables::word as wd;
699        let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
700        if nidx < self.string.len() {
701            let nch = self.string[nidx..].chars().next().unwrap();
702            Some(wd::word_category(nch).2)
703        } else {
704            None
705        }
706    }
707
708    #[inline]
709    fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
710        use crate::tables::word as wd;
711        if idx > 0 {
712            let nch = self.string[..idx].chars().next_back().unwrap();
713            Some(wd::word_category(nch).2)
714        } else {
715            None
716        }
717    }
718}
719
720#[inline]
721pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
722    UWordBounds {
723        string: s,
724        cat: None,
725        catb: None,
726    }
727}
728
729#[inline]
730pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
731    UWordBoundIndices {
732        start_offset: s.as_ptr() as usize,
733        iter: new_word_bounds(s),
734    }
735}
736
737#[inline]
738fn has_alphanumeric(s: &&str) -> bool {
739    use crate::tables::util::is_alphanumeric;
740
741    s.chars().any(is_alphanumeric)
742}
743
744#[inline]
745pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
746    use super::UnicodeSegmentation;
747
748    UnicodeWords {
749        inner: s.split_word_bounds().filter(has_alphanumeric),
750    }
751}
752
753#[inline]
754pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
755    use super::UnicodeSegmentation;
756
757    UnicodeWordIndices {
758        inner: s
759            .split_word_bound_indices()
760            .filter(|(_, c)| has_alphanumeric(c)),
761    }
762}
763
764#[cfg(test)]
765mod tests {
766    #[test]
767    fn test_syriac_abbr_mark() {
768        use crate::tables::word as wd;
769        let (_, _, cat) = wd::word_category('\u{70f}');
770        assert_eq!(cat, wd::WC_ALetter);
771    }
772
773    #[test]
774    fn test_end_of_ayah_cat() {
775        use crate::tables::word as wd;
776        let (_, _, cat) = wd::word_category('\u{6dd}');
777        assert_eq!(cat, wd::WC_Numeric);
778    }
779}
unicode_segmentation/word.rs

unicode_segmentation/
word.rs