unicode_segmentation/
word.rs

1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14use crate::tables::word::WordCat;
15
16/// An iterator over the substrings of a string which, after splitting the string on
17/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18/// contain any characters with the
19/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20/// property, or with
21/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22///
23/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24/// its documentation for more.
25///
26/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28pub struct UnicodeWords<'a> {
29    inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
30}
31
32impl<'a> Iterator for UnicodeWords<'a> {
33    type Item = &'a str;
34
35    #[inline]
36    fn next(&mut self) -> Option<&'a str> {
37        self.inner.next()
38    }
39}
40impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
41    #[inline]
42    fn next_back(&mut self) -> Option<&'a str> {
43        self.inner.next_back()
44    }
45}
46
47/// An iterator over the substrings of a string which, after splitting the string on
48/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
49/// contain any characters with the
50/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
51/// property, or with
52/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
53/// This iterator also provides the byte offsets for each substring.
54///
55/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
56/// its documentation for more.
57///
58/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
59/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
60pub struct UnicodeWordIndices<'a> {
61    inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
62}
63
64impl<'a> Iterator for UnicodeWordIndices<'a> {
65    type Item = (usize, &'a str);
66
67    #[inline]
68    fn next(&mut self) -> Option<(usize, &'a str)> {
69        self.inner.next()
70    }
71}
72impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
73    #[inline]
74    fn next_back(&mut self) -> Option<(usize, &'a str)> {
75        self.inner.next_back()
76    }
77}
78
79/// External iterator for a string's
80/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
81///
82/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
83/// trait. See its documentation for more.
84///
85/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
86/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
87#[derive(Clone)]
88pub struct UWordBounds<'a> {
89    string: &'a str,
90    cat: Option<WordCat>,
91    catb: Option<WordCat>,
92}
93
94/// External iterator for word boundaries and byte offsets.
95///
96/// This struct is created by the [`split_word_bound_indices`] method on the
97/// [`UnicodeSegmentation`] trait. See its documentation for more.
98///
99/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
100/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
101#[derive(Clone)]
102pub struct UWordBoundIndices<'a> {
103    start_offset: usize,
104    iter: UWordBounds<'a>,
105}
106
107impl<'a> UWordBoundIndices<'a> {
108    #[inline]
109    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
110    ///
111    /// ```rust
112    /// # use unicode_segmentation::UnicodeSegmentation;
113    /// let mut iter = "Hello world".split_word_bound_indices();
114    /// assert_eq!(iter.as_str(), "Hello world");
115    /// iter.next();
116    /// assert_eq!(iter.as_str(), " world");
117    /// iter.next();
118    /// assert_eq!(iter.as_str(), "world");
119    /// ```
120    pub fn as_str(&self) -> &'a str {
121        self.iter.as_str()
122    }
123}
124
125impl<'a> Iterator for UWordBoundIndices<'a> {
126    type Item = (usize, &'a str);
127
128    #[inline]
129    fn next(&mut self) -> Option<(usize, &'a str)> {
130        self.iter
131            .next()
132            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
133    }
134
135    #[inline]
136    fn size_hint(&self) -> (usize, Option<usize>) {
137        self.iter.size_hint()
138    }
139}
140
141impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
142    #[inline]
143    fn next_back(&mut self) -> Option<(usize, &'a str)> {
144        self.iter
145            .next_back()
146            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
147    }
148}
149
150// state machine for word boundary rules
151#[derive(Clone, Copy, PartialEq, Eq, Debug)]
152enum UWordBoundsState {
153    Start,
154    Letter,
155    HLetter,
156    Numeric,
157    Katakana,
158    ExtendNumLet,
159    Regional(RegionalState),
160    FormatExtend(FormatExtendType),
161    Zwj,
162    Emoji,
163    WSegSpace,
164}
165
166// subtypes for FormatExtend state in UWordBoundsState
167#[derive(Clone, Copy, PartialEq, Eq, Debug)]
168enum FormatExtendType {
169    AcceptAny,
170    AcceptNone,
171    RequireLetter,
172    RequireHLetter,
173    AcceptQLetter,
174    RequireNumeric,
175}
176
177#[derive(Clone, Copy, PartialEq, Eq, Debug)]
178enum RegionalState {
179    Half,
180    Full,
181    Unknown,
182}
183
184fn is_emoji(ch: char) -> bool {
185    use crate::tables::emoji;
186    emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
187}
188
189impl<'a> Iterator for UWordBounds<'a> {
190    type Item = &'a str;
191
192    #[inline]
193    fn size_hint(&self) -> (usize, Option<usize>) {
194        let slen = self.string.len();
195        (cmp::min(slen, 1), Some(slen))
196    }
197
198    #[inline]
199    fn next(&mut self) -> Option<&'a str> {
200        use self::FormatExtendType::*;
201        use self::UWordBoundsState::*;
202        use crate::tables::word as wd;
203        if self.string.len() == 0 {
204            return None;
205        }
206
207        let mut take_curr = true;
208        let mut take_cat = true;
209        let mut idx = 0;
210        let mut saveidx = 0;
211        let mut state = Start;
212        let mut cat = wd::WC_Any;
213        let mut savecat = wd::WC_Any;
214
215        // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
216        let mut skipped_format_extend = false;
217        for (curr, ch) in self.string.char_indices() {
218            idx = curr;
219            // Whether or not the previous category was ZWJ
220            // ZWJs get collapsed, so this handles precedence of WB3c over WB4
221            let prev_zwj = cat == wd::WC_ZWJ;
222            // if there's a category cached, grab it
223            cat = match self.cat {
224                None => wd::word_category(ch).2,
225                _ => self.cat.take().unwrap(),
226            };
227            take_cat = true;
228
229            // handle rule WB4
230            // just skip all format, extend, and zwj chars
231            // note that Start is a special case: if there's a bunch of Format | Extend
232            // characters at the beginning of a block of text, dump them out as one unit.
233            //
234            // (This is not obvious from the wording of UAX#29, but if you look at the
235            // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
236            // then the "correct" interpretation of WB4 becomes apparent.)
237            if state != Start {
238                match cat {
239                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
240                        skipped_format_extend = true;
241                        continue;
242                    }
243                    _ => {}
244                }
245            }
246
247            // rule WB3c
248            // WB4 makes all ZWJs collapse into the previous state
249            // but you can still be in a Zwj state if you started with Zwj
250            //
251            // This means that an EP + Zwj will collapse into EP, which is wrong,
252            // since EP+EP is not a boundary but EP+ZWJ+EP is
253            //
254            // Thus, we separately keep track of whether or not the last character
255            // was a ZWJ. This is an additional bit of state tracked outside of the
256            // state enum; the state enum represents the last non-zwj state encountered.
257            // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
258            // however we are in the previous state for the purposes of all other rules.
259            if prev_zwj {
260                if is_emoji(ch) {
261                    state = Emoji;
262                    continue;
263                }
264            }
265            // Don't use `continue` in this match without updating `cat`
266            state = match state {
267                Start if cat == wd::WC_CR => {
268                    idx += match self.get_next_cat(idx) {
269                        Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
270                        _ => 0,
271                    };
272                    break; // rule WB3a
273                }
274                Start => match cat {
275                    wd::WC_ALetter => Letter,            // rule WB5, WB6, WB9, WB13a
276                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
277                    wd::WC_Numeric => Numeric,           // rule WB8, WB10, WB12, WB13a
278                    wd::WC_Katakana => Katakana,         // rule WB13, WB13a
279                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
280                    wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
281                    wd::WC_LF | wd::WC_Newline => break, // rule WB3a
282                    wd::WC_ZWJ => Zwj,                   // rule WB3c
283                    wd::WC_WSegSpace => WSegSpace,       // rule WB3d
284                    _ => {
285                        if let Some(ncat) = self.get_next_cat(idx) {
286                            // rule WB4
287                            if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
288                            {
289                                state = FormatExtend(AcceptNone);
290                                self.cat = Some(ncat);
291                                continue;
292                            }
293                        }
294                        break; // rule WB999
295                    }
296                },
297                WSegSpace => match cat {
298                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
299                    _ => {
300                        take_curr = false;
301                        break;
302                    }
303                },
304                Zwj => {
305                    // We already handle WB3c above.
306                    take_curr = false;
307                    break;
308                }
309                Letter | HLetter => match cat {
310                    wd::WC_ALetter => Letter,            // rule WB5
311                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
312                    wd::WC_Numeric => Numeric,           // rule WB9
313                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
314                    wd::WC_Double_Quote if state == HLetter => {
315                        savecat = cat;
316                        saveidx = idx;
317                        FormatExtend(RequireHLetter) // rule WB7b
318                    }
319                    wd::WC_Single_Quote if state == HLetter => {
320                        FormatExtend(AcceptQLetter) // rule WB7a
321                    }
322                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
323                        savecat = cat;
324                        saveidx = idx;
325                        FormatExtend(RequireLetter) // rule WB6
326                    }
327                    _ => {
328                        take_curr = false;
329                        break;
330                    }
331                },
332                Numeric => match cat {
333                    wd::WC_Numeric => Numeric,           // rule WB8
334                    wd::WC_ALetter => Letter,            // rule WB10
335                    wd::WC_Hebrew_Letter => HLetter,     // rule WB10
336                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
337                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
338                        savecat = cat;
339                        saveidx = idx;
340                        FormatExtend(RequireNumeric) // rule WB12
341                    }
342                    _ => {
343                        take_curr = false;
344                        break;
345                    }
346                },
347                Katakana => match cat {
348                    wd::WC_Katakana => Katakana,         // rule WB13
349                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
350                    _ => {
351                        take_curr = false;
352                        break;
353                    }
354                },
355                ExtendNumLet => match cat {
356                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
357                    wd::WC_ALetter => Letter,            // rule WB13b
358                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13b
359                    wd::WC_Numeric => Numeric,           // rule WB13b
360                    wd::WC_Katakana => Katakana,         // rule WB13b
361                    _ => {
362                        take_curr = false;
363                        break;
364                    }
365                },
366                Regional(RegionalState::Full) => {
367                    // if it reaches here we've gone too far,
368                    // a full flag can only compose with ZWJ/Extend/Format
369                    // proceeding it.
370                    take_curr = false;
371                    break;
372                }
373                Regional(RegionalState::Half) => match cat {
374                    wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
375                    _ => {
376                        take_curr = false;
377                        break;
378                    }
379                },
380                Regional(_) => {
381                    unreachable!("RegionalState::Unknown should not occur on forward iteration")
382                }
383                Emoji => {
384                    // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
385                    take_curr = false;
386                    break;
387                }
388                FormatExtend(t) => match t {
389                    // handle FormatExtends depending on what type
390                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
391                    RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
392                    RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
393                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
394                    AcceptNone | AcceptQLetter => {
395                        take_curr = false; // emit all the Format|Extend characters
396                        take_cat = false;
397                        break;
398                    }
399                    _ => break, // rewind (in if statement below)
400                },
401            }
402        }
403
404        if let FormatExtend(t) = state {
405            // we were looking for something and didn't find it; we have to back up
406            if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
407                idx = saveidx;
408                cat = savecat;
409                take_curr = false;
410            }
411        }
412
413        self.cat = if take_curr {
414            idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
415            None
416        } else if take_cat {
417            Some(cat)
418        } else {
419            None
420        };
421
422        let retstr = &self.string[..idx];
423        self.string = &self.string[idx..];
424        Some(retstr)
425    }
426}
427
428impl<'a> DoubleEndedIterator for UWordBounds<'a> {
429    #[inline]
430    fn next_back(&mut self) -> Option<&'a str> {
431        use self::FormatExtendType::*;
432        use self::UWordBoundsState::*;
433        use crate::tables::word as wd;
434        if self.string.len() == 0 {
435            return None;
436        }
437
438        let mut take_curr = true;
439        let mut take_cat = true;
440        let mut idx = self.string.len();
441        idx -= self.string.chars().next_back().unwrap().len_utf8();
442        let mut previdx = idx;
443        let mut saveidx = idx;
444        let mut state = Start;
445        let mut savestate = Start;
446        let mut cat = wd::WC_Any;
447
448        let mut skipped_format_extend = false;
449
450        for (curr, ch) in self.string.char_indices().rev() {
451            previdx = idx;
452            idx = curr;
453
454            // if there's a category cached, grab it
455            cat = match self.catb {
456                None => wd::word_category(ch).2,
457                _ => self.catb.take().unwrap(),
458            };
459            take_cat = true;
460
461            // backward iterator over word boundaries. Mostly the same as the forward
462            // iterator, with two weirdnesses:
463            // (1) If we encounter a single quote in the Start state, we have to check for a
464            //     Hebrew Letter immediately before it.
465            // (2) Format and Extend char handling takes some gymnastics.
466
467            if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
468                // WB3c has more priority so we should not
469                // fold in that case
470                if match state {
471                    FormatExtend(_) | Start => false,
472                    _ => true,
473                } {
474                    saveidx = previdx;
475                    savestate = state;
476                    state = FormatExtend(AcceptNone);
477                }
478
479                if state != Start {
480                    continue;
481                }
482            } else if state == FormatExtend(AcceptNone) {
483                // finished a scan of some Format|Extend chars, restore previous state
484                state = savestate;
485                previdx = saveidx;
486                take_cat = false;
487                skipped_format_extend = true;
488            }
489
490            // Don't use `continue` in this match without updating `catb`
491            state = match state {
492                Start | FormatExtend(AcceptAny) => match cat {
493                    _ if is_emoji(ch) => Zwj,
494                    wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
495                    wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
496                    wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
497                    wd::WC_Katakana => Katakana, // rule WB13, WB13b
498                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
499                    wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
500                    // rule WB4:
501                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
502                    wd::WC_Single_Quote => {
503                        saveidx = idx;
504                        FormatExtend(AcceptQLetter) // rule WB7a
505                    }
506                    wd::WC_WSegSpace => WSegSpace,
507                    wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
508                        if state == Start {
509                            if cat == wd::WC_LF {
510                                idx -= match self.get_prev_cat(idx) {
511                                    Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
512                                    _ => 0,
513                                };
514                            }
515                        } else {
516                            take_curr = false;
517                        }
518                        break; // rule WB3a
519                    }
520                    _ => break, // rule WB999
521                },
522                Zwj => match cat {
523                    // rule WB3c
524                    wd::WC_ZWJ => FormatExtend(AcceptAny),
525                    _ => {
526                        take_curr = false;
527                        break;
528                    }
529                },
530                WSegSpace => match cat {
531                    // rule WB3d
532                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
533                    _ => {
534                        take_curr = false;
535                        break;
536                    }
537                },
538                Letter | HLetter => match cat {
539                    wd::WC_ALetter => Letter,            // rule WB5
540                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
541                    wd::WC_Numeric => Numeric,           // rule WB10
542                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
543                    wd::WC_Double_Quote if state == HLetter => {
544                        saveidx = previdx;
545                        FormatExtend(RequireHLetter) // rule WB7c
546                    }
547                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
548                        saveidx = previdx;
549                        FormatExtend(RequireLetter) // rule WB7
550                    }
551                    _ => {
552                        take_curr = false;
553                        break;
554                    }
555                },
556                Numeric => match cat {
557                    wd::WC_Numeric => Numeric,           // rule WB8
558                    wd::WC_ALetter => Letter,            // rule WB9
559                    wd::WC_Hebrew_Letter => HLetter,     // rule WB9
560                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
561                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
562                        saveidx = previdx;
563                        FormatExtend(RequireNumeric) // rule WB11
564                    }
565                    _ => {
566                        take_curr = false;
567                        break;
568                    }
569                },
570                Katakana => match cat {
571                    wd::WC_Katakana => Katakana,         // rule WB13
572                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
573                    _ => {
574                        take_curr = false;
575                        break;
576                    }
577                },
578                ExtendNumLet => match cat {
579                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
580                    wd::WC_ALetter => Letter,            // rule WB13a
581                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13a
582                    wd::WC_Numeric => Numeric,           // rule WB13a
583                    wd::WC_Katakana => Katakana,         // rule WB13a
584                    _ => {
585                        take_curr = false;
586                        break;
587                    }
588                },
589                Regional(mut regional_state) => match cat {
590                    // rule WB13c
591                    wd::WC_Regional_Indicator => {
592                        if regional_state == RegionalState::Unknown {
593                            let count = self.string[..previdx]
594                                .chars()
595                                .rev()
596                                .map(|c| wd::word_category(c).2)
597                                .filter(|&c| {
598                                    !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
599                                })
600                                .take_while(|&c| c == wd::WC_Regional_Indicator)
601                                .count();
602                            regional_state = if count % 2 == 0 {
603                                RegionalState::Full
604                            } else {
605                                RegionalState::Half
606                            };
607                        }
608                        if regional_state == RegionalState::Full {
609                            take_curr = false;
610                            break;
611                        } else {
612                            Regional(RegionalState::Full)
613                        }
614                    }
615                    _ => {
616                        take_curr = false;
617                        break;
618                    }
619                },
620                Emoji => {
621                    if is_emoji(ch) {
622                        // rule WB3c
623                        Zwj
624                    } else {
625                        take_curr = false;
626                        break;
627                    }
628                }
629                FormatExtend(t) => match t {
630                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
631                    RequireLetter if cat == wd::WC_ALetter => Letter,   // rule WB6
632                    RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
633                    AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
634                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
635                    _ => break,                                         // backtrack will happens
636                },
637            }
638        }
639
640        if let FormatExtend(t) = state {
641            // if we required something but didn't find it, backtrack
642            if t == RequireLetter
643                || t == RequireHLetter
644                || t == RequireNumeric
645                || t == AcceptNone
646                || t == AcceptQLetter
647            {
648                previdx = saveidx;
649                take_cat = false;
650                take_curr = false;
651            }
652        }
653
654        self.catb = if take_curr {
655            None
656        } else {
657            idx = previdx;
658            if take_cat {
659                Some(cat)
660            } else {
661                None
662            }
663        };
664
665        let retstr = &self.string[idx..];
666        self.string = &self.string[..idx];
667        Some(retstr)
668    }
669}
670
671impl<'a> UWordBounds<'a> {
672    #[inline]
673    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
674    ///
675    /// ```rust
676    /// # use unicode_segmentation::UnicodeSegmentation;
677    /// let mut iter = "Hello world".split_word_bounds();
678    /// assert_eq!(iter.as_str(), "Hello world");
679    /// iter.next();
680    /// assert_eq!(iter.as_str(), " world");
681    /// iter.next();
682    /// assert_eq!(iter.as_str(), "world");
683    /// ```
684    pub fn as_str(&self) -> &'a str {
685        self.string
686    }
687
688    #[inline]
689    fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
690        use crate::tables::word as wd;
691        let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
692        if nidx < self.string.len() {
693            let nch = self.string[nidx..].chars().next().unwrap();
694            Some(wd::word_category(nch).2)
695        } else {
696            None
697        }
698    }
699
700    #[inline]
701    fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
702        use crate::tables::word as wd;
703        if idx > 0 {
704            let nch = self.string[..idx].chars().next_back().unwrap();
705            Some(wd::word_category(nch).2)
706        } else {
707            None
708        }
709    }
710}
711
712#[inline]
713pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
714    UWordBounds {
715        string: s,
716        cat: None,
717        catb: None,
718    }
719}
720
721#[inline]
722pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
723    UWordBoundIndices {
724        start_offset: s.as_ptr() as usize,
725        iter: new_word_bounds(s),
726    }
727}
728
729#[inline]
730fn has_alphanumeric(s: &&str) -> bool {
731    use crate::tables::util::is_alphanumeric;
732
733    s.chars().any(|c| is_alphanumeric(c))
734}
735
736#[inline]
737pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
738    use super::UnicodeSegmentation;
739
740    UnicodeWords {
741        inner: s.split_word_bounds().filter(has_alphanumeric),
742    }
743}
744
745#[inline]
746pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
747    use super::UnicodeSegmentation;
748
749    UnicodeWordIndices {
750        inner: s
751            .split_word_bound_indices()
752            .filter(|(_, c)| has_alphanumeric(c)),
753    }
754}