unicode_segmentation/
sentence.rs

1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14// All of the logic for forward iteration over sentences
15mod fwd {
16    use crate::tables::sentence::SentenceCat;
17    use core::cmp;
18
19    // Describe a parsed part of source string as described in this table:
20    // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
21    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
22    enum StatePart {
23        Sot,
24        Eot,
25        Other,
26        CR,
27        LF,
28        Sep,
29        ATerm,
30        UpperLower,
31        ClosePlus,
32        SpPlus,
33        STerm,
34    }
35
36    #[derive(Debug, Clone, PartialEq, Eq)]
37    struct SentenceBreaksState(pub [StatePart; 4]);
38
39    const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
40        StatePart::Sot,
41        StatePart::Sot,
42        StatePart::Sot,
43        StatePart::Sot,
44    ]);
45
46    #[derive(Debug, Clone)]
47    pub struct SentenceBreaks<'a> {
48        pub string: &'a str,
49        pos: usize,
50        state: SentenceBreaksState,
51    }
52
53    impl SentenceBreaksState {
54        // Attempt to advance the internal state by one part
55        // Whitespace and some punctutation will be collapsed
56        fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
57            let &SentenceBreaksState(parts) = self;
58            let parts = match (parts[3], cat) {
59                (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
60                (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
61                _ => [
62                    parts[1],
63                    parts[2],
64                    parts[3],
65                    match cat {
66                        SentenceCat::SC_CR => StatePart::CR,
67                        SentenceCat::SC_LF => StatePart::LF,
68                        SentenceCat::SC_Sep => StatePart::Sep,
69                        SentenceCat::SC_ATerm => StatePart::ATerm,
70                        SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
71                        SentenceCat::SC_Close => StatePart::ClosePlus,
72                        SentenceCat::SC_Sp => StatePart::SpPlus,
73                        SentenceCat::SC_STerm => StatePart::STerm,
74                        _ => StatePart::Other,
75                    },
76                ],
77            };
78            SentenceBreaksState(parts)
79        }
80
81        fn end(&self) -> SentenceBreaksState {
82            let &SentenceBreaksState(parts) = self;
83            SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
84        }
85
86        // Helper function to check if state head matches a single `StatePart`
87        fn match1(&self, part: StatePart) -> bool {
88            let &SentenceBreaksState(parts) = self;
89            part == parts[3]
90        }
91
92        // Helper function to check if first two `StateParts` in state match
93        // the given two
94        fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
95            let &SentenceBreaksState(parts) = self;
96            part1 == parts[2] && part2 == parts[3]
97        }
98    }
99
100    // https://unicode.org/reports/tr29/#SB8
101    // TODO cache this, it is currently quadratic
102    fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
103        let &SentenceBreaksState(parts) = state;
104        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
105        if parts[idx] == StatePart::ClosePlus {
106            idx -= 1
107        }
108
109        if parts[idx] == StatePart::ATerm {
110            use crate::tables::sentence as se;
111
112            for next_char in ahead.chars() {
113                //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
114                match se::sentence_category(next_char).2 {
115                    se::SC_Lower => return true,
116                    se::SC_OLetter
117                    | se::SC_Upper
118                    | se::SC_Sep
119                    | se::SC_CR
120                    | se::SC_LF
121                    | se::SC_STerm
122                    | se::SC_ATerm => return false,
123                    _ => continue,
124                }
125            }
126        }
127
128        false
129    }
130
131    // https://unicode.org/reports/tr29/#SB8a
132    fn match_sb8a(state: &SentenceBreaksState) -> bool {
133        // SATerm Close* Sp*
134        let &SentenceBreaksState(parts) = state;
135        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
136        if parts[idx] == StatePart::ClosePlus {
137            idx -= 1
138        }
139        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
140    }
141
142    // https://unicode.org/reports/tr29/#SB9
143    fn match_sb9(state: &SentenceBreaksState) -> bool {
144        // SATerm Close*
145        let &SentenceBreaksState(parts) = state;
146        let idx = if parts[3] == StatePart::ClosePlus {
147            2
148        } else {
149            3
150        };
151        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
152    }
153
154    // https://unicode.org/reports/tr29/#SB11
155    fn match_sb11(state: &SentenceBreaksState) -> bool {
156        // SATerm Close* Sp* ParaSep?
157        let &SentenceBreaksState(parts) = state;
158        let mut idx = match parts[3] {
159            StatePart::Sep | StatePart::CR | StatePart::LF => 2,
160            _ => 3,
161        };
162
163        if parts[idx] == StatePart::SpPlus {
164            idx -= 1
165        }
166        if parts[idx] == StatePart::ClosePlus {
167            idx -= 1
168        }
169
170        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
171    }
172
173    impl<'a> Iterator for SentenceBreaks<'a> {
174        // Returns the index of the character which follows a break
175        type Item = usize;
176
177        #[inline]
178        fn size_hint(&self) -> (usize, Option<usize>) {
179            let slen = self.string.len();
180            // A sentence could be one character
181            (cmp::min(slen, 2), Some(slen + 1))
182        }
183
184        #[inline]
185        fn next(&mut self) -> Option<usize> {
186            use crate::tables::sentence as se;
187
188            for next_char in self.string[self.pos..].chars() {
189                let position_before = self.pos;
190                let state_before = self.state.clone();
191
192                let next_cat = se::sentence_category(next_char).2;
193
194                self.pos += next_char.len_utf8();
195                self.state = self.state.next(next_cat);
196
197                match next_cat {
198                    // SB1 https://unicode.org/reports/tr29/#SB1
199                    _ if state_before.match1(StatePart::Sot) => return Some(position_before),
200
201                    // SB2 is handled when inner iterator (chars) is finished
202
203                    // SB3 https://unicode.org/reports/tr29/#SB3
204                    SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
205
206                    // SB4 https://unicode.org/reports/tr29/#SB4
207                    _ if state_before.match1(StatePart::Sep)
208                        || state_before.match1(StatePart::CR)
209                        || state_before.match1(StatePart::LF) =>
210                    {
211                        return Some(position_before)
212                    }
213
214                    // SB5 https://unicode.org/reports/tr29/#SB5
215                    SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
216
217                    // SB6 https://unicode.org/reports/tr29/#SB6
218                    SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
219
220                    // SB7 https://unicode.org/reports/tr29/#SB7
221                    SentenceCat::SC_Upper
222                        if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
223                    {
224                        continue
225                    }
226
227                    // SB8 https://unicode.org/reports/tr29/#SB8
228                    _ if match_sb8(&state_before, &self.string[position_before..]) => continue,
229
230                    // SB8a https://unicode.org/reports/tr29/#SB8a
231                    SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
232                        if match_sb8a(&state_before) =>
233                    {
234                        continue
235                    }
236
237                    // SB9 https://unicode.org/reports/tr29/#SB9
238                    SentenceCat::SC_Close
239                    | SentenceCat::SC_Sp
240                    | SentenceCat::SC_Sep
241                    | SentenceCat::SC_CR
242                    | SentenceCat::SC_LF
243                        if match_sb9(&state_before) =>
244                    {
245                        continue
246                    }
247
248                    // SB10 https://unicode.org/reports/tr29/#SB10
249                    SentenceCat::SC_Sp
250                    | SentenceCat::SC_Sep
251                    | SentenceCat::SC_CR
252                    | SentenceCat::SC_LF
253                        if match_sb8a(&state_before) =>
254                    {
255                        continue
256                    }
257
258                    // SB11 https://unicode.org/reports/tr29/#SB11
259                    _ if match_sb11(&state_before) => return Some(position_before),
260
261                    // SB998 https://unicode.org/reports/tr29/#SB998
262                    _ => continue,
263                }
264            }
265
266            // SB2 https://unicode.org/reports/tr29/#SB2
267            if self.state.match1(StatePart::Sot) || self.state.match1(StatePart::Eot) {
268                None
269            } else {
270                self.state = self.state.end();
271                Some(self.pos)
272            }
273        }
274    }
275
276    pub fn new_sentence_breaks(source: &str) -> SentenceBreaks<'_> {
277        SentenceBreaks {
278            string: source,
279            pos: 0,
280            state: INITIAL_STATE,
281        }
282    }
283}
284
285/// An iterator over the substrings of a string which, after splitting the string on
286/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
287/// contain any characters with the
288/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
289/// property, or with
290/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
291///
292/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
293/// trait. See its documentation for more.
294///
295/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
296/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
297#[derive(Debug, Clone)]
298pub struct UnicodeSentences<'a> {
299    inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
300}
301
302/// External iterator for a string's
303/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
304///
305/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
306/// trait. See its documentation for more.
307///
308/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
309/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
310#[derive(Debug, Clone)]
311pub struct USentenceBounds<'a> {
312    iter: fwd::SentenceBreaks<'a>,
313    sentence_start: Option<usize>,
314}
315
316/// External iterator for sentence boundaries and byte offsets.
317///
318/// This struct is created by the [`split_sentence_bound_indices`] method on the
319/// [`UnicodeSegmentation`] trait. See its documentation for more.
320///
321/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
322/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
323#[derive(Debug, Clone)]
324pub struct USentenceBoundIndices<'a> {
325    start_offset: usize,
326    iter: USentenceBounds<'a>,
327}
328
329#[inline]
330pub fn new_sentence_bounds(source: &str) -> USentenceBounds<'_> {
331    USentenceBounds {
332        iter: fwd::new_sentence_breaks(source),
333        sentence_start: None,
334    }
335}
336
337#[inline]
338pub fn new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_> {
339    USentenceBoundIndices {
340        start_offset: source.as_ptr() as usize,
341        iter: new_sentence_bounds(source),
342    }
343}
344
345#[inline]
346pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> {
347    use super::UnicodeSegmentation;
348    use crate::tables::util::is_alphanumeric;
349
350    fn has_alphanumeric(s: &&str) -> bool {
351        s.chars().any(is_alphanumeric)
352    }
353    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
354
355    UnicodeSentences {
356        inner: s.split_sentence_bounds().filter(has_alphanumeric),
357    }
358}
359
360impl<'a> Iterator for UnicodeSentences<'a> {
361    type Item = &'a str;
362
363    #[inline]
364    fn next(&mut self) -> Option<&'a str> {
365        self.inner.next()
366    }
367
368    #[inline]
369    fn size_hint(&self) -> (usize, Option<usize>) {
370        self.inner.size_hint()
371    }
372}
373
374impl<'a> Iterator for USentenceBounds<'a> {
375    type Item = &'a str;
376
377    #[inline]
378    fn size_hint(&self) -> (usize, Option<usize>) {
379        let (lower, upper) = self.iter.size_hint();
380        (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
381    }
382
383    #[inline]
384    fn next(&mut self) -> Option<&'a str> {
385        if self.sentence_start.is_none() {
386            if let Some(start_pos) = self.iter.next() {
387                self.sentence_start = Some(start_pos)
388            } else {
389                return None;
390            }
391        }
392
393        if let Some(break_pos) = self.iter.next() {
394            let start_pos = self.sentence_start.unwrap();
395            let sentence = &self.iter.string[start_pos..break_pos];
396            self.sentence_start = Some(break_pos);
397            Some(sentence)
398        } else {
399            None
400        }
401    }
402}
403
404impl<'a> Iterator for USentenceBoundIndices<'a> {
405    type Item = (usize, &'a str);
406
407    #[inline]
408    fn next(&mut self) -> Option<(usize, &'a str)> {
409        self.iter
410            .next()
411            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
412    }
413
414    #[inline]
415    fn size_hint(&self) -> (usize, Option<usize>) {
416        self.iter.size_hint()
417    }
418}
unicode_segmentation/sentence.rs

unicode_segmentation/
sentence.rs