unicode_segmentation/
sentence.rs

1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14// All of the logic for forward iteration over sentences
15mod fwd {
16    use crate::tables::sentence::SentenceCat;
17    use core::cmp;
18
19    // Describe a parsed part of source string as described in this table:
20    // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
21    #[derive(Clone, Copy, PartialEq, Eq)]
22    enum StatePart {
23        Sot,
24        Eot,
25        Other,
26        CR,
27        LF,
28        Sep,
29        ATerm,
30        UpperLower,
31        ClosePlus,
32        SpPlus,
33        STerm,
34    }
35
36    #[derive(Clone, PartialEq, Eq)]
37    struct SentenceBreaksState(pub [StatePart; 4]);
38
39    const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
40        StatePart::Sot,
41        StatePart::Sot,
42        StatePart::Sot,
43        StatePart::Sot,
44    ]);
45
46    #[derive(Clone)]
47    pub struct SentenceBreaks<'a> {
48        pub string: &'a str,
49        pos: usize,
50        state: SentenceBreaksState,
51    }
52
53    impl SentenceBreaksState {
54        // Attempt to advance the internal state by one part
55        // Whitespace and some punctutation will be collapsed
56        fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
57            let &SentenceBreaksState(parts) = self;
58            let parts = match (parts[3], cat) {
59                (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
60                (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
61                _ => [
62                    parts[1],
63                    parts[2],
64                    parts[3],
65                    match cat {
66                        SentenceCat::SC_CR => StatePart::CR,
67                        SentenceCat::SC_LF => StatePart::LF,
68                        SentenceCat::SC_Sep => StatePart::Sep,
69                        SentenceCat::SC_ATerm => StatePart::ATerm,
70                        SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
71                        SentenceCat::SC_Close => StatePart::ClosePlus,
72                        SentenceCat::SC_Sp => StatePart::SpPlus,
73                        SentenceCat::SC_STerm => StatePart::STerm,
74                        _ => StatePart::Other,
75                    },
76                ],
77            };
78            SentenceBreaksState(parts)
79        }
80
81        fn end(&self) -> SentenceBreaksState {
82            let &SentenceBreaksState(parts) = self;
83            SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
84        }
85
86        // Helper function to check if state head matches a single `StatePart`
87        fn match1(&self, part: StatePart) -> bool {
88            let &SentenceBreaksState(parts) = self;
89            part == parts[3]
90        }
91
92        // Helper function to check if first two `StateParts` in state match
93        // the given two
94        fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
95            let &SentenceBreaksState(parts) = self;
96            part1 == parts[2] && part2 == parts[3]
97        }
98    }
99
100    // https://unicode.org/reports/tr29/#SB8
101    // TODO cache this, it is currently quadratic
102    fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
103        let &SentenceBreaksState(parts) = state;
104        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
105        if parts[idx] == StatePart::ClosePlus {
106            idx -= 1
107        }
108
109        if parts[idx] == StatePart::ATerm {
110            use crate::tables::sentence as se;
111
112            for next_char in ahead.chars() {
113                //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
114                match se::sentence_category(next_char).2 {
115                    se::SC_Lower => return true,
116                    se::SC_OLetter
117                    | se::SC_Upper
118                    | se::SC_Sep
119                    | se::SC_CR
120                    | se::SC_LF
121                    | se::SC_STerm
122                    | se::SC_ATerm => return false,
123                    _ => continue,
124                }
125            }
126        }
127
128        false
129    }
130
131    // https://unicode.org/reports/tr29/#SB8a
132    fn match_sb8a(state: &SentenceBreaksState) -> bool {
133        // SATerm Close* Sp*
134        let &SentenceBreaksState(parts) = state;
135        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
136        if parts[idx] == StatePart::ClosePlus {
137            idx -= 1
138        }
139        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
140    }
141
142    // https://unicode.org/reports/tr29/#SB9
143    fn match_sb9(state: &SentenceBreaksState) -> bool {
144        // SATerm Close*
145        let &SentenceBreaksState(parts) = state;
146        let idx = if parts[3] == StatePart::ClosePlus {
147            2
148        } else {
149            3
150        };
151        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
152    }
153
154    // https://unicode.org/reports/tr29/#SB11
155    fn match_sb11(state: &SentenceBreaksState) -> bool {
156        // SATerm Close* Sp* ParaSep?
157        let &SentenceBreaksState(parts) = state;
158        let mut idx = match parts[3] {
159            StatePart::Sep | StatePart::CR | StatePart::LF => 2,
160            _ => 3,
161        };
162
163        if parts[idx] == StatePart::SpPlus {
164            idx -= 1
165        }
166        if parts[idx] == StatePart::ClosePlus {
167            idx -= 1
168        }
169
170        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
171    }
172
173    impl<'a> Iterator for SentenceBreaks<'a> {
174        // Returns the index of the character which follows a break
175        type Item = usize;
176
177        #[inline]
178        fn size_hint(&self) -> (usize, Option<usize>) {
179            let slen = self.string.len();
180            // A sentence could be one character
181            (cmp::min(slen, 2), Some(slen + 1))
182        }
183
184        #[inline]
185        fn next(&mut self) -> Option<usize> {
186            use crate::tables::sentence as se;
187
188            for next_char in self.string[self.pos..].chars() {
189                let position_before = self.pos;
190                let state_before = self.state.clone();
191
192                let next_cat = se::sentence_category(next_char).2;
193
194                self.pos += next_char.len_utf8();
195                self.state = self.state.next(next_cat);
196
197                match next_cat {
198                    // SB1 https://unicode.org/reports/tr29/#SB1
199                    _ if state_before.match1(StatePart::Sot) => return Some(position_before),
200
201                    // SB2 is handled when inner iterator (chars) is finished
202
203                    // SB3 https://unicode.org/reports/tr29/#SB3
204                    SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
205
206                    // SB4 https://unicode.org/reports/tr29/#SB4
207                    _ if state_before.match1(StatePart::Sep)
208                        || state_before.match1(StatePart::CR)
209                        || state_before.match1(StatePart::LF) =>
210                    {
211                        return Some(position_before)
212                    }
213
214                    // SB5 https://unicode.org/reports/tr29/#SB5
215                    SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
216
217                    // SB6 https://unicode.org/reports/tr29/#SB6
218                    SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
219
220                    // SB7 https://unicode.org/reports/tr29/#SB7
221                    SentenceCat::SC_Upper
222                        if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
223                    {
224                        continue
225                    }
226
227                    // SB8 https://unicode.org/reports/tr29/#SB8
228                    _ if match_sb8(&state_before, &self.string[position_before..]) => continue,
229
230                    // SB8a https://unicode.org/reports/tr29/#SB8a
231                    SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
232                        if match_sb8a(&state_before) =>
233                    {
234                        continue
235                    }
236
237                    // SB9 https://unicode.org/reports/tr29/#SB9
238                    SentenceCat::SC_Close
239                    | SentenceCat::SC_Sp
240                    | SentenceCat::SC_Sep
241                    | SentenceCat::SC_CR
242                    | SentenceCat::SC_LF
243                        if match_sb9(&state_before) =>
244                    {
245                        continue
246                    }
247
248                    // SB10 https://unicode.org/reports/tr29/#SB10
249                    SentenceCat::SC_Sp
250                    | SentenceCat::SC_Sep
251                    | SentenceCat::SC_CR
252                    | SentenceCat::SC_LF
253                        if match_sb8a(&state_before) =>
254                    {
255                        continue
256                    }
257
258                    // SB11 https://unicode.org/reports/tr29/#SB11
259                    _ if match_sb11(&state_before) => return Some(position_before),
260
261                    // SB998 https://unicode.org/reports/tr29/#SB998
262                    _ => continue,
263                }
264            }
265
266            // SB2 https://unicode.org/reports/tr29/#SB2
267            if self.state.match1(StatePart::Sot) {
268                None
269            } else if self.state.match1(StatePart::Eot) {
270                None
271            } else {
272                self.state = self.state.end();
273                Some(self.pos)
274            }
275        }
276    }
277
278    pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
279        SentenceBreaks {
280            string: source,
281            pos: 0,
282            state: INITIAL_STATE,
283        }
284    }
285}
286
287/// An iterator over the substrings of a string which, after splitting the string on
288/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
289/// contain any characters with the
290/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
291/// property, or with
292/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
293///
294/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
295/// trait. See its documentation for more.
296///
297/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
298/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
299#[derive(Clone)]
300pub struct UnicodeSentences<'a> {
301    inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
302}
303
304/// External iterator for a string's
305/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
306///
307/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
308/// trait. See its documentation for more.
309///
310/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
311/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
312#[derive(Clone)]
313pub struct USentenceBounds<'a> {
314    iter: fwd::SentenceBreaks<'a>,
315    sentence_start: Option<usize>,
316}
317
318/// External iterator for sentence boundaries and byte offsets.
319///
320/// This struct is created by the [`split_sentence_bound_indices`] method on the
321/// [`UnicodeSegmentation`] trait. See its documentation for more.
322///
323/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
324/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
325#[derive(Clone)]
326pub struct USentenceBoundIndices<'a> {
327    start_offset: usize,
328    iter: USentenceBounds<'a>,
329}
330
331#[inline]
332pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
333    USentenceBounds {
334        iter: fwd::new_sentence_breaks(source),
335        sentence_start: None,
336    }
337}
338
339#[inline]
340pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
341    USentenceBoundIndices {
342        start_offset: source.as_ptr() as usize,
343        iter: new_sentence_bounds(source),
344    }
345}
346
347#[inline]
348pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
349    use super::UnicodeSegmentation;
350    use crate::tables::util::is_alphanumeric;
351
352    fn has_alphanumeric(s: &&str) -> bool {
353        s.chars().any(|c| is_alphanumeric(c))
354    }
355    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
356
357    UnicodeSentences {
358        inner: s.split_sentence_bounds().filter(has_alphanumeric),
359    }
360}
361
362impl<'a> Iterator for UnicodeSentences<'a> {
363    type Item = &'a str;
364
365    #[inline]
366    fn next(&mut self) -> Option<&'a str> {
367        self.inner.next()
368    }
369}
370
371impl<'a> Iterator for USentenceBounds<'a> {
372    type Item = &'a str;
373
374    #[inline]
375    fn size_hint(&self) -> (usize, Option<usize>) {
376        let (lower, upper) = self.iter.size_hint();
377        (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
378    }
379
380    #[inline]
381    fn next(&mut self) -> Option<&'a str> {
382        if self.sentence_start == None {
383            if let Some(start_pos) = self.iter.next() {
384                self.sentence_start = Some(start_pos)
385            } else {
386                return None;
387            }
388        }
389
390        if let Some(break_pos) = self.iter.next() {
391            let start_pos = self.sentence_start.unwrap();
392            let sentence = &self.iter.string[start_pos..break_pos];
393            self.sentence_start = Some(break_pos);
394            Some(sentence)
395        } else {
396            None
397        }
398    }
399}
400
401impl<'a> Iterator for USentenceBoundIndices<'a> {
402    type Item = (usize, &'a str);
403
404    #[inline]
405    fn next(&mut self) -> Option<(usize, &'a str)> {
406        self.iter
407            .next()
408            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
409    }
410
411    #[inline]
412    fn size_hint(&self) -> (usize, Option<usize>) {
413        self.iter.size_hint()
414    }
415}