encode_unicode/
utf16_iterators.rs

1/* Copyright 2016 The encode_unicode Developers
2 *
3 * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4 * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5 * http://opensource.org/licenses/MIT>, at your option. This file may not be
6 * copied, modified, or distributed except according to those terms.
7 */
8
9use traits::CharExt;
10use utf16_char::Utf16Char;
11use errors::EmptyStrError;
12extern crate core;
13use self::core::fmt;
14use self::core::borrow::Borrow;
15
16// Invalid values that says the field is consumed or empty.
17const FIRST_USED: u16 = 0x_dc_00;
18const SECOND_USED: u16 = 0;
19
20/// Iterate over the units of the UTF-16 representation of a codepoint.
21#[derive(Clone)]
22pub struct Utf16Iterator {
23    first: u16,
24    second: u16,
25}
26impl From<char> for Utf16Iterator {
27    fn from(c: char) -> Self {
28        let (first, second) = c.to_utf16_tuple();
29        Utf16Iterator{ first: first,  second: second.unwrap_or(SECOND_USED) }
30    }
31}
32impl From<Utf16Char> for Utf16Iterator {
33    fn from(uc: Utf16Char) -> Self {
34        let (first, second) = uc.to_tuple();
35        Utf16Iterator{ first: first,  second: second.unwrap_or(SECOND_USED) }
36    }
37}
38impl Iterator for Utf16Iterator {
39    type Item=u16;
40    fn next(&mut self) -> Option<u16> {
41        match (self.first, self.second) {
42            (FIRST_USED, SECOND_USED)  =>  {                            None        },
43            (FIRST_USED, second     )  =>  {self.second = SECOND_USED;  Some(second)},
44            (first     ,      _     )  =>  {self.first = FIRST_USED;    Some(first )},
45        }
46    }
47    fn size_hint(&self) -> (usize, Option<usize>) {
48        (self.len(), Some(self.len()))
49    }
50}
51impl ExactSizeIterator for Utf16Iterator {
52    fn len(&self) -> usize {
53        (if self.first == FIRST_USED {0} else {1}) +
54        (if self.second == SECOND_USED {0} else {1})
55    }
56}
57impl fmt::Debug for Utf16Iterator {
58    fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
59        let mut clone = self.clone();
60        match (clone.next(), clone.next()) {
61            (Some(one), None)  => write!(fmtr, "[{}]", one),
62            (Some(a), Some(b)) => write!(fmtr, "[{}, {}]", a, b),
63            (None,  _)         => write!(fmtr, "[]"),
64        }
65    }
66}
67
68
69
70/// Converts an iterator of `Utf16Char` (or `&Utf16Char`)
71/// to an iterator of `u16`s.  
72/// Is equivalent to calling `.flat_map()` on the original iterator,
73/// but the returned iterator is about twice as fast.
74///
75/// The exact number of units cannot be known in advance, but `size_hint()`
76/// gives the possible range.
77///
78/// # Examples
79///
80/// From iterator of values:
81///
82/// ```
83/// use encode_unicode::{iter_units, CharExt};
84///
85/// let iterator = "foo".chars().map(|c| c.to_utf16() );
86/// let mut units = [0; 4];
87/// for (u,dst) in iter_units(iterator).zip(&mut units) {*dst=u;}
88/// assert_eq!(units, ['f' as u16, 'o' as u16, 'o' as u16, 0]);
89/// ```
90///
91/// From iterator of references:
92///
93#[cfg_attr(feature="std", doc=" ```")]
94#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
95/// use encode_unicode::{iter_units, CharExt, Utf16Char};
96///
97/// // (💣 takes two units)
98/// let chars: Vec<Utf16Char> = "💣 bomb 💣".chars().map(|c| c.to_utf16() ).collect();
99/// let units: Vec<u16> = iter_units(&chars).collect();
100/// let flat_map: Vec<u16> = chars.iter().flat_map(|u16c| *u16c ).collect();
101/// assert_eq!(units, flat_map);
102/// ```
103pub fn iter_units<U:Borrow<Utf16Char>, I:IntoIterator<Item=U>>
104(iterable: I) -> Utf16CharSplitter<U, I::IntoIter> {
105    Utf16CharSplitter{ inner: iterable.into_iter(),  prev_second: 0 }
106}
107
108/// The iterator type returned by `iter_units()`
109#[derive(Clone)]
110pub struct Utf16CharSplitter<U:Borrow<Utf16Char>, I:Iterator<Item=U>> {
111    inner: I,
112    prev_second: u16,
113}
114impl<I:Iterator<Item=Utf16Char>> From<I> for Utf16CharSplitter<Utf16Char,I> {
115    /// A less generic constructor than `iter_units()`
116    fn from(iter: I) -> Self {
117        iter_units(iter)
118    }
119}
120impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Utf16CharSplitter<U,I> {
121    /// Extracts the source iterator.
122    ///
123    /// Note that `iter_units(iter.into_inner())` is not a no-op:  
124    /// If the last returned unit from `next()` was a leading surrogate,
125    /// the trailing surrogate is lost.
126    pub fn into_inner(self) -> I {
127        self.inner
128    }
129}
130impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Iterator for Utf16CharSplitter<U,I> {
131    type Item = u16;
132    fn next(&mut self) -> Option<Self::Item> {
133        if self.prev_second == 0 {
134            self.inner.next().map(|u16c| {
135                let (first, second) = u16c.borrow().to_tuple();
136                self.prev_second = second.unwrap_or(0);
137                first
138            })
139        } else {
140            let prev_second = self.prev_second;
141            self.prev_second = 0;
142            Some(prev_second)
143        }
144    }
145    fn size_hint(&self) -> (usize,Option<usize>) {
146        // Doesn't need to handle unlikely overflows correctly because
147        // size_hint() cannot be relied upon anyway. (the trait isn't unsafe)
148        let (min, max) = self.inner.size_hint();
149        let add = if self.prev_second == 0 {0} else {1};
150        (min.wrapping_add(add), max.map(|max| max.wrapping_mul(2).wrapping_add(add) ))
151    }
152}
153
154
155
156/// An iterator over the codepoints in a `str` represented as `Utf16Char`.
157#[derive(Clone)]
158pub struct Utf16CharIndices<'a>{
159    str: &'a str,
160    index: usize,
161}
162impl<'a> From<&'a str> for Utf16CharIndices<'a> {
163    fn from(s: &str) -> Utf16CharIndices {
164        Utf16CharIndices{str: s, index: 0}
165    }
166}
167impl<'a> Utf16CharIndices<'a> {
168    /// Extract the remainder of the source `str`.
169    ///
170    /// # Examples
171    ///
172    /// ```
173    /// use encode_unicode::{StrExt, Utf16Char};
174    /// let mut iter = "abc".utf16char_indices();
175    /// assert_eq!(iter.next_back(), Some((2, Utf16Char::from('c'))));
176    /// assert_eq!(iter.next(), Some((0, Utf16Char::from('a'))));
177    /// assert_eq!(iter.as_str(), "b");
178    /// ```
179    pub fn as_str(&self) -> &'a str {
180        &self.str[self.index..]
181    }
182}
183impl<'a> Iterator for Utf16CharIndices<'a> {
184    type Item = (usize,Utf16Char);
185    fn next(&mut self) -> Option<(usize,Utf16Char)> {
186        match Utf16Char::from_str_start(&self.str[self.index..]) {
187            Ok((u16c, bytes)) => {
188                let item = (self.index, u16c);
189                self.index += bytes;
190                Some(item)
191            },
192            Err(EmptyStrError) => None
193        }
194    }
195    fn size_hint(&self) -> (usize,Option<usize>) {
196        let len = self.str.len() - self.index;
197        // For len+3 to overflow, the slice must fill all but two bytes of
198        // addressable memory, and size_hint() doesn't need to be correct.
199        (len.wrapping_add(3)/4, Some(len))
200    }
201}
202impl<'a> DoubleEndedIterator for Utf16CharIndices<'a> {
203    fn next_back(&mut self) -> Option<(usize,Utf16Char)> {
204        if self.index < self.str.len() {
205            let rev = self.str.bytes().rev();
206            let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
207            let starts = self.str.len() - len;
208            let (u16c,_) = Utf16Char::from_str_start(&self.str[starts..]).unwrap();
209            self.str = &self.str[..starts];
210            Some((starts, u16c))
211        } else {
212            None
213        }
214    }
215}
216impl<'a> fmt::Debug for Utf16CharIndices<'a> {
217    fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
218        fmtr.debug_tuple("Utf16CharIndices")
219            .field(&self.index)
220            .field(&self.as_str())
221            .finish()
222    }
223}
224
225
226/// An iterator over the codepoints in a `str` represented as `Utf16Char`.
227#[derive(Clone)]
228pub struct Utf16Chars<'a>(Utf16CharIndices<'a>);
229impl<'a> From<&'a str> for Utf16Chars<'a> {
230    fn from(s: &str) -> Utf16Chars {
231        Utf16Chars(Utf16CharIndices::from(s))
232    }
233}
234impl<'a> Utf16Chars<'a> {
235    /// Extract the remainder of the source `str`.
236    ///
237    /// # Examples
238    ///
239    /// ```
240    /// use encode_unicode::{StrExt, Utf16Char};
241    /// let mut iter = "abc".utf16chars();
242    /// assert_eq!(iter.next(), Some(Utf16Char::from('a')));
243    /// assert_eq!(iter.next_back(), Some(Utf16Char::from('c')));
244    /// assert_eq!(iter.as_str(), "b");
245    /// ```
246    pub fn as_str(&self) -> &'a str {
247        self.0.as_str()
248    }
249}
250impl<'a> Iterator for Utf16Chars<'a> {
251    type Item = Utf16Char;
252    fn next(&mut self) -> Option<Utf16Char> {
253        self.0.next().map(|(_,u16c)| u16c )
254    }
255    fn size_hint(&self) -> (usize,Option<usize>) {
256        self.0.size_hint()
257    }
258}
259impl<'a> DoubleEndedIterator for Utf16Chars<'a> {
260    fn next_back(&mut self) -> Option<Utf16Char> {
261        self.0.next_back().map(|(_,u16c)| u16c )
262    }
263}
264impl<'a> fmt::Debug for Utf16Chars<'a> {
265    fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
266        fmtr.debug_tuple("Utf16Chars")
267            .field(&self.as_str())
268            .finish()
269    }
270}