encode_unicode/
utf8_iterators.rs

1/* Copyright 2016 The encode_unicode Developers
2 *
3 * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4 * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5 * http://opensource.org/licenses/MIT>, at your option. This file may not be
6 * copied, modified, or distributed except according to those terms.
7 */
8
9use utf8_char::Utf8Char;
10use errors::EmptyStrError;
11extern crate core;
12use self::core::{mem, u32, u64};
13use self::core::ops::Not;
14use self::core::fmt;
15use self::core::borrow::Borrow;
16#[cfg(feature="std")]
17use std::io::{Read, Error as ioError};
18
19
20
21/// Read or iterate over the bytes of the UTF-8 representation of a codepoint.
22#[derive(Clone)]
23pub struct Utf8Iterator (u32);
24
25impl From<Utf8Char> for Utf8Iterator {
26    fn from(uc: Utf8Char) -> Self {
27        let used = u32::from_le(unsafe{ mem::transmute(uc.to_array().0) });
28        // uses u64 because shifting an u32 by 32 bits is a no-op.
29        let unused_set = (u64::MAX  <<  uc.len() as u64*8) as u32;
30        Utf8Iterator(used | unused_set)
31    }
32}
33impl From<char> for Utf8Iterator {
34    fn from(c: char) -> Self {
35        Self::from(Utf8Char::from(c))
36    }
37}
38impl Iterator for Utf8Iterator {
39    type Item=u8;
40    fn next(&mut self) -> Option<u8> {
41        let next = self.0 as u8;
42        if next == 0xff {
43            None
44        } else {
45            self.0 = (self.0 >> 8)  |  0xff_00_00_00;
46            Some(next)
47        }
48    }
49    fn size_hint(&self) -> (usize, Option<usize>) {
50        (self.len(),  Some(self.len()))
51    }
52}
53impl ExactSizeIterator for Utf8Iterator {
54    fn len(&self) -> usize {// not straightforward, but possible
55        let unused_bytes = self.0.not().leading_zeros() / 8;
56        4 - unused_bytes as usize
57    }
58}
59#[cfg(feature="std")]
60impl Read for Utf8Iterator {
61    /// Always returns Ok
62    fn read(&mut self,  buf: &mut[u8]) -> Result<usize, ioError> {
63        // Cannot call self.next() until I know I can write the result.
64        for (i, dst) in buf.iter_mut().enumerate() {
65            match self.next() {
66                Some(b) => *dst = b,
67                None    => return Ok(i),
68            }
69        }
70        Ok(buf.len())
71    }
72}
73impl fmt::Debug for Utf8Iterator {
74    fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
75        let mut content = [0; 4];
76        let mut i = 0;
77        for b in self.clone() {
78            content[i] = b;
79            i += 1;
80        }
81        write!(fmtr, "{:?}", &content[..i])
82    }
83}
84
85
86
87/// Converts an iterator of `Utf8Char` (or `&Utf8Char`)
88/// to an iterator of `u8`s.  
89/// Is equivalent to calling `.flat_map()` on the original iterator,
90/// but the returned iterator is ~40% faster.
91///
92/// The iterator also implements `Read` (if the `std` feature isn't disabled).
93/// Reading will never produce an error, and calls to `.read()` and `.next()`
94/// can be mixed.
95///
96/// The exact number of bytes cannot be known in advance, but `size_hint()`
97/// gives the possible range.
98/// (min: all remaining characters are ASCII, max: all require four bytes)
99///
100/// # Examples
101///
102/// From iterator of values:
103///
104/// ```
105/// use encode_unicode::{iter_bytes, CharExt};
106///
107/// let iterator = "foo".chars().map(|c| c.to_utf8() );
108/// let mut bytes = [0; 4];
109/// for (u,dst) in iter_bytes(iterator).zip(&mut bytes) {*dst=u;}
110/// assert_eq!(&bytes, b"foo\0");
111/// ```
112///
113/// From iterator of references:
114///
115#[cfg_attr(feature="std", doc=" ```")]
116#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
117/// use encode_unicode::{iter_bytes, CharExt, Utf8Char};
118///
119/// let chars: Vec<Utf8Char> = "💣 bomb 💣".chars().map(|c| c.to_utf8() ).collect();
120/// let bytes: Vec<u8> = iter_bytes(&chars).collect();
121/// let flat_map: Vec<u8> = chars.iter().flat_map(|u8c| *u8c ).collect();
122/// assert_eq!(bytes, flat_map);
123/// ```
124///
125/// `Read`ing from it:
126///
127#[cfg_attr(feature="std", doc=" ```")]
128#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
129/// use encode_unicode::{iter_bytes, CharExt};
130/// use std::io::Read;
131///
132/// let s = "Ååh‽";
133/// assert_eq!(s.len(), 8);
134/// let mut buf = [b'E'; 9];
135/// let mut reader = iter_bytes(s.chars().map(|c| c.to_utf8() ));
136/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 8);
137/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 0);
138/// assert_eq!(&buf[..8], s.as_bytes());
139/// assert_eq!(buf[8], b'E');
140/// ```
141pub fn iter_bytes<U:Borrow<Utf8Char>, I:IntoIterator<Item=U>>
142(iterable: I) -> Utf8CharSplitter<U, I::IntoIter> {
143    Utf8CharSplitter{ inner: iterable.into_iter(),  prev: 0 }
144}
145
146/// The iterator type returned by `iter_bytes()`
147///
148/// See its documentation for details.
149#[derive(Clone)]
150pub struct Utf8CharSplitter<U:Borrow<Utf8Char>, I:Iterator<Item=U>> {
151    inner: I,
152    prev: u32,
153}
154impl<I:Iterator<Item=Utf8Char>> From<I> for Utf8CharSplitter<Utf8Char,I> {
155    /// A less generic constructor than `iter_bytes()`
156    fn from(iter: I) -> Self {
157        iter_bytes(iter)
158    }
159}
160impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Utf8CharSplitter<U,I> {
161    /// Extracts the source iterator.
162    ///
163    /// Note that `iter_bytes(iter.into_inner())` is not a no-op:  
164    /// If the last returned byte from `next()` was not an ASCII by,
165    /// the remaining bytes of that codepoint is lost.
166    pub fn into_inner(self) -> I {
167        self.inner
168    }
169}
170impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Iterator for Utf8CharSplitter<U,I> {
171    type Item = u8;
172    fn next(&mut self) -> Option<Self::Item> {
173        if self.prev == 0 {
174            self.inner.next().map(|u8c| {
175                let array = u8c.borrow().to_array().0;
176                self.prev = unsafe{ u32::from_le(mem::transmute(array)) } >> 8;
177                array[0]
178            })
179        } else {
180            let next = self.prev as u8;
181            self.prev >>= 8;
182            Some(next)
183        }
184    }
185    fn size_hint(&self) -> (usize,Option<usize>) {
186        // Doesn't need to handle unlikely overflows correctly because
187        // size_hint() cannot be relied upon anyway. (the trait isn't unsafe)
188        let (min, max) = self.inner.size_hint();
189        let add = 4 - (self.prev.leading_zeros() / 8) as usize;
190        (min.wrapping_add(add), max.map(|max| max.wrapping_mul(4).wrapping_add(add) ))
191    }
192}
193#[cfg(feature="std")]
194impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Read for Utf8CharSplitter<U,I> {
195    /// Always returns `Ok`
196    fn read(&mut self,  buf: &mut[u8]) -> Result<usize, ioError> {
197        let mut i = 0;
198        // write remaining bytes of previous codepoint
199        while self.prev != 0  &&  i < buf.len() {
200            buf[i] = self.prev as u8;
201            self.prev >>= 8;
202            i += 1;
203        }
204        // write whole characters
205        while i < buf.len() {
206            let bytes = match self.inner.next() {
207                Some(u8c) => u8c.borrow().to_array().0,
208                None => break
209            };
210            buf[i] = bytes[0];
211            i += 1;
212            if bytes[1] != 0 {
213                let len = bytes[0].not().leading_zeros() as usize;
214                let mut written = 1;
215                while written < len {
216                    if i < buf.len() {
217                        buf[i] = bytes[written];
218                        i += 1;
219                        written += 1;
220                    } else {
221                        let bytes_as_u32 = unsafe{ u32::from_le(mem::transmute(bytes)) };
222                        self.prev = bytes_as_u32 >> (8*written);
223                        return Ok(i);
224                    }
225                }
226            }
227        }
228        Ok(i)
229    }
230}
231
232
233
234/// An iterator over the `Utf8Char` of a string slice, and their positions.
235///
236/// This struct is created by the `utf8char_indices() method from [`StrExt`] trait. See its documentation for more.
237#[derive(Clone)]
238pub struct Utf8CharIndices<'a>{
239    str: &'a str,
240    index: usize,
241}
242impl<'a> From<&'a str> for Utf8CharIndices<'a> {
243    fn from(s: &str) -> Utf8CharIndices {
244        Utf8CharIndices{str: s, index: 0}
245    }
246}
247impl<'a> Utf8CharIndices<'a> {
248    /// Extract the remainder of the source `str`.
249    ///
250    /// # Examples
251    ///
252    /// ```
253    /// use encode_unicode::{StrExt, Utf8Char};
254    /// let mut iter = "abc".utf8char_indices();
255    /// assert_eq!(iter.next_back(), Some((2, Utf8Char::from('c'))));
256    /// assert_eq!(iter.next(), Some((0, Utf8Char::from('a'))));
257    /// assert_eq!(iter.as_str(), "b");
258    /// ```
259    pub fn as_str(&self) -> &'a str {
260        &self.str[self.index..]
261    }
262}
263impl<'a> Iterator for Utf8CharIndices<'a> {
264    type Item = (usize,Utf8Char);
265    fn next(&mut self) -> Option<(usize,Utf8Char)> {
266        match Utf8Char::from_str_start(&self.str[self.index..]) {
267            Ok((u8c, len)) => {
268                let item = (self.index, u8c);
269                self.index += len;
270                Some(item)
271            },
272            Err(EmptyStrError) => None
273        }
274    }
275    fn size_hint(&self) -> (usize,Option<usize>) {
276        let len = self.str.len() - self.index;
277        // For len+3 to overflow, the slice must fill all but two bytes of
278        // addressable memory, and size_hint() doesn't need to be correct.
279        (len.wrapping_add(3)/4, Some(len))
280    }
281}
282impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> {
283    fn next_back(&mut self) -> Option<(usize,Utf8Char)> {
284        // Cannot refactor out the unwrap without switching to ::from_slice()
285        // since slicing the str panics if not on a boundary.
286        if self.index < self.str.len() {
287            let rev = self.str.bytes().rev();
288            let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
289            let starts = self.str.len() - len;
290            let (u8c,_) = Utf8Char::from_str_start(&self.str[starts..]).unwrap();
291            self.str = &self.str[..starts];
292            Some((starts, u8c))
293        } else {
294            None
295        }
296    }
297}
298impl<'a> fmt::Debug for Utf8CharIndices<'a> {
299    fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
300        fmtr.debug_tuple("Utf8CharIndices")
301            .field(&self.index)
302            .field(&self.as_str())
303            .finish()
304    }
305}
306
307
308/// An iterator over the codepoints in a `str` represented as `Utf8Char`.
309#[derive(Clone)]
310pub struct Utf8Chars<'a>(Utf8CharIndices<'a>);
311impl<'a> From<&'a str> for Utf8Chars<'a> {
312    fn from(s: &str) -> Utf8Chars {
313        Utf8Chars(Utf8CharIndices::from(s))
314    }
315}
316impl<'a> Utf8Chars<'a> {
317    /// Extract the remainder of the source `str`.
318    ///
319    /// # Examples
320    ///
321    /// ```
322    /// use encode_unicode::{StrExt, Utf8Char};
323    /// let mut iter = "abc".utf8chars();
324    /// assert_eq!(iter.next(), Some(Utf8Char::from('a')));
325    /// assert_eq!(iter.next_back(), Some(Utf8Char::from('c')));
326    /// assert_eq!(iter.as_str(), "b");
327    /// ```
328    pub fn as_str(&self) -> &'a str {
329        self.0.as_str()
330    }
331}
332impl<'a> Iterator for Utf8Chars<'a> {
333    type Item = Utf8Char;
334    fn next(&mut self) -> Option<Utf8Char> {
335        self.0.next().map(|(_,u8c)| u8c )
336    }
337    fn size_hint(&self) -> (usize,Option<usize>) {
338        self.0.size_hint()
339    }
340}
341impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
342    fn next_back(&mut self) -> Option<Utf8Char> {
343        self.0.next_back().map(|(_,u8c)| u8c )
344    }
345}
346impl<'a> fmt::Debug for Utf8Chars<'a> {
347    fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
348        fmtr.debug_tuple("Utf8CharIndices")
349            .field(&self.as_str())
350            .finish()
351    }
352}