encode_unicode/
decoding_iterators.rs

1/* Copyright 2018 The encode_unicode Developers
2 *
3 * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4 * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5 * http://opensource.org/licenses/MIT>, at your option. This file may not be
6 * copied, modified, or distributed except according to those terms.
7 */
8
9//! Iterators that turn multiple `u8`s or `u16`s into `Utf*Char`s, but can fail.
10//!
11//! To be predictable, all errors consume one element each.
12//!
13//! The iterator adaptors produce neither offset nor element length to work
14//! well with other adaptors,
15//! while the slice iterators yield both to make more advanced use cases easy.
16
17use errors::{InvalidUtf8Slice, InvalidUtf16FirstUnit, Utf16PairError};
18use errors::InvalidUtf8Slice::*;
19use errors::InvalidUtf8::*;
20use errors::InvalidUtf8FirstByte::*;
21use errors::InvalidUtf16Slice::*;
22use errors::InvalidCodepoint::*;
23use errors::Utf16PairError::*;
24use utf8_char::Utf8Char;
25use utf16_char::Utf16Char;
26use traits::U16UtfExt;
27extern crate core;
28use self::core::borrow::Borrow;
29use self::core::fmt::{self, Debug};
30use self::core::iter::Chain;
31use self::core::option;
32
33
34/// Decodes UTF-8 characters from a byte iterator into `Utf8Char`s.
35///
36/// See [`IterExt::to_utf8chars()`](../trait.IterExt.html#tymethod.to_utf8chars)
37/// for examples and error handling.
38#[derive(Clone, Default)]
39pub struct Utf8CharMerger<B:Borrow<u8>, I:Iterator<Item=B>> {
40    iter: I,
41    /// number of bytes that were read before an error was detected
42    after_err_leftover: u8,
43    /// stack because it simplifies popping.
44    after_err_stack: [u8; 3],
45}
46impl<B:Borrow<u8>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
47From<T> for Utf8CharMerger<B, I> {
48    fn from(t: T) -> Self {
49        Utf8CharMerger {
50            iter: t.into_iter(),
51            after_err_leftover: 0,
52            after_err_stack: [0; 3],
53        }
54    }
55}
56impl<B:Borrow<u8>, I:Iterator<Item=B>> Utf8CharMerger<B,I> {
57    /// Extract the inner iterator.
58    ///
59    /// If the last item produced by `.next()` was an `Err`,
60    /// up to three following bytes might be missing.  
61    /// The exact number of missing bytes for each error type should not be relied on.
62    ///
63    /// # Examples
64    ///
65    /// Three bytes swallowed:
66    /// ```
67    /// # use encode_unicode::IterExt;
68    /// let mut merger = b"\xf4\xa1\xb2FS".iter().to_utf8chars();
69    /// assert!(merger.next().unwrap().is_err());
70    /// let mut inner: std::slice::Iter<u8> = merger.into_inner();
71    /// assert_eq!(inner.next(), Some(&b'S')); // b'\xa1', b'\xb2' and b'F' disappeared
72    /// ```
73    ///
74    /// All bytes present:
75    /// ```
76    /// # use encode_unicode::IterExt;
77    /// let mut merger = b"\xb0FS".iter().to_utf8chars();
78    /// assert!(merger.next().unwrap().is_err());
79    /// assert_eq!(merger.into_inner().next(), Some(&b'F'));
80    /// ```
81    ///
82    /// Two bytes missing:
83    /// ```
84    /// # use encode_unicode::IterExt;
85    /// let mut merger = b"\xe0\x80\x80FS".iter().to_utf8chars();
86    /// assert!(merger.next().unwrap().is_err());
87    /// assert_eq!(merger.into_inner().next(), Some(&b'F'));
88    /// ```
89    pub fn into_inner(self) -> I {
90        self.iter
91    }
92
93    fn save(&mut self,  bytes: &[u8;4],  len: usize) {
94        // forget bytes[0] and push the others onto self.after_err_stack (in reverse).
95        for &after_err in bytes[1..len].iter().rev() {
96            self.after_err_stack[self.after_err_leftover as usize] = after_err;
97            self.after_err_leftover += 1;
98        }
99    }
100    /// Reads len-1 bytes into bytes[1..]
101    fn extra(&mut self,  bytes: &mut[u8;4],  len: usize) -> Result<(),InvalidUtf8Slice> {
102        // This is the only function that pushes onto after_err_stack,
103        // and it checks that all bytes are continuation bytes before fetching the next one.
104        // Therefore only the last byte retrieved can be a non-continuation byte.
105        // That last byte is also the last to be retrieved from after_err.
106        //
107        // Before this function is called, there has been retrieved at least one byte.
108        // If that byte was a continuation byte, next() produces an error
109        // and won't call this function.
110        // Therefore, we know that after_err is empty at this point.
111        // This means that we can use self.iter directly, and knows where to start pushing
112        debug_assert_eq!(self.after_err_leftover, 0, "first: {:#02x}, stack: {:?}", bytes[0], self.after_err_stack);
113        for i in 1..len {
114            if let Some(extra) = self.iter.next() {
115                let extra = *extra.borrow();
116                bytes[i] = extra;
117                if extra & 0b1100_0000 != 0b1000_0000 {
118                    // not a continuation byte
119                    self.save(bytes, i+1);
120                    return Err(InvalidUtf8Slice::Utf8(NotAContinuationByte(i)))
121                }
122            } else {
123                self.save(bytes, i);
124                return Err(TooShort(len));
125            }
126        }
127        Ok(())
128    }
129}
130impl<B:Borrow<u8>, I:Iterator<Item=B>> Iterator for Utf8CharMerger<B,I> {
131    type Item = Result<Utf8Char,InvalidUtf8Slice>;
132    fn next(&mut self) -> Option<Self::Item> {
133        let first: u8;
134        if self.after_err_leftover != 0 {
135            self.after_err_leftover -= 1;
136            first = self.after_err_stack[self.after_err_leftover as usize];
137        } else if let Some(next) = self.iter.next() {
138            first = *next.borrow();
139        } else {
140            return None;
141        }
142
143        unsafe {
144            let mut bytes = [first, 0, 0, 0];
145            let ok = match first {
146                0b0000_0000...0b0111_1111 => {/*1 and */Ok(())},
147                0b1100_0010...0b1101_1111 => {//2 and not overlong
148                    self.extra(&mut bytes, 2) // no extra validation required
149                },
150                0b1110_0000...0b1110_1111 => {//3
151                    if let Err(e) = self.extra(&mut bytes, 3) {
152                        Err(e)
153                    } else if bytes[0] == 0b1110_0000  &&  bytes[1] <= 0b10_011111 {
154                        self.save(&bytes, 3);
155                        Err(Utf8(OverLong))
156                    } else if bytes[0] == 0b1110_1101  &&  bytes[1] & 0b11_100000 == 0b10_100000 {
157                        self.save(&bytes, 3);
158                        Err(Codepoint(Utf16Reserved))
159                    } else {
160                        Ok(())
161                    }
162                },
163                0b1111_0000...0b1111_0100 => {//4
164                    if let Err(e) = self.extra(&mut bytes, 4) {
165                        Err(e)
166                    } else if bytes[0] == 0b11110_000  &&  bytes[1] <= 0b10_001111 {
167                        self.save(&bytes, 4);
168                        Err(InvalidUtf8Slice::Utf8(OverLong))
169                    } else if bytes[0] == 0b11110_100  &&  bytes[1] > 0b10_001111 {
170                        self.save(&bytes, 4);
171                        Err(InvalidUtf8Slice::Codepoint(TooHigh))
172                    } else {
173                        Ok(())
174                    }
175                },
176                0b1000_0000...0b1011_1111 => {// continuation byte
177                    Err(Utf8(FirstByte(ContinuationByte)))
178                },
179                0b1100_0000...0b1100_0001 => {// 2 and overlong
180                    Err(Utf8(OverLong))
181                },
182                0b1111_0101...0b1111_0111 => {// 4 and too high codepoint
183                    Err(Codepoint(TooHigh))
184                },
185                0b1111_1000...0b1111_1111 => {
186                    Err(Utf8(FirstByte(TooLongSeqence)))
187                },
188                _ => unreachable!("all possible byte values should be covered")
189            };
190            Some(ok.map(|()| Utf8Char::from_array_unchecked(bytes) ))
191        }
192    }
193    fn size_hint(&self) -> (usize,Option<usize>) {
194        let (iter_min, iter_max) = self.iter.size_hint();
195        // cannot be exact, so KISS
196        let min = iter_min / 4; // don't bother rounding up or accounting for after_err
197        // handle edge case of max > usize::MAX-3 just in case.
198        // Using wrapping_add() wouldn't violate any API contract as the trait isn't unsafe.
199        let max = iter_max.and_then(|max| {
200            max.checked_add(self.after_err_leftover as usize)
201        });
202        (min, max)
203    }
204}
205impl<B:Borrow<u8>, I:Iterator<Item=B>+Debug> Debug for Utf8CharMerger<B,I> {
206    fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
207        let mut in_order = [0u8; 3];
208        for i in 0..self.after_err_leftover as usize {
209            in_order[i] = self.after_err_stack[self.after_err_leftover as usize - i - 1];
210        }
211        fmtr.debug_struct("Utf8CharMerger")
212            .field("buffered", &&in_order[..self.after_err_leftover as usize])
213            .field("inner", &self.iter)
214            .finish()
215    }
216}
217
218
219/// An [`Utf8CharMerger`](struct.Utf8CharMerger.html) that also produces
220/// offsets and lengths, but can only iterate over slices.
221///
222/// See [`SliceExt::utf8char_indices()`](../trait.SliceExt.html#tymethod.utf8char_indices)
223/// for examples and error handling.
224#[derive(Clone, Default)]
225pub struct Utf8CharDecoder<'a> {
226    slice: &'a[u8],
227    index: usize,
228}
229impl<'a> From<&'a[u8]> for Utf8CharDecoder<'a> {
230    fn from(s: &[u8]) -> Utf8CharDecoder {
231        Utf8CharDecoder { slice: s, index: 0 }
232    }
233}
234impl<'a> Utf8CharDecoder<'a> {
235    /// Extract the remainder of the source slice.
236    ///
237    /// # Examples
238    ///
239    /// Unlike `Utf8CharMerger::into_inner()`, bytes directly after an error
240    /// are never swallowed:
241    /// ```
242    /// # use encode_unicode::SliceExt;
243    /// let mut iter = b"\xf4\xa1\xb2FS".utf8char_indices();
244    /// assert!(iter.next().unwrap().1.is_err());
245    /// assert_eq!(iter.as_slice(), b"\xa1\xb2FS");
246    /// ```
247    pub fn as_slice(&self) -> &'a[u8] {
248        &self.slice[self.index..]
249    }
250}
251impl<'a> Iterator for Utf8CharDecoder<'a> {
252    type Item = (usize, Result<Utf8Char,InvalidUtf8Slice>, usize);
253    fn next(&mut self) -> Option<Self::Item> {
254        let start = self.index;
255        match Utf8Char::from_slice_start(&self.slice[self.index..]) {
256            Ok((u8c, len)) => {
257                self.index += len;
258                Some((start, Ok(u8c), len))
259            },
260            Err(TooShort(1)) => None,
261            Err(e) => {
262                self.index += 1;
263                Some((start, Err(e), 1))
264            }
265        }
266    }
267    #[inline]
268    fn size_hint(&self) -> (usize,Option<usize>) {
269        let bytes = self.slice.len() - self.index;
270        // Cannot be exact, so KISS and don't bother rounding up.
271        // The slice is unlikely be full of 4-byte codepoints, so buffers
272        // allocated with the lower bound will have to be grown anyway.
273        (bytes/4, Some(bytes))
274    }
275}
276impl<'a> DoubleEndedIterator for Utf8CharDecoder<'a> {
277    fn next_back(&mut self) -> Option<Self::Item> {
278        if self.index < self.slice.len() {
279            let extras = self.slice.iter()
280                .rev()
281                .take_while(|&b| b & 0b1100_0000 == 0b1000_0000 )
282                .count();
283            let starts = self.slice.len() - (extras+1);
284            match Utf8Char::from_slice_start(&self.slice[starts..]) {
285                Ok((u8c,len)) if len == 1+extras => {
286                    self.slice = &self.slice[..starts];
287                    Some((starts, Ok(u8c), len))
288                },
289                // This enures errors for every byte in both directions,
290                // but means overlong and codepoint errors will be turned into
291                // tooshort errors.
292                Err(e) if extras == 0 => {
293                    self.slice = &self.slice[..self.slice.len()-1];
294                    Some((self.slice.len()-1, Err(e), 1))
295                },
296                _ => {
297                    self.slice = &self.slice[..self.slice.len()-1];
298                    Some((self.slice.len()-1, Err(Utf8(FirstByte(ContinuationByte))), 1))
299                },
300            }
301        } else {
302            None
303        }
304    }
305}
306impl<'a> Debug for Utf8CharDecoder<'a> {
307    fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
308        write!(fmtr, "Utf8CharDecoder {{ bytes[{}..]: {:?} }}", self.index, self.as_slice())
309    }
310}
311
312
313
314/// Decodes UTF-16 characters from a `u16` iterator into `Utf16Char`s.
315///
316/// See [`IterExt::to_utf16chars()`](../trait.IterExt.html#tymethod.to_utf16chars)
317/// for examples and error handling.
318#[derive(Clone, Default)]
319pub struct Utf16CharMerger<B:Borrow<u16>, I:Iterator<Item=B>> {
320    iter: I,
321    /// Used when a trailing surrogate was expected, the u16 can be any value.
322    prev: Option<B>,
323}
324impl<B:Borrow<u16>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
325From<T> for Utf16CharMerger<B,I> {
326    fn from(t: T) -> Self {
327        Utf16CharMerger { iter: t.into_iter(),  prev: None }
328    }
329}
330impl<B:Borrow<u16>, I:Iterator<Item=B>> Utf16CharMerger<B,I> {
331    /// Extract the inner iterator.
332    ///
333    /// If the last item produced was an `Err`, the first unit might be missing.
334    ///
335    /// # Examples
336    ///
337    /// Unit right after an error missing
338    /// ```
339    /// # use encode_unicode::IterExt;
340    /// # use encode_unicode::error::Utf16PairError;
341    /// let mut merger = [0xd901, 'F' as u16, 'S' as u16].iter().to_utf16chars();
342    /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
343    /// let mut inner: std::slice::Iter<u16> = merger.into_inner();
344    /// assert_eq!(inner.next(), Some('S' as u16).as_ref()); // 'F' was consumed by Utf16CharMerger
345    /// ```
346    ///
347    /// Error that doesn't swallow any units
348    /// ```
349    /// # use encode_unicode::IterExt;
350    /// # use encode_unicode::error::Utf16PairError;
351    /// let mut merger = [0xde00, 'F' as u16, 'S' as u16].iter().to_utf16chars();
352    /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnexpectedTrailingSurrogate)));
353    /// let mut inner: std::slice::Iter<u16> = merger.into_inner();
354    /// assert_eq!(inner.next(), Some('F' as u16).as_ref()); // not consumed
355    /// ```
356    pub fn into_inner(self) -> I {
357        self.iter
358    }
359    /// Returns an iterator over the remaining units.
360    /// Unlike `into_inner()` this will never drop any units.
361    ///
362    /// The exact type of the returned iterator should not be depended on.
363    ///
364    /// # Examples
365    ///
366    /// ```
367    /// # use encode_unicode::IterExt;
368    /// # use encode_unicode::error::Utf16PairError;
369    /// let slice = [0xd901, 'F' as u16, 'S' as u16];
370    /// let mut merger = slice.iter().to_utf16chars();
371    /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
372    /// let mut remaining = merger.into_remaining_units();
373    /// assert_eq!(remaining.next(), Some('F' as u16).as_ref());
374    /// ```
375    pub fn into_remaining_units(self) -> Chain<option::IntoIter<B>,I> {
376        self.prev.into_iter().chain(self.iter)
377    }
378}
379impl<B:Borrow<u16>, I:Iterator<Item=B>> Iterator for Utf16CharMerger<B,I> {
380    type Item = Result<Utf16Char,Utf16PairError>;
381    fn next(&mut self) -> Option<Self::Item> {
382        let first = self.prev.take().or_else(|| self.iter.next() );
383        first.map(|first| unsafe {
384            match first.borrow().utf16_needs_extra_unit() {
385                Ok(false) => Ok(Utf16Char::from_tuple_unchecked((*first.borrow(), None))),
386                Ok(true) => match self.iter.next() {
387                    Some(second) => match second.borrow().utf16_needs_extra_unit() {
388                        Err(InvalidUtf16FirstUnit) => Ok(Utf16Char::from_tuple_unchecked((
389                            *first.borrow(),
390                            Some(*second.borrow())
391                        ))),
392                        Ok(_) => {
393                            self.prev = Some(second);
394                            Err(Utf16PairError::UnmatchedLeadingSurrogate)
395                        }
396                    },
397                    None => Err(Utf16PairError::Incomplete)
398                },
399                Err(InvalidUtf16FirstUnit) => Err(Utf16PairError::UnexpectedTrailingSurrogate),
400            }
401        })
402    }
403    fn size_hint(&self) -> (usize,Option<usize>) {
404        let (iter_min, iter_max) = self.iter.size_hint();
405        // cannot be exact, so KISS
406        let min = iter_min / 2; // don't bother rounding up or accounting for self.prev
407        let max = match (iter_max, &self.prev) {
408            (Some(max), &Some(_)) => max.checked_add(1),
409            (max, _) => max,
410        };
411        (min, max)
412    }
413}
414impl<B:Borrow<u16>, I:Iterator<Item=B>+Debug> Debug for Utf16CharMerger<B,I> {
415    fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
416        fmtr.debug_struct("Utf16CharMerger")
417            .field("buffered", &self.prev.as_ref().map(|b| *b.borrow() ))
418            .field("inner", &self.iter)
419            .finish()
420    }
421}
422
423
424/// An [`Utf16CharMerger`](struct.Utf16CharMerger.html) that also produces
425/// offsets and lengths, but can only iterate over slices.
426///
427/// See [`SliceExt::utf16char_indices()`](../trait.SliceExt.html#tymethod.utf16char_indices)
428/// for examples and error handling.
429#[derive(Clone, Default)]
430pub struct Utf16CharDecoder<'a> {
431    slice: &'a[u16],
432    index: usize,
433}
434impl<'a> From<&'a[u16]> for Utf16CharDecoder<'a> {
435    fn from(s: &'a[u16]) -> Self {
436        Utf16CharDecoder{ slice: s,  index: 0 }
437    }
438}
439impl<'a> Utf16CharDecoder<'a> {
440    /// Extract the remainder of the source slice.
441    ///
442    /// # Examples
443    ///
444    /// Unlike `Utf16CharMerger::into_inner()`, the unit after an error is never swallowed:
445    /// ```
446    /// # use encode_unicode::SliceExt;
447    /// # use encode_unicode::error::Utf16PairError;
448    /// let mut iter = [0xd901, 'F' as u16, 'S' as u16].utf16char_indices();
449    /// assert_eq!(iter.next(), Some((0, Err(Utf16PairError::UnmatchedLeadingSurrogate), 1)));
450    /// assert_eq!(iter.as_slice(), &['F' as u16, 'S' as u16]);
451    /// ```
452    pub fn as_slice(&self) -> &[u16] {
453        &self.slice[self.index..]
454    }
455}
456impl<'a> Iterator for Utf16CharDecoder<'a> {
457    type Item = (usize,Result<Utf16Char,Utf16PairError>,usize);
458    #[inline]
459    fn next(&mut self) -> Option<Self::Item>  {
460        let start = self.index;
461        match Utf16Char::from_slice_start(self.as_slice()) {
462            Ok((u16c,len)) => {
463                self.index += len;
464                Some((start, Ok(u16c), len))
465            },
466            Err(EmptySlice) => None,
467            Err(FirstLowSurrogate) => {
468                self.index += 1;
469                Some((start, Err(UnexpectedTrailingSurrogate), 1))
470            },
471            Err(SecondNotLowSurrogate) => {
472                self.index += 1;
473                Some((start, Err(UnmatchedLeadingSurrogate), 1))
474            },
475            Err(MissingSecond) => {
476                self.index = self.slice.len();
477                Some((start, Err(Incomplete), 1))
478            }
479        }
480    }
481    #[inline]
482    fn size_hint(&self) -> (usize,Option<usize>) {
483        let units = self.slice.len() - self.index;
484        // Cannot be exact, so KISS and don't bother rounding up.
485        // The slice is unlikely be full of surrogate pairs, so buffers
486        // allocated with the lower bound will have to be grown anyway.
487        (units/2, Some(units))
488    }
489}
490impl<'a> Debug for Utf16CharDecoder<'a> {
491    fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
492        write!(fmtr, "Utf16CharDecoder {{ units[{}..]: {:?} }}", self.index, self.as_slice())
493    }
494}