encode_unicode/decoding_iterators.rs
1/* Copyright 2018 The encode_unicode Developers
2 *
3 * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4 * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5 * http://opensource.org/licenses/MIT>, at your option. This file may not be
6 * copied, modified, or distributed except according to those terms.
7 */
8
9//! Iterators that turn multiple `u8`s or `u16`s into `Utf*Char`s, but can fail.
10//!
11//! To be predictable, all errors consume one element each.
12//!
13//! The iterator adaptors produce neither offset nor element length to work
14//! well with other adaptors,
15//! while the slice iterators yield both to make more advanced use cases easy.
16
17use errors::{InvalidUtf8Slice, InvalidUtf16FirstUnit, Utf16PairError};
18use errors::InvalidUtf8Slice::*;
19use errors::InvalidUtf8::*;
20use errors::InvalidUtf8FirstByte::*;
21use errors::InvalidUtf16Slice::*;
22use errors::InvalidCodepoint::*;
23use errors::Utf16PairError::*;
24use utf8_char::Utf8Char;
25use utf16_char::Utf16Char;
26use traits::U16UtfExt;
27extern crate core;
28use self::core::borrow::Borrow;
29use self::core::fmt::{self, Debug};
30use self::core::iter::Chain;
31use self::core::option;
32
33
34/// Decodes UTF-8 characters from a byte iterator into `Utf8Char`s.
35///
36/// See [`IterExt::to_utf8chars()`](../trait.IterExt.html#tymethod.to_utf8chars)
37/// for examples and error handling.
38#[derive(Clone, Default)]
39pub struct Utf8CharMerger<B:Borrow<u8>, I:Iterator<Item=B>> {
40 iter: I,
41 /// number of bytes that were read before an error was detected
42 after_err_leftover: u8,
43 /// stack because it simplifies popping.
44 after_err_stack: [u8; 3],
45}
46impl<B:Borrow<u8>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
47From<T> for Utf8CharMerger<B, I> {
48 fn from(t: T) -> Self {
49 Utf8CharMerger {
50 iter: t.into_iter(),
51 after_err_leftover: 0,
52 after_err_stack: [0; 3],
53 }
54 }
55}
56impl<B:Borrow<u8>, I:Iterator<Item=B>> Utf8CharMerger<B,I> {
57 /// Extract the inner iterator.
58 ///
59 /// If the last item produced by `.next()` was an `Err`,
60 /// up to three following bytes might be missing.
61 /// The exact number of missing bytes for each error type should not be relied on.
62 ///
63 /// # Examples
64 ///
65 /// Three bytes swallowed:
66 /// ```
67 /// # use encode_unicode::IterExt;
68 /// let mut merger = b"\xf4\xa1\xb2FS".iter().to_utf8chars();
69 /// assert!(merger.next().unwrap().is_err());
70 /// let mut inner: std::slice::Iter<u8> = merger.into_inner();
71 /// assert_eq!(inner.next(), Some(&b'S')); // b'\xa1', b'\xb2' and b'F' disappeared
72 /// ```
73 ///
74 /// All bytes present:
75 /// ```
76 /// # use encode_unicode::IterExt;
77 /// let mut merger = b"\xb0FS".iter().to_utf8chars();
78 /// assert!(merger.next().unwrap().is_err());
79 /// assert_eq!(merger.into_inner().next(), Some(&b'F'));
80 /// ```
81 ///
82 /// Two bytes missing:
83 /// ```
84 /// # use encode_unicode::IterExt;
85 /// let mut merger = b"\xe0\x80\x80FS".iter().to_utf8chars();
86 /// assert!(merger.next().unwrap().is_err());
87 /// assert_eq!(merger.into_inner().next(), Some(&b'F'));
88 /// ```
89 pub fn into_inner(self) -> I {
90 self.iter
91 }
92
93 fn save(&mut self, bytes: &[u8;4], len: usize) {
94 // forget bytes[0] and push the others onto self.after_err_stack (in reverse).
95 for &after_err in bytes[1..len].iter().rev() {
96 self.after_err_stack[self.after_err_leftover as usize] = after_err;
97 self.after_err_leftover += 1;
98 }
99 }
100 /// Reads len-1 bytes into bytes[1..]
101 fn extra(&mut self, bytes: &mut[u8;4], len: usize) -> Result<(),InvalidUtf8Slice> {
102 // This is the only function that pushes onto after_err_stack,
103 // and it checks that all bytes are continuation bytes before fetching the next one.
104 // Therefore only the last byte retrieved can be a non-continuation byte.
105 // That last byte is also the last to be retrieved from after_err.
106 //
107 // Before this function is called, there has been retrieved at least one byte.
108 // If that byte was a continuation byte, next() produces an error
109 // and won't call this function.
110 // Therefore, we know that after_err is empty at this point.
111 // This means that we can use self.iter directly, and knows where to start pushing
112 debug_assert_eq!(self.after_err_leftover, 0, "first: {:#02x}, stack: {:?}", bytes[0], self.after_err_stack);
113 for i in 1..len {
114 if let Some(extra) = self.iter.next() {
115 let extra = *extra.borrow();
116 bytes[i] = extra;
117 if extra & 0b1100_0000 != 0b1000_0000 {
118 // not a continuation byte
119 self.save(bytes, i+1);
120 return Err(InvalidUtf8Slice::Utf8(NotAContinuationByte(i)))
121 }
122 } else {
123 self.save(bytes, i);
124 return Err(TooShort(len));
125 }
126 }
127 Ok(())
128 }
129}
130impl<B:Borrow<u8>, I:Iterator<Item=B>> Iterator for Utf8CharMerger<B,I> {
131 type Item = Result<Utf8Char,InvalidUtf8Slice>;
132 fn next(&mut self) -> Option<Self::Item> {
133 let first: u8;
134 if self.after_err_leftover != 0 {
135 self.after_err_leftover -= 1;
136 first = self.after_err_stack[self.after_err_leftover as usize];
137 } else if let Some(next) = self.iter.next() {
138 first = *next.borrow();
139 } else {
140 return None;
141 }
142
143 unsafe {
144 let mut bytes = [first, 0, 0, 0];
145 let ok = match first {
146 0b0000_0000...0b0111_1111 => {/*1 and */Ok(())},
147 0b1100_0010...0b1101_1111 => {//2 and not overlong
148 self.extra(&mut bytes, 2) // no extra validation required
149 },
150 0b1110_0000...0b1110_1111 => {//3
151 if let Err(e) = self.extra(&mut bytes, 3) {
152 Err(e)
153 } else if bytes[0] == 0b1110_0000 && bytes[1] <= 0b10_011111 {
154 self.save(&bytes, 3);
155 Err(Utf8(OverLong))
156 } else if bytes[0] == 0b1110_1101 && bytes[1] & 0b11_100000 == 0b10_100000 {
157 self.save(&bytes, 3);
158 Err(Codepoint(Utf16Reserved))
159 } else {
160 Ok(())
161 }
162 },
163 0b1111_0000...0b1111_0100 => {//4
164 if let Err(e) = self.extra(&mut bytes, 4) {
165 Err(e)
166 } else if bytes[0] == 0b11110_000 && bytes[1] <= 0b10_001111 {
167 self.save(&bytes, 4);
168 Err(InvalidUtf8Slice::Utf8(OverLong))
169 } else if bytes[0] == 0b11110_100 && bytes[1] > 0b10_001111 {
170 self.save(&bytes, 4);
171 Err(InvalidUtf8Slice::Codepoint(TooHigh))
172 } else {
173 Ok(())
174 }
175 },
176 0b1000_0000...0b1011_1111 => {// continuation byte
177 Err(Utf8(FirstByte(ContinuationByte)))
178 },
179 0b1100_0000...0b1100_0001 => {// 2 and overlong
180 Err(Utf8(OverLong))
181 },
182 0b1111_0101...0b1111_0111 => {// 4 and too high codepoint
183 Err(Codepoint(TooHigh))
184 },
185 0b1111_1000...0b1111_1111 => {
186 Err(Utf8(FirstByte(TooLongSeqence)))
187 },
188 _ => unreachable!("all possible byte values should be covered")
189 };
190 Some(ok.map(|()| Utf8Char::from_array_unchecked(bytes) ))
191 }
192 }
193 fn size_hint(&self) -> (usize,Option<usize>) {
194 let (iter_min, iter_max) = self.iter.size_hint();
195 // cannot be exact, so KISS
196 let min = iter_min / 4; // don't bother rounding up or accounting for after_err
197 // handle edge case of max > usize::MAX-3 just in case.
198 // Using wrapping_add() wouldn't violate any API contract as the trait isn't unsafe.
199 let max = iter_max.and_then(|max| {
200 max.checked_add(self.after_err_leftover as usize)
201 });
202 (min, max)
203 }
204}
205impl<B:Borrow<u8>, I:Iterator<Item=B>+Debug> Debug for Utf8CharMerger<B,I> {
206 fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
207 let mut in_order = [0u8; 3];
208 for i in 0..self.after_err_leftover as usize {
209 in_order[i] = self.after_err_stack[self.after_err_leftover as usize - i - 1];
210 }
211 fmtr.debug_struct("Utf8CharMerger")
212 .field("buffered", &&in_order[..self.after_err_leftover as usize])
213 .field("inner", &self.iter)
214 .finish()
215 }
216}
217
218
219/// An [`Utf8CharMerger`](struct.Utf8CharMerger.html) that also produces
220/// offsets and lengths, but can only iterate over slices.
221///
222/// See [`SliceExt::utf8char_indices()`](../trait.SliceExt.html#tymethod.utf8char_indices)
223/// for examples and error handling.
224#[derive(Clone, Default)]
225pub struct Utf8CharDecoder<'a> {
226 slice: &'a[u8],
227 index: usize,
228}
229impl<'a> From<&'a[u8]> for Utf8CharDecoder<'a> {
230 fn from(s: &[u8]) -> Utf8CharDecoder {
231 Utf8CharDecoder { slice: s, index: 0 }
232 }
233}
234impl<'a> Utf8CharDecoder<'a> {
235 /// Extract the remainder of the source slice.
236 ///
237 /// # Examples
238 ///
239 /// Unlike `Utf8CharMerger::into_inner()`, bytes directly after an error
240 /// are never swallowed:
241 /// ```
242 /// # use encode_unicode::SliceExt;
243 /// let mut iter = b"\xf4\xa1\xb2FS".utf8char_indices();
244 /// assert!(iter.next().unwrap().1.is_err());
245 /// assert_eq!(iter.as_slice(), b"\xa1\xb2FS");
246 /// ```
247 pub fn as_slice(&self) -> &'a[u8] {
248 &self.slice[self.index..]
249 }
250}
251impl<'a> Iterator for Utf8CharDecoder<'a> {
252 type Item = (usize, Result<Utf8Char,InvalidUtf8Slice>, usize);
253 fn next(&mut self) -> Option<Self::Item> {
254 let start = self.index;
255 match Utf8Char::from_slice_start(&self.slice[self.index..]) {
256 Ok((u8c, len)) => {
257 self.index += len;
258 Some((start, Ok(u8c), len))
259 },
260 Err(TooShort(1)) => None,
261 Err(e) => {
262 self.index += 1;
263 Some((start, Err(e), 1))
264 }
265 }
266 }
267 #[inline]
268 fn size_hint(&self) -> (usize,Option<usize>) {
269 let bytes = self.slice.len() - self.index;
270 // Cannot be exact, so KISS and don't bother rounding up.
271 // The slice is unlikely be full of 4-byte codepoints, so buffers
272 // allocated with the lower bound will have to be grown anyway.
273 (bytes/4, Some(bytes))
274 }
275}
276impl<'a> DoubleEndedIterator for Utf8CharDecoder<'a> {
277 fn next_back(&mut self) -> Option<Self::Item> {
278 if self.index < self.slice.len() {
279 let extras = self.slice.iter()
280 .rev()
281 .take_while(|&b| b & 0b1100_0000 == 0b1000_0000 )
282 .count();
283 let starts = self.slice.len() - (extras+1);
284 match Utf8Char::from_slice_start(&self.slice[starts..]) {
285 Ok((u8c,len)) if len == 1+extras => {
286 self.slice = &self.slice[..starts];
287 Some((starts, Ok(u8c), len))
288 },
289 // This enures errors for every byte in both directions,
290 // but means overlong and codepoint errors will be turned into
291 // tooshort errors.
292 Err(e) if extras == 0 => {
293 self.slice = &self.slice[..self.slice.len()-1];
294 Some((self.slice.len()-1, Err(e), 1))
295 },
296 _ => {
297 self.slice = &self.slice[..self.slice.len()-1];
298 Some((self.slice.len()-1, Err(Utf8(FirstByte(ContinuationByte))), 1))
299 },
300 }
301 } else {
302 None
303 }
304 }
305}
306impl<'a> Debug for Utf8CharDecoder<'a> {
307 fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
308 write!(fmtr, "Utf8CharDecoder {{ bytes[{}..]: {:?} }}", self.index, self.as_slice())
309 }
310}
311
312
313
314/// Decodes UTF-16 characters from a `u16` iterator into `Utf16Char`s.
315///
316/// See [`IterExt::to_utf16chars()`](../trait.IterExt.html#tymethod.to_utf16chars)
317/// for examples and error handling.
318#[derive(Clone, Default)]
319pub struct Utf16CharMerger<B:Borrow<u16>, I:Iterator<Item=B>> {
320 iter: I,
321 /// Used when a trailing surrogate was expected, the u16 can be any value.
322 prev: Option<B>,
323}
324impl<B:Borrow<u16>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
325From<T> for Utf16CharMerger<B,I> {
326 fn from(t: T) -> Self {
327 Utf16CharMerger { iter: t.into_iter(), prev: None }
328 }
329}
330impl<B:Borrow<u16>, I:Iterator<Item=B>> Utf16CharMerger<B,I> {
331 /// Extract the inner iterator.
332 ///
333 /// If the last item produced was an `Err`, the first unit might be missing.
334 ///
335 /// # Examples
336 ///
337 /// Unit right after an error missing
338 /// ```
339 /// # use encode_unicode::IterExt;
340 /// # use encode_unicode::error::Utf16PairError;
341 /// let mut merger = [0xd901, 'F' as u16, 'S' as u16].iter().to_utf16chars();
342 /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
343 /// let mut inner: std::slice::Iter<u16> = merger.into_inner();
344 /// assert_eq!(inner.next(), Some('S' as u16).as_ref()); // 'F' was consumed by Utf16CharMerger
345 /// ```
346 ///
347 /// Error that doesn't swallow any units
348 /// ```
349 /// # use encode_unicode::IterExt;
350 /// # use encode_unicode::error::Utf16PairError;
351 /// let mut merger = [0xde00, 'F' as u16, 'S' as u16].iter().to_utf16chars();
352 /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnexpectedTrailingSurrogate)));
353 /// let mut inner: std::slice::Iter<u16> = merger.into_inner();
354 /// assert_eq!(inner.next(), Some('F' as u16).as_ref()); // not consumed
355 /// ```
356 pub fn into_inner(self) -> I {
357 self.iter
358 }
359 /// Returns an iterator over the remaining units.
360 /// Unlike `into_inner()` this will never drop any units.
361 ///
362 /// The exact type of the returned iterator should not be depended on.
363 ///
364 /// # Examples
365 ///
366 /// ```
367 /// # use encode_unicode::IterExt;
368 /// # use encode_unicode::error::Utf16PairError;
369 /// let slice = [0xd901, 'F' as u16, 'S' as u16];
370 /// let mut merger = slice.iter().to_utf16chars();
371 /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
372 /// let mut remaining = merger.into_remaining_units();
373 /// assert_eq!(remaining.next(), Some('F' as u16).as_ref());
374 /// ```
375 pub fn into_remaining_units(self) -> Chain<option::IntoIter<B>,I> {
376 self.prev.into_iter().chain(self.iter)
377 }
378}
379impl<B:Borrow<u16>, I:Iterator<Item=B>> Iterator for Utf16CharMerger<B,I> {
380 type Item = Result<Utf16Char,Utf16PairError>;
381 fn next(&mut self) -> Option<Self::Item> {
382 let first = self.prev.take().or_else(|| self.iter.next() );
383 first.map(|first| unsafe {
384 match first.borrow().utf16_needs_extra_unit() {
385 Ok(false) => Ok(Utf16Char::from_tuple_unchecked((*first.borrow(), None))),
386 Ok(true) => match self.iter.next() {
387 Some(second) => match second.borrow().utf16_needs_extra_unit() {
388 Err(InvalidUtf16FirstUnit) => Ok(Utf16Char::from_tuple_unchecked((
389 *first.borrow(),
390 Some(*second.borrow())
391 ))),
392 Ok(_) => {
393 self.prev = Some(second);
394 Err(Utf16PairError::UnmatchedLeadingSurrogate)
395 }
396 },
397 None => Err(Utf16PairError::Incomplete)
398 },
399 Err(InvalidUtf16FirstUnit) => Err(Utf16PairError::UnexpectedTrailingSurrogate),
400 }
401 })
402 }
403 fn size_hint(&self) -> (usize,Option<usize>) {
404 let (iter_min, iter_max) = self.iter.size_hint();
405 // cannot be exact, so KISS
406 let min = iter_min / 2; // don't bother rounding up or accounting for self.prev
407 let max = match (iter_max, &self.prev) {
408 (Some(max), &Some(_)) => max.checked_add(1),
409 (max, _) => max,
410 };
411 (min, max)
412 }
413}
414impl<B:Borrow<u16>, I:Iterator<Item=B>+Debug> Debug for Utf16CharMerger<B,I> {
415 fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
416 fmtr.debug_struct("Utf16CharMerger")
417 .field("buffered", &self.prev.as_ref().map(|b| *b.borrow() ))
418 .field("inner", &self.iter)
419 .finish()
420 }
421}
422
423
424/// An [`Utf16CharMerger`](struct.Utf16CharMerger.html) that also produces
425/// offsets and lengths, but can only iterate over slices.
426///
427/// See [`SliceExt::utf16char_indices()`](../trait.SliceExt.html#tymethod.utf16char_indices)
428/// for examples and error handling.
429#[derive(Clone, Default)]
430pub struct Utf16CharDecoder<'a> {
431 slice: &'a[u16],
432 index: usize,
433}
434impl<'a> From<&'a[u16]> for Utf16CharDecoder<'a> {
435 fn from(s: &'a[u16]) -> Self {
436 Utf16CharDecoder{ slice: s, index: 0 }
437 }
438}
439impl<'a> Utf16CharDecoder<'a> {
440 /// Extract the remainder of the source slice.
441 ///
442 /// # Examples
443 ///
444 /// Unlike `Utf16CharMerger::into_inner()`, the unit after an error is never swallowed:
445 /// ```
446 /// # use encode_unicode::SliceExt;
447 /// # use encode_unicode::error::Utf16PairError;
448 /// let mut iter = [0xd901, 'F' as u16, 'S' as u16].utf16char_indices();
449 /// assert_eq!(iter.next(), Some((0, Err(Utf16PairError::UnmatchedLeadingSurrogate), 1)));
450 /// assert_eq!(iter.as_slice(), &['F' as u16, 'S' as u16]);
451 /// ```
452 pub fn as_slice(&self) -> &[u16] {
453 &self.slice[self.index..]
454 }
455}
456impl<'a> Iterator for Utf16CharDecoder<'a> {
457 type Item = (usize,Result<Utf16Char,Utf16PairError>,usize);
458 #[inline]
459 fn next(&mut self) -> Option<Self::Item> {
460 let start = self.index;
461 match Utf16Char::from_slice_start(self.as_slice()) {
462 Ok((u16c,len)) => {
463 self.index += len;
464 Some((start, Ok(u16c), len))
465 },
466 Err(EmptySlice) => None,
467 Err(FirstLowSurrogate) => {
468 self.index += 1;
469 Some((start, Err(UnexpectedTrailingSurrogate), 1))
470 },
471 Err(SecondNotLowSurrogate) => {
472 self.index += 1;
473 Some((start, Err(UnmatchedLeadingSurrogate), 1))
474 },
475 Err(MissingSecond) => {
476 self.index = self.slice.len();
477 Some((start, Err(Incomplete), 1))
478 }
479 }
480 }
481 #[inline]
482 fn size_hint(&self) -> (usize,Option<usize>) {
483 let units = self.slice.len() - self.index;
484 // Cannot be exact, so KISS and don't bother rounding up.
485 // The slice is unlikely be full of surrogate pairs, so buffers
486 // allocated with the lower bound will have to be grown anyway.
487 (units/2, Some(units))
488 }
489}
490impl<'a> Debug for Utf16CharDecoder<'a> {
491 fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
492 write!(fmtr, "Utf16CharDecoder {{ units[{}..]: {:?} }}", self.index, self.as_slice())
493 }
494}