bstr/
utf8.rs

Help
1use core::{char, cmp, fmt, str};
2
3#[cfg(feature = "std")]
4use std::error;
5
6use crate::{ascii, bstr::BStr, ext_slice::ByteSlice};
7
8// The UTF-8 decoder provided here is based on the one presented here:
9// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
10//
11// We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}`
12// using regex-automata that is roughly the same size. The real benefit of
13// Hoehrmann's formulation is that the byte class mapping below is manually
14// tailored such that each byte's class doubles as a shift to mask out the
15// bits necessary for constructing the leading bits of each codepoint value
16// from the initial byte.
17//
18// There are some minor differences between this implementation and Hoehrmann's
19// formulation.
20//
21// Firstly, we make REJECT have state ID 0, since it makes the state table
22// itself a little easier to read and is consistent with the notion that 0
23// means "false" or "bad."
24//
25// Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast
26// path.
27//
28// Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction
29// in the core decoding loop. (Which is what regex-automata would do by
30// default.)
31//
32// Fourthly, we split the byte class mapping and transition table into two
33// arrays because it's clearer.
34//
35// It is unlikely that this is the fastest way to do UTF-8 decoding, however,
36// it is fairly simple.
37
38const ACCEPT: usize = 12;
39const REJECT: usize = 0;
40
41/// SAFETY: The decode below function relies on the correctness of these
42/// equivalence classes.
43#[cfg_attr(rustfmt, rustfmt::skip)]
44const CLASSES: [u8; 256] = [
45   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
50   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
51   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
52  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
53];
54
55/// SAFETY: The decode below function relies on the correctness of this state
56/// machine.
57#[cfg_attr(rustfmt, rustfmt::skip)]
58const STATES_FORWARD: &'static [u8] = &[
59  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60  12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
61  0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
62  0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
63  0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
64  0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
65  0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
66  0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
67  0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68];
69
70/// An iterator over Unicode scalar values in a byte string.
71///
72/// When invalid UTF-8 byte sequences are found, they are substituted with the
73/// Unicode replacement codepoint (`U+FFFD`) using the
74/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
75///
76/// This iterator is created by the
77/// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
78/// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
79#[derive(Clone, Debug)]
80pub struct Chars<'a> {
81    bs: &'a [u8],
82}
83
84impl<'a> Chars<'a> {
85    pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> {
86        Chars { bs }
87    }
88
89    /// View the underlying data as a subslice of the original data.
90    ///
91    /// The slice returned has the same lifetime as the original slice, and so
92    /// the iterator can continue to be used while this exists.
93    ///
94    /// # Examples
95    ///
96    /// ```
97    /// use bstr::ByteSlice;
98    ///
99    /// let mut chars = b"abc".chars();
100    ///
101    /// assert_eq!(b"abc", chars.as_bytes());
102    /// chars.next();
103    /// assert_eq!(b"bc", chars.as_bytes());
104    /// chars.next();
105    /// chars.next();
106    /// assert_eq!(b"", chars.as_bytes());
107    /// ```
108    #[inline]
109    pub fn as_bytes(&self) -> &'a [u8] {
110        self.bs
111    }
112}
113
114impl<'a> Iterator for Chars<'a> {
115    type Item = char;
116
117    #[inline]
118    fn next(&mut self) -> Option<char> {
119        let (ch, size) = decode_lossy(self.bs);
120        if size == 0 {
121            return None;
122        }
123        self.bs = &self.bs[size..];
124        Some(ch)
125    }
126}
127
128impl<'a> DoubleEndedIterator for Chars<'a> {
129    #[inline]
130    fn next_back(&mut self) -> Option<char> {
131        let (ch, size) = decode_last_lossy(self.bs);
132        if size == 0 {
133            return None;
134        }
135        self.bs = &self.bs[..self.bs.len() - size];
136        Some(ch)
137    }
138}
139
140/// An iterator over Unicode scalar values in a byte string and their
141/// byte index positions.
142///
143/// When invalid UTF-8 byte sequences are found, they are substituted with the
144/// Unicode replacement codepoint (`U+FFFD`) using the
145/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
146///
147/// Note that this is slightly different from the `CharIndices` iterator
148/// provided by the standard library. Aside from working on possibly invalid
149/// UTF-8, this iterator provides both the corresponding starting and ending
150/// byte indices of each codepoint yielded. The ending position is necessary to
151/// slice the original byte string when invalid UTF-8 bytes are converted into
152/// a Unicode replacement codepoint, since a single replacement codepoint can
153/// substitute anywhere from 1 to 3 invalid bytes (inclusive).
154///
155/// This iterator is created by the
156/// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided
157/// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
158#[derive(Clone, Debug)]
159pub struct CharIndices<'a> {
160    bs: &'a [u8],
161    forward_index: usize,
162    reverse_index: usize,
163}
164
165impl<'a> CharIndices<'a> {
166    pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
167        CharIndices { bs, forward_index: 0, reverse_index: bs.len() }
168    }
169
170    /// View the underlying data as a subslice of the original data.
171    ///
172    /// The slice returned has the same lifetime as the original slice, and so
173    /// the iterator can continue to be used while this exists.
174    ///
175    /// # Examples
176    ///
177    /// ```
178    /// use bstr::ByteSlice;
179    ///
180    /// let mut it = b"abc".char_indices();
181    ///
182    /// assert_eq!(b"abc", it.as_bytes());
183    /// it.next();
184    /// assert_eq!(b"bc", it.as_bytes());
185    /// it.next();
186    /// it.next();
187    /// assert_eq!(b"", it.as_bytes());
188    /// ```
189    #[inline]
190    pub fn as_bytes(&self) -> &'a [u8] {
191        self.bs
192    }
193}
194
195impl<'a> Iterator for CharIndices<'a> {
196    type Item = (usize, usize, char);
197
198    #[inline]
199    fn next(&mut self) -> Option<(usize, usize, char)> {
200        let index = self.forward_index;
201        let (ch, size) = decode_lossy(self.bs);
202        if size == 0 {
203            return None;
204        }
205        self.bs = &self.bs[size..];
206        self.forward_index += size;
207        Some((index, index + size, ch))
208    }
209}
210
211impl<'a> DoubleEndedIterator for CharIndices<'a> {
212    #[inline]
213    fn next_back(&mut self) -> Option<(usize, usize, char)> {
214        let (ch, size) = decode_last_lossy(self.bs);
215        if size == 0 {
216            return None;
217        }
218        self.bs = &self.bs[..self.bs.len() - size];
219        self.reverse_index -= size;
220        Some((self.reverse_index, self.reverse_index + size, ch))
221    }
222}
223
224impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {}
225
226/// An iterator over chunks of valid UTF-8 in a byte slice.
227///
228/// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks).
229#[derive(Clone, Debug)]
230pub struct Utf8Chunks<'a> {
231    pub(super) bytes: &'a [u8],
232}
233
234/// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes.
235///
236/// This is yielded by the
237/// [`Utf8Chunks`](struct.Utf8Chunks.html)
238/// iterator, which can be created via the
239/// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks)
240/// method.
241///
242/// The `'a` lifetime parameter corresponds to the lifetime of the bytes that
243/// are being iterated over.
244#[cfg_attr(test, derive(Debug, PartialEq))]
245pub struct Utf8Chunk<'a> {
246    /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes.
247    ///
248    /// This is empty between adjacent invalid UTF-8 byte sequences.
249    valid: &'a str,
250    /// A sequence of invalid UTF-8 bytes.
251    ///
252    /// Can only be empty in the last chunk.
253    ///
254    /// Should be replaced by a single unicode replacement character, if not
255    /// empty.
256    invalid: &'a BStr,
257    /// Indicates whether the invalid sequence could've been valid if there
258    /// were more bytes.
259    ///
260    /// Can only be true in the last chunk.
261    incomplete: bool,
262}
263
264impl<'a> Utf8Chunk<'a> {
265    /// Returns the (possibly empty) valid UTF-8 bytes in this chunk.
266    ///
267    /// This may be empty if there are consecutive sequences of invalid UTF-8
268    /// bytes.
269    #[inline]
270    pub fn valid(&self) -> &'a str {
271        self.valid
272    }
273
274    /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that
275    /// immediately follow the valid UTF-8 bytes in this chunk.
276    ///
277    /// This is only empty when this chunk corresponds to the last chunk in
278    /// the original bytes.
279    ///
280    /// The maximum length of this slice is 3. That is, invalid UTF-8 byte
281    /// sequences greater than 1 always correspond to a valid _prefix_ of
282    /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution
283    /// of maximal subparts" strategy that is described in more detail in the
284    /// docs for the
285    /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
286    /// method.
287    #[inline]
288    pub fn invalid(&self) -> &'a [u8] {
289        self.invalid.as_bytes()
290    }
291
292    /// Returns whether the invalid sequence might still become valid if more
293    /// bytes are added.
294    ///
295    /// Returns true if the end of the input was reached unexpectedly,
296    /// without encountering an unexpected byte.
297    ///
298    /// This can only be the case for the last chunk.
299    #[inline]
300    pub fn incomplete(&self) -> bool {
301        self.incomplete
302    }
303}
304
305impl<'a> Iterator for Utf8Chunks<'a> {
306    type Item = Utf8Chunk<'a>;
307
308    #[inline]
309    fn next(&mut self) -> Option<Utf8Chunk<'a>> {
310        if self.bytes.is_empty() {
311            return None;
312        }
313        match validate(self.bytes) {
314            Ok(()) => {
315                let valid = self.bytes;
316                self.bytes = &[];
317                Some(Utf8Chunk {
318                    // SAFETY: This is safe because of the guarantees provided
319                    // by utf8::validate.
320                    valid: unsafe { str::from_utf8_unchecked(valid) },
321                    invalid: [].as_bstr(),
322                    incomplete: false,
323                })
324            }
325            Err(e) => {
326                let (valid, rest) = self.bytes.split_at(e.valid_up_to());
327                // SAFETY: This is safe because of the guarantees provided by
328                // utf8::validate.
329                let valid = unsafe { str::from_utf8_unchecked(valid) };
330                let (invalid_len, incomplete) = match e.error_len() {
331                    Some(n) => (n, false),
332                    None => (rest.len(), true),
333                };
334                let (invalid, rest) = rest.split_at(invalid_len);
335                self.bytes = rest;
336                Some(Utf8Chunk {
337                    valid,
338                    invalid: invalid.as_bstr(),
339                    incomplete,
340                })
341            }
342        }
343    }
344
345    #[inline]
346    fn size_hint(&self) -> (usize, Option<usize>) {
347        if self.bytes.is_empty() {
348            (0, Some(0))
349        } else {
350            (1, Some(self.bytes.len()))
351        }
352    }
353}
354
355impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
356
357/// An error that occurs when UTF-8 decoding fails.
358///
359/// This error occurs when attempting to convert a non-UTF-8 byte
360/// string to a Rust string that must be valid UTF-8. For example,
361/// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method.
362///
363/// # Example
364///
365/// This example shows what happens when a given byte sequence is invalid,
366/// but ends with a sequence that is a possible prefix of valid UTF-8.
367///
368/// ```
369/// use bstr::{B, ByteSlice};
370///
371/// let s = B(b"foobar\xF1\x80\x80");
372/// let err = s.to_str().unwrap_err();
373/// assert_eq!(err.valid_up_to(), 6);
374/// assert_eq!(err.error_len(), None);
375/// ```
376///
377/// This example shows what happens when a given byte sequence contains
378/// invalid UTF-8.
379///
380/// ```
381/// use bstr::ByteSlice;
382///
383/// let s = b"foobar\xF1\x80\x80quux";
384/// let err = s.to_str().unwrap_err();
385/// assert_eq!(err.valid_up_to(), 6);
386/// // The error length reports the maximum number of bytes that correspond to
387/// // a valid prefix of a UTF-8 encoded codepoint.
388/// assert_eq!(err.error_len(), Some(3));
389///
390/// // In contrast to the above which contains a single invalid prefix,
391/// // consider the case of multiple individual bytes that are never valid
392/// // prefixes. Note how the value of error_len changes!
393/// let s = b"foobar\xFF\xFFquux";
394/// let err = s.to_str().unwrap_err();
395/// assert_eq!(err.valid_up_to(), 6);
396/// assert_eq!(err.error_len(), Some(1));
397///
398/// // The fact that it's an invalid prefix does not change error_len even
399/// // when it immediately precedes the end of the string.
400/// let s = b"foobar\xFF";
401/// let err = s.to_str().unwrap_err();
402/// assert_eq!(err.valid_up_to(), 6);
403/// assert_eq!(err.error_len(), Some(1));
404/// ```
405#[derive(Clone, Debug, Eq, PartialEq)]
406pub struct Utf8Error {
407    valid_up_to: usize,
408    error_len: Option<usize>,
409}
410
411impl Utf8Error {
412    /// Returns the byte index of the position immediately following the last
413    /// valid UTF-8 byte.
414    ///
415    /// # Example
416    ///
417    /// This examples shows how `valid_up_to` can be used to retrieve a
418    /// possibly empty prefix that is guaranteed to be valid UTF-8:
419    ///
420    /// ```
421    /// use bstr::ByteSlice;
422    ///
423    /// let s = b"foobar\xF1\x80\x80quux";
424    /// let err = s.to_str().unwrap_err();
425    ///
426    /// // This is guaranteed to never panic.
427    /// let string = s[..err.valid_up_to()].to_str().unwrap();
428    /// assert_eq!(string, "foobar");
429    /// ```
430    #[inline]
431    pub fn valid_up_to(&self) -> usize {
432        self.valid_up_to
433    }
434
435    /// Returns the total number of invalid UTF-8 bytes immediately following
436    /// the position returned by `valid_up_to`. This value is always at least
437    /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8
438    /// encoded codepoint.
439    ///
440    /// If the end of the original input was found before a valid UTF-8 encoded
441    /// codepoint could be completed, then this returns `None`. This is useful
442    /// when processing streams, where a `None` value signals that more input
443    /// might be needed.
444    #[inline]
445    pub fn error_len(&self) -> Option<usize> {
446        self.error_len
447    }
448}
449
450#[cfg(feature = "std")]
451impl error::Error for Utf8Error {
452    fn description(&self) -> &str {
453        "invalid UTF-8"
454    }
455}
456
457impl fmt::Display for Utf8Error {
458    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
459        write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to)
460    }
461}
462
463/// Returns OK if and only if the given slice is completely valid UTF-8.
464///
465/// If the slice isn't valid UTF-8, then an error is returned that explains
466/// the first location at which invalid UTF-8 was detected.
467pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
468    // The fast path for validating UTF-8. It steps through a UTF-8 automaton
469    // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
470    // detected, it backs up and runs the slower version of the UTF-8 automaton
471    // to determine correct error information.
472    fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
473        let mut state = ACCEPT;
474        let mut i = 0;
475
476        while i < slice.len() {
477            let b = slice[i];
478
479            // ASCII fast path. If we see two consecutive ASCII bytes, then try
480            // to validate as much ASCII as possible very quickly.
481            if state == ACCEPT
482                && b <= 0x7F
483                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
484            {
485                i += ascii::first_non_ascii_byte(&slice[i..]);
486                continue;
487            }
488
489            state = step(state, b);
490            if state == REJECT {
491                return Err(find_valid_up_to(slice, i));
492            }
493            i += 1;
494        }
495        if state != ACCEPT {
496            Err(find_valid_up_to(slice, slice.len()))
497        } else {
498            Ok(())
499        }
500    }
501
502    // Given the first position at which a UTF-8 sequence was determined to be
503    // invalid, return an error that correctly reports the position at which
504    // the last complete UTF-8 sequence ends.
505    #[inline(never)]
506    fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
507        // In order to find the last valid byte, we need to back up an amount
508        // that guarantees every preceding byte is part of a valid UTF-8
509        // code unit sequence. To do this, we simply locate the last leading
510        // byte that occurs before rejected_at.
511        let mut backup = rejected_at.saturating_sub(1);
512        while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
513            backup -= 1;
514        }
515        let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
516        let mut err = slow(&slice[backup..upto]).unwrap_err();
517        err.valid_up_to += backup;
518        err
519    }
520
521    // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
522    // when an invalid sequence is found. This is split out from validate so
523    // that the fast path doesn't need to keep track of the position of the
524    // last valid UTF-8 byte. In particular, tracking this requires checking
525    // for an ACCEPT state on each byte, which degrades throughput pretty
526    // badly.
527    fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
528        let mut state = ACCEPT;
529        let mut valid_up_to = 0;
530        for (i, &b) in slice.iter().enumerate() {
531            state = step(state, b);
532            if state == ACCEPT {
533                valid_up_to = i + 1;
534            } else if state == REJECT {
535                // Our error length must always be at least 1.
536                let error_len = Some(cmp::max(1, i - valid_up_to));
537                return Err(Utf8Error { valid_up_to, error_len });
538            }
539        }
540        if state != ACCEPT {
541            Err(Utf8Error { valid_up_to, error_len: None })
542        } else {
543            Ok(())
544        }
545    }
546
547    // Advance to the next state given the current state and current byte.
548    fn step(state: usize, b: u8) -> usize {
549        let class = CLASSES[b as usize];
550        // SAFETY: This is safe because 'class' is always <=11 and 'state' is
551        // always <=96. Therefore, the maximal index is 96+11 = 107, where
552        // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
553        // valid by construction of the state machine and the byte equivalence
554        // classes.
555        unsafe {
556            *STATES_FORWARD.get_unchecked(state + class as usize) as usize
557        }
558    }
559
560    fast(slice)
561}
562
563/// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
564///
565/// When successful, the corresponding Unicode scalar value is returned along
566/// with the number of bytes it was encoded with. The number of bytes consumed
567/// for a successful decode is always between 1 and 4, inclusive.
568///
569/// When unsuccessful, `None` is returned along with the number of bytes that
570/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
571/// the number of bytes consumed is always between 0 and 3, inclusive, where
572/// 0 is only returned when `slice` is empty.
573///
574/// # Examples
575///
576/// Basic usage:
577///
578/// ```
579/// use bstr::decode_utf8;
580///
581/// // Decoding a valid codepoint.
582/// let (ch, size) = decode_utf8(b"\xE2\x98\x83");
583/// assert_eq!(Some('☃'), ch);
584/// assert_eq!(3, size);
585///
586/// // Decoding an incomplete codepoint.
587/// let (ch, size) = decode_utf8(b"\xE2\x98");
588/// assert_eq!(None, ch);
589/// assert_eq!(2, size);
590/// ```
591///
592/// This example shows how to iterate over all codepoints in UTF-8 encoded
593/// bytes, while replacing invalid UTF-8 sequences with the replacement
594/// codepoint:
595///
596/// ```
597/// use bstr::{B, decode_utf8};
598///
599/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
600/// let mut chars = vec![];
601/// while !bytes.is_empty() {
602///     let (ch, size) = decode_utf8(bytes);
603///     bytes = &bytes[size..];
604///     chars.push(ch.unwrap_or('\u{FFFD}'));
605/// }
606/// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
607/// ```
608#[inline]
609pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
610    let slice = slice.as_ref();
611    match slice.get(0) {
612        None => return (None, 0),
613        Some(&b) if b <= 0x7F => return (Some(b as char), 1),
614        _ => {}
615    }
616
617    let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
618    while i < slice.len() {
619        decode_step(&mut state, &mut cp, slice[i]);
620        i += 1;
621
622        if state == ACCEPT {
623            // SAFETY: This is safe because `decode_step` guarantees that
624            // `cp` is a valid Unicode scalar value in an ACCEPT state.
625            let ch = unsafe { char::from_u32_unchecked(cp) };
626            return (Some(ch), i);
627        } else if state == REJECT {
628            // At this point, we always want to advance at least one byte.
629            return (None, cmp::max(1, i.saturating_sub(1)));
630        }
631    }
632    (None, i)
633}
634
635/// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a
636/// slice.
637///
638/// When successful, the corresponding Unicode scalar value is returned along
639/// with the number of bytes it was encoded with. The number of bytes consumed
640/// for a successful decode is always between 1 and 4, inclusive.
641///
642/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
643/// along with the number of bytes that make up a maximal prefix of a valid
644/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
645/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
646/// empty.
647///
648/// # Examples
649///
650/// Basic usage:
651///
652/// ```ignore
653/// use bstr::decode_utf8_lossy;
654///
655/// // Decoding a valid codepoint.
656/// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83");
657/// assert_eq!('☃', ch);
658/// assert_eq!(3, size);
659///
660/// // Decoding an incomplete codepoint.
661/// let (ch, size) = decode_utf8_lossy(b"\xE2\x98");
662/// assert_eq!('\u{FFFD}', ch);
663/// assert_eq!(2, size);
664/// ```
665///
666/// This example shows how to iterate over all codepoints in UTF-8 encoded
667/// bytes, while replacing invalid UTF-8 sequences with the replacement
668/// codepoint:
669///
670/// ```ignore
671/// use bstr::{B, decode_utf8_lossy};
672///
673/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
674/// let mut chars = vec![];
675/// while !bytes.is_empty() {
676///     let (ch, size) = decode_utf8_lossy(bytes);
677///     bytes = &bytes[size..];
678///     chars.push(ch);
679/// }
680/// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
681/// ```
682#[inline]
683pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
684    match decode(slice) {
685        (Some(ch), size) => (ch, size),
686        (None, size) => ('\u{FFFD}', size),
687    }
688}
689
690/// UTF-8 decode a single Unicode scalar value from the end of a slice.
691///
692/// When successful, the corresponding Unicode scalar value is returned along
693/// with the number of bytes it was encoded with. The number of bytes consumed
694/// for a successful decode is always between 1 and 4, inclusive.
695///
696/// When unsuccessful, `None` is returned along with the number of bytes that
697/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
698/// the number of bytes consumed is always between 0 and 3, inclusive, where
699/// 0 is only returned when `slice` is empty.
700///
701/// # Examples
702///
703/// Basic usage:
704///
705/// ```
706/// use bstr::decode_last_utf8;
707///
708/// // Decoding a valid codepoint.
709/// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83");
710/// assert_eq!(Some('☃'), ch);
711/// assert_eq!(3, size);
712///
713/// // Decoding an incomplete codepoint.
714/// let (ch, size) = decode_last_utf8(b"\xE2\x98");
715/// assert_eq!(None, ch);
716/// assert_eq!(2, size);
717/// ```
718///
719/// This example shows how to iterate over all codepoints in UTF-8 encoded
720/// bytes in reverse, while replacing invalid UTF-8 sequences with the
721/// replacement codepoint:
722///
723/// ```
724/// use bstr::{B, decode_last_utf8};
725///
726/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
727/// let mut chars = vec![];
728/// while !bytes.is_empty() {
729///     let (ch, size) = decode_last_utf8(bytes);
730///     bytes = &bytes[..bytes.len()-size];
731///     chars.push(ch.unwrap_or('\u{FFFD}'));
732/// }
733/// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
734/// ```
735#[inline]
736pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
737    // TODO: We could implement this by reversing the UTF-8 automaton, but for
738    // now, we do it the slow way by using the forward automaton.
739
740    let slice = slice.as_ref();
741    if slice.is_empty() {
742        return (None, 0);
743    }
744    let mut start = slice.len() - 1;
745    let limit = slice.len().saturating_sub(4);
746    while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) {
747        start -= 1;
748    }
749    let (ch, size) = decode(&slice[start..]);
750    // If we didn't consume all of the bytes, then that means there's at least
751    // one stray byte that never occurs in a valid code unit prefix, so we can
752    // advance by one byte.
753    if start + size != slice.len() {
754        (None, 1)
755    } else {
756        (ch, size)
757    }
758}
759
760/// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice.
761///
762/// When successful, the corresponding Unicode scalar value is returned along
763/// with the number of bytes it was encoded with. The number of bytes consumed
764/// for a successful decode is always between 1 and 4, inclusive.
765///
766/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
767/// along with the number of bytes that make up a maximal prefix of a valid
768/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
769/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
770/// empty.
771///
772/// # Examples
773///
774/// Basic usage:
775///
776/// ```ignore
777/// use bstr::decode_last_utf8_lossy;
778///
779/// // Decoding a valid codepoint.
780/// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83");
781/// assert_eq!('☃', ch);
782/// assert_eq!(3, size);
783///
784/// // Decoding an incomplete codepoint.
785/// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98");
786/// assert_eq!('\u{FFFD}', ch);
787/// assert_eq!(2, size);
788/// ```
789///
790/// This example shows how to iterate over all codepoints in UTF-8 encoded
791/// bytes in reverse, while replacing invalid UTF-8 sequences with the
792/// replacement codepoint:
793///
794/// ```ignore
795/// use bstr::decode_last_utf8_lossy;
796///
797/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
798/// let mut chars = vec![];
799/// while !bytes.is_empty() {
800///     let (ch, size) = decode_last_utf8_lossy(bytes);
801///     bytes = &bytes[..bytes.len()-size];
802///     chars.push(ch);
803/// }
804/// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
805/// ```
806#[inline]
807pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
808    match decode_last(slice) {
809        (Some(ch), size) => (ch, size),
810        (None, size) => ('\u{FFFD}', size),
811    }
812}
813
814/// SAFETY: The decode function relies on state being equal to ACCEPT only if
815/// cp is a valid Unicode scalar value.
816#[inline]
817pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
818    let class = CLASSES[b as usize];
819    if *state == ACCEPT {
820        *cp = (0xFF >> class) & (b as u32);
821    } else {
822        *cp = (b as u32 & 0b111111) | (*cp << 6);
823    }
824    *state = STATES_FORWARD[*state + class as usize] as usize;
825}
826
827/// Returns true if and only if the given byte is either a valid leading UTF-8
828/// byte, or is otherwise an invalid byte that can never appear anywhere in a
829/// valid UTF-8 sequence.
830fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
831    // In the ASCII case, the most significant bit is never set. The leading
832    // byte of a 2/3/4-byte sequence always has the top two most significant
833    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
834    // also returns true, since every such byte has its two most significant
835    // bits set:
836    //
837    //     \xC0 :: 11000000
838    //     \xC1 :: 11000001
839    //     \xF5 :: 11110101
840    //     \xF6 :: 11110110
841    //     \xF7 :: 11110111
842    //     \xF8 :: 11111000
843    //     \xF9 :: 11111001
844    //     \xFA :: 11111010
845    //     \xFB :: 11111011
846    //     \xFC :: 11111100
847    //     \xFD :: 11111101
848    //     \xFE :: 11111110
849    //     \xFF :: 11111111
850    (b & 0b1100_0000) != 0b1000_0000
851}
852
853#[cfg(all(test, feature = "std"))]
854mod tests {
855    use std::char;
856
857    use crate::{
858        ext_slice::{ByteSlice, B},
859        tests::LOSSY_TESTS,
860        utf8::{self, Utf8Error},
861    };
862
863    fn utf8e(valid_up_to: usize) -> Utf8Error {
864        Utf8Error { valid_up_to, error_len: None }
865    }
866
867    fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error {
868        Utf8Error { valid_up_to, error_len: Some(error_len) }
869    }
870
871    #[test]
872    #[cfg(not(miri))]
873    fn validate_all_codepoints() {
874        for i in 0..(0x10FFFF + 1) {
875            let cp = match char::from_u32(i) {
876                None => continue,
877                Some(cp) => cp,
878            };
879            let mut buf = [0; 4];
880            let s = cp.encode_utf8(&mut buf);
881            assert_eq!(Ok(()), utf8::validate(s.as_bytes()));
882        }
883    }
884
885    #[test]
886    fn validate_multiple_codepoints() {
887        assert_eq!(Ok(()), utf8::validate(b"abc"));
888        assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a"));
889        assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a"));
890        assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",));
891        assert_eq!(
892            Ok(()),
893            utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",)
894        );
895        assert_eq!(
896            Ok(()),
897            utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",)
898        );
899    }
900
901    #[test]
902    fn validate_errors() {
903        // single invalid byte
904        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF"));
905        // single invalid byte after ASCII
906        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF"));
907        // single invalid byte after 2 byte sequence
908        assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF"));
909        // single invalid byte after 3 byte sequence
910        assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF"));
911        // single invalid byte after 4 byte sequence
912        assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF"));
913
914        // An invalid 2-byte sequence with a valid 1-byte prefix.
915        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0"));
916        // An invalid 3-byte sequence with a valid 2-byte prefix.
917        assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0"));
918        // An invalid 4-byte sequence with a valid 3-byte prefix.
919        assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0"));
920
921        // An overlong sequence. Should be \xE2\x82\xAC, but we encode the
922        // same codepoint value in 4 bytes. This not only tests that we reject
923        // overlong sequences, but that we get valid_up_to correct.
924        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC"));
925        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC"));
926        assert_eq!(
927            Err(utf8e2(3, 1)),
928            utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",)
929        );
930
931        // Check that encoding a surrogate codepoint using the UTF-8 scheme
932        // fails validation.
933        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80"));
934        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80"));
935        assert_eq!(
936            Err(utf8e2(3, 1)),
937            utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",)
938        );
939
940        // Check that an incomplete 2-byte sequence fails.
941        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa"));
942        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa"));
943        assert_eq!(
944            Err(utf8e2(3, 1)),
945            utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",)
946        );
947        // Check that an incomplete 3-byte sequence fails.
948        assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a"));
949        assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a"));
950        assert_eq!(
951            Err(utf8e2(3, 2)),
952            utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",)
953        );
954        // Check that an incomplete 4-byte sequence fails.
955        assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca"));
956        assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca"));
957        assert_eq!(
958            Err(utf8e2(4, 3)),
959            utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",)
960        );
961        assert_eq!(
962            Err(utf8e2(6, 3)),
963            utf8::validate(b"foobar\xF1\x80\x80quux",)
964        );
965
966        // Check that an incomplete (EOF) 2-byte sequence fails.
967        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE"));
968        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE"));
969        assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE"));
970        // Check that an incomplete (EOF) 3-byte sequence fails.
971        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98"));
972        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98"));
973        assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98"));
974        // Check that an incomplete (EOF) 4-byte sequence fails.
975        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C"));
976        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C"));
977        assert_eq!(
978            Err(utf8e(4)),
979            utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",)
980        );
981
982        // Test that we errors correct even after long valid sequences. This
983        // checks that our "backup" logic for detecting errors is correct.
984        assert_eq!(
985            Err(utf8e2(8, 1)),
986            utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",)
987        );
988    }
989
990    #[test]
991    fn decode_valid() {
992        fn d(mut s: &str) -> Vec<char> {
993            let mut chars = vec![];
994            while !s.is_empty() {
995                let (ch, size) = utf8::decode(s.as_bytes());
996                s = &s[size..];
997                chars.push(ch.unwrap());
998            }
999            chars
1000        }
1001
1002        assert_eq!(vec!['☃'], d("☃"));
1003        assert_eq!(vec!['☃', '☃'], d("☃☃"));
1004        assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
1005        assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
1006        assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
1007    }
1008
1009    #[test]
1010    fn decode_invalid() {
1011        let (ch, size) = utf8::decode(b"");
1012        assert_eq!(None, ch);
1013        assert_eq!(0, size);
1014
1015        let (ch, size) = utf8::decode(b"\xFF");
1016        assert_eq!(None, ch);
1017        assert_eq!(1, size);
1018
1019        let (ch, size) = utf8::decode(b"\xCE\xF0");
1020        assert_eq!(None, ch);
1021        assert_eq!(1, size);
1022
1023        let (ch, size) = utf8::decode(b"\xE2\x98\xF0");
1024        assert_eq!(None, ch);
1025        assert_eq!(2, size);
1026
1027        let (ch, size) = utf8::decode(b"\xF0\x9D\x9D");
1028        assert_eq!(None, ch);
1029        assert_eq!(3, size);
1030
1031        let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0");
1032        assert_eq!(None, ch);
1033        assert_eq!(3, size);
1034
1035        let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC");
1036        assert_eq!(None, ch);
1037        assert_eq!(1, size);
1038
1039        let (ch, size) = utf8::decode(b"\xED\xA0\x80");
1040        assert_eq!(None, ch);
1041        assert_eq!(1, size);
1042
1043        let (ch, size) = utf8::decode(b"\xCEa");
1044        assert_eq!(None, ch);
1045        assert_eq!(1, size);
1046
1047        let (ch, size) = utf8::decode(b"\xE2\x98a");
1048        assert_eq!(None, ch);
1049        assert_eq!(2, size);
1050
1051        let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca");
1052        assert_eq!(None, ch);
1053        assert_eq!(3, size);
1054    }
1055
1056    #[test]
1057    fn decode_lossy() {
1058        let (ch, size) = utf8::decode_lossy(b"");
1059        assert_eq!('\u{FFFD}', ch);
1060        assert_eq!(0, size);
1061
1062        let (ch, size) = utf8::decode_lossy(b"\xFF");
1063        assert_eq!('\u{FFFD}', ch);
1064        assert_eq!(1, size);
1065
1066        let (ch, size) = utf8::decode_lossy(b"\xCE\xF0");
1067        assert_eq!('\u{FFFD}', ch);
1068        assert_eq!(1, size);
1069
1070        let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0");
1071        assert_eq!('\u{FFFD}', ch);
1072        assert_eq!(2, size);
1073
1074        let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0");
1075        assert_eq!('\u{FFFD}', ch);
1076        assert_eq!(3, size);
1077
1078        let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC");
1079        assert_eq!('\u{FFFD}', ch);
1080        assert_eq!(1, size);
1081
1082        let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80");
1083        assert_eq!('\u{FFFD}', ch);
1084        assert_eq!(1, size);
1085
1086        let (ch, size) = utf8::decode_lossy(b"\xCEa");
1087        assert_eq!('\u{FFFD}', ch);
1088        assert_eq!(1, size);
1089
1090        let (ch, size) = utf8::decode_lossy(b"\xE2\x98a");
1091        assert_eq!('\u{FFFD}', ch);
1092        assert_eq!(2, size);
1093
1094        let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca");
1095        assert_eq!('\u{FFFD}', ch);
1096        assert_eq!(3, size);
1097    }
1098
1099    #[test]
1100    fn decode_last_valid() {
1101        fn d(mut s: &str) -> Vec<char> {
1102            let mut chars = vec![];
1103            while !s.is_empty() {
1104                let (ch, size) = utf8::decode_last(s.as_bytes());
1105                s = &s[..s.len() - size];
1106                chars.push(ch.unwrap());
1107            }
1108            chars
1109        }
1110
1111        assert_eq!(vec!['☃'], d("☃"));
1112        assert_eq!(vec!['☃', '☃'], d("☃☃"));
1113        assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε"));
1114        assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇"));
1115        assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲"));
1116    }
1117
1118    #[test]
1119    fn decode_last_invalid() {
1120        let (ch, size) = utf8::decode_last(b"");
1121        assert_eq!(None, ch);
1122        assert_eq!(0, size);
1123
1124        let (ch, size) = utf8::decode_last(b"\xFF");
1125        assert_eq!(None, ch);
1126        assert_eq!(1, size);
1127
1128        let (ch, size) = utf8::decode_last(b"\xCE\xF0");
1129        assert_eq!(None, ch);
1130        assert_eq!(1, size);
1131
1132        let (ch, size) = utf8::decode_last(b"\xCE");
1133        assert_eq!(None, ch);
1134        assert_eq!(1, size);
1135
1136        let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0");
1137        assert_eq!(None, ch);
1138        assert_eq!(1, size);
1139
1140        let (ch, size) = utf8::decode_last(b"\xE2\x98");
1141        assert_eq!(None, ch);
1142        assert_eq!(2, size);
1143
1144        let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0");
1145        assert_eq!(None, ch);
1146        assert_eq!(1, size);
1147
1148        let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D");
1149        assert_eq!(None, ch);
1150        assert_eq!(3, size);
1151
1152        let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC");
1153        assert_eq!(None, ch);
1154        assert_eq!(1, size);
1155
1156        let (ch, size) = utf8::decode_last(b"\xED\xA0\x80");
1157        assert_eq!(None, ch);
1158        assert_eq!(1, size);
1159
1160        let (ch, size) = utf8::decode_last(b"\xED\xA0");
1161        assert_eq!(None, ch);
1162        assert_eq!(1, size);
1163
1164        let (ch, size) = utf8::decode_last(b"\xED");
1165        assert_eq!(None, ch);
1166        assert_eq!(1, size);
1167
1168        let (ch, size) = utf8::decode_last(b"a\xCE");
1169        assert_eq!(None, ch);
1170        assert_eq!(1, size);
1171
1172        let (ch, size) = utf8::decode_last(b"a\xE2\x98");
1173        assert_eq!(None, ch);
1174        assert_eq!(2, size);
1175
1176        let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C");
1177        assert_eq!(None, ch);
1178        assert_eq!(3, size);
1179    }
1180
1181    #[test]
1182    fn decode_last_lossy() {
1183        let (ch, size) = utf8::decode_last_lossy(b"");
1184        assert_eq!('\u{FFFD}', ch);
1185        assert_eq!(0, size);
1186
1187        let (ch, size) = utf8::decode_last_lossy(b"\xFF");
1188        assert_eq!('\u{FFFD}', ch);
1189        assert_eq!(1, size);
1190
1191        let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0");
1192        assert_eq!('\u{FFFD}', ch);
1193        assert_eq!(1, size);
1194
1195        let (ch, size) = utf8::decode_last_lossy(b"\xCE");
1196        assert_eq!('\u{FFFD}', ch);
1197        assert_eq!(1, size);
1198
1199        let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0");
1200        assert_eq!('\u{FFFD}', ch);
1201        assert_eq!(1, size);
1202
1203        let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98");
1204        assert_eq!('\u{FFFD}', ch);
1205        assert_eq!(2, size);
1206
1207        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0");
1208        assert_eq!('\u{FFFD}', ch);
1209        assert_eq!(1, size);
1210
1211        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D");
1212        assert_eq!('\u{FFFD}', ch);
1213        assert_eq!(3, size);
1214
1215        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC");
1216        assert_eq!('\u{FFFD}', ch);
1217        assert_eq!(1, size);
1218
1219        let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80");
1220        assert_eq!('\u{FFFD}', ch);
1221        assert_eq!(1, size);
1222
1223        let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0");
1224        assert_eq!('\u{FFFD}', ch);
1225        assert_eq!(1, size);
1226
1227        let (ch, size) = utf8::decode_last_lossy(b"\xED");
1228        assert_eq!('\u{FFFD}', ch);
1229        assert_eq!(1, size);
1230
1231        let (ch, size) = utf8::decode_last_lossy(b"a\xCE");
1232        assert_eq!('\u{FFFD}', ch);
1233        assert_eq!(1, size);
1234
1235        let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98");
1236        assert_eq!('\u{FFFD}', ch);
1237        assert_eq!(2, size);
1238
1239        let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C");
1240        assert_eq!('\u{FFFD}', ch);
1241        assert_eq!(3, size);
1242    }
1243
1244    #[test]
1245    fn chars() {
1246        for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
1247            let got: String = B(input).chars().collect();
1248            assert_eq!(
1249                expected, got,
1250                "chars(ith: {:?}, given: {:?})",
1251                i, input,
1252            );
1253            let got: String =
1254                B(input).char_indices().map(|(_, _, ch)| ch).collect();
1255            assert_eq!(
1256                expected, got,
1257                "char_indices(ith: {:?}, given: {:?})",
1258                i, input,
1259            );
1260
1261            let expected: String = expected.chars().rev().collect();
1262
1263            let got: String = B(input).chars().rev().collect();
1264            assert_eq!(
1265                expected, got,
1266                "chars.rev(ith: {:?}, given: {:?})",
1267                i, input,
1268            );
1269            let got: String =
1270                B(input).char_indices().rev().map(|(_, _, ch)| ch).collect();
1271            assert_eq!(
1272                expected, got,
1273                "char_indices.rev(ith: {:?}, given: {:?})",
1274                i, input,
1275            );
1276        }
1277    }
1278
1279    #[test]
1280    fn utf8_chunks() {
1281        let mut c = utf8::Utf8Chunks { bytes: b"123\xC0" };
1282        assert_eq!(
1283            (c.next(), c.next()),
1284            (
1285                Some(utf8::Utf8Chunk {
1286                    valid: "123",
1287                    invalid: b"\xC0".as_bstr(),
1288                    incomplete: false,
1289                }),
1290                None,
1291            )
1292        );
1293
1294        let mut c = utf8::Utf8Chunks { bytes: b"123\xFF\xFF" };
1295        assert_eq!(
1296            (c.next(), c.next(), c.next()),
1297            (
1298                Some(utf8::Utf8Chunk {
1299                    valid: "123",
1300                    invalid: b"\xFF".as_bstr(),
1301                    incomplete: false,
1302                }),
1303                Some(utf8::Utf8Chunk {
1304                    valid: "",
1305                    invalid: b"\xFF".as_bstr(),
1306                    incomplete: false,
1307                }),
1308                None,
1309            )
1310        );
1311
1312        let mut c = utf8::Utf8Chunks { bytes: b"123\xD0" };
1313        assert_eq!(
1314            (c.next(), c.next()),
1315            (
1316                Some(utf8::Utf8Chunk {
1317                    valid: "123",
1318                    invalid: b"\xD0".as_bstr(),
1319                    incomplete: true,
1320                }),
1321                None,
1322            )
1323        );
1324
1325        let mut c = utf8::Utf8Chunks { bytes: b"123\xD0456" };
1326        assert_eq!(
1327            (c.next(), c.next(), c.next()),
1328            (
1329                Some(utf8::Utf8Chunk {
1330                    valid: "123",
1331                    invalid: b"\xD0".as_bstr(),
1332                    incomplete: false,
1333                }),
1334                Some(utf8::Utf8Chunk {
1335                    valid: "456",
1336                    invalid: b"".as_bstr(),
1337                    incomplete: false,
1338                }),
1339                None,
1340            )
1341        );
1342
1343        let mut c = utf8::Utf8Chunks { bytes: b"123\xE2\x98" };
1344        assert_eq!(
1345            (c.next(), c.next()),
1346            (
1347                Some(utf8::Utf8Chunk {
1348                    valid: "123",
1349                    invalid: b"\xE2\x98".as_bstr(),
1350                    incomplete: true,
1351                }),
1352                None,
1353            )
1354        );
1355
1356        let mut c = utf8::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" };
1357        assert_eq!(
1358            (c.next(), c.next()),
1359            (
1360                Some(utf8::Utf8Chunk {
1361                    valid: "123",
1362                    invalid: b"\xF4\x8F\xBF".as_bstr(),
1363                    incomplete: true,
1364                }),
1365                None,
1366            )
1367        );
1368    }
1369}
bstr/utf8.rs

bstr/
utf8.rs