Trait SliceExt

Source

pub trait SliceExt: Index<RangeFull> {
    // Required methods
    fn utf8char_indices(&self) -> Utf8CharDecoder<'_> ⓘ
       where Self::Output: Borrow<[u8]>;
    fn utf16char_indices(&self) -> Utf16CharDecoder<'_> ⓘ
       where Self::Output: Borrow<[u16]>;
}

Expand description

Methods for iterating over u8 and u16 slices as UTF-8 or UTF-16 characters.

The iterators are slightly faster than the similar methods in IterExt because they con “push back” items for free after errors and don’t need a separate buffer that must be checked on every call to .next().

Required Methods§

Source

fn utf8char_indices(&self) -> Utf8CharDecoder<'_> ⓘ
where Self::Output: Borrow<[u8]>,

Decode u8 slices as UTF-8 and iterate over the codepoints as Utf8Chars,

§Examples

Get the index and error type of the first error:

use encode_unicode::{SliceExt, Utf8Char, error::Utf8ErrorKind};

let slice = b"ab\0\xe0\xbc\xa9 \xf3\x80\x77";
let result = slice.utf8char_indices()
   .map(|(offset,r,length)| r.map_err(|e| (offset,e.kind(),length) ) )
   .collect::<Result<String,(usize,Utf8ErrorKind,usize)>>();

assert_eq!(result, Err((7, Utf8ErrorKind::TooFewBytes, 1)));

use encode_unicode::{SliceExt, Utf8Char};
use std::error::Error;

let slice = b"\xf0\xbf\xbf\xbfXY\xdd\xbb\xe1\x80\x99quux123";
let mut fixed_size = [Utf8Char::default(); 8];
for (cp_i, (byte_index, r, _)) in slice.utf8char_indices().enumerate().take(8) {
    match r {
        Ok(u8c) => fixed_size[cp_i] = u8c,
        Err(e) => panic!("Invalid codepoint at index {} ({})", cp_i, e),
    }
}
let chars = ['\u{3ffff}', 'X', 'Y', '\u{77b}', '\u{1019}', 'q', 'u', 'u'];
assert_eq!(fixed_size, chars);

use encode_unicode::{SliceExt, Utf8Char, error::Utf8ErrorKind};

let bytes = b"\xfa-\xf4\x8f\xee\xa1\x8f-\xed\xa9\x87\xf0\xcc\xbb";
let mut errors = Vec::new();
let mut lengths = Vec::new();
let mut string = String::new();
for (offset,result,length) in bytes.utf8char_indices() {
   lengths.push((offset,length));
   let c = result.unwrap_or_else(|error| {
       errors.push((offset, error.kind()));
       Utf8Char::from('\u{fffd}') // replacement character
   });
   string.push_str(c.as_str());
}

assert_eq!(string, "�-��\u{e84f}-����\u{33b}");
assert_eq!(lengths, [(0,1), (1,1), (2,1), (3,1), (4,3), (7,1),
                    (8,1), (9,1), (10,1), (11,1), (12,2)]);
assert_eq!(errors, [
   ( 0, Utf8ErrorKind::NonUtf8Byte),
   ( 2, Utf8ErrorKind::InterruptedSequence),
   ( 3, Utf8ErrorKind::UnexpectedContinuationByte),
   ( 8, Utf8ErrorKind::Utf16ReservedCodepoint),
   ( 9, Utf8ErrorKind::UnexpectedContinuationByte),
   (10, Utf8ErrorKind::UnexpectedContinuationByte),
   (11, Utf8ErrorKind::TooFewBytes), // (but it was not the last element returned!)
]);

Source

fn utf16char_indices(&self) -> Utf16CharDecoder<'_> ⓘ
where Self::Output: Borrow<[u16]>,

Decode u16 slices as UTF-16 and iterate over the codepoints as Utf16Chars,

The iterator produces (usize,Result<Utf16Char,Utf16Error>,usize), and the slice is validated as you go.

The first usize contains the offset from the start of the slice and the last usize contains the length of the codepoint or error. The length is either 1 or 2, and always 1 for errors.

§Examples

use encode_unicode::{SliceExt, Utf8Char};

let slice = &['a' as u16, 0xdf00, 0xd83c, 0xdca0][..];
let mut errors = Vec::new();
let string = slice.utf16char_indices().map(|(offset,r,_)| match r {
   Ok(u16c) => Utf8Char::from(u16c),
   Err(_) => {
       errors.push(offset);
       Utf8Char::from('\u{fffd}') // REPLACEMENT_CHARACTER
   }
}).collect::<String>();

assert_eq!(string, "a�🂠");
assert_eq!(errors, [1]);

Search for a codepoint and return its unit and codepoint index.

use encode_unicode::{SliceExt, Utf16Char};

let slice = [0xd875,/*'𝕏'*/ 0xdd4f, '≈' as u16, '2' as u16];
let position = slice.utf16char_indices()
    .enumerate()
    .find(|&(_,(_,r,_))| r == Ok(Utf16Char::from('≈')) )
    .map(|(codepoint, (offset, _, _))| (codepoint, offset) );

assert_eq!(position, Some((1,2)));

Error types:

use encode_unicode::{SliceExt, Utf16Char};
use encode_unicode::error::Utf16PairError::*;

let slice = [0xdcba, 0xdeff, 0xd8be, 0xdeee, 'λ' as u16, 0xdab1, 0xdab1];
let mut iter = slice.utf16char_indices();
assert_eq!(iter.next(), Some((0, Err(UnexpectedTrailingSurrogate), 1)));
assert_eq!(iter.next(), Some((1, Err(UnexpectedTrailingSurrogate), 1)));
assert_eq!(iter.next(), Some((2, Ok(Utf16Char::from('\u{3faee}')), 2)));
assert_eq!(iter.next(), Some((4, Ok(Utf16Char::from('λ')), 1)));
assert_eq!(iter.next(), Some((5, Err(UnmatchedLeadingSurrogate), 1)));
assert_eq!(iter.next(), Some((6, Err(Incomplete), 1)));
assert_eq!(iter.next(), None);
assert_eq!(iter.as_slice(), [])

Dyn Compatibility§

This trait is dyn compatible.

In older versions of Rust, dyn compatibility was called "object safety".

Implementors§

Source §

SliceExt

Trait SliceExt Copy item path

Required Methods§

fn utf8char_indices(&self) -> Utf8CharDecoder<'_> ⓘwhere Self::Output: Borrow<[u8]>,

§Examples

fn utf16char_indices(&self) -> Utf16CharDecoder<'_> ⓘwhere Self::Output: Borrow<[u16]>,

§Examples

Dyn Compatibility§

Implementors§

impl<S: ?Sized + Index<RangeFull>> SliceExt for S

Trait SliceExt

fn utf8char_indices(&self) -> Utf8CharDecoder<'_> ⓘ
where Self::Output: Borrow<[u8]>,

fn utf16char_indices(&self) -> Utf16CharDecoder<'_> ⓘ
where Self::Output: Borrow<[u16]>,