encode_unicode

Trait SliceExt

Source
pub trait SliceExt: Index<RangeFull> {
    // Required methods
    fn utf8char_indices(&self) -> Utf8CharDecoder<'_> 
       where Self::Output: Borrow<[u8]>;
    fn utf16char_indices(&self) -> Utf16CharDecoder<'_> 
       where Self::Output: Borrow<[u16]>;
}
Expand description

Methods for iterating over u8 and u16 slices as UTF-8 or UTF-16 characters.

The iterators are slightly faster than the similar methods in IterExt because they con “push back” items for free after errors and don’t need a separate buffer that must be checked on every call to .next().

Required Methods§

Source

fn utf8char_indices(&self) -> Utf8CharDecoder<'_>
where Self::Output: Borrow<[u8]>,

Decode u8 slices as UTF-8 and iterate over the codepoints as Utf8Chars,

§Examples

Get the index and error type of the first error:

use encode_unicode::{SliceExt, Utf8Char};
use encode_unicode::error::InvalidUtf8Slice;

let slice = b"ab\0\xe0\xbc\xa9 \xf3\x80\x77";
let result = slice.utf8char_indices()
   .map(|(offset,r,length)| r.map_err(|e| (offset,e,length) ) )
   .collect::<Result<String,(usize,InvalidUtf8Slice,usize)>>();

assert_eq!(result, Err((7, InvalidUtf8Slice::TooShort(4), 1)));
use encode_unicode::{SliceExt, Utf8Char};
use std::error::Error;

let slice = b"\xf0\xbf\xbf\xbfXY\xdd\xbb\xe1\x80\x99quux123";
let mut fixed_size = [Utf8Char::default(); 8];
for (cp_i, (byte_index, r, _)) in slice.utf8char_indices().enumerate().take(8) {
    match r {
        Ok(u8c) => fixed_size[cp_i] = u8c,
        Err(e) => panic!("Invalid codepoint at index {} ({})", cp_i, e.description()),
    }
}
let chars = ['\u{3ffff}', 'X', 'Y', '\u{77b}', '\u{1019}', 'q', 'u', 'u'];
assert_eq!(fixed_size, chars);
use encode_unicode::{SliceExt, Utf8Char};
use encode_unicode::error::InvalidUtf8Slice::*;
use encode_unicode::error::{InvalidUtf8, InvalidUtf8FirstByte, InvalidCodepoint};

let bytes = b"\xfa-\xf4\x8f\xee\xa1\x8f-\xed\xa9\x87\xf0\xcc\xbb";
let mut errors = Vec::new();
let mut lengths = Vec::new();
let mut string = String::new();
for (offset,result,length) in bytes.utf8char_indices() {
   lengths.push((offset,length));
   let c = result.unwrap_or_else(|error| {
       errors.push((offset,error));
       Utf8Char::from('\u{fffd}') // replacement character
   });
   string.push_str(c.as_str());
}

assert_eq!(string, "�-��\u{e84f}-����\u{33b}");
assert_eq!(lengths, [(0,1), (1,1), (2,1), (3,1), (4,3), (7,1),
                    (8,1), (9,1), (10,1), (11,1), (12,2)]);
assert_eq!(errors, [
   ( 0, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::TooLongSeqence))),
   ( 2, Utf8(InvalidUtf8::NotAContinuationByte(2))),
   ( 3, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::ContinuationByte))),
   ( 8, Codepoint(InvalidCodepoint::Utf16Reserved)),
   ( 9, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::ContinuationByte))),
   (10, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::ContinuationByte))),
   (11, TooShort(4)), // (but it was not the last element returned!)
]);
Source

fn utf16char_indices(&self) -> Utf16CharDecoder<'_>
where Self::Output: Borrow<[u16]>,

Decode u16 slices as UTF-16 and iterate over the codepoints as Utf16Chars,

The iterator produces (usize,Result<Utf16Char,Utf16Error>,usize), and the slice is validated as you go.

The first usize contains the offset from the start of the slice and the last usize contains the length of the codepoint or error. The length is either 1 or 2, and always 1 for errors.

§Examples
use encode_unicode::{SliceExt, Utf8Char};

let slice = &['a' as u16, 0xdf00, 0xd83c, 0xdca0][..];
let mut errors = Vec::new();
let string = slice.utf16char_indices().map(|(offset,r,_)| match r {
   Ok(u16c) => Utf8Char::from(u16c),
   Err(_) => {
       errors.push(offset);
       Utf8Char::from('\u{fffd}') // REPLACEMENT_CHARACTER
   }
}).collect::<String>();

assert_eq!(string, "a�🂠");
assert_eq!(errors, [1]);

Search for a codepoint and return its unit and codepoint index.

use encode_unicode::{SliceExt, Utf16Char};

let slice = [0xd875,/*'𝕏'*/ 0xdd4f, '≈' as u16, '2' as u16];
let position = slice.utf16char_indices()
    .enumerate()
    .find(|&(_,(_,r,_))| r == Ok(Utf16Char::from('≈')) )
    .map(|(codepoint, (offset, _, _))| (codepoint, offset) );

assert_eq!(position, Some((1,2)));

Error types:

use encode_unicode::{SliceExt, Utf16Char};
use encode_unicode::error::Utf16PairError::*;

let slice = [0xdcba, 0xdeff, 0xd8be, 0xdeee, 'λ' as u16, 0xdab1, 0xdab1];
let mut iter = slice.utf16char_indices();
assert_eq!(iter.next(), Some((0, Err(UnexpectedTrailingSurrogate), 1)));
assert_eq!(iter.next(), Some((1, Err(UnexpectedTrailingSurrogate), 1)));
assert_eq!(iter.next(), Some((2, Ok(Utf16Char::from('\u{3faee}')), 2)));
assert_eq!(iter.next(), Some((4, Ok(Utf16Char::from('λ')), 1)));
assert_eq!(iter.next(), Some((5, Err(UnmatchedLeadingSurrogate), 1)));
assert_eq!(iter.next(), Some((6, Err(Incomplete), 1)));
assert_eq!(iter.next(), None);
assert_eq!(iter.as_slice(), [])

Implementors§