pub trait SliceExt: Index<RangeFull> {
// Required methods
fn utf8char_indices(&self) -> Utf8CharDecoder<'_> ⓘ
where Self::Output: Borrow<[u8]>;
fn utf16char_indices(&self) -> Utf16CharDecoder<'_> ⓘ
where Self::Output: Borrow<[u16]>;
}
Expand description
Methods for iterating over u8
and u16
slices as UTF-8 or UTF-16 characters.
The iterators are slightly faster than the similar methods in IterExt
because they con “push back” items for free after errors and don’t need a
separate buffer that must be checked on every call to .next()
.
Required Methods§
Sourcefn utf8char_indices(&self) -> Utf8CharDecoder<'_> ⓘ
fn utf8char_indices(&self) -> Utf8CharDecoder<'_> ⓘ
Decode u8
slices as UTF-8 and iterate over the codepoints as Utf8Char
s,
§Examples
Get the index and error type of the first error:
use encode_unicode::{SliceExt, Utf8Char};
use encode_unicode::error::InvalidUtf8Slice;
let slice = b"ab\0\xe0\xbc\xa9 \xf3\x80\x77";
let result = slice.utf8char_indices()
.map(|(offset,r,length)| r.map_err(|e| (offset,e,length) ) )
.collect::<Result<String,(usize,InvalidUtf8Slice,usize)>>();
assert_eq!(result, Err((7, InvalidUtf8Slice::TooShort(4), 1)));
use encode_unicode::{SliceExt, Utf8Char};
use std::error::Error;
let slice = b"\xf0\xbf\xbf\xbfXY\xdd\xbb\xe1\x80\x99quux123";
let mut fixed_size = [Utf8Char::default(); 8];
for (cp_i, (byte_index, r, _)) in slice.utf8char_indices().enumerate().take(8) {
match r {
Ok(u8c) => fixed_size[cp_i] = u8c,
Err(e) => panic!("Invalid codepoint at index {} ({})", cp_i, e.description()),
}
}
let chars = ['\u{3ffff}', 'X', 'Y', '\u{77b}', '\u{1019}', 'q', 'u', 'u'];
assert_eq!(fixed_size, chars);
use encode_unicode::{SliceExt, Utf8Char};
use encode_unicode::error::InvalidUtf8Slice::*;
use encode_unicode::error::{InvalidUtf8, InvalidUtf8FirstByte, InvalidCodepoint};
let bytes = b"\xfa-\xf4\x8f\xee\xa1\x8f-\xed\xa9\x87\xf0\xcc\xbb";
let mut errors = Vec::new();
let mut lengths = Vec::new();
let mut string = String::new();
for (offset,result,length) in bytes.utf8char_indices() {
lengths.push((offset,length));
let c = result.unwrap_or_else(|error| {
errors.push((offset,error));
Utf8Char::from('\u{fffd}') // replacement character
});
string.push_str(c.as_str());
}
assert_eq!(string, "�-��\u{e84f}-����\u{33b}");
assert_eq!(lengths, [(0,1), (1,1), (2,1), (3,1), (4,3), (7,1),
(8,1), (9,1), (10,1), (11,1), (12,2)]);
assert_eq!(errors, [
( 0, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::TooLongSeqence))),
( 2, Utf8(InvalidUtf8::NotAContinuationByte(2))),
( 3, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::ContinuationByte))),
( 8, Codepoint(InvalidCodepoint::Utf16Reserved)),
( 9, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::ContinuationByte))),
(10, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::ContinuationByte))),
(11, TooShort(4)), // (but it was not the last element returned!)
]);
Sourcefn utf16char_indices(&self) -> Utf16CharDecoder<'_> ⓘ
fn utf16char_indices(&self) -> Utf16CharDecoder<'_> ⓘ
Decode u16
slices as UTF-16 and iterate over the codepoints as Utf16Char
s,
The iterator produces (usize,Result<Utf16Char,Utf16Error>,usize)
,
and the slice is validated as you go.
The first usize
contains the offset from the start of the slice and
the last usize
contains the length of the codepoint or error.
The length is either 1 or 2, and always 1 for errors.
§Examples
use encode_unicode::{SliceExt, Utf8Char};
let slice = &['a' as u16, 0xdf00, 0xd83c, 0xdca0][..];
let mut errors = Vec::new();
let string = slice.utf16char_indices().map(|(offset,r,_)| match r {
Ok(u16c) => Utf8Char::from(u16c),
Err(_) => {
errors.push(offset);
Utf8Char::from('\u{fffd}') // REPLACEMENT_CHARACTER
}
}).collect::<String>();
assert_eq!(string, "a�🂠");
assert_eq!(errors, [1]);
Search for a codepoint and return its unit and codepoint index.
use encode_unicode::{SliceExt, Utf16Char};
let slice = [0xd875,/*'𝕏'*/ 0xdd4f, '≈' as u16, '2' as u16];
let position = slice.utf16char_indices()
.enumerate()
.find(|&(_,(_,r,_))| r == Ok(Utf16Char::from('≈')) )
.map(|(codepoint, (offset, _, _))| (codepoint, offset) );
assert_eq!(position, Some((1,2)));
Error types:
use encode_unicode::{SliceExt, Utf16Char};
use encode_unicode::error::Utf16PairError::*;
let slice = [0xdcba, 0xdeff, 0xd8be, 0xdeee, 'λ' as u16, 0xdab1, 0xdab1];
let mut iter = slice.utf16char_indices();
assert_eq!(iter.next(), Some((0, Err(UnexpectedTrailingSurrogate), 1)));
assert_eq!(iter.next(), Some((1, Err(UnexpectedTrailingSurrogate), 1)));
assert_eq!(iter.next(), Some((2, Ok(Utf16Char::from('\u{3faee}')), 2)));
assert_eq!(iter.next(), Some((4, Ok(Utf16Char::from('λ')), 1)));
assert_eq!(iter.next(), Some((5, Err(UnmatchedLeadingSurrogate), 1)));
assert_eq!(iter.next(), Some((6, Err(Incomplete), 1)));
assert_eq!(iter.next(), None);
assert_eq!(iter.as_slice(), [])