encode_unicode/
utf8_iterators.rs
1use utf8_char::Utf8Char;
10use errors::EmptyStrError;
11extern crate core;
12use self::core::{mem, u32, u64};
13use self::core::ops::Not;
14use self::core::fmt;
15use self::core::borrow::Borrow;
16#[cfg(feature="std")]
17use std::io::{Read, Error as ioError};
18
19
20
21#[derive(Clone)]
23pub struct Utf8Iterator (u32);
24
25impl From<Utf8Char> for Utf8Iterator {
26 fn from(uc: Utf8Char) -> Self {
27 let used = u32::from_le(unsafe{ mem::transmute(uc.to_array().0) });
28 let unused_set = (u64::MAX << uc.len() as u64*8) as u32;
30 Utf8Iterator(used | unused_set)
31 }
32}
33impl From<char> for Utf8Iterator {
34 fn from(c: char) -> Self {
35 Self::from(Utf8Char::from(c))
36 }
37}
38impl Iterator for Utf8Iterator {
39 type Item=u8;
40 fn next(&mut self) -> Option<u8> {
41 let next = self.0 as u8;
42 if next == 0xff {
43 None
44 } else {
45 self.0 = (self.0 >> 8) | 0xff_00_00_00;
46 Some(next)
47 }
48 }
49 fn size_hint(&self) -> (usize, Option<usize>) {
50 (self.len(), Some(self.len()))
51 }
52}
53impl ExactSizeIterator for Utf8Iterator {
54 fn len(&self) -> usize {let unused_bytes = self.0.not().leading_zeros() / 8;
56 4 - unused_bytes as usize
57 }
58}
59#[cfg(feature="std")]
60impl Read for Utf8Iterator {
61 fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> {
63 for (i, dst) in buf.iter_mut().enumerate() {
65 match self.next() {
66 Some(b) => *dst = b,
67 None => return Ok(i),
68 }
69 }
70 Ok(buf.len())
71 }
72}
73impl fmt::Debug for Utf8Iterator {
74 fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
75 let mut content = [0; 4];
76 let mut i = 0;
77 for b in self.clone() {
78 content[i] = b;
79 i += 1;
80 }
81 write!(fmtr, "{:?}", &content[..i])
82 }
83}
84
85
86
87#[cfg_attr(feature="std", doc=" ```")]
116#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
117#[cfg_attr(feature="std", doc=" ```")]
128#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
129pub fn iter_bytes<U:Borrow<Utf8Char>, I:IntoIterator<Item=U>>
142(iterable: I) -> Utf8CharSplitter<U, I::IntoIter> {
143 Utf8CharSplitter{ inner: iterable.into_iter(), prev: 0 }
144}
145
146#[derive(Clone)]
150pub struct Utf8CharSplitter<U:Borrow<Utf8Char>, I:Iterator<Item=U>> {
151 inner: I,
152 prev: u32,
153}
154impl<I:Iterator<Item=Utf8Char>> From<I> for Utf8CharSplitter<Utf8Char,I> {
155 fn from(iter: I) -> Self {
157 iter_bytes(iter)
158 }
159}
160impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Utf8CharSplitter<U,I> {
161 pub fn into_inner(self) -> I {
167 self.inner
168 }
169}
170impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Iterator for Utf8CharSplitter<U,I> {
171 type Item = u8;
172 fn next(&mut self) -> Option<Self::Item> {
173 if self.prev == 0 {
174 self.inner.next().map(|u8c| {
175 let array = u8c.borrow().to_array().0;
176 self.prev = unsafe{ u32::from_le(mem::transmute(array)) } >> 8;
177 array[0]
178 })
179 } else {
180 let next = self.prev as u8;
181 self.prev >>= 8;
182 Some(next)
183 }
184 }
185 fn size_hint(&self) -> (usize,Option<usize>) {
186 let (min, max) = self.inner.size_hint();
189 let add = 4 - (self.prev.leading_zeros() / 8) as usize;
190 (min.wrapping_add(add), max.map(|max| max.wrapping_mul(4).wrapping_add(add) ))
191 }
192}
193#[cfg(feature="std")]
194impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Read for Utf8CharSplitter<U,I> {
195 fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> {
197 let mut i = 0;
198 while self.prev != 0 && i < buf.len() {
200 buf[i] = self.prev as u8;
201 self.prev >>= 8;
202 i += 1;
203 }
204 while i < buf.len() {
206 let bytes = match self.inner.next() {
207 Some(u8c) => u8c.borrow().to_array().0,
208 None => break
209 };
210 buf[i] = bytes[0];
211 i += 1;
212 if bytes[1] != 0 {
213 let len = bytes[0].not().leading_zeros() as usize;
214 let mut written = 1;
215 while written < len {
216 if i < buf.len() {
217 buf[i] = bytes[written];
218 i += 1;
219 written += 1;
220 } else {
221 let bytes_as_u32 = unsafe{ u32::from_le(mem::transmute(bytes)) };
222 self.prev = bytes_as_u32 >> (8*written);
223 return Ok(i);
224 }
225 }
226 }
227 }
228 Ok(i)
229 }
230}
231
232
233
234#[derive(Clone)]
238pub struct Utf8CharIndices<'a>{
239 str: &'a str,
240 index: usize,
241}
242impl<'a> From<&'a str> for Utf8CharIndices<'a> {
243 fn from(s: &str) -> Utf8CharIndices {
244 Utf8CharIndices{str: s, index: 0}
245 }
246}
247impl<'a> Utf8CharIndices<'a> {
248 pub fn as_str(&self) -> &'a str {
260 &self.str[self.index..]
261 }
262}
263impl<'a> Iterator for Utf8CharIndices<'a> {
264 type Item = (usize,Utf8Char);
265 fn next(&mut self) -> Option<(usize,Utf8Char)> {
266 match Utf8Char::from_str_start(&self.str[self.index..]) {
267 Ok((u8c, len)) => {
268 let item = (self.index, u8c);
269 self.index += len;
270 Some(item)
271 },
272 Err(EmptyStrError) => None
273 }
274 }
275 fn size_hint(&self) -> (usize,Option<usize>) {
276 let len = self.str.len() - self.index;
277 (len.wrapping_add(3)/4, Some(len))
280 }
281}
282impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> {
283 fn next_back(&mut self) -> Option<(usize,Utf8Char)> {
284 if self.index < self.str.len() {
287 let rev = self.str.bytes().rev();
288 let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
289 let starts = self.str.len() - len;
290 let (u8c,_) = Utf8Char::from_str_start(&self.str[starts..]).unwrap();
291 self.str = &self.str[..starts];
292 Some((starts, u8c))
293 } else {
294 None
295 }
296 }
297}
298impl<'a> fmt::Debug for Utf8CharIndices<'a> {
299 fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
300 fmtr.debug_tuple("Utf8CharIndices")
301 .field(&self.index)
302 .field(&self.as_str())
303 .finish()
304 }
305}
306
307
308#[derive(Clone)]
310pub struct Utf8Chars<'a>(Utf8CharIndices<'a>);
311impl<'a> From<&'a str> for Utf8Chars<'a> {
312 fn from(s: &str) -> Utf8Chars {
313 Utf8Chars(Utf8CharIndices::from(s))
314 }
315}
316impl<'a> Utf8Chars<'a> {
317 pub fn as_str(&self) -> &'a str {
329 self.0.as_str()
330 }
331}
332impl<'a> Iterator for Utf8Chars<'a> {
333 type Item = Utf8Char;
334 fn next(&mut self) -> Option<Utf8Char> {
335 self.0.next().map(|(_,u8c)| u8c )
336 }
337 fn size_hint(&self) -> (usize,Option<usize>) {
338 self.0.size_hint()
339 }
340}
341impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
342 fn next_back(&mut self) -> Option<Utf8Char> {
343 self.0.next_back().map(|(_,u8c)| u8c )
344 }
345}
346impl<'a> fmt::Debug for Utf8Chars<'a> {
347 fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
348 fmtr.debug_tuple("Utf8CharIndices")
349 .field(&self.as_str())
350 .finish()
351 }
352}