utf8parse/
lib.rs

1//! A table-driven UTF-8 Parser
2//!
3//! This module implements a table-driven UTF-8 parser which should
4//! theoretically contain the minimal number of branches (1). The only branch is
5//! on the `Action` returned from unpacking a transition.
6#![deny(clippy::all, clippy::if_not_else, clippy::enum_glob_use)]
7#![cfg_attr(all(feature = "nightly", test), feature(test))]
8#![no_std]
9
10use core::char;
11
12mod types;
13
14use types::{Action, State};
15
16/// Handles codepoint and invalid sequence events from the parser.
17pub trait Receiver {
18    /// Called whenever a codepoint is parsed successfully
19    fn codepoint(&mut self, _: char);
20
21    /// Called when an invalid_sequence is detected
22    fn invalid_sequence(&mut self);
23}
24
25/// A parser for Utf8 Characters
26///
27/// Repeatedly call `advance` with bytes to emit Utf8 characters
28#[derive(Clone, Default, PartialEq, Eq, Debug)]
29pub struct Parser {
30    point: u32,
31    state: State,
32}
33
34/// Continuation bytes are masked with this value.
35const CONTINUATION_MASK: u8 = 0b0011_1111;
36
37impl Parser {
38    /// Create a new Parser
39    pub fn new() -> Parser {
40        Parser { point: 0, state: State::Ground }
41    }
42
43    /// Advance the parser
44    ///
45    /// The provider receiver will be called whenever a codepoint is completed or an invalid
46    /// sequence is detected.
47    pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
48    where
49        R: Receiver,
50    {
51        let (state, action) = self.state.advance(byte);
52        self.perform_action(receiver, byte, action);
53        self.state = state;
54    }
55
56    fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
57    where
58        R: Receiver,
59    {
60        match action {
61            Action::InvalidSequence => {
62                self.point = 0;
63                receiver.invalid_sequence();
64            },
65            Action::EmitByte => {
66                receiver.codepoint(byte as char);
67            },
68            Action::SetByte1 => {
69                let point = self.point | ((byte & CONTINUATION_MASK) as u32);
70                let c = unsafe { char::from_u32_unchecked(point) };
71                self.point = 0;
72
73                receiver.codepoint(c);
74            },
75            Action::SetByte2 => {
76                self.point |= ((byte & CONTINUATION_MASK) as u32) << 6;
77            },
78            Action::SetByte2Top => {
79                self.point |= ((byte & 0b0001_1111) as u32) << 6;
80            },
81            Action::SetByte3 => {
82                self.point |= ((byte & CONTINUATION_MASK) as u32) << 12;
83            },
84            Action::SetByte3Top => {
85                self.point |= ((byte & 0b0000_1111) as u32) << 12;
86            },
87            Action::SetByte4 => {
88                self.point |= ((byte & 0b0000_0111) as u32) << 18;
89            },
90        }
91    }
92}
93
94#[cfg(all(feature = "nightly", test))]
95mod benches {
96    extern crate std;
97    extern crate test;
98
99    use super::{Parser, Receiver};
100
101    use self::test::{black_box, Bencher};
102
103    static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt");
104
105    impl Receiver for () {
106        fn codepoint(&mut self, c: char) {
107            black_box(c);
108        }
109
110        fn invalid_sequence(&mut self) {}
111    }
112
113    #[bench]
114    fn parse_bench_utf8_demo(b: &mut Bencher) {
115        let mut parser = Parser::new();
116
117        b.iter(|| {
118            for byte in UTF8_DEMO {
119                parser.advance(&mut (), *byte);
120            }
121        })
122    }
123
124    #[bench]
125    fn std_string_parse_utf8(b: &mut Bencher) {
126        b.iter(|| {
127            for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() {
128                black_box(c);
129            }
130        });
131    }
132}