#![no_std]
use core::char;
mod types;
use self::types::{State, Action, unpack};
mod table;
use self::table::TRANSITIONS;
pub trait Receiver {
fn codepoint(&mut self, char);
fn invalid_sequence(&mut self);
}
pub struct Parser {
point: u32,
state: State,
}
const CONTINUATION_MASK: u8 = 0b0011_1111;
impl Parser {
pub fn new() -> Parser {
Parser {
point: 0,
state: State::Ground,
}
}
pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
where R: Receiver
{
let cur = self.state as usize;
let change = TRANSITIONS[cur][byte as usize];
let (state, action) = unsafe { unpack(change) };
self.perform_action(receiver, byte, action);
self.state = state;
}
fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
where R: Receiver
{
match action {
Action::InvalidSequence => {
self.point = 0;
receiver.invalid_sequence();
},
Action::EmitByte => {
receiver.codepoint(byte as char);
},
Action::SetByte1 => {
let point = self.point | ((byte & CONTINUATION_MASK) as u32);
let c = unsafe { char::from_u32_unchecked(point) };
self.point = 0;
receiver.codepoint(c);
},
Action::SetByte2 => {
self.point |= ((byte & CONTINUATION_MASK) as u32) << 6;
},
Action::SetByte2Top => {
self.point |= ((byte & 0b0001_1111) as u32) << 6;
},
Action::SetByte3 => {
self.point |= ((byte & CONTINUATION_MASK) as u32) << 12;
},
Action::SetByte3Top => {
self.point |= ((byte & 0b0000_1111) as u32) << 12;
},
Action::SetByte4 => {
self.point |= ((byte & 0b0000_0111) as u32) << 18;
},
}
}
}
#[cfg(test)]
#[macro_use]
extern crate std;
#[cfg(test)]
mod tests {
use std::io::Read;
use std::fs::File;
use std::string::String;
use Receiver;
use Parser;
impl Receiver for String {
fn codepoint(&mut self, c: char) {
self.push(c);
}
fn invalid_sequence(&mut self) {
}
}
#[test]
fn utf8parse_test() {
let mut buffer = String::new();
let mut file = File::open("src/UTF-8-demo.txt").unwrap();
let mut parser = Parser::new();
file.read_to_string(&mut buffer).expect("Reading file to string");
let expected = String::from_utf8(buffer.as_bytes().to_vec()).unwrap();
let mut actual = String::new();
for byte in buffer.as_bytes().to_vec() {
parser.advance(&mut actual, byte)
}
assert_eq!(actual, expected);
}
}