use crate::lookup::{self, ccc};
use std::ops::Range;
const HANGUL_RANGE: Range<char> = '\u{ac00}'..'\u{d7a4}';
fn hangul_decomposition(s: char, out: &mut Vec<char>) {
const SBASE: u32 = 0xac00;
const LBASE: u32 = 0x1100;
const VBASE: u32 = 0x1161;
const TBASE: u32 = 0x11a7;
const TCOUNT: u32 = 28;
const NCOUNT: u32 = 588;
let s_index = s as u32 - SBASE;
out.push(unsafe {
char::from_u32_unchecked(LBASE + s_index / NCOUNT)
});
out.push(unsafe {
char::from_u32_unchecked(VBASE + (s_index % NCOUNT) / TCOUNT)
});
if s_index % TCOUNT != 0 {
out.push(unsafe {
char::from_u32_unchecked(TBASE + s_index % TCOUNT)
});
}
}
fn decompose(ch: char, out: &mut Vec<char>) {
if HANGUL_RANGE.contains(&ch) {
hangul_decomposition(ch, out);
} else {
if let Some(mapping) = lookup::decomposition(ch) {
out.extend(mapping.chars());
} else {
out.push(ch);
}
}
}
pub struct NfdIterator<I: Iterator<Item = char>> {
input: std::iter::Peekable<I>,
buf: Vec<char>,
pos: usize,
}
impl<I: Iterator<Item = char>> std::iter::FusedIterator for NfdIterator<I> {}
impl<I: Iterator<Item = char>> Iterator for NfdIterator<I> {
type Item = char;
fn next(&mut self) -> Option<char> {
if self.pos < self.buf.len() {
let ch = self.buf[self.pos];
self.pos += 1;
return Some(ch);
}
self.buf.clear();
self.pos = 0;
if let Some(ch) = self.input.next_if(|&x| ccc(x) == 0) {
decompose(ch, &mut self.buf);
}
while let Some(ch) = self.input.next_if(|&x| ccc(x) != 0) {
decompose(ch, &mut self.buf);
}
self.buf[0..].sort_by(|a, b| ccc(*a).cmp(&ccc(*b)));
if self.pos < self.buf.len() {
let ch = self.buf[self.pos];
self.pos += 1;
Some(ch)
} else {
None
}
}
}
pub fn nfd<I: Iterator<Item = char>>(input: I) -> NfdIterator<I> {
NfdIterator { input: input.peekable(), buf: Vec::new(), pos: 0 }
}
#[cfg(test)]
mod test {
use super::nfd;
use regex::Regex;
use std::io::Read;
use zip::read::ZipArchive;
pub fn read_zip(path: &str) -> Result<String, std::io::Error> {
const ZIP_PATH: &'static str = std::env!("UCD_ZIP");
let mut zip = ZipArchive::new(std::fs::File::open(ZIP_PATH).unwrap()).unwrap();
let mut file = zip.by_name(path).map_err(|_| std::io::ErrorKind::NotFound)?;
let mut out = Vec::new();
file.read_to_end(&mut out).unwrap();
Ok(String::from_utf8(out).unwrap())
}
pub fn normalization_test(
) -> Result<Vec<(Vec<u32>, Vec<u32>, Vec<u32>, Vec<u32>, Vec<u32>)>, std::io::Error> {
let data = read_zip("NormalizationTest.txt")?;
let mut tests = Vec::new();
let re =
Regex::new(r"([A-F0-9 ]+);([A-F0-9 ]+);([A-F0-9 ]+);([A-F0-9 ]+);([A-F0-9 ]+); #.*$")
.unwrap();
for line in data.split("\n") {
if line.len() == 0 {
continue;
}
if let Some(cap) = re.captures(&line) {
let source: Vec<u32> =
cap[1].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
let nfc: Vec<u32> =
cap[2].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
let nfd: Vec<u32> =
cap[3].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
let nfkc: Vec<u32> =
cap[4].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
let nfkd: Vec<u32> =
cap[5].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
tests.push((source, nfc, nfd, nfkc, nfkd));
}
}
Ok(tests)
}
#[test]
fn test_decomposition_of_non_starters() {
assert_eq!(nfd("\u{0360}\u{0340}a".chars()).collect::<String>(), "\u{0300}\u{0360}a");
assert_eq!(nfd("\u{09CB}\u{0300}".chars()).collect::<String>(), "\u{09c7}\u{09be}\u{0300}");
}
#[test]
fn test_nfd() {
for (line, testdata) in normalization_test().unwrap().into_iter().enumerate() {
let source: String = testdata.0.iter().map(|x| char::from_u32(*x).unwrap()).collect();
let expected: String = testdata.2.iter().map(|x| char::from_u32(*x).unwrap()).collect();
assert_eq!(
nfd(source.chars()).collect::<String>(),
expected,
"Failute in case {line} of NormalizationTest.txt: {:#x} {testdata:?}..",
testdata.0[0],
);
}
}
}