fxfs_unicode/
nfd.rs
1use crate::lookup::{self, ccc};
12use std::ops::Range;
13
14const HANGUL_RANGE: Range<char> = '\u{ac00}'..'\u{d7a4}';
17
18fn hangul_decomposition(s: char, out: &mut Vec<char>) {
20 const SBASE: u32 = 0xac00;
21 const LBASE: u32 = 0x1100;
22 const VBASE: u32 = 0x1161;
23 const TBASE: u32 = 0x11a7;
24 const TCOUNT: u32 = 28;
25 const NCOUNT: u32 = 588;
26 let s_index = s as u32 - SBASE;
27 out.push(unsafe {
28 char::from_u32_unchecked(LBASE + s_index / NCOUNT)
30 });
31 out.push(unsafe {
32 char::from_u32_unchecked(VBASE + (s_index % NCOUNT) / TCOUNT)
34 });
35 if s_index % TCOUNT != 0 {
36 out.push(unsafe {
37 char::from_u32_unchecked(TBASE + s_index % TCOUNT)
39 });
40 }
41}
42
43fn decompose(ch: char, out: &mut Vec<char>) {
45 if HANGUL_RANGE.contains(&ch) {
46 hangul_decomposition(ch, out);
47 } else {
48 if let Some(mapping) = lookup::decomposition(ch) {
49 out.extend(mapping.chars());
50 } else {
51 out.push(ch);
52 }
53 }
54}
55
56pub struct NfdIterator<I: Iterator<Item = char>> {
58 input: std::iter::Peekable<I>,
60 buf: Vec<char>,
62 pos: usize,
64}
65
66impl<I: Iterator<Item = char>> std::iter::FusedIterator for NfdIterator<I> {}
67
68impl<I: Iterator<Item = char>> Iterator for NfdIterator<I> {
69 type Item = char;
70
71 fn next(&mut self) -> Option<char> {
72 if self.pos < self.buf.len() {
73 let ch = self.buf[self.pos];
74 self.pos += 1;
75 return Some(ch);
76 }
77 self.buf.clear();
78 self.pos = 0;
79
80 if let Some(ch) = self.input.next_if(|&x| ccc(x) == 0) {
82 decompose(ch, &mut self.buf);
83 }
84 while let Some(ch) = self.input.next_if(|&x| ccc(x) != 0) {
86 decompose(ch, &mut self.buf);
87 }
88 self.buf[0..].sort_by(|a, b| ccc(*a).cmp(&ccc(*b)));
90
91 if self.pos < self.buf.len() {
92 let ch = self.buf[self.pos];
93 self.pos += 1;
94 Some(ch)
95 } else {
96 None
97 }
98 }
99}
100
101pub fn nfd<I: Iterator<Item = char>>(input: I) -> NfdIterator<I> {
102 NfdIterator { input: input.peekable(), buf: Vec::new(), pos: 0 }
103}
104
105#[cfg(test)]
106mod test {
107 use super::nfd;
108 use regex::Regex;
109 use std::io::Read;
110 use zip::read::ZipArchive;
111
112 pub fn read_zip(path: &str) -> Result<String, std::io::Error> {
113 const ZIP_PATH: &'static str = std::env!("UCD_ZIP");
114 let mut zip = ZipArchive::new(std::fs::File::open(ZIP_PATH).unwrap()).unwrap();
115 let mut file = zip.by_name(path).map_err(|_| std::io::ErrorKind::NotFound)?;
116 let mut out = Vec::new();
117 file.read_to_end(&mut out).unwrap();
118 Ok(String::from_utf8(out).unwrap())
119 }
120
121 pub fn normalization_test(
124 ) -> Result<Vec<(Vec<u32>, Vec<u32>, Vec<u32>, Vec<u32>, Vec<u32>)>, std::io::Error> {
125 let data = read_zip("NormalizationTest.txt")?;
127
128 let mut tests = Vec::new();
129 let re =
130 Regex::new(r"([A-F0-9 ]+);([A-F0-9 ]+);([A-F0-9 ]+);([A-F0-9 ]+);([A-F0-9 ]+); #.*$")
131 .unwrap();
132 for line in data.split("\n") {
133 if line.len() == 0 {
134 continue;
135 }
136 if let Some(cap) = re.captures(&line) {
137 let source: Vec<u32> =
138 cap[1].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
139 let nfc: Vec<u32> =
140 cap[2].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
141 let nfd: Vec<u32> =
142 cap[3].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
143 let nfkc: Vec<u32> =
144 cap[4].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
145 let nfkd: Vec<u32> =
146 cap[5].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
147 tests.push((source, nfc, nfd, nfkc, nfkd));
148 }
149 }
150 Ok(tests)
151 }
152
153 #[test]
154 fn test_decomposition_of_non_starters() {
155 assert_eq!(nfd("\u{0360}\u{0340}a".chars()).collect::<String>(), "\u{0300}\u{0360}a");
158
159 assert_eq!(nfd("\u{09CB}\u{0300}".chars()).collect::<String>(), "\u{09c7}\u{09be}\u{0300}");
160 }
161
162 #[test]
163 fn test_nfd() {
164 for (line, testdata) in normalization_test().unwrap().into_iter().enumerate() {
165 let source: String = testdata.0.iter().map(|x| char::from_u32(*x).unwrap()).collect();
166 let expected: String = testdata.2.iter().map(|x| char::from_u32(*x).unwrap()).collect();
167 assert_eq!(
168 nfd(source.chars()).collect::<String>(),
169 expected,
170 "Failute in case {line} of NormalizationTest.txt: {:#x} {testdata:?}..",
171 testdata.0[0],
172 );
173 }
174 }
175}