fxfs_unicode/
nfd.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! Normalized decomposition involves:
6//!   1. Recursively applying character decompositions.
7//!   1. Special-case Hangul decomposition.
8//!   2. Reordering via the 'canonical ordering algorithm'.
9//!
10//! We expose a simple character-by-charcter iterator interface for NFD.
11use crate::lookup::{self, ccc};
12use std::ops::Range;
13
14/// Hangul and CJK have assigned ranges but no per-character data within those ranges but
15/// only Hangul needs algorithmic decomposition.
16const HANGUL_RANGE: Range<char> = '\u{ac00}'..'\u{d7a4}';
17
18/// Decomposition of Hangul syllables as according to the unicode 15.0 standard.
19fn hangul_decomposition(s: char, out: &mut Vec<char>) {
20    const SBASE: u32 = 0xac00;
21    const LBASE: u32 = 0x1100;
22    const VBASE: u32 = 0x1161;
23    const TBASE: u32 = 0x11a7;
24    const TCOUNT: u32 = 28;
25    const NCOUNT: u32 = 588;
26    let s_index = s as u32 - SBASE;
27    out.push(unsafe {
28        // Safety: Known unicode codepoint
29        char::from_u32_unchecked(LBASE + s_index / NCOUNT)
30    });
31    out.push(unsafe {
32        // Safety: Known unicode codepoint
33        char::from_u32_unchecked(VBASE + (s_index % NCOUNT) / TCOUNT)
34    });
35    if s_index % TCOUNT != 0 {
36        out.push(unsafe {
37            // Safety: Known unicode codepoint
38            char::from_u32_unchecked(TBASE + s_index % TCOUNT)
39        });
40    }
41}
42
43/// Perform decomposition (Hangul, then lookup), appending result to 'out'
44fn decompose(ch: char, out: &mut Vec<char>) {
45    if HANGUL_RANGE.contains(&ch) {
46        hangul_decomposition(ch, out);
47    } else {
48        if let Some(mapping) = lookup::decomposition(ch) {
49            out.extend(mapping.chars());
50        } else {
51            out.push(ch);
52        }
53    }
54}
55
56/// Filters a valid sequence of unicode characters, outputting an normalized (NFD) sequence.
57pub struct NfdIterator<I: Iterator<Item = char>> {
58    /// The not-yet-normalized input sequence.
59    input: std::iter::Peekable<I>,
60    /// A working area used to normalize characters prior to emitting them.
61    buf: Vec<char>,
62    /// Tracks the cursor position in 'buf'.
63    pos: usize,
64}
65
66impl<I: Iterator<Item = char>> std::iter::FusedIterator for NfdIterator<I> {}
67
68impl<I: Iterator<Item = char>> Iterator for NfdIterator<I> {
69    type Item = char;
70
71    fn next(&mut self) -> Option<char> {
72        if self.pos < self.buf.len() {
73            let ch = self.buf[self.pos];
74            self.pos += 1;
75            return Some(ch);
76        }
77        self.buf.clear();
78        self.pos = 0;
79
80        // Printable entities start with a ccc=0 and their decompositions start with ccc=0
81        if let Some(ch) = self.input.next_if(|&x| ccc(x) == 0) {
82            decompose(ch, &mut self.buf);
83        }
84        // Fill buffer with any ccc>0 (modifiers) and then sort it.
85        while let Some(ch) = self.input.next_if(|&x| ccc(x) != 0) {
86            decompose(ch, &mut self.buf);
87        }
88        // Rust's sort is stable so we can rely on it not to reorder the ccc=0 elements.
89        self.buf[0..].sort_by(|a, b| ccc(*a).cmp(&ccc(*b)));
90
91        if self.pos < self.buf.len() {
92            let ch = self.buf[self.pos];
93            self.pos += 1;
94            Some(ch)
95        } else {
96            None
97        }
98    }
99}
100
101pub fn nfd<I: Iterator<Item = char>>(input: I) -> NfdIterator<I> {
102    NfdIterator { input: input.peekable(), buf: Vec::new(), pos: 0 }
103}
104
105#[cfg(test)]
106mod test {
107    use super::nfd;
108    use regex::Regex;
109    use std::io::Read;
110    use zip::read::ZipArchive;
111
112    pub fn read_zip(path: &str) -> Result<String, std::io::Error> {
113        const ZIP_PATH: &'static str = std::env!("UCD_ZIP");
114        let mut zip = ZipArchive::new(std::fs::File::open(ZIP_PATH).unwrap()).unwrap();
115        let mut file = zip.by_name(path).map_err(|_| std::io::ErrorKind::NotFound)?;
116        let mut out = Vec::new();
117        file.read_to_end(&mut out).unwrap();
118        Ok(String::from_utf8(out).unwrap())
119    }
120
121    /// Test suite of codepoint to normalized forms.
122    /// Value contains different types of normalization (original, nfc, nfd, nfkc, nfkd)
123    pub fn normalization_test(
124    ) -> Result<Vec<(Vec<u32>, Vec<u32>, Vec<u32>, Vec<u32>, Vec<u32>)>, std::io::Error> {
125        // We use the UCD provided test data for validating our implementation.
126        let data = read_zip("NormalizationTest.txt")?;
127
128        let mut tests = Vec::new();
129        let re =
130            Regex::new(r"([A-F0-9 ]+);([A-F0-9 ]+);([A-F0-9 ]+);([A-F0-9 ]+);([A-F0-9 ]+); #.*$")
131                .unwrap();
132        for line in data.split("\n") {
133            if line.len() == 0 {
134                continue;
135            }
136            if let Some(cap) = re.captures(&line) {
137                let source: Vec<u32> =
138                    cap[1].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
139                let nfc: Vec<u32> =
140                    cap[2].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
141                let nfd: Vec<u32> =
142                    cap[3].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
143                let nfkc: Vec<u32> =
144                    cap[4].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
145                let nfkd: Vec<u32> =
146                    cap[5].split(" ").map(|x| u32::from_str_radix(x, 16).unwrap()).collect();
147                tests.push((source, nfc, nfd, nfkc, nfkd));
148            }
149        }
150        Ok(tests)
151    }
152
153    #[test]
154    fn test_decomposition_of_non_starters() {
155        // '\u{0340}' decomposes to '\u{0300}'. Neither are ccc=0.
156        // This tests that we don't make the assumption that decompositions are ccc=0.
157        assert_eq!(nfd("\u{0360}\u{0340}a".chars()).collect::<String>(), "\u{0300}\u{0360}a");
158
159        assert_eq!(nfd("\u{09CB}\u{0300}".chars()).collect::<String>(), "\u{09c7}\u{09be}\u{0300}");
160    }
161
162    #[test]
163    fn test_nfd() {
164        for (line, testdata) in normalization_test().unwrap().into_iter().enumerate() {
165            let source: String = testdata.0.iter().map(|x| char::from_u32(*x).unwrap()).collect();
166            let expected: String = testdata.2.iter().map(|x| char::from_u32(*x).unwrap()).collect();
167            assert_eq!(
168                nfd(source.chars()).collect::<String>(),
169                expected,
170                "Failute in case {line} of NormalizationTest.txt: {:#x} {testdata:?}..",
171                testdata.0[0],
172            );
173        }
174    }
175}