Skip to main content

fxfs_unicode/
lib.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4use fprint::TypeFingerprint;
5use serde::{Deserialize, Serialize};
6use std::collections::VecDeque;
7use std::hash::{Hash, Hasher};
8
9mod lookup;
10mod nfd;
11use unicode_gen;
12
13/// Filters a valid sequence of unicode characters, casefolding.
14pub struct CaseFoldIterator<I: Iterator<Item = char>> {
15    /// The not-yet-normalized input sequence.
16    input: I,
17    buf: VecDeque<char>,
18}
19
20impl<I: Iterator<Item = char>> Iterator for CaseFoldIterator<I> {
21    type Item = char;
22
23    fn next(&mut self) -> Option<char> {
24        if let Some(ch) = self.buf.pop_front() {
25            return Some(ch);
26        }
27        self.input.next().map(|ch| {
28            if let Some(mapping) = crate::lookup::casefold(ch) {
29                let mut chars = mapping.chars();
30                let first = chars.next().unwrap();
31                self.buf.extend(chars);
32                first
33            } else {
34                ch
35            }
36        })
37    }
38}
39
40pub fn casefold<I: Iterator<Item = char>>(input: I) -> CaseFoldIterator<I> {
41    CaseFoldIterator { input, buf: VecDeque::new() }
42}
43
44/// Helper function to convert a `char` to an iterator over its UTF-8 bytes
45/// without any heap allocations.
46pub fn utf8_bytes(c: char) -> impl Iterator<Item = u8> {
47    let mut buf = [0; 4];
48    let len = c.encode_utf8(&mut buf).len();
49    buf.into_iter().take(len)
50}
51
52/// A comparison function that:
53///  * Applies casefolding.
54///  * Removes default ignorable characters.
55///  * Applies nfd normalization.
56/// That function will early-out at the first non-matching character.
57pub fn casefold_cmp(a: &str, b: &str) -> std::cmp::Ordering {
58    let a_it = nfd::nfd(casefold(a.chars()).filter(|x| !lookup::default_ignorable(*x)));
59    let b_it = nfd::nfd(casefold(b.chars()).filter(|x| !lookup::default_ignorable(*x)));
60    a_it.cmp(b_it)
61}
62
63// A simple wrapper around String that provides casefolding comparison.
64#[derive(arbitrary::Arbitrary, Clone, Eq, Serialize, Deserialize, TypeFingerprint)]
65pub struct CasefoldString(String);
66impl CasefoldString {
67    pub fn new(s: String) -> Self {
68        Self(s)
69    }
70}
71impl std::fmt::Debug for CasefoldString {
72    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
73        write!(f, "CasefoldString(\"{}\")", self.0)
74    }
75}
76impl std::fmt::Display for CasefoldString {
77    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
78        write!(f, "{}", self.0)
79    }
80}
81/// A borrowed slice of a casefolded string.
82///
83/// It is designed to be the borrowed counterpart to `CasefoldString`, mirroring
84/// the relationship between `str` and `String`.
85///
86/// We wrap `str` instead of a struct like `CasefoldStr<'a>(&'a str)`
87/// so that `CasefoldString` can implement `Deref<Target = CasefoldStr>`. This allows
88/// `CasefoldString` to coerce to `&CasefoldStr` automatically, and enables zero-allocation
89/// map lookups via `Borrow<CasefoldStr>`.
90#[repr(transparent)]
91pub struct CasefoldStr(str);
92
93impl CasefoldStr {
94    pub fn new(s: &str) -> &Self {
95        // SAFETY: `CasefoldStr` is `#[repr(transparent)]` around `str`, so it has the same layout
96        // and alignment.
97        unsafe { &*(s as *const str as *const CasefoldStr) }
98    }
99
100    pub fn as_str(&self) -> &str {
101        &self.0
102    }
103
104    pub fn casefold_normalized_chars(&self) -> impl Iterator<Item = char> + '_ {
105        nfd::nfd(casefold(self.0.chars()).filter(|x| !lookup::default_ignorable(*x)))
106    }
107}
108
109impl std::fmt::Debug for CasefoldStr {
110    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
111        write!(f, "CasefoldStr(\"{}\")", &self.0)
112    }
113}
114
115impl std::fmt::Display for CasefoldStr {
116    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
117        write!(f, "{}", &self.0)
118    }
119}
120
121impl std::cmp::PartialEq for CasefoldStr {
122    fn eq(&self, rhs: &Self) -> bool {
123        casefold_cmp(self.as_str(), rhs.as_str()).is_eq()
124    }
125}
126impl std::cmp::Eq for CasefoldStr {}
127
128impl std::cmp::Ord for CasefoldStr {
129    fn cmp(&self, rhs: &Self) -> std::cmp::Ordering {
130        casefold_cmp(self.as_str(), rhs.as_str())
131    }
132}
133impl std::cmp::PartialOrd for CasefoldStr {
134    fn partial_cmp(&self, rhs: &Self) -> Option<std::cmp::Ordering> {
135        Some(self.cmp(rhs))
136    }
137}
138
139impl ToOwned for CasefoldStr {
140    type Owned = CasefoldString;
141    fn to_owned(&self) -> Self::Owned {
142        CasefoldString::new(self.as_str().to_owned())
143    }
144}
145
146impl Hash for CasefoldStr {
147    fn hash<H: Hasher>(&self, state: &mut H) {
148        for ch in self.casefold_normalized_chars() {
149            ch.hash(state);
150        }
151    }
152}
153
154impl CasefoldString {
155    pub fn as_str(&self) -> &str {
156        &self.0
157    }
158}
159
160impl std::ops::Deref for CasefoldString {
161    type Target = CasefoldStr;
162    fn deref(&self) -> &CasefoldStr {
163        CasefoldStr::new(&self.0)
164    }
165}
166
167impl std::borrow::Borrow<CasefoldStr> for CasefoldString {
168    fn borrow(&self) -> &CasefoldStr {
169        &**self
170    }
171}
172
173impl std::cmp::PartialEq for CasefoldString {
174    fn eq(&self, rhs: &Self) -> bool {
175        **self == **rhs
176    }
177}
178
179impl std::cmp::Ord for CasefoldString {
180    fn cmp(&self, rhs: &Self) -> std::cmp::Ordering {
181        (**self).cmp(&**rhs)
182    }
183}
184
185impl std::cmp::PartialOrd for CasefoldString {
186    fn partial_cmp(&self, rhs: &Self) -> Option<std::cmp::Ordering> {
187        Some(self.cmp(rhs))
188    }
189}
190
191// Nb: This trait is provided for completeness but is NOT intended to be performant.
192impl Hash for CasefoldString {
193    fn hash<H>(&self, state: &mut H)
194    where
195        H: Hasher,
196    {
197        (**self).hash(state);
198    }
199}
200
201impl From<&str> for CasefoldString {
202    fn from(item: &str) -> Self {
203        CasefoldString(item.into())
204    }
205}
206
207impl<'a> From<&'a str> for &'a CasefoldStr {
208    fn from(item: &'a str) -> Self {
209        CasefoldStr::new(item)
210    }
211}
212#[cfg(test)]
213mod test {
214    use super::*;
215    use std::hash::{Hash, Hasher};
216
217    fn get_hash<T: Hash + ?Sized>(t: &T) -> u64 {
218        let mut s = std::collections::hash_map::DefaultHasher::new();
219        t.hash(&mut s);
220        s.finish()
221    }
222
223    #[test]
224    fn test_casefold() {
225        assert_eq!(casefold("Hello There".chars()).collect::<String>(), "hello there");
226        assert_eq!(casefold("HELLO There".chars()).collect::<String>(), "hello there");
227    }
228
229    #[test]
230    fn test_casefold_cmp() {
231        assert_eq!(casefold_cmp("Hello", "hello"), std::cmp::Ordering::Equal);
232        assert_eq!(casefold_cmp("Hello There", "hello"), std::cmp::Ordering::Greater);
233        assert_eq!(casefold_cmp("hello there", "hello"), std::cmp::Ordering::Greater);
234        assert_eq!(casefold_cmp("hello\u{00AD}", "hello"), std::cmp::Ordering::Equal);
235        assert_eq!(casefold_cmp("\u{03AA}", "\u{0399}\u{0308}"), std::cmp::Ordering::Equal);
236
237        // Gracefully handle the degenerate case where we start with modifiers
238        assert_eq!(
239            casefold_cmp("\u{308}\u{05ae}Hello", "\u{05ae}\u{308}hello"),
240            std::cmp::Ordering::Equal
241        );
242    }
243
244    #[test]
245    fn test_casefoldstring() {
246        let a = CasefoldString::new("Hello There".to_owned());
247        let b = CasefoldString::new("hello there".to_owned());
248        let c = CasefoldString::new("hello".to_owned());
249        let d = CasefoldString::new("\u{03AA}".to_owned());
250        let e = CasefoldString::new("\u{0399}\u{0308}".to_owned());
251        // Check some comparisons.
252        assert_eq!(a, a);
253        assert_eq!(a, b);
254        assert_eq!(d, e);
255        assert!(a > c);
256        assert!(b > c);
257        assert_eq!(a.0, format!("{}", a));
258        // Debug::fmt should show type to avoid confusion.
259        assert_eq!("CasefoldString(\"Hello There\")", format!("{:?}", a));
260        // Displays the same as String.
261        assert_eq!(format!("{}", "Hello There".to_owned()), format!("{}", a));
262    }
263
264    #[test]
265    fn test_casefold_hash_equality() {
266        // hello and hello with soft hyphen (ignorable) should be equal and have identical hashes
267        let a = CasefoldString::new("hello\u{00AD}".to_owned());
268        let b = CasefoldString::new("hello".to_owned());
269        assert_eq!(a, b);
270        assert_eq!(get_hash(&a), get_hash(&b));
271    }
272
273    #[test]
274    fn test_casefoldstr() {
275        use std::borrow::Borrow;
276
277        let a = CasefoldString::new("Hello There".to_owned());
278        let b = CasefoldStr::new("hello there");
279        let c = CasefoldStr::new("hello");
280
281        // Compare CasefoldString with CasefoldStr (via Borrow)
282        let borrowed_a: &CasefoldStr = a.borrow();
283        assert_eq!(borrowed_a, b);
284        assert!(borrowed_a > c);
285
286        // Compare CasefoldStr with CasefoldStr
287        assert_eq!(b, b);
288        assert_eq!(CasefoldStr::new("Hello"), CasefoldStr::new("hello"));
289        assert!(b > c);
290
291        // Verify as_str() returns the original case-sensitive string
292        assert_eq!(b.as_str(), "hello there");
293        assert_ne!(b.as_str(), "Hello There");
294
295        // Test ToOwned
296        let owned_b: CasefoldString = b.to_owned();
297        assert_eq!(owned_b, a);
298
299        // Test Hash consistency
300        assert_eq!(get_hash(&a), get_hash(borrowed_a));
301        assert_eq!(get_hash(&owned_b), get_hash(b));
302    }
303
304    #[test]
305    fn test_casefold_str_normalized() {
306        // soft hyphen (ignorable), ohm sign (decomposes to omega)
307        let raw = "Hello\u{00AD} World\u{2126}";
308        let cf_str = CasefoldStr::new(raw);
309
310        let normalized_chars: String = cf_str.casefold_normalized_chars().collect();
311
312        // expected:
313        // "Hello" -> "hello"
314        // "\u{00AD}" -> stripped
315        // " " -> " "
316        // "World" -> "world"
317        // "\u{2126}" -> \u{03c9} (omega lowercase in NFD)
318        let expected_str = "hello world\u{03c9}";
319        assert_eq!(normalized_chars, expected_str);
320    }
321}