Skip to main content

fxfs_unicode/
lib.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4use fprint::TypeFingerprint;
5use serde::{Deserialize, Serialize};
6use std::collections::VecDeque;
7use std::hash::{Hash, Hasher};
8
9mod lookup;
10mod nfd;
11use unicode_gen;
12
13/// Filters a valid sequence of unicode characters, casefolding.
14pub struct CaseFoldIterator<I: Iterator<Item = char>> {
15    /// The not-yet-normalized input sequence.
16    input: I,
17    buf: VecDeque<char>,
18}
19
20impl<I: Iterator<Item = char>> Iterator for CaseFoldIterator<I> {
21    type Item = char;
22
23    fn next(&mut self) -> Option<char> {
24        if let Some(ch) = self.buf.pop_front() {
25            return Some(ch);
26        }
27        self.input.next().map(|ch| {
28            if let Some(mapping) = crate::lookup::casefold(ch) {
29                let mut chars = mapping.chars();
30                let first = chars.next().unwrap();
31                self.buf.extend(chars);
32                first
33            } else {
34                ch
35            }
36        })
37    }
38}
39
40/// Wraps an Iterator<Item=char> so that it case folds.
41pub fn casefold<I: Iterator<Item = char>>(input: I) -> CaseFoldIterator<I> {
42    CaseFoldIterator { input, buf: VecDeque::new() }
43}
44
45/// A comparison function that:
46///  * Applies casefolding.
47///  * Removes default ignorable characters.
48///  * Applies nfd normalization.
49/// That function will early-out at the first non-matching character.
50pub fn casefold_cmp(a: &str, b: &str) -> std::cmp::Ordering {
51    let a_it = nfd::nfd(casefold(a.chars()).filter(|x| !lookup::default_ignorable(*x)));
52    let b_it = nfd::nfd(casefold(b.chars()).filter(|x| !lookup::default_ignorable(*x)));
53    a_it.cmp(b_it)
54}
55
56// A simple wrapper around String that provides casefolding comparison.
57#[derive(arbitrary::Arbitrary, Clone, Eq, Serialize, Deserialize, TypeFingerprint)]
58pub struct CasefoldString(String);
59impl CasefoldString {
60    pub fn new(s: String) -> Self {
61        Self(s)
62    }
63}
64impl std::fmt::Debug for CasefoldString {
65    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
66        write!(f, "CasefoldString(\"{}\")", self.0)
67    }
68}
69impl std::fmt::Display for CasefoldString {
70    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
71        write!(f, "{}", self.0)
72    }
73}
74impl std::ops::Deref for CasefoldString {
75    type Target = str;
76    fn deref(&self) -> &str {
77        &self.0
78    }
79}
80impl std::cmp::PartialEq for CasefoldString {
81    fn eq(&self, rhs: &Self) -> bool {
82        casefold_cmp(&self.0, &rhs.0) == std::cmp::Ordering::Equal
83    }
84}
85impl std::cmp::Ord for CasefoldString {
86    fn cmp(&self, rhs: &Self) -> std::cmp::Ordering {
87        casefold_cmp(&self.0, &rhs.0)
88    }
89}
90impl std::cmp::PartialOrd for CasefoldString {
91    fn partial_cmp(&self, rhs: &Self) -> Option<std::cmp::Ordering> {
92        Some(casefold_cmp(&self.0, &rhs.0))
93    }
94}
95
96pub fn casefold_hash(s: &str, state: &mut impl Hasher) {
97    let normalized_chars = nfd::nfd(casefold(s.chars()).filter(|x| !lookup::default_ignorable(*x)));
98    for ch in normalized_chars {
99        ch.hash(state);
100    }
101}
102
103// Nb: This trait is provided for completeness but is NOT intended to be performant.
104impl Hash for CasefoldString {
105    fn hash<H>(&self, state: &mut H)
106    where
107        H: std::hash::Hasher,
108    {
109        casefold_hash(&self.0, state);
110    }
111}
112
113impl From<&str> for CasefoldString {
114    fn from(item: &str) -> Self {
115        CasefoldString(item.into())
116    }
117}
118
119#[cfg(test)]
120mod test {
121    use super::*;
122
123    #[test]
124    fn test_casefold() {
125        assert_eq!(casefold("Hello There".chars()).collect::<String>(), "hello there");
126        assert_eq!(casefold("HELLO There".chars()).collect::<String>(), "hello there");
127    }
128
129    #[test]
130    fn test_casefold_cmp() {
131        assert_eq!(casefold_cmp("Hello", "hello"), std::cmp::Ordering::Equal);
132        assert_eq!(casefold_cmp("Hello There", "hello"), std::cmp::Ordering::Greater);
133        assert_eq!(casefold_cmp("hello there", "hello"), std::cmp::Ordering::Greater);
134        assert_eq!(casefold_cmp("hello\u{00AD}", "hello"), std::cmp::Ordering::Equal);
135        assert_eq!(casefold_cmp("\u{03AA}", "\u{0399}\u{0308}"), std::cmp::Ordering::Equal);
136
137        // Gracefully handle the degenerate case where we start with modifiers
138        assert_eq!(
139            casefold_cmp("\u{308}\u{05ae}Hello", "\u{05ae}\u{308}hello"),
140            std::cmp::Ordering::Equal
141        );
142    }
143
144    #[test]
145    fn test_casefoldstring() {
146        let a = CasefoldString::new("Hello There".to_owned());
147        let b = CasefoldString::new("hello there".to_owned());
148        let c = CasefoldString::new("hello".to_owned());
149        let d = CasefoldString::new("\u{03AA}".to_owned());
150        let e = CasefoldString::new("\u{0399}\u{0308}".to_owned());
151        // Check some comparisons.
152        assert_eq!(a, a);
153        assert_eq!(a, b);
154        assert_eq!(d, e);
155        assert!(a > c);
156        assert!(b > c);
157        assert_eq!(a.0, format!("{}", a));
158        // Debug::fmt should show type to avoid confusion.
159        assert_eq!("CasefoldString(\"Hello There\")", format!("{:?}", a));
160        // Displays the same as String.
161        assert_eq!(format!("{}", "Hello There".to_owned()), format!("{}", a));
162    }
163
164    #[test]
165    fn test_casefold_hash_equality() {
166        use std::hash::{Hash, Hasher};
167        fn get_hash<T: Hash>(t: &T) -> u64 {
168            let mut s = std::collections::hash_map::DefaultHasher::new();
169            t.hash(&mut s);
170            s.finish()
171        }
172        // hello and hello with soft hyphen (ignorable) should be equal and have identical hashes
173        let a = CasefoldString::new("hello\u{00AD}".to_owned());
174        let b = CasefoldString::new("hello".to_owned());
175        assert_eq!(a, b);
176        assert_eq!(get_hash(&a), get_hash(&b));
177    }
178}