fxfs_unicode/
lib.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4use fprint::TypeFingerprint;
5use serde::{Deserialize, Serialize};
6use std::collections::VecDeque;
7
8mod lookup;
9mod nfd;
10use unicode_gen;
11
12/// Filters a valid sequence of unicode characters, casefolding.
13pub struct CaseFoldIterator<I: Iterator<Item = char>> {
14    /// The not-yet-normalized input sequence.
15    input: I,
16    buf: VecDeque<char>,
17}
18
19impl<I: Iterator<Item = char>> Iterator for CaseFoldIterator<I> {
20    type Item = char;
21
22    fn next(&mut self) -> Option<char> {
23        if let Some(ch) = self.buf.pop_front() {
24            return Some(ch);
25        }
26        self.input.next().map(|ch| {
27            if let Some(mapping) = crate::lookup::casefold(ch) {
28                let mut chars = mapping.chars();
29                let first = chars.next().unwrap();
30                self.buf.extend(chars);
31                first
32            } else {
33                ch
34            }
35        })
36    }
37}
38
39/// Wraps an Iterator<Item=char> so that it case folds.
40pub fn casefold<I: Iterator<Item = char>>(input: I) -> CaseFoldIterator<I> {
41    CaseFoldIterator { input, buf: VecDeque::new() }
42}
43
44/// A comparison function that:
45///  * Applies casefolding.
46///  * Removes default ignorable characters.
47///  * Applies nfd normalization.
48/// That function will early-out at the first non-matching character.
49pub fn casefold_cmp(a: &str, b: &str) -> std::cmp::Ordering {
50    let a_it = nfd::nfd(casefold(a.chars()).filter(|x| !lookup::default_ignorable(*x)));
51    let b_it = nfd::nfd(casefold(b.chars()).filter(|x| !lookup::default_ignorable(*x)));
52    a_it.cmp(b_it)
53}
54
55// A simple wrapper around String that provides casefolding comparison.
56#[derive(arbitrary::Arbitrary, Clone, Eq, Serialize, Deserialize, TypeFingerprint)]
57pub struct CasefoldString(String);
58impl CasefoldString {
59    pub fn new(s: String) -> Self {
60        Self(s)
61    }
62}
63impl std::fmt::Debug for CasefoldString {
64    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
65        write!(f, "CasefoldString(\"{}\")", self.0)
66    }
67}
68impl std::fmt::Display for CasefoldString {
69    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
70        write!(f, "{}", self.0)
71    }
72}
73impl std::ops::Deref for CasefoldString {
74    type Target = str;
75    fn deref(&self) -> &str {
76        &self.0
77    }
78}
79impl std::cmp::PartialEq for CasefoldString {
80    fn eq(&self, rhs: &Self) -> bool {
81        casefold_cmp(&self.0, &rhs.0) == std::cmp::Ordering::Equal
82    }
83}
84impl std::cmp::Ord for CasefoldString {
85    fn cmp(&self, rhs: &Self) -> std::cmp::Ordering {
86        casefold_cmp(&self.0, &rhs.0)
87    }
88}
89impl std::cmp::PartialOrd for CasefoldString {
90    fn partial_cmp(&self, rhs: &Self) -> Option<std::cmp::Ordering> {
91        Some(casefold_cmp(&self.0, &rhs.0))
92    }
93}
94
95// Nb: This trait is provided for completeness but is NOT intended to be performant.
96impl std::hash::Hash for CasefoldString {
97    fn hash<H>(&self, state: &mut H)
98    where
99        H: std::hash::Hasher,
100    {
101        for ch in casefold(self.0.chars()) {
102            ch.hash(state);
103        }
104    }
105}
106
107impl From<&str> for CasefoldString {
108    fn from(item: &str) -> Self {
109        CasefoldString(item.into())
110    }
111}
112
113#[cfg(test)]
114mod test {
115    use super::*;
116
117    #[test]
118    fn test_casefold() {
119        assert_eq!(casefold("Hello There".chars()).collect::<String>(), "hello there");
120        assert_eq!(casefold("HELLO There".chars()).collect::<String>(), "hello there");
121    }
122
123    #[test]
124    fn test_casefold_cmp() {
125        assert_eq!(casefold_cmp("Hello", "hello"), std::cmp::Ordering::Equal);
126        assert_eq!(casefold_cmp("Hello There", "hello"), std::cmp::Ordering::Greater);
127        assert_eq!(casefold_cmp("hello there", "hello"), std::cmp::Ordering::Greater);
128        assert_eq!(casefold_cmp("hello\u{00AD}", "hello"), std::cmp::Ordering::Equal);
129        assert_eq!(casefold_cmp("\u{03AA}", "\u{0399}\u{0308}"), std::cmp::Ordering::Equal);
130
131        // Gracefully handle the degenerate case where we start with modifiers
132        assert_eq!(
133            casefold_cmp("\u{308}\u{05ae}Hello", "\u{05ae}\u{308}hello"),
134            std::cmp::Ordering::Equal
135        );
136    }
137
138    #[test]
139    fn test_casefoldstring() {
140        let a = CasefoldString::new("Hello There".to_owned());
141        let b = CasefoldString::new("hello there".to_owned());
142        let c = CasefoldString::new("hello".to_owned());
143        let d = CasefoldString::new("\u{03AA}".to_owned());
144        let e = CasefoldString::new("\u{0399}\u{0308}".to_owned());
145        // Check some comparisons.
146        assert_eq!(a, a);
147        assert_eq!(a, b);
148        assert_eq!(d, e);
149        assert!(a > c);
150        assert!(b > c);
151        assert_eq!(a.0, format!("{}", a));
152        // Debug::fmt should show type to avoid confusion.
153        assert_eq!("CasefoldString(\"Hello There\")", format!("{:?}", a));
154        // Displays the same as String.
155        assert_eq!(format!("{}", "Hello There".to_owned()), format!("{}", a));
156    }
157}