unicase/unicode/
mod.rs

1use alloc::string::String;
2use core::cmp::Ordering;
3use core::hash::{Hash, Hasher};
4
5use self::map::lookup;
6mod map;
7
8#[derive(Clone, Copy, Debug, Default)]
9pub struct Unicode<S>(pub S);
10
11impl<S: AsRef<str>> Unicode<S> {
12    pub fn to_folded_case(&self) -> String {
13        self.0.as_ref().chars().flat_map(lookup).collect()
14    }
15}
16
17impl<S1: AsRef<str>, S2: AsRef<str>> PartialEq<Unicode<S2>> for Unicode<S1> {
18    #[inline]
19    fn eq(&self, other: &Unicode<S2>) -> bool {
20        let mut left = self.0.as_ref().chars().flat_map(lookup);
21        let mut right = other.0.as_ref().chars().flat_map(lookup);
22
23        // inline Iterator::eq since not added until Rust 1.5
24        loop {
25            let x = match left.next() {
26                None => return right.next().is_none(),
27                Some(val) => val,
28            };
29
30            let y = match right.next() {
31                None => return false,
32                Some(val) => val,
33            };
34
35            if x != y {
36                return false;
37            }
38        }
39    }
40}
41
42impl<S: AsRef<str>> Eq for Unicode<S> {}
43
44impl<T: AsRef<str>> PartialOrd for Unicode<T> {
45    #[inline]
46    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
47        Some(self.cmp(other))
48    }
49}
50
51impl<T: AsRef<str>> Ord for Unicode<T> {
52    #[inline]
53    fn cmp(&self, other: &Self) -> Ordering {
54        let self_chars = self.0.as_ref().chars().flat_map(lookup);
55        let other_chars = other.0.as_ref().chars().flat_map(lookup);
56        self_chars.cmp(other_chars)
57    }
58}
59
60impl<S: AsRef<str>> Hash for Unicode<S> {
61    #[inline]
62    fn hash<H: Hasher>(&self, hasher: &mut H) {
63        let mut buf = [0; 4];
64        for c in self.0.as_ref().chars().flat_map(|c| lookup(c)) {
65            let len = char_to_utf8(c, &mut buf);
66            // we can't use `write(buf)` because the ASCII variant uses
67            // `write_u8`. The docs for Hash say that's technically different.
68            // ¯\_(ツ)_/¯
69            for &b in &buf[..len] {
70                hasher.write_u8(b);
71            }
72        }
73        // prefix-freedom
74        hasher.write_u8(0xFF);
75    }
76}
77
78#[inline]
79fn char_to_utf8(c: char, dst: &mut [u8; 4]) -> usize {
80    const TAG_CONT: u8 = 0b1000_0000;
81    const TAG_TWO_B: u8 = 0b1100_0000;
82    const TAG_THREE_B: u8 = 0b1110_0000;
83    const TAG_FOUR_B: u8 = 0b1111_0000;
84
85    let code = c as u32;
86    if code <= 0x7F {
87        dst[0] = code as u8;
88        1
89    } else if code <= 0x7FF {
90        dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
91        dst[1] = (code & 0x3F) as u8 | TAG_CONT;
92        2
93    } else if code <= 0xFFFF {
94        dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
95        dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
96        dst[2] = (code & 0x3F) as u8 | TAG_CONT;
97        3
98    } else {
99        dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
100        dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
101        dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
102        dst[3] = (code & 0x3F) as u8 | TAG_CONT;
103        4
104    }
105}
106
107// internal mod so that the enum can be 'pub'
108// thanks privacy-checker :___(
109mod fold {
110    #[derive(Clone, Copy)]
111    pub enum Fold {
112        Zero,
113        One(char),
114        Two(char, char),
115        Three(char, char, char),
116    }
117
118    impl Iterator for Fold {
119        type Item = char;
120        #[inline]
121        fn next(&mut self) -> Option<char> {
122            match *self {
123                Fold::Zero => None,
124                Fold::One(one) => {
125                    *self = Fold::Zero;
126                    Some(one)
127                }
128                Fold::Two(one, two) => {
129                    *self = Fold::One(two);
130                    Some(one)
131                }
132                Fold::Three(one, two, three) => {
133                    *self = Fold::Two(one, two);
134                    Some(three)
135                }
136            }
137        }
138
139        #[inline]
140        fn size_hint(&self) -> (usize, Option<usize>) {
141            match *self {
142                Fold::Zero => (0, Some(0)),
143                Fold::One(..) => (1, Some(1)),
144                Fold::Two(..) => (2, Some(2)),
145                Fold::Three(..) => (3, Some(3)),
146            }
147        }
148    }
149    impl From<(char,)> for Fold {
150        #[inline]
151        fn from((one,): (char,)) -> Fold {
152            Fold::One(one)
153        }
154    }
155
156    impl From<(char, char)> for Fold {
157        #[inline]
158        fn from((one, two): (char, char)) -> Fold {
159            Fold::Two(one, two)
160        }
161    }
162
163    impl From<(char, char, char)> for Fold {
164        #[inline]
165        fn from((one, two, three): (char, char, char)) -> Fold {
166            Fold::Three(one, two, three)
167        }
168    }
169}
170
171#[cfg(test)]
172mod tests {
173    use super::Unicode;
174
175    macro_rules! eq {
176        ($left:expr, $right:expr) => {{
177            assert_eq!(Unicode($left), Unicode($right));
178        }};
179    }
180
181    #[test]
182    fn test_ascii_folding() {
183        eq!("foo bar", "FoO BAR");
184    }
185
186    #[test]
187    fn test_simple_case_folding() {
188        eq!("στιγμας", "στιγμασ");
189    }
190
191    #[test]
192    fn test_full_case_folding() {
193        eq!("flour", "flour");
194        eq!("Maße", "MASSE");
195        eq!("ᾲ στο διάολο", "ὰι στο διάολο");
196    }
197
198    #[test]
199    fn test_to_folded_case() {
200        assert_eq!(Unicode("Maße").to_folded_case(), "masse");
201    }
202
203    #[cfg(feature = "nightly")]
204    #[bench]
205    fn bench_ascii_folding(b: &mut ::test::Bencher) {
206        b.bytes = b"foo bar".len() as u64;
207        b.iter(|| eq!("foo bar", "FoO BAR"));
208    }
209
210    #[cfg(feature = "nightly")]
211    #[bench]
212    fn bench_simple_case_folding(b: &mut ::test::Bencher) {
213        b.bytes = "στιγμας".len() as u64;
214        b.iter(|| eq!("στιγμας", "στιγμασ"));
215    }
216}