fxfs_unicode/
lookup.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! A set of functions for parsing generated unicode data (see generator.rs and unicode_gen.rs)
6use crate::unicode_gen::*;
7use std::cmp::Ordering;
8use types::MappingOffset;
9
10/// Lookup the CCC for a code point.
11/// This aims to minimise data set size whilst maintaining performance.
12/// Worst case is O(log2(2^16)) = O(16).
13pub fn ccc(ch: char) -> u8 {
14    // CCC data is encoded in 'planes' using LUT for top two bytes of u32 unicode char.
15    let plane_offset = CCC_PLANE_OFFSETS[ch as u32 as usize >> 16];
16    let plane_end = CCC_PLANE_OFFSETS[((ch as u32 >> 16) + 1) as usize];
17    let plane = &CCC_PLANE_DATA[plane_offset..plane_end];
18    let ch_lsb = ch as u32 as u16;
19    let ix = plane
20        .binary_search_by(|entry| match entry.low_plane_end_ix.cmp(&ch_lsb) {
21            Ordering::Equal => Ordering::Less,
22            x => x,
23        })
24        .unwrap_or_else(|x| x);
25    if ix < plane.len() {
26        let entry = &plane[ix];
27        if entry.range().contains(&ch_lsb) {
28            return entry.ccc;
29        }
30    }
31    0
32}
33
34/// Both decomposition and casefold use the same lookup mechanism
35fn string_table_lookup(
36    ch: char,
37    plane_offsets: &[usize],
38    plane_data: &[MappingOffset],
39    string_data: &'static [u8],
40) -> Option<&'static str> {
41    // Unicode has 17 planes (0..=16). we store one extra entry to calculate plane_end easily.
42    let plane_start = plane_offsets[(ch as u32 >> 16) as usize];
43    let plane_end = plane_offsets[((ch as u32 >> 16) + 1) as usize];
44    let plane = &plane_data[plane_start..plane_end];
45    // Look up the lower 16-bit in the plane using binary search.
46    let ch_lsb = ch as u32 as u16;
47    if let Ok(ix) = plane.binary_search_by(|entry| entry.low_plane_ix.cmp(&ch_lsb)) {
48        // Read the start of the string and look at the next entry to find the end of the string.
49        let offset = plane[ix].offset as usize;
50        let end = if plane.len() == ix + 1 {
51            if plane_end < plane_data.len() {
52                plane_data[plane_end].offset as usize
53            } else {
54                string_data.len()
55            }
56        } else {
57            plane[ix + 1].offset as usize
58        };
59        let out = &string_data[offset..end];
60        return Some(unsafe {
61            // Safety: This comes from official unicode decompositions.
62            std::str::from_utf8_unchecked(out)
63        });
64    }
65    return None;
66}
67
68/// Returns the unicode decomposition of a given codepoint.
69pub fn decomposition(ch: char) -> Option<&'static str> {
70    string_table_lookup(ch, DECOMP_PLANE_OFFSETS, DECOMP_PLANE_DATA, DECOMP_STRINGS)
71}
72
73/// Returns true if a codepoint is a default ignorable codepoint.
74pub fn default_ignorable(ch: char) -> bool {
75    // Unicode has 17 planes (0..=16). we store one extra entry to calculate plane_end easily.
76    let plane_offset = IGNORABLE_PLANE_OFFSETS[ch as u32 as usize >> 16];
77    let plane_end = IGNORABLE_PLANE_OFFSETS[((ch as u32 >> 16) + 1) as usize];
78    let plane = &IGNORABLE_PLANE_DATA[plane_offset..plane_end];
79    let ch_lsb = ch as u32 as u16;
80    let ix = plane
81        .binary_search_by(|entry| match entry.end.cmp(&ch_lsb) {
82            Ordering::Equal => Ordering::Less,
83            x => x,
84        })
85        .unwrap_or_else(|x| x);
86    if ix < plane.len() {
87        plane[ix].contains(&ch_lsb)
88    } else {
89        false
90    }
91}
92
93pub fn casefold(ch: char) -> Option<&'static str> {
94    string_table_lookup(ch, CASEFOLD_PLANE_OFFSETS, CASEFOLD_PLANE_DATA, CASEFOLD_STRINGS)
95}
96
97#[cfg(test)]
98mod test {
99    use super::*;
100
101    #[test]
102    fn test_ccc() {
103        assert_eq!(ccc('\u{0}'), 0);
104        assert_eq!(ccc('A'), 0);
105        assert_eq!(ccc('\u{059a}'), 222);
106        assert_eq!(ccc('\u{2cee}'), 0);
107        assert_eq!(ccc('\u{2cef}'), 230);
108        assert_eq!(ccc('\u{2cf0}'), 230);
109        assert_eq!(ccc('\u{2cf1}'), 230);
110        assert_eq!(ccc('\u{2cf2}'), 0);
111    }
112
113    #[test]
114    fn test_decomposition() {
115        assert_eq!(decomposition('\u{f900}'), Some("\u{8c48}"));
116        assert_eq!(decomposition('A'), None);
117        assert_eq!(decomposition('\u{1f82}'), Some("α\u{313}\u{300}\u{345}"));
118    }
119
120    #[test]
121    fn test_deafault_ignorable() {
122        assert!(!default_ignorable('\u{00ac}'));
123        assert!(default_ignorable('\u{00ad}'));
124        assert!(!default_ignorable('\u{00ae}'));
125        assert!(!default_ignorable('\u{115e}'));
126        assert!(default_ignorable('\u{115f}'));
127        assert!(default_ignorable('\u{1160}'));
128        assert!(!default_ignorable('\u{1161}'));
129    }
130
131    #[test]
132    fn test_casefold() {
133        assert_eq!(casefold('A'), Some("a"));
134        assert_eq!(casefold('a'), None);
135    }
136
137    #[test]
138    fn test_last_char() {
139        // We look up strings end by incrementing offset + 1 from string start so we need make sure
140        // we handle the case where offset is the last string in the plane or the last string
141        // overall.
142        assert_eq!(decomposition('\u{10000}'), None);
143        assert_eq!(decomposition('\u{1fbca}'), None);
144        assert_eq!(decomposition('\u{1fbf9}'), None);
145        assert_eq!(decomposition('\u{20000}'), None);
146        assert_eq!(decomposition('\u{2fa1d}'), Some("\u{2a600}"));
147        assert_eq!(casefold('\u{ff39}'), Some("\u{ff59}"));
148        assert_eq!(casefold('\u{ff3a}'), Some("\u{ff5a}"));
149        assert_eq!(casefold('\u{ff3b}'), None);
150        assert_eq!(casefold('\u{10400}'), Some("\u{10428}"));
151        assert_eq!(casefold('\u{1e920}'), Some("\u{1e942}"));
152        assert_eq!(casefold('\u{1e921}'), Some("\u{1e943}"));
153        assert_eq!(casefold('\u{1e922}'), None);
154    }
155
156    #[test]
157    fn test_high_char() {
158        // This is the maximum unicode codepoint.
159        assert_eq!(decomposition('\u{10FFFF}'), None);
160        assert_eq!(casefold('\u{10FFFF}'), None);
161        assert_eq!(default_ignorable('\u{10FFFF}'), false);
162    }
163}