char_set/
lib.rs

1// Copyright 2019 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use char_collection::CharCollection;
6use std::cmp::Ordering;
7
8type BitmapElement = u64;
9const BITMAP_ELEMENT_SIZE: usize = 64;
10/// This value is optimal for memory use, based on some non-scientific experimentation with
11/// real-world font files. (Though compared to `2048`, it does add a few nanoseconds to the average
12/// `contains` call.)
13const MAX_RANGE_GAP: u32 = 256;
14
15/// Represents an ordered set of code points that begin at [CharSetRange.start]. The largest
16/// allowed discontinuity between two consecutive code points in the set is [MAX_RANGE_GAP].
17#[derive(Debug, Clone, Hash, Eq, PartialEq)]
18struct CharSetRange {
19    start: u32,
20    bitmap: Vec<BitmapElement>,
21}
22
23impl CharSetRange {
24    fn new() -> CharSetRange {
25        CharSetRange { start: 0, bitmap: vec![] }
26    }
27
28    fn start(&self) -> u32 {
29        self.start
30    }
31
32    fn end(&self) -> u32 {
33        self.start + (self.bitmap.len() * BITMAP_ELEMENT_SIZE) as u32
34    }
35
36    fn is_empty(&self) -> bool {
37        self.bitmap.is_empty()
38    }
39
40    fn add(&mut self, val: u32) {
41        assert!(val >= self.start);
42        assert!(char::try_from(val).is_ok());
43
44        if self.bitmap.is_empty() {
45            self.start = val;
46        }
47
48        let pos = (val - self.start) as usize;
49        let element_pos = pos / BITMAP_ELEMENT_SIZE;
50
51        if element_pos >= self.bitmap.len() {
52            self.bitmap.resize(element_pos + 1, 0);
53        }
54
55        self.bitmap[element_pos] |= 1 << (pos % BITMAP_ELEMENT_SIZE);
56    }
57
58    fn contains(&self, c: u32) -> bool {
59        if c < self.start || c >= self.end() {
60            false
61        } else {
62            let index = c as usize - self.start as usize;
63            (self.bitmap[index / BITMAP_ELEMENT_SIZE] & (1 << (index % BITMAP_ELEMENT_SIZE))) > 0
64        }
65    }
66
67    fn iter(&self) -> CharSetRangeIterator<'_> {
68        CharSetRangeIterator { char_set_range: &self, position: self.start.clone() }
69    }
70}
71
72struct CharSetRangeIterator<'a> {
73    char_set_range: &'a CharSetRange,
74    position: u32,
75}
76
77impl Iterator for CharSetRangeIterator<'_> {
78    type Item = char;
79
80    fn next(&mut self) -> Option<char> {
81        while self.position < self.char_set_range.end() {
82            if self.char_set_range.contains(self.position) {
83                self.position += 1;
84                return Some(std::char::from_u32(self.position - 1).unwrap());
85            }
86            self.position += 1;
87        }
88        None
89    }
90}
91
92/// Represents an ordered set of code points.
93///
94/// TODO(kpozin): Add factory method that takes lazy `Iterator<Item = char>` instead of `Vec`.
95/// TODO(kpozin): Enforce `char` values.
96#[derive(Debug, Clone, Hash, Eq, PartialEq)]
97pub struct CharSet {
98    ranges: Vec<CharSetRange>,
99    len: usize,
100}
101
102impl CharSet {
103    pub fn new(mut code_points: Vec<u32>) -> CharSet {
104        let len = code_points.len();
105        code_points.sort_unstable();
106
107        let mut ranges = vec![];
108        let mut range = CharSetRange::new();
109        for c in code_points {
110            if c != 0 && !range.is_empty() && c >= range.end() + MAX_RANGE_GAP {
111                ranges.push(range);
112                range = CharSetRange::new();
113            }
114            range.add(c);
115        }
116        if !range.is_empty() {
117            ranges.push(range)
118        }
119        CharSet { ranges, len }
120    }
121
122    pub fn contains(&self, c: u32) -> bool {
123        match self.ranges.binary_search_by(|r| {
124            if r.end() < c {
125                Ordering::Less
126            } else if r.start() > c {
127                Ordering::Greater
128            } else {
129                Ordering::Equal
130            }
131        }) {
132            Ok(r) => self.ranges[r].contains(c),
133            Err(_) => false,
134        }
135    }
136
137    pub fn is_empty(&self) -> bool {
138        self.ranges.is_empty()
139    }
140
141    /// Returns the number of code points in the set.
142    pub fn len(&self) -> usize {
143        self.len
144    }
145
146    /// Iterate over all the characters in the the `CharSet` in code point order.
147    pub fn iter(&self) -> impl Iterator<Item = char> + '_ {
148        self.ranges.iter().flat_map(CharSetRange::iter)
149    }
150}
151
152impl Default for CharSet {
153    fn default() -> Self {
154        CharSet::new(vec![])
155    }
156}
157
158impl Into<CharCollection> for &CharSet {
159    fn into(self) -> CharCollection {
160        // Unwrapping is safe because we know `CharSet` iterates in order.
161        CharCollection::from_sorted_chars(self.iter()).unwrap()
162    }
163}
164
165impl From<CharCollection> for CharSet {
166    fn from(value: CharCollection) -> CharSet {
167        CharSet::from(&value)
168    }
169}
170
171impl From<&CharCollection> for CharSet {
172    fn from(value: &CharCollection) -> CharSet {
173        CharSet::new(value.iter().map(|ch| ch as u32).collect::<Vec<u32>>())
174    }
175}
176
177#[cfg(test)]
178mod tests {
179    use super::*;
180    use char_collection::char_collect;
181
182    #[test]
183    fn test_charset() {
184        let charset = CharSet::new(vec![1, 2, 3, 10, 500, 5000, 5001, 10000]);
185        assert!(!charset.contains(0));
186        assert!(charset.contains(1));
187        assert!(charset.contains(2));
188        assert!(charset.contains(3));
189        assert!(!charset.contains(4));
190        assert!(!charset.contains(9));
191        assert!(charset.contains(10));
192        assert!(charset.contains(500));
193        assert!(!charset.contains(501));
194        assert!(charset.contains(5000));
195        assert!(charset.contains(5001));
196        assert!(!charset.contains(5002));
197        assert!(!charset.contains(9999));
198        assert!(charset.contains(10000));
199        assert!(!charset.contains(10001));
200
201        assert_eq!(
202            charset.iter().map(|ch| ch as u32).collect::<Vec<u32>>(),
203            vec![1, 2, 3, 10, 500, 5000, 5001, 10000]
204        );
205    }
206
207    #[test]
208    fn test_charset_from_char_collection() {
209        let collection = char_collect!(0..=0, 2..=2, 13..=13, 32..=126);
210        let charset = CharSet::from(&collection);
211        assert!([0, 2, 13, 32, 54, 126].iter().all(|c| charset.contains(*c)));
212        assert!([1, 11, 19, 127, 10000].iter().all(|c| !charset.contains(*c)));
213    }
214}