1use char_collection::CharCollection;
6use std::cmp::Ordering;
7
8type BitmapElement = u64;
9const BITMAP_ELEMENT_SIZE: usize = 64;
10const MAX_RANGE_GAP: u32 = 256;
14
15#[derive(Debug, Clone, Hash, Eq, PartialEq)]
18struct CharSetRange {
19 start: u32,
20 bitmap: Vec<BitmapElement>,
21}
22
23impl CharSetRange {
24 fn new() -> CharSetRange {
25 CharSetRange { start: 0, bitmap: vec![] }
26 }
27
28 fn start(&self) -> u32 {
29 self.start
30 }
31
32 fn end(&self) -> u32 {
33 self.start + (self.bitmap.len() * BITMAP_ELEMENT_SIZE) as u32
34 }
35
36 fn is_empty(&self) -> bool {
37 self.bitmap.is_empty()
38 }
39
40 fn add(&mut self, val: u32) {
41 assert!(val >= self.start);
42 assert!(char::try_from(val).is_ok());
43
44 if self.bitmap.is_empty() {
45 self.start = val;
46 }
47
48 let pos = (val - self.start) as usize;
49 let element_pos = pos / BITMAP_ELEMENT_SIZE;
50
51 if element_pos >= self.bitmap.len() {
52 self.bitmap.resize(element_pos + 1, 0);
53 }
54
55 self.bitmap[element_pos] |= 1 << (pos % BITMAP_ELEMENT_SIZE);
56 }
57
58 fn contains(&self, c: u32) -> bool {
59 if c < self.start || c >= self.end() {
60 false
61 } else {
62 let index = c as usize - self.start as usize;
63 (self.bitmap[index / BITMAP_ELEMENT_SIZE] & (1 << (index % BITMAP_ELEMENT_SIZE))) > 0
64 }
65 }
66
67 fn iter(&self) -> CharSetRangeIterator<'_> {
68 CharSetRangeIterator { char_set_range: &self, position: self.start.clone() }
69 }
70}
71
72struct CharSetRangeIterator<'a> {
73 char_set_range: &'a CharSetRange,
74 position: u32,
75}
76
77impl Iterator for CharSetRangeIterator<'_> {
78 type Item = char;
79
80 fn next(&mut self) -> Option<char> {
81 while self.position < self.char_set_range.end() {
82 if self.char_set_range.contains(self.position) {
83 self.position += 1;
84 return Some(std::char::from_u32(self.position - 1).unwrap());
85 }
86 self.position += 1;
87 }
88 None
89 }
90}
91
92#[derive(Debug, Clone, Hash, Eq, PartialEq)]
97pub struct CharSet {
98 ranges: Vec<CharSetRange>,
99 len: usize,
100}
101
102impl CharSet {
103 pub fn new(mut code_points: Vec<u32>) -> CharSet {
104 let len = code_points.len();
105 code_points.sort_unstable();
106
107 let mut ranges = vec![];
108 let mut range = CharSetRange::new();
109 for c in code_points {
110 if c != 0 && !range.is_empty() && c >= range.end() + MAX_RANGE_GAP {
111 ranges.push(range);
112 range = CharSetRange::new();
113 }
114 range.add(c);
115 }
116 if !range.is_empty() {
117 ranges.push(range)
118 }
119 CharSet { ranges, len }
120 }
121
122 pub fn contains(&self, c: u32) -> bool {
123 match self.ranges.binary_search_by(|r| {
124 if r.end() < c {
125 Ordering::Less
126 } else if r.start() > c {
127 Ordering::Greater
128 } else {
129 Ordering::Equal
130 }
131 }) {
132 Ok(r) => self.ranges[r].contains(c),
133 Err(_) => false,
134 }
135 }
136
137 pub fn is_empty(&self) -> bool {
138 self.ranges.is_empty()
139 }
140
141 pub fn len(&self) -> usize {
143 self.len
144 }
145
146 pub fn iter(&self) -> impl Iterator<Item = char> + '_ {
148 self.ranges.iter().flat_map(CharSetRange::iter)
149 }
150}
151
152impl Default for CharSet {
153 fn default() -> Self {
154 CharSet::new(vec![])
155 }
156}
157
158impl Into<CharCollection> for &CharSet {
159 fn into(self) -> CharCollection {
160 CharCollection::from_sorted_chars(self.iter()).unwrap()
162 }
163}
164
165impl From<CharCollection> for CharSet {
166 fn from(value: CharCollection) -> CharSet {
167 CharSet::from(&value)
168 }
169}
170
171impl From<&CharCollection> for CharSet {
172 fn from(value: &CharCollection) -> CharSet {
173 CharSet::new(value.iter().map(|ch| ch as u32).collect::<Vec<u32>>())
174 }
175}
176
177#[cfg(test)]
178mod tests {
179 use super::*;
180 use char_collection::char_collect;
181
182 #[test]
183 fn test_charset() {
184 let charset = CharSet::new(vec![1, 2, 3, 10, 500, 5000, 5001, 10000]);
185 assert!(!charset.contains(0));
186 assert!(charset.contains(1));
187 assert!(charset.contains(2));
188 assert!(charset.contains(3));
189 assert!(!charset.contains(4));
190 assert!(!charset.contains(9));
191 assert!(charset.contains(10));
192 assert!(charset.contains(500));
193 assert!(!charset.contains(501));
194 assert!(charset.contains(5000));
195 assert!(charset.contains(5001));
196 assert!(!charset.contains(5002));
197 assert!(!charset.contains(9999));
198 assert!(charset.contains(10000));
199 assert!(!charset.contains(10001));
200
201 assert_eq!(
202 charset.iter().map(|ch| ch as u32).collect::<Vec<u32>>(),
203 vec![1, 2, 3, 10, 500, 5000, 5001, 10000]
204 );
205 }
206
207 #[test]
208 fn test_charset_from_char_collection() {
209 let collection = char_collect!(0..=0, 2..=2, 13..=13, 32..=126);
210 let charset = CharSet::from(&collection);
211 assert!([0, 2, 13, 32, 54, 126].iter().all(|c| charset.contains(*c)));
212 assert!([1, 11, 19, 127, 10000].iter().all(|c| !charset.contains(*c)));
213 }
214}